From d7f7cb9586d235de9fb7072cdeffff069551c3b8 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Mon, 19 Jun 2023 17:05:05 -0400 Subject: [PATCH] Bug fixes - Tag relation - Inner whitespace of cleaned text - Deferred properties --- lib/Parser.php | 51 ++++--- tests/cases/StandardTest.php | 40 ++++-- tests/cases/json/phpmf2/hentry/fberriman.html | 44 ++++++ tests/cases/json/phpmf2/hentry/fberriman.json | 136 ++++++++++++++++++ 4 files changed, 242 insertions(+), 29 deletions(-) create mode 100644 tests/cases/json/phpmf2/hentry/fberriman.html create mode 100644 tests/cases/json/phpmf2/hentry/fberriman.json diff --git a/lib/Parser.php b/lib/Parser.php index 1e0b63d..2763ac9 100644 --- a/lib/Parser.php +++ b/lib/Parser.php @@ -120,6 +120,7 @@ class Parser { /** @var array The list of link relations which are backward-compatibility property markers. The format is the same as for backcompat classes */ protected const BACKCOMPAT_RELATIONS = [ // h-review and h-review-agregate also include "self bookmark", but this requires special processing + // the tag relation also requires special processing 'bookmark' => ['h-entry' => ["u", "url"]], 'tag' => ['h-entry' => ["p", "category", [], true], 'h-feed' => ["p", "category"], 'h-review' => ["p", "category"], 'h-review-aggregate' => ["p", "category"]], 'author' => ['h-entry' => ["u", "author", [], true]], @@ -392,7 +393,7 @@ class Parser { return array_values($out); } - protected function matchPropertiesBackcompat(array $classes, array $types, \DOMElement $node): array { + protected function matchPropertiesBackcompat(array &$classes, array $types, \DOMElement $node): array { $props = []; $out = []; foreach ($types as $t) { @@ -420,9 +421,11 @@ class Parser { } } // filter the list of properties for uniqueness by name + // while we're at it we'll also add extra roots where needed foreach ($props as $map) { $prefix = $map[0]; $name = $map[1]; + $extraRoots = $map[2] ?? []; if ( // property with this name has not been seen yet !isset($out[$name]) @@ -430,6 +433,11 @@ class Parser { || (static::PREFIX_RANK[$prefix] > static::PREFIX_RANK[$out[$name][0]] && !($map[3] ?? false)) ) { $out[$name] = $map; + foreach ($extraRoots as $r) { + if (!in_array($r, $classes)) { + $classes[] = $r; + } + } } } return array_values($out); @@ -466,6 +474,13 @@ class Parser { while ($node = $this->nextElement($node ?? $root, $root, !($child = $child ?? false))) { $child = null; $classes = $this->parseTokens($node, "class"); + if ($backcompat) { + # if parsing a backcompat root, parse child element class name(s) for backcompat properties + $properties = $this->matchPropertiesBackcompat($classes, $types, $node); + } else { + # else parse a child element class for property class name(s) "p-*,u-*,dt-*,e-*" + $properties = $this->matchPropertiesMf2($classes); + } # parse a child element for microformats (recurse) // NOTE: We do this in a different order from the spec because this seems to be what is actually required if ($childTypes = $this->matchRootsMf2($classes)) { @@ -475,13 +490,6 @@ class Parser { $child = $this->parseMicroformat($node, $childTypes, true); $hasChild = true; } - if ($backcompat) { - # if parsing a backcompat root, parse child element class name(s) for backcompat properties - $properties = $this->matchPropertiesBackcompat($classes, $types, $node); - } else { - # else parse a child element class for property class name(s) "p-*,u-*,dt-*,e-*" - $properties = $this->matchPropertiesMf2($classes); - } # [if the element is a microformat and it has no properties] add # found elements that are microformats to the "children" array if ($child && !$properties) { @@ -535,18 +543,16 @@ class Parser { } $out['properties'][$key][] = $value; } - // now add any extra roots to the element's class list; this only ever occurs during backcompat processing - foreach ($extraRoots ?? [] as $r) { - if (!in_array($r, $classes)) { - $classes[] = $r; - } - } } } // add any deferred properties + $known = array_keys($out['properties']); foreach ($deferred as [$key, $value]) { - if (!isset($out['properties'][$key])) { - $out['properties'][$key] = [$value]; + if (!in_array($key, $known)) { + if (!isset($out['properties'][$key])) { + $out['properties'][$key] = []; + } + $out['properties'][$key][] = $value; } } # imply properties for the found microformat @@ -702,6 +708,14 @@ class Parser { } elseif (in_array($node->localName, ["img", "area"]) && $node->hasAttribute("alt")) { # else if img.p-x[alt] or area.p-x[alt], then return the alt attribute return $node->getAttribute("alt"); + } elseif (in_array($node->localName, ["a", "area", "link"]) && array_intersect($backcompatTypes, array_keys(static::BACKCOMPAT_RELATIONS['tag'])) && preg_match('/\btag\b/', $node->getAttribute("rel"))) { + // we have encountered a tag relation during backcompat processing + // https://microformats.org/wiki/rel-tag#Abstract + // we are required to retrieve the last component of the URL path and use that + if (preg_match('#([^/]*)/?$#', URL::fromString($this->normalizeUrl($node->getAttribute("href")))->getPath(), $match)) { + return $match[1]; + } + return ""; } # else return the textContent of the element after [cleaning] return $this->getCleanText($node, $prefix); @@ -804,7 +818,7 @@ class Parser { $classes = $this->parseTokens($node, "class"); $candidate = null; if (!array_intersect(["value", "value-title"], $classes) && ( - ($backcompatTypes && ($this->matchRootsBackcompat($classes) || $this->matchPropertiesBackcompat($classes, $backcompatTypes, $node))) + ($backcompatTypes && ($this->matchPropertiesBackcompat($classes, $backcompatTypes, $node) || $this->matchRootsBackcompat($classes))) || ($this->matchRootsMf2($classes) || $this->matchPropertiesMf2($classes)) )) { // only consider elements which are not themselves properties or roots, unless they have a value @@ -1053,7 +1067,8 @@ class Parser { $e->parentNode->replaceChild($e->ownerDocument->createTextNode($attr), $e); } # removing all leading/trailing spaces - return trim($copy->textContent); + // NOTE: Also remove extraneous spaces within the text; this aligns with most mature implementations + return preg_replace('/\s{2,}/s', " ", trim($copy->textContent)); } protected function getBaseUrl(\DOMElement $root, string $base): string { diff --git a/tests/cases/StandardTest.php b/tests/cases/StandardTest.php index fc0d936..1b689c9 100644 --- a/tests/cases/StandardTest.php +++ b/tests/cases/StandardTest.php @@ -11,7 +11,6 @@ use MensBeam\HTML\DOMParser; /** @covers MensBeam\Microformats\Parser */ class StandardTest extends \PHPUnit\Framework\TestCase { - protected const BASE = \MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/"; protected const SUPPRESSED = [ 'microformats-v1/hcard/multiple' => "whether vcard keys are p- or u- is unclear", 'microformats-v1/includes/hcarditemref' => "include pattern not implemented", @@ -23,13 +22,13 @@ class StandardTest extends \PHPUnit\Framework\TestCase { ]; /** @dataProvider provideStandardTests */ - public function testStandardTests(string $test): void { - if (isset(self::SUPPRESSED[$test])) { - $this->markTestIncomplete(self::SUPPRESSED[$test]); + public function testStandardTests(string $name, string $path): void { + if (isset(self::SUPPRESSED[$name])) { + $this->markTestIncomplete(self::SUPPRESSED[$name]); } // read data - $exp = json_decode(file_get_contents(self::BASE.$test.".json"), true); - $html = file_get_contents(self::BASE.$test.".html"); + $exp = json_decode(file_get_contents($path.".json"), true); + $html = file_get_contents($path.".html"); // fix up expectation where necessary array_walk_recursive($exp, function(&$v) { // URLs differ trivially from output of our normalization library @@ -38,7 +37,7 @@ class StandardTest extends \PHPUnit\Framework\TestCase { } }); // perform some further monkey-patching on specific tests - $exp = $this->fixTests($exp, $test); + $exp = $this->fixTests($exp, $name); // parse input $dom = new DOMParser; $parser = new Parser; @@ -48,14 +47,31 @@ class StandardTest extends \PHPUnit\Framework\TestCase { $this->ksort($exp); $this->ksort($act); // run comparison + if (!$exp) { + echo json_encode($act, \JSON_PRETTY_PRINT | \JSON_UNESCAPED_SLASHES | \JSON_UNESCAPED_UNICODE); + exit; + } $this->assertSame($exp, $act); } public function provideStandardTests(): \Generator { - foreach (new \RegexIterator(new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator(self::BASE)), '/\.json$/') as $path) { - $path = str_replace(self::BASE, "", $path->getPathname()); - $path = preg_replace('/\.json$/', '', $path); - yield $path => [$path]; + return $this->provideTestList(\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/"); + } + + protected function provideTestList(): \Generator { + $tests = [ + //\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/", // standard tests + \MensBeam\Microformats\BASE."tests/cases/json/", // additional tests + ]; + foreach ($tests as $base) { + $base = strtr($base, "\\", "/"); + foreach (new \RegexIterator(new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator($base )), '/\.json$/') as $file) { + $path = $file->getPathname(); + $path = preg_replace('/\.json$/', '', $path); + $name = strtr($path, "\\", "/"); + $name = str_replace(strtr($base, "\\", "/"), "", $name); + yield $name => [$name, $path]; + } } } @@ -91,6 +107,8 @@ class StandardTest extends \PHPUnit\Framework\TestCase { $this->fixDates($exp['items'][0]['properties']['bday']); $this->fixDates($exp['items'][0]['properties']['rev']); break; + case "phpmf2/hentry/fberriman": + $this->fixDates($exp['items'][0]['properties']['published']); } return $exp; } diff --git a/tests/cases/json/phpmf2/hentry/fberriman.html b/tests/cases/json/phpmf2/hentry/fberriman.html new file mode 100644 index 0000000..5eaf90e --- /dev/null +++ b/tests/cases/json/phpmf2/hentry/fberriman.html @@ -0,0 +1,44 @@ + +
+
+

+ April recap – TXJS & Front-Trends +

+ + +
+ +
+

April was pretty decent. I got to attend two very good conferences and I got to speak at them.

+
+ + +
\ No newline at end of file diff --git a/tests/cases/json/phpmf2/hentry/fberriman.json b/tests/cases/json/phpmf2/hentry/fberriman.json new file mode 100644 index 0000000..e3e190a --- /dev/null +++ b/tests/cases/json/phpmf2/hentry/fberriman.json @@ -0,0 +1,136 @@ +{ + "items": [ + { + "type": [ + "h-entry" + ], + "properties": { + "name": [ + "April recap \u2013 TXJS & Front-Trends" + ], + "published": [ + "2013-05-14T11:54:06+0000" + ], + "category": [ + "speaking", + "web-dev", + "conferences", + "front-trends", + "fronttrends", + "speaking", + "txjs" + ], + "url": [ + "http://fberriman.com/2013/05/14/april-recap-txjs-front-trends/", + "http://fberriman.com/2013/05/14/april-recap-txjs-front-trends/" + ], + "content": [ + { + "html": "

April was pretty decent. I got to attend two very good conferences and I got to speak at them.

", + "value": "April was pretty decent. I got to attend two very good conferences and I got to speak at them." + } + ], + "author": [ + { + "type": [ + "h-card" + ], + "properties": { + "name": [ + "Frances" + ], + "url": [ + "http://fberriman.com/author/admin/" + ] + }, + "value": "Frances" + } + ] + }, + "id": "post-976" + } + ], + "rels": { + "bookmark": [ + "http://fberriman.com/2013/05/14/april-recap-txjs-front-trends/" + ], + "category": [ + "http://fberriman.com/category/speaking/", + "http://fberriman.com/category/web-dev/" + ], + "tag": [ + "http://fberriman.com/category/speaking/", + "http://fberriman.com/category/web-dev/", + "http://fberriman.com/tag/conferences/", + "http://fberriman.com/tag/front-trends/", + "http://fberriman.com/tag/fronttrends/", + "http://fberriman.com/tag/speaking/", + "http://fberriman.com/tag/txjs/" + ], + "author": [ + "http://fberriman.com/author/admin/" + ] + }, + "rel-urls": { + "http://fberriman.com/2013/05/14/april-recap-txjs-front-trends/": { + "title": "Permalink to April recap \u2013 TXJS & Front-Trends", + "text": "April recap \u2013 TXJS & Front-Trends", + "rels": [ + "bookmark" + ] + }, + "http://fberriman.com/category/speaking/": { + "title": "View all posts in Speaking", + "text": "Speaking", + "rels": [ + "category", + "tag" + ] + }, + "http://fberriman.com/category/web-dev/": { + "title": "View all posts in Web Dev", + "text": "Web Dev", + "rels": [ + "category", + "tag" + ] + }, + "http://fberriman.com/tag/conferences/": { + "text": "conferences", + "rels": [ + "tag" + ] + }, + "http://fberriman.com/tag/front-trends/": { + "text": "front-trends", + "rels": [ + "tag" + ] + }, + "http://fberriman.com/tag/fronttrends/": { + "text": "fronttrends", + "rels": [ + "tag" + ] + }, + "http://fberriman.com/tag/speaking/": { + "text": "Speaking", + "rels": [ + "tag" + ] + }, + "http://fberriman.com/tag/txjs/": { + "text": "txjs", + "rels": [ + "tag" + ] + }, + "http://fberriman.com/author/admin/": { + "title": "View all posts by Frances", + "text": "Frances", + "rels": [ + "author" + ] + } + } +} \ No newline at end of file