Bug fixes

- Tag relation
- Inner whitespace of cleaned text
- Deferred properties
This commit is contained in:
J. King 2023-06-19 17:05:05 -04:00
parent 4eee308b22
commit d7f7cb9586
4 changed files with 242 additions and 29 deletions

View file

@ -120,6 +120,7 @@ class Parser {
/** @var array The list of link relations which are backward-compatibility property markers. The format is the same as for backcompat classes */
protected const BACKCOMPAT_RELATIONS = [
// h-review and h-review-agregate also include "self bookmark", but this requires special processing
// the tag relation also requires special processing
'bookmark' => ['h-entry' => ["u", "url"]],
'tag' => ['h-entry' => ["p", "category", [], true], 'h-feed' => ["p", "category"], 'h-review' => ["p", "category"], 'h-review-aggregate' => ["p", "category"]],
'author' => ['h-entry' => ["u", "author", [], true]],
@ -392,7 +393,7 @@ class Parser {
return array_values($out);
}
protected function matchPropertiesBackcompat(array $classes, array $types, \DOMElement $node): array {
protected function matchPropertiesBackcompat(array &$classes, array $types, \DOMElement $node): array {
$props = [];
$out = [];
foreach ($types as $t) {
@ -420,9 +421,11 @@ class Parser {
}
}
// filter the list of properties for uniqueness by name
// while we're at it we'll also add extra roots where needed
foreach ($props as $map) {
$prefix = $map[0];
$name = $map[1];
$extraRoots = $map[2] ?? [];
if (
// property with this name has not been seen yet
!isset($out[$name])
@ -430,6 +433,11 @@ class Parser {
|| (static::PREFIX_RANK[$prefix] > static::PREFIX_RANK[$out[$name][0]] && !($map[3] ?? false))
) {
$out[$name] = $map;
foreach ($extraRoots as $r) {
if (!in_array($r, $classes)) {
$classes[] = $r;
}
}
}
}
return array_values($out);
@ -466,6 +474,13 @@ class Parser {
while ($node = $this->nextElement($node ?? $root, $root, !($child = $child ?? false))) {
$child = null;
$classes = $this->parseTokens($node, "class");
if ($backcompat) {
# if parsing a backcompat root, parse child element class name(s) for backcompat properties
$properties = $this->matchPropertiesBackcompat($classes, $types, $node);
} else {
# else parse a child element class for property class name(s) "p-*,u-*,dt-*,e-*"
$properties = $this->matchPropertiesMf2($classes);
}
# parse a child element for microformats (recurse)
// NOTE: We do this in a different order from the spec because this seems to be what is actually required
if ($childTypes = $this->matchRootsMf2($classes)) {
@ -475,13 +490,6 @@ class Parser {
$child = $this->parseMicroformat($node, $childTypes, true);
$hasChild = true;
}
if ($backcompat) {
# if parsing a backcompat root, parse child element class name(s) for backcompat properties
$properties = $this->matchPropertiesBackcompat($classes, $types, $node);
} else {
# else parse a child element class for property class name(s) "p-*,u-*,dt-*,e-*"
$properties = $this->matchPropertiesMf2($classes);
}
# [if the element is a microformat and it has no properties] add
# found elements that are microformats to the "children" array
if ($child && !$properties) {
@ -535,18 +543,16 @@ class Parser {
}
$out['properties'][$key][] = $value;
}
// now add any extra roots to the element's class list; this only ever occurs during backcompat processing
foreach ($extraRoots ?? [] as $r) {
if (!in_array($r, $classes)) {
$classes[] = $r;
}
}
}
}
// add any deferred properties
$known = array_keys($out['properties']);
foreach ($deferred as [$key, $value]) {
if (!isset($out['properties'][$key])) {
$out['properties'][$key] = [$value];
if (!in_array($key, $known)) {
if (!isset($out['properties'][$key])) {
$out['properties'][$key] = [];
}
$out['properties'][$key][] = $value;
}
}
# imply properties for the found microformat
@ -702,6 +708,14 @@ class Parser {
} elseif (in_array($node->localName, ["img", "area"]) && $node->hasAttribute("alt")) {
# else if img.p-x[alt] or area.p-x[alt], then return the alt attribute
return $node->getAttribute("alt");
} elseif (in_array($node->localName, ["a", "area", "link"]) && array_intersect($backcompatTypes, array_keys(static::BACKCOMPAT_RELATIONS['tag'])) && preg_match('/\btag\b/', $node->getAttribute("rel"))) {
// we have encountered a tag relation during backcompat processing
// https://microformats.org/wiki/rel-tag#Abstract
// we are required to retrieve the last component of the URL path and use that
if (preg_match('#([^/]*)/?$#', URL::fromString($this->normalizeUrl($node->getAttribute("href")))->getPath(), $match)) {
return $match[1];
}
return "";
}
# else return the textContent of the element after [cleaning]
return $this->getCleanText($node, $prefix);
@ -804,7 +818,7 @@ class Parser {
$classes = $this->parseTokens($node, "class");
$candidate = null;
if (!array_intersect(["value", "value-title"], $classes) && (
($backcompatTypes && ($this->matchRootsBackcompat($classes) || $this->matchPropertiesBackcompat($classes, $backcompatTypes, $node)))
($backcompatTypes && ($this->matchPropertiesBackcompat($classes, $backcompatTypes, $node) || $this->matchRootsBackcompat($classes)))
|| ($this->matchRootsMf2($classes) || $this->matchPropertiesMf2($classes))
)) {
// only consider elements which are not themselves properties or roots, unless they have a value
@ -1053,7 +1067,8 @@ class Parser {
$e->parentNode->replaceChild($e->ownerDocument->createTextNode($attr), $e);
}
# removing all leading/trailing spaces
return trim($copy->textContent);
// NOTE: Also remove extraneous spaces within the text; this aligns with most mature implementations
return preg_replace('/\s{2,}/s', " ", trim($copy->textContent));
}
protected function getBaseUrl(\DOMElement $root, string $base): string {

View file

@ -11,7 +11,6 @@ use MensBeam\HTML\DOMParser;
/** @covers MensBeam\Microformats\Parser */
class StandardTest extends \PHPUnit\Framework\TestCase {
protected const BASE = \MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/";
protected const SUPPRESSED = [
'microformats-v1/hcard/multiple' => "whether vcard keys are p- or u- is unclear",
'microformats-v1/includes/hcarditemref' => "include pattern not implemented",
@ -23,13 +22,13 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
];
/** @dataProvider provideStandardTests */
public function testStandardTests(string $test): void {
if (isset(self::SUPPRESSED[$test])) {
$this->markTestIncomplete(self::SUPPRESSED[$test]);
public function testStandardTests(string $name, string $path): void {
if (isset(self::SUPPRESSED[$name])) {
$this->markTestIncomplete(self::SUPPRESSED[$name]);
}
// read data
$exp = json_decode(file_get_contents(self::BASE.$test.".json"), true);
$html = file_get_contents(self::BASE.$test.".html");
$exp = json_decode(file_get_contents($path.".json"), true);
$html = file_get_contents($path.".html");
// fix up expectation where necessary
array_walk_recursive($exp, function(&$v) {
// URLs differ trivially from output of our normalization library
@ -38,7 +37,7 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
}
});
// perform some further monkey-patching on specific tests
$exp = $this->fixTests($exp, $test);
$exp = $this->fixTests($exp, $name);
// parse input
$dom = new DOMParser;
$parser = new Parser;
@ -48,14 +47,31 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
$this->ksort($exp);
$this->ksort($act);
// run comparison
if (!$exp) {
echo json_encode($act, \JSON_PRETTY_PRINT | \JSON_UNESCAPED_SLASHES | \JSON_UNESCAPED_UNICODE);
exit;
}
$this->assertSame($exp, $act);
}
public function provideStandardTests(): \Generator {
foreach (new \RegexIterator(new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator(self::BASE)), '/\.json$/') as $path) {
$path = str_replace(self::BASE, "", $path->getPathname());
$path = preg_replace('/\.json$/', '', $path);
yield $path => [$path];
return $this->provideTestList(\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/");
}
protected function provideTestList(): \Generator {
$tests = [
//\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/", // standard tests
\MensBeam\Microformats\BASE."tests/cases/json/", // additional tests
];
foreach ($tests as $base) {
$base = strtr($base, "\\", "/");
foreach (new \RegexIterator(new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator($base )), '/\.json$/') as $file) {
$path = $file->getPathname();
$path = preg_replace('/\.json$/', '', $path);
$name = strtr($path, "\\", "/");
$name = str_replace(strtr($base, "\\", "/"), "", $name);
yield $name => [$name, $path];
}
}
}
@ -91,6 +107,8 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
$this->fixDates($exp['items'][0]['properties']['bday']);
$this->fixDates($exp['items'][0]['properties']['rev']);
break;
case "phpmf2/hentry/fberriman":
$this->fixDates($exp['items'][0]['properties']['published']);
}
return $exp;
}

View file

@ -0,0 +1,44 @@
<!--
The php-mf2 library tests only the "tag" link relation with this file.
The JSON output is a synthesis of what most implementations do, along
with the addition of the <article> element's ID (as per the general
parsing steps), as well as the transformation of the "entry-date"
class, which is more obscure.
-->
<article id="post-976" class="post-976 post type-post status-publish format-standard hentry category-speaking category-web-dev tag-conferences tag-front-trends tag-fronttrends tag-speaking tag-txjs">
<header class="entry-header">
<h1 class="entry-title">
<a href="http://fberriman.com/2013/05/14/april-recap-txjs-front-trends/" rel="bookmark">April recap &#8211; TXJS &#038; Front-Trends</a>
</h1>
<div class="entry-meta">
<span class="date">
<a href="http://fberriman.com/2013/05/14/april-recap-txjs-front-trends/" title="Permalink to April recap &#8211; TXJS &amp; Front-Trends" rel="bookmark">
<time class="entry-date" datetime="2013-05-14T11:54:06+00:00">May 14, 2013</time>
</a>
</span>
<span class="categories-links">
<a href="http://fberriman.com/category/speaking/" title="View all posts in Speaking" rel="category tag">Speaking</a>,
<a href="http://fberriman.com/category/web-dev/" title="View all posts in Web Dev" rel="category tag">Web Dev</a>
</span>
<span class="tags-links">
<a href="http://fberriman.com/tag/conferences/" rel="tag">conferences</a>,
<a href="http://fberriman.com/tag/front-trends/" rel="tag">front-trends</a>,
<a href="http://fberriman.com/tag/fronttrends/" rel="tag">fronttrends</a>,
<a href="http://fberriman.com/tag/speaking/" rel="tag">Speaking</a>,
<a href="http://fberriman.com/tag/txjs/" rel="tag">txjs</a>
</span>
<span class="author vcard"><a class="url fn n" href="http://fberriman.com/author/admin/" title="View all posts by Frances" rel="author">Frances</a></span> </div>
</header>
<div class="entry-content">
<p>April was pretty decent. I got to attend two very good conferences <strong>and</strong> I got to speak at them.</p>
</div>
<footer class="entry-meta">
<div class="comments-link">
<a href="http://fberriman.com/2013/05/14/april-recap-txjs-front-trends/#respond" title="Comment on April recap &#8211; TXJS &amp; Front-Trends"><span class="leave-reply">Leave a comment</span></a>
</div>
</footer><!-- .entry-meta -->
</article><!-- #post -->

View file

@ -0,0 +1,136 @@
{
"items": [
{
"type": [
"h-entry"
],
"properties": {
"name": [
"April recap \u2013 TXJS & Front-Trends"
],
"published": [
"2013-05-14T11:54:06+0000"
],
"category": [
"speaking",
"web-dev",
"conferences",
"front-trends",
"fronttrends",
"speaking",
"txjs"
],
"url": [
"http://fberriman.com/2013/05/14/april-recap-txjs-front-trends/",
"http://fberriman.com/2013/05/14/april-recap-txjs-front-trends/"
],
"content": [
{
"html": "<p>April was pretty decent. I got to attend two very good conferences <strong>and</strong> I got to speak at them.</p>",
"value": "April was pretty decent. I got to attend two very good conferences and I got to speak at them."
}
],
"author": [
{
"type": [
"h-card"
],
"properties": {
"name": [
"Frances"
],
"url": [
"http://fberriman.com/author/admin/"
]
},
"value": "Frances"
}
]
},
"id": "post-976"
}
],
"rels": {
"bookmark": [
"http://fberriman.com/2013/05/14/april-recap-txjs-front-trends/"
],
"category": [
"http://fberriman.com/category/speaking/",
"http://fberriman.com/category/web-dev/"
],
"tag": [
"http://fberriman.com/category/speaking/",
"http://fberriman.com/category/web-dev/",
"http://fberriman.com/tag/conferences/",
"http://fberriman.com/tag/front-trends/",
"http://fberriman.com/tag/fronttrends/",
"http://fberriman.com/tag/speaking/",
"http://fberriman.com/tag/txjs/"
],
"author": [
"http://fberriman.com/author/admin/"
]
},
"rel-urls": {
"http://fberriman.com/2013/05/14/april-recap-txjs-front-trends/": {
"title": "Permalink to April recap \u2013 TXJS & Front-Trends",
"text": "April recap \u2013 TXJS & Front-Trends",
"rels": [
"bookmark"
]
},
"http://fberriman.com/category/speaking/": {
"title": "View all posts in Speaking",
"text": "Speaking",
"rels": [
"category",
"tag"
]
},
"http://fberriman.com/category/web-dev/": {
"title": "View all posts in Web Dev",
"text": "Web Dev",
"rels": [
"category",
"tag"
]
},
"http://fberriman.com/tag/conferences/": {
"text": "conferences",
"rels": [
"tag"
]
},
"http://fberriman.com/tag/front-trends/": {
"text": "front-trends",
"rels": [
"tag"
]
},
"http://fberriman.com/tag/fronttrends/": {
"text": "fronttrends",
"rels": [
"tag"
]
},
"http://fberriman.com/tag/speaking/": {
"text": "Speaking",
"rels": [
"tag"
]
},
"http://fberriman.com/tag/txjs/": {
"text": "txjs",
"rels": [
"tag"
]
},
"http://fberriman.com/author/admin/": {
"title": "View all posts by Frances",
"text": "Frances",
"rels": [
"author"
]
}
}
}