Bug fixes
- Tag relation - Inner whitespace of cleaned text - Deferred properties
This commit is contained in:
parent
4eee308b22
commit
d7f7cb9586
4 changed files with 242 additions and 29 deletions
|
@ -120,6 +120,7 @@ class Parser {
|
|||
/** @var array The list of link relations which are backward-compatibility property markers. The format is the same as for backcompat classes */
|
||||
protected const BACKCOMPAT_RELATIONS = [
|
||||
// h-review and h-review-agregate also include "self bookmark", but this requires special processing
|
||||
// the tag relation also requires special processing
|
||||
'bookmark' => ['h-entry' => ["u", "url"]],
|
||||
'tag' => ['h-entry' => ["p", "category", [], true], 'h-feed' => ["p", "category"], 'h-review' => ["p", "category"], 'h-review-aggregate' => ["p", "category"]],
|
||||
'author' => ['h-entry' => ["u", "author", [], true]],
|
||||
|
@ -392,7 +393,7 @@ class Parser {
|
|||
return array_values($out);
|
||||
}
|
||||
|
||||
protected function matchPropertiesBackcompat(array $classes, array $types, \DOMElement $node): array {
|
||||
protected function matchPropertiesBackcompat(array &$classes, array $types, \DOMElement $node): array {
|
||||
$props = [];
|
||||
$out = [];
|
||||
foreach ($types as $t) {
|
||||
|
@ -420,9 +421,11 @@ class Parser {
|
|||
}
|
||||
}
|
||||
// filter the list of properties for uniqueness by name
|
||||
// while we're at it we'll also add extra roots where needed
|
||||
foreach ($props as $map) {
|
||||
$prefix = $map[0];
|
||||
$name = $map[1];
|
||||
$extraRoots = $map[2] ?? [];
|
||||
if (
|
||||
// property with this name has not been seen yet
|
||||
!isset($out[$name])
|
||||
|
@ -430,6 +433,11 @@ class Parser {
|
|||
|| (static::PREFIX_RANK[$prefix] > static::PREFIX_RANK[$out[$name][0]] && !($map[3] ?? false))
|
||||
) {
|
||||
$out[$name] = $map;
|
||||
foreach ($extraRoots as $r) {
|
||||
if (!in_array($r, $classes)) {
|
||||
$classes[] = $r;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return array_values($out);
|
||||
|
@ -466,6 +474,13 @@ class Parser {
|
|||
while ($node = $this->nextElement($node ?? $root, $root, !($child = $child ?? false))) {
|
||||
$child = null;
|
||||
$classes = $this->parseTokens($node, "class");
|
||||
if ($backcompat) {
|
||||
# if parsing a backcompat root, parse child element class name(s) for backcompat properties
|
||||
$properties = $this->matchPropertiesBackcompat($classes, $types, $node);
|
||||
} else {
|
||||
# else parse a child element class for property class name(s) "p-*,u-*,dt-*,e-*"
|
||||
$properties = $this->matchPropertiesMf2($classes);
|
||||
}
|
||||
# parse a child element for microformats (recurse)
|
||||
// NOTE: We do this in a different order from the spec because this seems to be what is actually required
|
||||
if ($childTypes = $this->matchRootsMf2($classes)) {
|
||||
|
@ -475,13 +490,6 @@ class Parser {
|
|||
$child = $this->parseMicroformat($node, $childTypes, true);
|
||||
$hasChild = true;
|
||||
}
|
||||
if ($backcompat) {
|
||||
# if parsing a backcompat root, parse child element class name(s) for backcompat properties
|
||||
$properties = $this->matchPropertiesBackcompat($classes, $types, $node);
|
||||
} else {
|
||||
# else parse a child element class for property class name(s) "p-*,u-*,dt-*,e-*"
|
||||
$properties = $this->matchPropertiesMf2($classes);
|
||||
}
|
||||
# [if the element is a microformat and it has no properties] add
|
||||
# found elements that are microformats to the "children" array
|
||||
if ($child && !$properties) {
|
||||
|
@ -535,18 +543,16 @@ class Parser {
|
|||
}
|
||||
$out['properties'][$key][] = $value;
|
||||
}
|
||||
// now add any extra roots to the element's class list; this only ever occurs during backcompat processing
|
||||
foreach ($extraRoots ?? [] as $r) {
|
||||
if (!in_array($r, $classes)) {
|
||||
$classes[] = $r;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// add any deferred properties
|
||||
$known = array_keys($out['properties']);
|
||||
foreach ($deferred as [$key, $value]) {
|
||||
if (!isset($out['properties'][$key])) {
|
||||
$out['properties'][$key] = [$value];
|
||||
if (!in_array($key, $known)) {
|
||||
if (!isset($out['properties'][$key])) {
|
||||
$out['properties'][$key] = [];
|
||||
}
|
||||
$out['properties'][$key][] = $value;
|
||||
}
|
||||
}
|
||||
# imply properties for the found microformat
|
||||
|
@ -702,6 +708,14 @@ class Parser {
|
|||
} elseif (in_array($node->localName, ["img", "area"]) && $node->hasAttribute("alt")) {
|
||||
# else if img.p-x[alt] or area.p-x[alt], then return the alt attribute
|
||||
return $node->getAttribute("alt");
|
||||
} elseif (in_array($node->localName, ["a", "area", "link"]) && array_intersect($backcompatTypes, array_keys(static::BACKCOMPAT_RELATIONS['tag'])) && preg_match('/\btag\b/', $node->getAttribute("rel"))) {
|
||||
// we have encountered a tag relation during backcompat processing
|
||||
// https://microformats.org/wiki/rel-tag#Abstract
|
||||
// we are required to retrieve the last component of the URL path and use that
|
||||
if (preg_match('#([^/]*)/?$#', URL::fromString($this->normalizeUrl($node->getAttribute("href")))->getPath(), $match)) {
|
||||
return $match[1];
|
||||
}
|
||||
return "";
|
||||
}
|
||||
# else return the textContent of the element after [cleaning]
|
||||
return $this->getCleanText($node, $prefix);
|
||||
|
@ -804,7 +818,7 @@ class Parser {
|
|||
$classes = $this->parseTokens($node, "class");
|
||||
$candidate = null;
|
||||
if (!array_intersect(["value", "value-title"], $classes) && (
|
||||
($backcompatTypes && ($this->matchRootsBackcompat($classes) || $this->matchPropertiesBackcompat($classes, $backcompatTypes, $node)))
|
||||
($backcompatTypes && ($this->matchPropertiesBackcompat($classes, $backcompatTypes, $node) || $this->matchRootsBackcompat($classes)))
|
||||
|| ($this->matchRootsMf2($classes) || $this->matchPropertiesMf2($classes))
|
||||
)) {
|
||||
// only consider elements which are not themselves properties or roots, unless they have a value
|
||||
|
@ -1053,7 +1067,8 @@ class Parser {
|
|||
$e->parentNode->replaceChild($e->ownerDocument->createTextNode($attr), $e);
|
||||
}
|
||||
# removing all leading/trailing spaces
|
||||
return trim($copy->textContent);
|
||||
// NOTE: Also remove extraneous spaces within the text; this aligns with most mature implementations
|
||||
return preg_replace('/\s{2,}/s', " ", trim($copy->textContent));
|
||||
}
|
||||
|
||||
protected function getBaseUrl(\DOMElement $root, string $base): string {
|
||||
|
|
|
@ -11,7 +11,6 @@ use MensBeam\HTML\DOMParser;
|
|||
|
||||
/** @covers MensBeam\Microformats\Parser */
|
||||
class StandardTest extends \PHPUnit\Framework\TestCase {
|
||||
protected const BASE = \MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/";
|
||||
protected const SUPPRESSED = [
|
||||
'microformats-v1/hcard/multiple' => "whether vcard keys are p- or u- is unclear",
|
||||
'microformats-v1/includes/hcarditemref' => "include pattern not implemented",
|
||||
|
@ -23,13 +22,13 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
|
|||
];
|
||||
|
||||
/** @dataProvider provideStandardTests */
|
||||
public function testStandardTests(string $test): void {
|
||||
if (isset(self::SUPPRESSED[$test])) {
|
||||
$this->markTestIncomplete(self::SUPPRESSED[$test]);
|
||||
public function testStandardTests(string $name, string $path): void {
|
||||
if (isset(self::SUPPRESSED[$name])) {
|
||||
$this->markTestIncomplete(self::SUPPRESSED[$name]);
|
||||
}
|
||||
// read data
|
||||
$exp = json_decode(file_get_contents(self::BASE.$test.".json"), true);
|
||||
$html = file_get_contents(self::BASE.$test.".html");
|
||||
$exp = json_decode(file_get_contents($path.".json"), true);
|
||||
$html = file_get_contents($path.".html");
|
||||
// fix up expectation where necessary
|
||||
array_walk_recursive($exp, function(&$v) {
|
||||
// URLs differ trivially from output of our normalization library
|
||||
|
@ -38,7 +37,7 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
|
|||
}
|
||||
});
|
||||
// perform some further monkey-patching on specific tests
|
||||
$exp = $this->fixTests($exp, $test);
|
||||
$exp = $this->fixTests($exp, $name);
|
||||
// parse input
|
||||
$dom = new DOMParser;
|
||||
$parser = new Parser;
|
||||
|
@ -48,14 +47,31 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
|
|||
$this->ksort($exp);
|
||||
$this->ksort($act);
|
||||
// run comparison
|
||||
if (!$exp) {
|
||||
echo json_encode($act, \JSON_PRETTY_PRINT | \JSON_UNESCAPED_SLASHES | \JSON_UNESCAPED_UNICODE);
|
||||
exit;
|
||||
}
|
||||
$this->assertSame($exp, $act);
|
||||
}
|
||||
|
||||
public function provideStandardTests(): \Generator {
|
||||
foreach (new \RegexIterator(new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator(self::BASE)), '/\.json$/') as $path) {
|
||||
$path = str_replace(self::BASE, "", $path->getPathname());
|
||||
$path = preg_replace('/\.json$/', '', $path);
|
||||
yield $path => [$path];
|
||||
return $this->provideTestList(\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/");
|
||||
}
|
||||
|
||||
protected function provideTestList(): \Generator {
|
||||
$tests = [
|
||||
//\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/", // standard tests
|
||||
\MensBeam\Microformats\BASE."tests/cases/json/", // additional tests
|
||||
];
|
||||
foreach ($tests as $base) {
|
||||
$base = strtr($base, "\\", "/");
|
||||
foreach (new \RegexIterator(new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator($base )), '/\.json$/') as $file) {
|
||||
$path = $file->getPathname();
|
||||
$path = preg_replace('/\.json$/', '', $path);
|
||||
$name = strtr($path, "\\", "/");
|
||||
$name = str_replace(strtr($base, "\\", "/"), "", $name);
|
||||
yield $name => [$name, $path];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -91,6 +107,8 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
|
|||
$this->fixDates($exp['items'][0]['properties']['bday']);
|
||||
$this->fixDates($exp['items'][0]['properties']['rev']);
|
||||
break;
|
||||
case "phpmf2/hentry/fberriman":
|
||||
$this->fixDates($exp['items'][0]['properties']['published']);
|
||||
}
|
||||
return $exp;
|
||||
}
|
||||
|
|
44
tests/cases/json/phpmf2/hentry/fberriman.html
Normal file
44
tests/cases/json/phpmf2/hentry/fberriman.html
Normal file
|
@ -0,0 +1,44 @@
|
|||
<!--
|
||||
The php-mf2 library tests only the "tag" link relation with this file.
|
||||
The JSON output is a synthesis of what most implementations do, along
|
||||
with the addition of the <article> element's ID (as per the general
|
||||
parsing steps), as well as the transformation of the "entry-date"
|
||||
class, which is more obscure.
|
||||
-->
|
||||
<article id="post-976" class="post-976 post type-post status-publish format-standard hentry category-speaking category-web-dev tag-conferences tag-front-trends tag-fronttrends tag-speaking tag-txjs">
|
||||
<header class="entry-header">
|
||||
<h1 class="entry-title">
|
||||
<a href="http://fberriman.com/2013/05/14/april-recap-txjs-front-trends/" rel="bookmark">April recap – TXJS & Front-Trends</a>
|
||||
</h1>
|
||||
|
||||
<div class="entry-meta">
|
||||
<span class="date">
|
||||
<a href="http://fberriman.com/2013/05/14/april-recap-txjs-front-trends/" title="Permalink to April recap – TXJS & Front-Trends" rel="bookmark">
|
||||
<time class="entry-date" datetime="2013-05-14T11:54:06+00:00">May 14, 2013</time>
|
||||
</a>
|
||||
</span>
|
||||
<span class="categories-links">
|
||||
<a href="http://fberriman.com/category/speaking/" title="View all posts in Speaking" rel="category tag">Speaking</a>,
|
||||
<a href="http://fberriman.com/category/web-dev/" title="View all posts in Web Dev" rel="category tag">Web Dev</a>
|
||||
</span>
|
||||
<span class="tags-links">
|
||||
<a href="http://fberriman.com/tag/conferences/" rel="tag">conferences</a>,
|
||||
<a href="http://fberriman.com/tag/front-trends/" rel="tag">front-trends</a>,
|
||||
<a href="http://fberriman.com/tag/fronttrends/" rel="tag">fronttrends</a>,
|
||||
<a href="http://fberriman.com/tag/speaking/" rel="tag">Speaking</a>,
|
||||
<a href="http://fberriman.com/tag/txjs/" rel="tag">txjs</a>
|
||||
</span>
|
||||
<span class="author vcard"><a class="url fn n" href="http://fberriman.com/author/admin/" title="View all posts by Frances" rel="author">Frances</a></span> </div>
|
||||
</header>
|
||||
|
||||
<div class="entry-content">
|
||||
<p>April was pretty decent. I got to attend two very good conferences <strong>and</strong> I got to speak at them.</p>
|
||||
</div>
|
||||
|
||||
<footer class="entry-meta">
|
||||
<div class="comments-link">
|
||||
<a href="http://fberriman.com/2013/05/14/april-recap-txjs-front-trends/#respond" title="Comment on April recap – TXJS & Front-Trends"><span class="leave-reply">Leave a comment</span></a>
|
||||
</div>
|
||||
|
||||
</footer><!-- .entry-meta -->
|
||||
</article><!-- #post -->
|
136
tests/cases/json/phpmf2/hentry/fberriman.json
Normal file
136
tests/cases/json/phpmf2/hentry/fberriman.json
Normal file
|
@ -0,0 +1,136 @@
|
|||
{
|
||||
"items": [
|
||||
{
|
||||
"type": [
|
||||
"h-entry"
|
||||
],
|
||||
"properties": {
|
||||
"name": [
|
||||
"April recap \u2013 TXJS & Front-Trends"
|
||||
],
|
||||
"published": [
|
||||
"2013-05-14T11:54:06+0000"
|
||||
],
|
||||
"category": [
|
||||
"speaking",
|
||||
"web-dev",
|
||||
"conferences",
|
||||
"front-trends",
|
||||
"fronttrends",
|
||||
"speaking",
|
||||
"txjs"
|
||||
],
|
||||
"url": [
|
||||
"http://fberriman.com/2013/05/14/april-recap-txjs-front-trends/",
|
||||
"http://fberriman.com/2013/05/14/april-recap-txjs-front-trends/"
|
||||
],
|
||||
"content": [
|
||||
{
|
||||
"html": "<p>April was pretty decent. I got to attend two very good conferences <strong>and</strong> I got to speak at them.</p>",
|
||||
"value": "April was pretty decent. I got to attend two very good conferences and I got to speak at them."
|
||||
}
|
||||
],
|
||||
"author": [
|
||||
{
|
||||
"type": [
|
||||
"h-card"
|
||||
],
|
||||
"properties": {
|
||||
"name": [
|
||||
"Frances"
|
||||
],
|
||||
"url": [
|
||||
"http://fberriman.com/author/admin/"
|
||||
]
|
||||
},
|
||||
"value": "Frances"
|
||||
}
|
||||
]
|
||||
},
|
||||
"id": "post-976"
|
||||
}
|
||||
],
|
||||
"rels": {
|
||||
"bookmark": [
|
||||
"http://fberriman.com/2013/05/14/april-recap-txjs-front-trends/"
|
||||
],
|
||||
"category": [
|
||||
"http://fberriman.com/category/speaking/",
|
||||
"http://fberriman.com/category/web-dev/"
|
||||
],
|
||||
"tag": [
|
||||
"http://fberriman.com/category/speaking/",
|
||||
"http://fberriman.com/category/web-dev/",
|
||||
"http://fberriman.com/tag/conferences/",
|
||||
"http://fberriman.com/tag/front-trends/",
|
||||
"http://fberriman.com/tag/fronttrends/",
|
||||
"http://fberriman.com/tag/speaking/",
|
||||
"http://fberriman.com/tag/txjs/"
|
||||
],
|
||||
"author": [
|
||||
"http://fberriman.com/author/admin/"
|
||||
]
|
||||
},
|
||||
"rel-urls": {
|
||||
"http://fberriman.com/2013/05/14/april-recap-txjs-front-trends/": {
|
||||
"title": "Permalink to April recap \u2013 TXJS & Front-Trends",
|
||||
"text": "April recap \u2013 TXJS & Front-Trends",
|
||||
"rels": [
|
||||
"bookmark"
|
||||
]
|
||||
},
|
||||
"http://fberriman.com/category/speaking/": {
|
||||
"title": "View all posts in Speaking",
|
||||
"text": "Speaking",
|
||||
"rels": [
|
||||
"category",
|
||||
"tag"
|
||||
]
|
||||
},
|
||||
"http://fberriman.com/category/web-dev/": {
|
||||
"title": "View all posts in Web Dev",
|
||||
"text": "Web Dev",
|
||||
"rels": [
|
||||
"category",
|
||||
"tag"
|
||||
]
|
||||
},
|
||||
"http://fberriman.com/tag/conferences/": {
|
||||
"text": "conferences",
|
||||
"rels": [
|
||||
"tag"
|
||||
]
|
||||
},
|
||||
"http://fberriman.com/tag/front-trends/": {
|
||||
"text": "front-trends",
|
||||
"rels": [
|
||||
"tag"
|
||||
]
|
||||
},
|
||||
"http://fberriman.com/tag/fronttrends/": {
|
||||
"text": "fronttrends",
|
||||
"rels": [
|
||||
"tag"
|
||||
]
|
||||
},
|
||||
"http://fberriman.com/tag/speaking/": {
|
||||
"text": "Speaking",
|
||||
"rels": [
|
||||
"tag"
|
||||
]
|
||||
},
|
||||
"http://fberriman.com/tag/txjs/": {
|
||||
"text": "txjs",
|
||||
"rels": [
|
||||
"tag"
|
||||
]
|
||||
},
|
||||
"http://fberriman.com/author/admin/": {
|
||||
"title": "View all posts by Frances",
|
||||
"text": "Frances",
|
||||
"rels": [
|
||||
"author"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue