Browse Source

Still more fixes

master
J. King 1 year ago
parent
commit
ff4ccff3a8
  1. 12
      lib/Parser.php
  2. 7
      tests/cases/StandardTest.php

12
lib/Parser.php

@ -272,7 +272,7 @@ class Parser {
$out['rel-urls'][$url][$attr] = trim($link->getAttribute($attr));
}
}
if (strlen($text = $this->getCleanText($link, "p"))) {
if (!isset($out['rel-urls'][$url]['text']) && strlen($text = $this->getCleanText($link, "p"))) {
$out['rel-urls'][$url]['text'] = $text;
}
# if there is no "rels" key in that hash, add it with an empty array value
@ -979,6 +979,10 @@ class Parser {
foreach ($copy->getElementsByTagName("style") as $e) {
$e->parentNode->removeChild($e);
}
// also drop templates; their contents would not normally be included in textContent
foreach ($copy->getElementsByTagName("template") as $e) {
$e->parentNode->removeChild($e);
}
# replacing any nested <img> elements with their alt attribute, if
# present; otherwise their src attribute, if present, adding a
# space at the beginning and end, resolving the URL if it’s
@ -1016,7 +1020,7 @@ class Parser {
* @param bool $considerChildren Whether or not child nodes are valid next nodes
*/
protected function nextElement(\DOMElement $node, \DOMElement $root, bool $considerChildren): ?\DOMElement {
if ($considerChildren && $node->localName !== "template" && $node->hasChildNodes()) {
if ($considerChildren && $node->hasChildNodes()) {
$node = $node->firstChild;
$next = $node;
} elseif ($node->isSameNode($root)) {
@ -1024,7 +1028,7 @@ class Parser {
} else {
$next = $node->nextSibling;
}
while ($next && !$next instanceof \DOMElement) {
while ($next && (!$next instanceof \DOMElement || $next->localName === "template")) {
$next = $next->nextSibling;
}
while (!$next) {
@ -1033,7 +1037,7 @@ class Parser {
return null;
}
$next = $node->nextSibling;
while ($next and !$next instanceof \DOMElement) {
while ($next && (!$next instanceof \DOMElement || $next->localName === "template")) {
$next = $next->nextSibling;
}
}

7
tests/cases/StandardTest.php

@ -45,9 +45,6 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
"microformats-v2/h-event/time",
"microformats-v2/h-review-aggregate/hevent",
"microformats-v2/h-review-aggregate/simpleproperties",
"microformats-v2/mixed/ignoretemplate",
"microformats-v2/rel/duplicate-rels",
"microformats-v2/rel/varying-text-duplicate-rels",
];
/** @dataProvider provideStandardTests */
@ -64,6 +61,8 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
if (preg_match('#^https?://[^/]+$#', $v)) {
$v .= "/";
}
// at least one test has spurious whitespace
$v = trim($v);
});
// parse input
$dom = new DOMParser;
@ -81,7 +80,7 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
foreach (new \RegexIterator(new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator(self::BASE)), '/\.json$/') as $path) {
$path = str_replace(self::BASE, "", $path->getPathname());
$path = preg_replace('/\.json$/', '', $path);
yield [$path];
yield $path => [$path];
}
}

Loading…
Cancel
Save