Browse Source

Still more fixes

master
J. King 1 year ago
parent
commit
ff4ccff3a8
  1. 12
      lib/Parser.php
  2. 7
      tests/cases/StandardTest.php

12
lib/Parser.php

@ -272,7 +272,7 @@ class Parser {
$out['rel-urls'][$url][$attr] = trim($link->getAttribute($attr)); $out['rel-urls'][$url][$attr] = trim($link->getAttribute($attr));
} }
} }
if (strlen($text = $this->getCleanText($link, "p"))) { if (!isset($out['rel-urls'][$url]['text']) && strlen($text = $this->getCleanText($link, "p"))) {
$out['rel-urls'][$url]['text'] = $text; $out['rel-urls'][$url]['text'] = $text;
} }
# if there is no "rels" key in that hash, add it with an empty array value # if there is no "rels" key in that hash, add it with an empty array value
@ -979,6 +979,10 @@ class Parser {
foreach ($copy->getElementsByTagName("style") as $e) { foreach ($copy->getElementsByTagName("style") as $e) {
$e->parentNode->removeChild($e); $e->parentNode->removeChild($e);
} }
// also drop templates; their contents would not normally be included in textContent
foreach ($copy->getElementsByTagName("template") as $e) {
$e->parentNode->removeChild($e);
}
# replacing any nested <img> elements with their alt attribute, if # replacing any nested <img> elements with their alt attribute, if
# present; otherwise their src attribute, if present, adding a # present; otherwise their src attribute, if present, adding a
# space at the beginning and end, resolving the URL if it’s # space at the beginning and end, resolving the URL if it’s
@ -1016,7 +1020,7 @@ class Parser {
* @param bool $considerChildren Whether or not child nodes are valid next nodes * @param bool $considerChildren Whether or not child nodes are valid next nodes
*/ */
protected function nextElement(\DOMElement $node, \DOMElement $root, bool $considerChildren): ?\DOMElement { protected function nextElement(\DOMElement $node, \DOMElement $root, bool $considerChildren): ?\DOMElement {
if ($considerChildren && $node->localName !== "template" && $node->hasChildNodes()) { if ($considerChildren && $node->hasChildNodes()) {
$node = $node->firstChild; $node = $node->firstChild;
$next = $node; $next = $node;
} elseif ($node->isSameNode($root)) { } elseif ($node->isSameNode($root)) {
@ -1024,7 +1028,7 @@ class Parser {
} else { } else {
$next = $node->nextSibling; $next = $node->nextSibling;
} }
while ($next && !$next instanceof \DOMElement) { while ($next && (!$next instanceof \DOMElement || $next->localName === "template")) {
$next = $next->nextSibling; $next = $next->nextSibling;
} }
while (!$next) { while (!$next) {
@ -1033,7 +1037,7 @@ class Parser {
return null; return null;
} }
$next = $node->nextSibling; $next = $node->nextSibling;
while ($next and !$next instanceof \DOMElement) { while ($next && (!$next instanceof \DOMElement || $next->localName === "template")) {
$next = $next->nextSibling; $next = $next->nextSibling;
} }
} }

7
tests/cases/StandardTest.php

@ -45,9 +45,6 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
"microformats-v2/h-event/time", "microformats-v2/h-event/time",
"microformats-v2/h-review-aggregate/hevent", "microformats-v2/h-review-aggregate/hevent",
"microformats-v2/h-review-aggregate/simpleproperties", "microformats-v2/h-review-aggregate/simpleproperties",
"microformats-v2/mixed/ignoretemplate",
"microformats-v2/rel/duplicate-rels",
"microformats-v2/rel/varying-text-duplicate-rels",
]; ];
/** @dataProvider provideStandardTests */ /** @dataProvider provideStandardTests */
@ -64,6 +61,8 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
if (preg_match('#^https?://[^/]+$#', $v)) { if (preg_match('#^https?://[^/]+$#', $v)) {
$v .= "/"; $v .= "/";
} }
// at least one test has spurious whitespace
$v = trim($v);
}); });
// parse input // parse input
$dom = new DOMParser; $dom = new DOMParser;
@ -81,7 +80,7 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
foreach (new \RegexIterator(new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator(self::BASE)), '/\.json$/') as $path) { foreach (new \RegexIterator(new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator(self::BASE)), '/\.json$/') as $path) {
$path = str_replace(self::BASE, "", $path->getPathname()); $path = str_replace(self::BASE, "", $path->getPathname());
$path = preg_replace('/\.json$/', '', $path); $path = preg_replace('/\.json$/', '', $path);
yield [$path]; yield $path => [$path];
} }
} }

Loading…
Cancel
Save