Implement textContent parsing properly

This commit is contained in:
J. King 2023-06-20 21:20:26 -04:00
parent 650d1c7e98
commit 65bbf61579
2 changed files with 80 additions and 41 deletions

View file

@ -313,7 +313,7 @@ class Parser {
} }
// sort and clean rel microformats // sort and clean rel microformats
foreach ($out['rels'] as $k => $v) { foreach ($out['rels'] as $k => $v) {
$out['rels'][$k] = array_unique($v); $out['rels'][$k] = array_values(array_unique($v));
} }
foreach ($out['rel-urls'] as $k => $v) { foreach ($out['rel-urls'] as $k => $v) {
$out['rel-urls'][$k]['rels'] = array_unique($v['rels']); $out['rel-urls'][$k]['rels'] = array_unique($v['rels']);
@ -1040,7 +1040,19 @@ class Parser {
if ($this->options['basicTrim']) { if ($this->options['basicTrim']) {
return $this->getCleanTextBasic($node, $prefix); return $this->getCleanTextBasic($node, $prefix);
} else { } else {
return $this->getCleanTextThorough($node, $prefix); // https://microformats.org/wiki/textcontent-parsing
# Plain text of element
# To get the plain text for an Element input:
# Let output be the result of running [Element to string] on input
$output = $this->getCleanTextThorough($node, $prefix);
# Remove any sequence of one or more consecutive U+0020 SPACE code points directly before and after an U+000A LF code point from output
$output = preg_replace('/^\s+|\s+$/m', "", $output);
# Strip leading and trailing ASCII whitespace from output
$output = trim($output);
# Replace any sequence of one or more consecutive U+0020 SPACE code points in output with a single U+0020 SPACE code point
$output = preg_replace('/ {2,}/', " ", $output);
# Return output
return $output;
} }
} }
@ -1060,7 +1072,7 @@ class Parser {
// NOTE: This ought to include FORM FEED characters // NOTE: This ought to include FORM FEED characters
$value = strtr($value, "\t\n\r\f", " "); $value = strtr($value, "\t\n\r\f", " ");
# Append value to output # Append value to output
$output .= $value; $output[] = $value;
} elseif ($n instanceof \DOMElement) { } elseif ($n instanceof \DOMElement) {
# If child is an Element, switch on its tagName: # If child is an Element, switch on its tagName:
// NOTE: we switch on localName instead to avoid silly case folding // NOTE: we switch on localName instead to avoid silly case folding
@ -1068,37 +1080,59 @@ class Parser {
case "script": case "script":
case "style": case "style":
case "template": case "template":
# SCRIPT # SCRIPT
# STYLE # STYLE
// TEMPLATE as well // TEMPLATE as well
# Continue # Continue
continue 2; continue 2;
case "img":
# IMG # IMG
# If child has an alt attribute, then: if ($n->hasAttribute("alt")) {
# Let value be the contents of the alt attribute # If child has an alt attribute, then:
# Strip leading and trailing ASCII whitespace from value # Let value be the contents of the alt attribute
# Else if child has a src attribute, then: # Strip leading and trailing ASCII whitespace from value
# Let value be the contents of the src attribute $value = trim($n->getAttribute("alt"));
# Strip leading and trailing ASCII whitespace from value } elseif ($n->hasAttribute("src")) {
# Set value to the absolute URL created by resolving value following the containing documents languages rules # Else if child has a src attribute, then:
# Else continue # Let value be the contents of the src attribute
# Append and prepend a single U+0020 SPACE code point to value # Strip leading and trailing ASCII whitespace from value
# Append value to output $value = trim($n->getAttribute("src"));
# BR # Set value to the absolute URL created by resolving value following the containing documents languages rules
# Append a string containing a single U+000A LF code point to output $value = $this->normalizeUrl($value);
# P } else {
# Let value be the result of running this algorithm on child # Else continue
# Prepend a single U+000A LF code point to value continue 2;
# Append value to output }
# Any other value # Append and prepend a single U+0020 SPACE code point to value
# Let value be the result of running this algorithm on child # Append value to output
# Append value to output $output[] = " ".$value." ";
break;
case "br":
# BR
# Append a string containing a single U+000A LF code point to output
$output[] = "\n";
break;
case "p":
# P
# Let value be the result of running this algorithm on child
# Prepend a single U+000A LF code point to value
# Append value to output
$output[] = "\n".$this->getCleanTextThorough($n, $prefix);
break;
default:
# Any other value
# Let value be the result of running this algorithm on child
# Append value to output
$output[] = $this->getCleanTextThorough($n, $prefix);
break;
} }
# Else continue } else {
# Else continue
continue;
} }
# Return the concatenation of output
} }
# Return the concatenation of output
return implode("", $output);
} }
protected function getCleanTextBasic(\DOMElement $node, string $prefix): string { protected function getCleanTextBasic(\DOMElement $node, string $prefix): string {

View file

@ -22,7 +22,7 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
]; ];
/** @dataProvider provideStandardTests */ /** @dataProvider provideStandardTests */
public function testStandardTests(string $name, string $path): void { public function testStandardTests(string $name, string $path, $options): void {
if (isset(self::SUPPRESSED[$name])) { if (isset(self::SUPPRESSED[$name])) {
$this->markTestIncomplete(self::SUPPRESSED[$name]); $this->markTestIncomplete(self::SUPPRESSED[$name]);
} }
@ -32,17 +32,23 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
// fix up expectation where necessary // fix up expectation where necessary
array_walk_recursive($exp, function(&$v) { array_walk_recursive($exp, function(&$v) {
// URLs differ trivially from output of our normalization library // URLs differ trivially from output of our normalization library
if (preg_match('#^https?://[^/]+$#', $v)) { $v = preg_replace('#^https?://[^/]+$#', "$0/", $v);
$v .= "/";
}
}); });
// URLs also need fixing as keys in rel-urls
foreach ($exp['rel-urls'] as $k => $v) {
$fixed = preg_replace('#^https?://[^/]+$#', "$0/", $k);
$exp['rel-urls'][$fixed] = $v;
if ($fixed !== $k) {
unset($exp['rel-urls'][$k]);
}
}
// perform some further monkey-patching on specific tests // perform some further monkey-patching on specific tests
$exp = $this->fixTests($exp, $name); $exp = $this->fixTests($exp, $name);
// parse input // parse input
$dom = new DOMParser; $dom = new DOMParser;
$parser = new Parser; $parser = new Parser;
$doc = $dom->parseFromString($html, "text/html; charset=UTF-8"); $doc = $dom->parseFromString($html, "text/html; charset=UTF-8");
$act = $parser->parseElement($doc->documentElement, "http://example.com"); $act = $parser->parseElement($doc->documentElement, "http://example.com", $options);
// sort both arrays // sort both arrays
$this->ksort($exp); $this->ksort($exp);
$this->ksort($act); $this->ksort($act);
@ -55,14 +61,13 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
} }
public function provideStandardTests(): \Generator { public function provideStandardTests(): \Generator {
return $this->provideTestList(\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/"); // the standard tests
yield from $this->provideTestList([\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/"], ['basicTrim' => true]);
// tests from php-mf2
yield from $this->provideTestList([\MensBeam\Microformats\BASE."tests/cases/json/"], null);
} }
protected function provideTestList(): \Generator { protected function provideTestList(array $tests, ?array $options = null): \Generator {
$tests = [
\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/", // standard tests
\MensBeam\Microformats\BASE."tests/cases/json/", // additional tests
];
foreach ($tests as $base) { foreach ($tests as $base) {
$base = strtr($base, "\\", "/"); $base = strtr($base, "\\", "/");
foreach (new \RegexIterator(new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator($base )), '/\.json$/') as $file) { foreach (new \RegexIterator(new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator($base )), '/\.json$/') as $file) {
@ -70,7 +75,7 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
$path = preg_replace('/\.json$/', '', $path); $path = preg_replace('/\.json$/', '', $path);
$name = strtr($path, "\\", "/"); $name = strtr($path, "\\", "/");
$name = str_replace(strtr($base, "\\", "/"), "", $name); $name = str_replace(strtr($base, "\\", "/"), "", $name);
yield $name => [$name, $path]; yield $name => [$name, $path, $options];
} }
} }
} }