Implement textContent parsing properly

This commit is contained in:
J. King 2023-06-20 21:20:26 -04:00
parent 650d1c7e98
commit 65bbf61579
2 changed files with 80 additions and 41 deletions

View file

@ -313,7 +313,7 @@ class Parser {
}
// sort and clean rel microformats
foreach ($out['rels'] as $k => $v) {
$out['rels'][$k] = array_unique($v);
$out['rels'][$k] = array_values(array_unique($v));
}
foreach ($out['rel-urls'] as $k => $v) {
$out['rel-urls'][$k]['rels'] = array_unique($v['rels']);
@ -1040,7 +1040,19 @@ class Parser {
if ($this->options['basicTrim']) {
return $this->getCleanTextBasic($node, $prefix);
} else {
return $this->getCleanTextThorough($node, $prefix);
// https://microformats.org/wiki/textcontent-parsing
# Plain text of element
# To get the plain text for an Element input:
# Let output be the result of running [Element to string] on input
$output = $this->getCleanTextThorough($node, $prefix);
# Remove any sequence of one or more consecutive U+0020 SPACE code points directly before and after an U+000A LF code point from output
$output = preg_replace('/^\s+|\s+$/m', "", $output);
# Strip leading and trailing ASCII whitespace from output
$output = trim($output);
# Replace any sequence of one or more consecutive U+0020 SPACE code points in output with a single U+0020 SPACE code point
$output = preg_replace('/ {2,}/', " ", $output);
# Return output
return $output;
}
}
@ -1060,7 +1072,7 @@ class Parser {
// NOTE: This ought to include FORM FEED characters
$value = strtr($value, "\t\n\r\f", " ");
# Append value to output
$output .= $value;
$output[] = $value;
} elseif ($n instanceof \DOMElement) {
# If child is an Element, switch on its tagName:
// NOTE: we switch on localName instead to avoid silly case folding
@ -1068,37 +1080,59 @@ class Parser {
case "script":
case "style":
case "template":
# SCRIPT
# STYLE
// TEMPLATE as well
# SCRIPT
# STYLE
// TEMPLATE as well
# Continue
continue 2;
# IMG
# If child has an alt attribute, then:
# Let value be the contents of the alt attribute
# Strip leading and trailing ASCII whitespace from value
# Else if child has a src attribute, then:
# Let value be the contents of the src attribute
# Strip leading and trailing ASCII whitespace from value
# Set value to the absolute URL created by resolving value following the containing documents languages rules
# Else continue
# Append and prepend a single U+0020 SPACE code point to value
# Append value to output
# BR
# Append a string containing a single U+000A LF code point to output
# P
# Let value be the result of running this algorithm on child
# Prepend a single U+000A LF code point to value
# Append value to output
# Any other value
# Let value be the result of running this algorithm on child
# Append value to output
case "img":
# IMG
if ($n->hasAttribute("alt")) {
# If child has an alt attribute, then:
# Let value be the contents of the alt attribute
# Strip leading and trailing ASCII whitespace from value
$value = trim($n->getAttribute("alt"));
} elseif ($n->hasAttribute("src")) {
# Else if child has a src attribute, then:
# Let value be the contents of the src attribute
# Strip leading and trailing ASCII whitespace from value
$value = trim($n->getAttribute("src"));
# Set value to the absolute URL created by resolving value following the containing documents languages rules
$value = $this->normalizeUrl($value);
} else {
# Else continue
continue 2;
}
# Append and prepend a single U+0020 SPACE code point to value
# Append value to output
$output[] = " ".$value." ";
break;
case "br":
# BR
# Append a string containing a single U+000A LF code point to output
$output[] = "\n";
break;
case "p":
# P
# Let value be the result of running this algorithm on child
# Prepend a single U+000A LF code point to value
# Append value to output
$output[] = "\n".$this->getCleanTextThorough($n, $prefix);
break;
default:
# Any other value
# Let value be the result of running this algorithm on child
# Append value to output
$output[] = $this->getCleanTextThorough($n, $prefix);
break;
}
# Else continue
} else {
# Else continue
continue;
}
# Return the concatenation of output
}
# Return the concatenation of output
return implode("", $output);
}
protected function getCleanTextBasic(\DOMElement $node, string $prefix): string {

View file

@ -22,7 +22,7 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
];
/** @dataProvider provideStandardTests */
public function testStandardTests(string $name, string $path): void {
public function testStandardTests(string $name, string $path, $options): void {
if (isset(self::SUPPRESSED[$name])) {
$this->markTestIncomplete(self::SUPPRESSED[$name]);
}
@ -32,17 +32,23 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
// fix up expectation where necessary
array_walk_recursive($exp, function(&$v) {
// URLs differ trivially from output of our normalization library
if (preg_match('#^https?://[^/]+$#', $v)) {
$v .= "/";
}
$v = preg_replace('#^https?://[^/]+$#', "$0/", $v);
});
// URLs also need fixing as keys in rel-urls
foreach ($exp['rel-urls'] as $k => $v) {
$fixed = preg_replace('#^https?://[^/]+$#', "$0/", $k);
$exp['rel-urls'][$fixed] = $v;
if ($fixed !== $k) {
unset($exp['rel-urls'][$k]);
}
}
// perform some further monkey-patching on specific tests
$exp = $this->fixTests($exp, $name);
// parse input
$dom = new DOMParser;
$parser = new Parser;
$doc = $dom->parseFromString($html, "text/html; charset=UTF-8");
$act = $parser->parseElement($doc->documentElement, "http://example.com");
$act = $parser->parseElement($doc->documentElement, "http://example.com", $options);
// sort both arrays
$this->ksort($exp);
$this->ksort($act);
@ -55,14 +61,13 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
}
public function provideStandardTests(): \Generator {
return $this->provideTestList(\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/");
// the standard tests
yield from $this->provideTestList([\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/"], ['basicTrim' => true]);
// tests from php-mf2
yield from $this->provideTestList([\MensBeam\Microformats\BASE."tests/cases/json/"], null);
}
protected function provideTestList(): \Generator {
$tests = [
\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/", // standard tests
\MensBeam\Microformats\BASE."tests/cases/json/", // additional tests
];
protected function provideTestList(array $tests, ?array $options = null): \Generator {
foreach ($tests as $base) {
$base = strtr($base, "\\", "/");
foreach (new \RegexIterator(new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator($base )), '/\.json$/') as $file) {
@ -70,7 +75,7 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
$path = preg_replace('/\.json$/', '', $path);
$name = strtr($path, "\\", "/");
$name = str_replace(strtr($base, "\\", "/"), "", $name);
yield $name => [$name, $path];
yield $name => [$name, $path, $options];
}
}
}