Implement textContent parsing properly
This commit is contained in:
parent
650d1c7e98
commit
65bbf61579
2 changed files with 80 additions and 41 deletions
|
@ -313,7 +313,7 @@ class Parser {
|
||||||
}
|
}
|
||||||
// sort and clean rel microformats
|
// sort and clean rel microformats
|
||||||
foreach ($out['rels'] as $k => $v) {
|
foreach ($out['rels'] as $k => $v) {
|
||||||
$out['rels'][$k] = array_unique($v);
|
$out['rels'][$k] = array_values(array_unique($v));
|
||||||
}
|
}
|
||||||
foreach ($out['rel-urls'] as $k => $v) {
|
foreach ($out['rel-urls'] as $k => $v) {
|
||||||
$out['rel-urls'][$k]['rels'] = array_unique($v['rels']);
|
$out['rel-urls'][$k]['rels'] = array_unique($v['rels']);
|
||||||
|
@ -1040,7 +1040,19 @@ class Parser {
|
||||||
if ($this->options['basicTrim']) {
|
if ($this->options['basicTrim']) {
|
||||||
return $this->getCleanTextBasic($node, $prefix);
|
return $this->getCleanTextBasic($node, $prefix);
|
||||||
} else {
|
} else {
|
||||||
return $this->getCleanTextThorough($node, $prefix);
|
// https://microformats.org/wiki/textcontent-parsing
|
||||||
|
# Plain text of element
|
||||||
|
# To get the plain text for an Element input:
|
||||||
|
# Let output be the result of running [Element to string] on input
|
||||||
|
$output = $this->getCleanTextThorough($node, $prefix);
|
||||||
|
# Remove any sequence of one or more consecutive U+0020 SPACE code points directly before and after an U+000A LF code point from output
|
||||||
|
$output = preg_replace('/^\s+|\s+$/m', "", $output);
|
||||||
|
# Strip leading and trailing ASCII whitespace from output
|
||||||
|
$output = trim($output);
|
||||||
|
# Replace any sequence of one or more consecutive U+0020 SPACE code points in output with a single U+0020 SPACE code point
|
||||||
|
$output = preg_replace('/ {2,}/', " ", $output);
|
||||||
|
# Return output
|
||||||
|
return $output;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1060,7 +1072,7 @@ class Parser {
|
||||||
// NOTE: This ought to include FORM FEED characters
|
// NOTE: This ought to include FORM FEED characters
|
||||||
$value = strtr($value, "\t\n\r\f", " ");
|
$value = strtr($value, "\t\n\r\f", " ");
|
||||||
# Append value to output
|
# Append value to output
|
||||||
$output .= $value;
|
$output[] = $value;
|
||||||
} elseif ($n instanceof \DOMElement) {
|
} elseif ($n instanceof \DOMElement) {
|
||||||
# If child is an Element, switch on its tagName:
|
# If child is an Element, switch on its tagName:
|
||||||
// NOTE: we switch on localName instead to avoid silly case folding
|
// NOTE: we switch on localName instead to avoid silly case folding
|
||||||
|
@ -1068,37 +1080,59 @@ class Parser {
|
||||||
case "script":
|
case "script":
|
||||||
case "style":
|
case "style":
|
||||||
case "template":
|
case "template":
|
||||||
# SCRIPT
|
# SCRIPT
|
||||||
# STYLE
|
# STYLE
|
||||||
// TEMPLATE as well
|
// TEMPLATE as well
|
||||||
# Continue
|
# Continue
|
||||||
continue 2;
|
continue 2;
|
||||||
|
case "img":
|
||||||
# IMG
|
# IMG
|
||||||
# If child has an alt attribute, then:
|
if ($n->hasAttribute("alt")) {
|
||||||
# Let value be the contents of the alt attribute
|
# If child has an alt attribute, then:
|
||||||
# Strip leading and trailing ASCII whitespace from value
|
# Let value be the contents of the alt attribute
|
||||||
# Else if child has a src attribute, then:
|
# Strip leading and trailing ASCII whitespace from value
|
||||||
# Let value be the contents of the src attribute
|
$value = trim($n->getAttribute("alt"));
|
||||||
# Strip leading and trailing ASCII whitespace from value
|
} elseif ($n->hasAttribute("src")) {
|
||||||
# Set value to the absolute URL created by resolving value following the containing document’s language’s rules
|
# Else if child has a src attribute, then:
|
||||||
# Else continue
|
# Let value be the contents of the src attribute
|
||||||
# Append and prepend a single U+0020 SPACE code point to value
|
# Strip leading and trailing ASCII whitespace from value
|
||||||
# Append value to output
|
$value = trim($n->getAttribute("src"));
|
||||||
# BR
|
# Set value to the absolute URL created by resolving value following the containing document’s language’s rules
|
||||||
# Append a string containing a single U+000A LF code point to output
|
$value = $this->normalizeUrl($value);
|
||||||
# P
|
} else {
|
||||||
# Let value be the result of running this algorithm on child
|
# Else continue
|
||||||
# Prepend a single U+000A LF code point to value
|
continue 2;
|
||||||
# Append value to output
|
}
|
||||||
# Any other value
|
# Append and prepend a single U+0020 SPACE code point to value
|
||||||
# Let value be the result of running this algorithm on child
|
# Append value to output
|
||||||
# Append value to output
|
$output[] = " ".$value." ";
|
||||||
|
break;
|
||||||
|
case "br":
|
||||||
|
# BR
|
||||||
|
# Append a string containing a single U+000A LF code point to output
|
||||||
|
$output[] = "\n";
|
||||||
|
break;
|
||||||
|
case "p":
|
||||||
|
# P
|
||||||
|
# Let value be the result of running this algorithm on child
|
||||||
|
# Prepend a single U+000A LF code point to value
|
||||||
|
# Append value to output
|
||||||
|
$output[] = "\n".$this->getCleanTextThorough($n, $prefix);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
# Any other value
|
||||||
|
# Let value be the result of running this algorithm on child
|
||||||
|
# Append value to output
|
||||||
|
$output[] = $this->getCleanTextThorough($n, $prefix);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
# Else continue
|
} else {
|
||||||
|
# Else continue
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
# Return the concatenation of output
|
|
||||||
}
|
}
|
||||||
|
# Return the concatenation of output
|
||||||
|
return implode("", $output);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected function getCleanTextBasic(\DOMElement $node, string $prefix): string {
|
protected function getCleanTextBasic(\DOMElement $node, string $prefix): string {
|
||||||
|
|
|
@ -22,7 +22,7 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
|
||||||
];
|
];
|
||||||
|
|
||||||
/** @dataProvider provideStandardTests */
|
/** @dataProvider provideStandardTests */
|
||||||
public function testStandardTests(string $name, string $path): void {
|
public function testStandardTests(string $name, string $path, $options): void {
|
||||||
if (isset(self::SUPPRESSED[$name])) {
|
if (isset(self::SUPPRESSED[$name])) {
|
||||||
$this->markTestIncomplete(self::SUPPRESSED[$name]);
|
$this->markTestIncomplete(self::SUPPRESSED[$name]);
|
||||||
}
|
}
|
||||||
|
@ -32,17 +32,23 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
|
||||||
// fix up expectation where necessary
|
// fix up expectation where necessary
|
||||||
array_walk_recursive($exp, function(&$v) {
|
array_walk_recursive($exp, function(&$v) {
|
||||||
// URLs differ trivially from output of our normalization library
|
// URLs differ trivially from output of our normalization library
|
||||||
if (preg_match('#^https?://[^/]+$#', $v)) {
|
$v = preg_replace('#^https?://[^/]+$#', "$0/", $v);
|
||||||
$v .= "/";
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
|
// URLs also need fixing as keys in rel-urls
|
||||||
|
foreach ($exp['rel-urls'] as $k => $v) {
|
||||||
|
$fixed = preg_replace('#^https?://[^/]+$#', "$0/", $k);
|
||||||
|
$exp['rel-urls'][$fixed] = $v;
|
||||||
|
if ($fixed !== $k) {
|
||||||
|
unset($exp['rel-urls'][$k]);
|
||||||
|
}
|
||||||
|
}
|
||||||
// perform some further monkey-patching on specific tests
|
// perform some further monkey-patching on specific tests
|
||||||
$exp = $this->fixTests($exp, $name);
|
$exp = $this->fixTests($exp, $name);
|
||||||
// parse input
|
// parse input
|
||||||
$dom = new DOMParser;
|
$dom = new DOMParser;
|
||||||
$parser = new Parser;
|
$parser = new Parser;
|
||||||
$doc = $dom->parseFromString($html, "text/html; charset=UTF-8");
|
$doc = $dom->parseFromString($html, "text/html; charset=UTF-8");
|
||||||
$act = $parser->parseElement($doc->documentElement, "http://example.com");
|
$act = $parser->parseElement($doc->documentElement, "http://example.com", $options);
|
||||||
// sort both arrays
|
// sort both arrays
|
||||||
$this->ksort($exp);
|
$this->ksort($exp);
|
||||||
$this->ksort($act);
|
$this->ksort($act);
|
||||||
|
@ -55,14 +61,13 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public function provideStandardTests(): \Generator {
|
public function provideStandardTests(): \Generator {
|
||||||
return $this->provideTestList(\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/");
|
// the standard tests
|
||||||
|
yield from $this->provideTestList([\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/"], ['basicTrim' => true]);
|
||||||
|
// tests from php-mf2
|
||||||
|
yield from $this->provideTestList([\MensBeam\Microformats\BASE."tests/cases/json/"], null);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected function provideTestList(): \Generator {
|
protected function provideTestList(array $tests, ?array $options = null): \Generator {
|
||||||
$tests = [
|
|
||||||
\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/", // standard tests
|
|
||||||
\MensBeam\Microformats\BASE."tests/cases/json/", // additional tests
|
|
||||||
];
|
|
||||||
foreach ($tests as $base) {
|
foreach ($tests as $base) {
|
||||||
$base = strtr($base, "\\", "/");
|
$base = strtr($base, "\\", "/");
|
||||||
foreach (new \RegexIterator(new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator($base )), '/\.json$/') as $file) {
|
foreach (new \RegexIterator(new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator($base )), '/\.json$/') as $file) {
|
||||||
|
@ -70,7 +75,7 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
|
||||||
$path = preg_replace('/\.json$/', '', $path);
|
$path = preg_replace('/\.json$/', '', $path);
|
||||||
$name = strtr($path, "\\", "/");
|
$name = strtr($path, "\\", "/");
|
||||||
$name = str_replace(strtr($base, "\\", "/"), "", $name);
|
$name = str_replace(strtr($base, "\\", "/"), "", $name);
|
||||||
yield $name => [$name, $path];
|
yield $name => [$name, $path, $options];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue