Implement textContent parsing properly
This commit is contained in:
parent
650d1c7e98
commit
65bbf61579
2 changed files with 80 additions and 41 deletions
|
@ -313,7 +313,7 @@ class Parser {
|
|||
}
|
||||
// sort and clean rel microformats
|
||||
foreach ($out['rels'] as $k => $v) {
|
||||
$out['rels'][$k] = array_unique($v);
|
||||
$out['rels'][$k] = array_values(array_unique($v));
|
||||
}
|
||||
foreach ($out['rel-urls'] as $k => $v) {
|
||||
$out['rel-urls'][$k]['rels'] = array_unique($v['rels']);
|
||||
|
@ -1040,7 +1040,19 @@ class Parser {
|
|||
if ($this->options['basicTrim']) {
|
||||
return $this->getCleanTextBasic($node, $prefix);
|
||||
} else {
|
||||
return $this->getCleanTextThorough($node, $prefix);
|
||||
// https://microformats.org/wiki/textcontent-parsing
|
||||
# Plain text of element
|
||||
# To get the plain text for an Element input:
|
||||
# Let output be the result of running [Element to string] on input
|
||||
$output = $this->getCleanTextThorough($node, $prefix);
|
||||
# Remove any sequence of one or more consecutive U+0020 SPACE code points directly before and after an U+000A LF code point from output
|
||||
$output = preg_replace('/^\s+|\s+$/m', "", $output);
|
||||
# Strip leading and trailing ASCII whitespace from output
|
||||
$output = trim($output);
|
||||
# Replace any sequence of one or more consecutive U+0020 SPACE code points in output with a single U+0020 SPACE code point
|
||||
$output = preg_replace('/ {2,}/', " ", $output);
|
||||
# Return output
|
||||
return $output;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1060,7 +1072,7 @@ class Parser {
|
|||
// NOTE: This ought to include FORM FEED characters
|
||||
$value = strtr($value, "\t\n\r\f", " ");
|
||||
# Append value to output
|
||||
$output .= $value;
|
||||
$output[] = $value;
|
||||
} elseif ($n instanceof \DOMElement) {
|
||||
# If child is an Element, switch on its tagName:
|
||||
// NOTE: we switch on localName instead to avoid silly case folding
|
||||
|
@ -1068,37 +1080,59 @@ class Parser {
|
|||
case "script":
|
||||
case "style":
|
||||
case "template":
|
||||
# SCRIPT
|
||||
# STYLE
|
||||
// TEMPLATE as well
|
||||
# SCRIPT
|
||||
# STYLE
|
||||
// TEMPLATE as well
|
||||
# Continue
|
||||
continue 2;
|
||||
|
||||
# IMG
|
||||
# If child has an alt attribute, then:
|
||||
# Let value be the contents of the alt attribute
|
||||
# Strip leading and trailing ASCII whitespace from value
|
||||
# Else if child has a src attribute, then:
|
||||
# Let value be the contents of the src attribute
|
||||
# Strip leading and trailing ASCII whitespace from value
|
||||
# Set value to the absolute URL created by resolving value following the containing document’s language’s rules
|
||||
# Else continue
|
||||
# Append and prepend a single U+0020 SPACE code point to value
|
||||
# Append value to output
|
||||
# BR
|
||||
# Append a string containing a single U+000A LF code point to output
|
||||
# P
|
||||
# Let value be the result of running this algorithm on child
|
||||
# Prepend a single U+000A LF code point to value
|
||||
# Append value to output
|
||||
# Any other value
|
||||
# Let value be the result of running this algorithm on child
|
||||
# Append value to output
|
||||
case "img":
|
||||
# IMG
|
||||
if ($n->hasAttribute("alt")) {
|
||||
# If child has an alt attribute, then:
|
||||
# Let value be the contents of the alt attribute
|
||||
# Strip leading and trailing ASCII whitespace from value
|
||||
$value = trim($n->getAttribute("alt"));
|
||||
} elseif ($n->hasAttribute("src")) {
|
||||
# Else if child has a src attribute, then:
|
||||
# Let value be the contents of the src attribute
|
||||
# Strip leading and trailing ASCII whitespace from value
|
||||
$value = trim($n->getAttribute("src"));
|
||||
# Set value to the absolute URL created by resolving value following the containing document’s language’s rules
|
||||
$value = $this->normalizeUrl($value);
|
||||
} else {
|
||||
# Else continue
|
||||
continue 2;
|
||||
}
|
||||
# Append and prepend a single U+0020 SPACE code point to value
|
||||
# Append value to output
|
||||
$output[] = " ".$value." ";
|
||||
break;
|
||||
case "br":
|
||||
# BR
|
||||
# Append a string containing a single U+000A LF code point to output
|
||||
$output[] = "\n";
|
||||
break;
|
||||
case "p":
|
||||
# P
|
||||
# Let value be the result of running this algorithm on child
|
||||
# Prepend a single U+000A LF code point to value
|
||||
# Append value to output
|
||||
$output[] = "\n".$this->getCleanTextThorough($n, $prefix);
|
||||
break;
|
||||
default:
|
||||
# Any other value
|
||||
# Let value be the result of running this algorithm on child
|
||||
# Append value to output
|
||||
$output[] = $this->getCleanTextThorough($n, $prefix);
|
||||
break;
|
||||
}
|
||||
# Else continue
|
||||
} else {
|
||||
# Else continue
|
||||
continue;
|
||||
}
|
||||
# Return the concatenation of output
|
||||
}
|
||||
# Return the concatenation of output
|
||||
return implode("", $output);
|
||||
}
|
||||
|
||||
protected function getCleanTextBasic(\DOMElement $node, string $prefix): string {
|
||||
|
|
|
@ -22,7 +22,7 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
|
|||
];
|
||||
|
||||
/** @dataProvider provideStandardTests */
|
||||
public function testStandardTests(string $name, string $path): void {
|
||||
public function testStandardTests(string $name, string $path, $options): void {
|
||||
if (isset(self::SUPPRESSED[$name])) {
|
||||
$this->markTestIncomplete(self::SUPPRESSED[$name]);
|
||||
}
|
||||
|
@ -32,17 +32,23 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
|
|||
// fix up expectation where necessary
|
||||
array_walk_recursive($exp, function(&$v) {
|
||||
// URLs differ trivially from output of our normalization library
|
||||
if (preg_match('#^https?://[^/]+$#', $v)) {
|
||||
$v .= "/";
|
||||
}
|
||||
$v = preg_replace('#^https?://[^/]+$#', "$0/", $v);
|
||||
});
|
||||
// URLs also need fixing as keys in rel-urls
|
||||
foreach ($exp['rel-urls'] as $k => $v) {
|
||||
$fixed = preg_replace('#^https?://[^/]+$#', "$0/", $k);
|
||||
$exp['rel-urls'][$fixed] = $v;
|
||||
if ($fixed !== $k) {
|
||||
unset($exp['rel-urls'][$k]);
|
||||
}
|
||||
}
|
||||
// perform some further monkey-patching on specific tests
|
||||
$exp = $this->fixTests($exp, $name);
|
||||
// parse input
|
||||
$dom = new DOMParser;
|
||||
$parser = new Parser;
|
||||
$doc = $dom->parseFromString($html, "text/html; charset=UTF-8");
|
||||
$act = $parser->parseElement($doc->documentElement, "http://example.com");
|
||||
$act = $parser->parseElement($doc->documentElement, "http://example.com", $options);
|
||||
// sort both arrays
|
||||
$this->ksort($exp);
|
||||
$this->ksort($act);
|
||||
|
@ -55,14 +61,13 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
|
|||
}
|
||||
|
||||
public function provideStandardTests(): \Generator {
|
||||
return $this->provideTestList(\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/");
|
||||
// the standard tests
|
||||
yield from $this->provideTestList([\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/"], ['basicTrim' => true]);
|
||||
// tests from php-mf2
|
||||
yield from $this->provideTestList([\MensBeam\Microformats\BASE."tests/cases/json/"], null);
|
||||
}
|
||||
|
||||
protected function provideTestList(): \Generator {
|
||||
$tests = [
|
||||
\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/", // standard tests
|
||||
\MensBeam\Microformats\BASE."tests/cases/json/", // additional tests
|
||||
];
|
||||
protected function provideTestList(array $tests, ?array $options = null): \Generator {
|
||||
foreach ($tests as $base) {
|
||||
$base = strtr($base, "\\", "/");
|
||||
foreach (new \RegexIterator(new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator($base )), '/\.json$/') as $file) {
|
||||
|
@ -70,7 +75,7 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
|
|||
$path = preg_replace('/\.json$/', '', $path);
|
||||
$name = strtr($path, "\\", "/");
|
||||
$name = str_replace(strtr($base, "\\", "/"), "", $name);
|
||||
yield $name => [$name, $path];
|
||||
yield $name => [$name, $path, $options];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue