diff --git a/lib/Parser.php b/lib/Parser.php index f16c795..6d52db0 100644 --- a/lib/Parser.php +++ b/lib/Parser.php @@ -313,7 +313,7 @@ class Parser { } // sort and clean rel microformats foreach ($out['rels'] as $k => $v) { - $out['rels'][$k] = array_unique($v); + $out['rels'][$k] = array_values(array_unique($v)); } foreach ($out['rel-urls'] as $k => $v) { $out['rel-urls'][$k]['rels'] = array_unique($v['rels']); @@ -1040,7 +1040,19 @@ class Parser { if ($this->options['basicTrim']) { return $this->getCleanTextBasic($node, $prefix); } else { - return $this->getCleanTextThorough($node, $prefix); + // https://microformats.org/wiki/textcontent-parsing + # Plain text of element + # To get the plain text for an Element input: + # Let output be the result of running [Element to string] on input + $output = $this->getCleanTextThorough($node, $prefix); + # Remove any sequence of one or more consecutive U+0020 SPACE code points directly before and after an U+000A LF code point from output + $output = preg_replace('/^\s+|\s+$/m', "", $output); + # Strip leading and trailing ASCII whitespace from output + $output = trim($output); + # Replace any sequence of one or more consecutive U+0020 SPACE code points in output with a single U+0020 SPACE code point + $output = preg_replace('/ {2,}/', " ", $output); + # Return output + return $output; } } @@ -1060,7 +1072,7 @@ class Parser { // NOTE: This ought to include FORM FEED characters $value = strtr($value, "\t\n\r\f", " "); # Append value to output - $output .= $value; + $output[] = $value; } elseif ($n instanceof \DOMElement) { # If child is an Element, switch on its tagName: // NOTE: we switch on localName instead to avoid silly case folding @@ -1068,37 +1080,59 @@ class Parser { case "script": case "style": case "template": - # SCRIPT - # STYLE - // TEMPLATE as well + # SCRIPT + # STYLE + // TEMPLATE as well # Continue continue 2; - - # IMG - # If child has an alt attribute, then: - # Let value be the contents of the alt attribute - # Strip leading and trailing ASCII whitespace from value - # Else if child has a src attribute, then: - # Let value be the contents of the src attribute - # Strip leading and trailing ASCII whitespace from value - # Set value to the absolute URL created by resolving value following the containing document’s language’s rules - # Else continue - # Append and prepend a single U+0020 SPACE code point to value - # Append value to output - # BR - # Append a string containing a single U+000A LF code point to output - # P - # Let value be the result of running this algorithm on child - # Prepend a single U+000A LF code point to value - # Append value to output - # Any other value - # Let value be the result of running this algorithm on child - # Append value to output + case "img": + # IMG + if ($n->hasAttribute("alt")) { + # If child has an alt attribute, then: + # Let value be the contents of the alt attribute + # Strip leading and trailing ASCII whitespace from value + $value = trim($n->getAttribute("alt")); + } elseif ($n->hasAttribute("src")) { + # Else if child has a src attribute, then: + # Let value be the contents of the src attribute + # Strip leading and trailing ASCII whitespace from value + $value = trim($n->getAttribute("src")); + # Set value to the absolute URL created by resolving value following the containing document’s language’s rules + $value = $this->normalizeUrl($value); + } else { + # Else continue + continue 2; + } + # Append and prepend a single U+0020 SPACE code point to value + # Append value to output + $output[] = " ".$value." "; + break; + case "br": + # BR + # Append a string containing a single U+000A LF code point to output + $output[] = "\n"; + break; + case "p": + # P + # Let value be the result of running this algorithm on child + # Prepend a single U+000A LF code point to value + # Append value to output + $output[] = "\n".$this->getCleanTextThorough($n, $prefix); + break; + default: + # Any other value + # Let value be the result of running this algorithm on child + # Append value to output + $output[] = $this->getCleanTextThorough($n, $prefix); + break; } - # Else continue + } else { + # Else continue + continue; } - # Return the concatenation of output } + # Return the concatenation of output + return implode("", $output); } protected function getCleanTextBasic(\DOMElement $node, string $prefix): string { diff --git a/tests/cases/StandardTest.php b/tests/cases/StandardTest.php index f5f55ee..7495b29 100644 --- a/tests/cases/StandardTest.php +++ b/tests/cases/StandardTest.php @@ -22,7 +22,7 @@ class StandardTest extends \PHPUnit\Framework\TestCase { ]; /** @dataProvider provideStandardTests */ - public function testStandardTests(string $name, string $path): void { + public function testStandardTests(string $name, string $path, $options): void { if (isset(self::SUPPRESSED[$name])) { $this->markTestIncomplete(self::SUPPRESSED[$name]); } @@ -32,17 +32,23 @@ class StandardTest extends \PHPUnit\Framework\TestCase { // fix up expectation where necessary array_walk_recursive($exp, function(&$v) { // URLs differ trivially from output of our normalization library - if (preg_match('#^https?://[^/]+$#', $v)) { - $v .= "/"; - } + $v = preg_replace('#^https?://[^/]+$#', "$0/", $v); }); + // URLs also need fixing as keys in rel-urls + foreach ($exp['rel-urls'] as $k => $v) { + $fixed = preg_replace('#^https?://[^/]+$#', "$0/", $k); + $exp['rel-urls'][$fixed] = $v; + if ($fixed !== $k) { + unset($exp['rel-urls'][$k]); + } + } // perform some further monkey-patching on specific tests $exp = $this->fixTests($exp, $name); // parse input $dom = new DOMParser; $parser = new Parser; $doc = $dom->parseFromString($html, "text/html; charset=UTF-8"); - $act = $parser->parseElement($doc->documentElement, "http://example.com"); + $act = $parser->parseElement($doc->documentElement, "http://example.com", $options); // sort both arrays $this->ksort($exp); $this->ksort($act); @@ -55,14 +61,13 @@ class StandardTest extends \PHPUnit\Framework\TestCase { } public function provideStandardTests(): \Generator { - return $this->provideTestList(\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/"); + // the standard tests + yield from $this->provideTestList([\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/"], ['basicTrim' => true]); + // tests from php-mf2 + yield from $this->provideTestList([\MensBeam\Microformats\BASE."tests/cases/json/"], null); } - protected function provideTestList(): \Generator { - $tests = [ - \MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/", // standard tests - \MensBeam\Microformats\BASE."tests/cases/json/", // additional tests - ]; + protected function provideTestList(array $tests, ?array $options = null): \Generator { foreach ($tests as $base) { $base = strtr($base, "\\", "/"); foreach (new \RegexIterator(new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator($base )), '/\.json$/') as $file) { @@ -70,7 +75,7 @@ class StandardTest extends \PHPUnit\Framework\TestCase { $path = preg_replace('/\.json$/', '', $path); $name = strtr($path, "\\", "/"); $name = str_replace(strtr($base, "\\", "/"), "", $name); - yield $name => [$name, $path]; + yield $name => [$name, $path, $options]; } } }