Implement textContent parsing properly

11 months ago · 65bbf61579
2 changed files with 80 additions and 41 deletions
--- a/lib/Parser.php
+++ b/lib/Parser.php
@ -313,7 +313,7 @@ class Parser {
        }
        // sort and clean rel microformats
        foreach ($out['rels'] as $k => $v) {
-            $out['rels'][$k] = array_unique($v);
+            $out['rels'][$k] = array_values(array_unique($v));
        }
        foreach ($out['rel-urls'] as $k => $v) {
            $out['rel-urls'][$k]['rels'] = array_unique($v['rels']);
@ -1040,7 +1040,19 @@ class Parser {
        if ($this->options['basicTrim']) {
            return $this->getCleanTextBasic($node, $prefix);
        } else {
-            return $this->getCleanTextThorough($node, $prefix);
+            // https://microformats.org/wiki/textcontent-parsing
+            # Plain text of element
+            # To get the plain text for an Element input:
+            # Let output be the result of running [Element to string] on input
+            $output = $this->getCleanTextThorough($node, $prefix);
+            # Remove any sequence of one or more consecutive U+0020 SPACE code points directly before and after an U+000A LF code point from output
+            $output = preg_replace('/^\s+|\s+$/m', "", $output);
+            # Strip leading and trailing ASCII whitespace from output
+            $output = trim($output);
+            # Replace any sequence of one or more consecutive U+0020 SPACE code points in output with a single U+0020 SPACE code point
+            $output = preg_replace('/ {2,}/', " ", $output);
+            # Return output
+            return $output;
        }
    }

@ -1060,7 +1072,7 @@ class Parser {
                // NOTE: This ought to include FORM FEED characters
                $value = strtr($value, "\t\n\r\f", "    ");
                # Append value to output
-                $output .= $value;
+                $output[] = $value;
            } elseif ($n instanceof \DOMElement) {
                # If child is an Element, switch on its tagName:
                // NOTE: we switch on localName instead to avoid silly case folding
@ -1068,37 +1080,59 @@ class Parser {
                    case "script":
                    case "style":
                    case "template":
-                    # SCRIPT
-                    # STYLE
-                    // TEMPLATE as well
+                        # SCRIPT
+                        # STYLE
+                        // TEMPLATE as well
                        # Continue
                        continue 2;
-                    
-                # IMG
-                # If child has an alt attribute, then:
-                # Let value be the contents of the alt attribute
-                # Strip leading and trailing ASCII whitespace from value
-                # Else if child has a src attribute, then:
-                # Let value be the contents of the src attribute
-                # Strip leading and trailing ASCII whitespace from value
-                # Set value to the absolute URL created by resolving value following the containing document’s language’s rules
-                # Else continue
-                # Append and prepend a single U+0020 SPACE code point to value
-                # Append value to output
-                # BR
-                # Append a string containing a single U+000A LF code point to output
-                # P
-                # Let value be the result of running this algorithm on child
-                # Prepend a single U+000A LF code point to value
-                # Append value to output
-                # Any other value
-                # Let value be the result of running this algorithm on child
-                # Append value to output
+                    case "img":
+                        # IMG
+                        if ($n->hasAttribute("alt")) {
+                            # If child has an alt attribute, then:
+                            # Let value be the contents of the alt attribute
+                            # Strip leading and trailing ASCII whitespace from value
+                            $value = trim($n->getAttribute("alt"));
+                        } elseif ($n->hasAttribute("src")) {
+                            # Else if child has a src attribute, then:
+                            # Let value be the contents of the src attribute
+                            # Strip leading and trailing ASCII whitespace from value
+                            $value = trim($n->getAttribute("src"));
+                            # Set value to the absolute URL created by resolving value following the containing document’s language’s rules
+                            $value = $this->normalizeUrl($value);
+                        } else {
+                            # Else continue
+                            continue 2;
+                        }
+                        # Append and prepend a single U+0020 SPACE code point to value
+                        # Append value to output
+                        $output[] = " ".$value." ";
+                        break;
+                    case "br":
+                        # BR
+                        # Append a string containing a single U+000A LF code point to output
+                        $output[] = "\n";
+                        break;
+                    case "p":
+                        # P
+                        # Let value be the result of running this algorithm on child
+                        # Prepend a single U+000A LF code point to value
+                        # Append value to output
+                        $output[] = "\n".$this->getCleanTextThorough($n, $prefix);
+                        break;
+                    default:
+                        # Any other value
+                        # Let value be the result of running this algorithm on child
+                        # Append value to output
+                        $output[] = $this->getCleanTextThorough($n, $prefix);
+                        break;
                }
-        # Else continue
+            } else {
+                # Else continue
+                continue;
            }
-        # Return the concatenation of output
        }
+        # Return the concatenation of output
+        return implode("", $output);
    }

    protected function getCleanTextBasic(\DOMElement $node, string $prefix): string {
--- a/tests/cases/StandardTest.php
+++ b/tests/cases/StandardTest.php
@ -22,7 +22,7 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
    ];

    /** @dataProvider provideStandardTests */
-    public function testStandardTests(string $name, string $path): void {
+    public function testStandardTests(string $name, string $path, $options): void {
        if (isset(self::SUPPRESSED[$name])) {
            $this->markTestIncomplete(self::SUPPRESSED[$name]);
        }
@ -32,17 +32,23 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
        // fix up expectation where necessary
        array_walk_recursive($exp, function(&$v) {
            // URLs differ trivially from output of our normalization library
-            if (preg_match('#^https?://[^/]+$#', $v)) {
-                $v .= "/";
-            }
+            $v = preg_replace('#^https?://[^/]+$#', "$0/", $v);
        });
+        // URLs also need fixing as keys in rel-urls
+        foreach ($exp['rel-urls'] as $k => $v) {
+            $fixed = preg_replace('#^https?://[^/]+$#', "$0/", $k);
+            $exp['rel-urls'][$fixed] = $v;
+            if ($fixed !== $k) {
+                unset($exp['rel-urls'][$k]);
+            }
+        }
        // perform some further monkey-patching on specific tests
        $exp = $this->fixTests($exp, $name);
        // parse input
        $dom = new DOMParser;
        $parser = new Parser;
        $doc = $dom->parseFromString($html, "text/html; charset=UTF-8");
-        $act = $parser->parseElement($doc->documentElement, "http://example.com");
+        $act = $parser->parseElement($doc->documentElement, "http://example.com", $options);
        // sort both arrays
        $this->ksort($exp);
        $this->ksort($act);
@ -55,14 +61,13 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
    }

    public function provideStandardTests(): \Generator {
-        return $this->provideTestList(\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/");
+        // the standard tests
+        yield from $this->provideTestList([\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/"], ['basicTrim' => true]);
+        // tests from php-mf2
+        yield from $this->provideTestList([\MensBeam\Microformats\BASE."tests/cases/json/"], null);
    }

-    protected function provideTestList(): \Generator {
-        $tests = [
-            \MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/", // standard tests
-            \MensBeam\Microformats\BASE."tests/cases/json/", // additional tests
-        ];
+    protected function provideTestList(array $tests, ?array $options = null): \Generator {
        foreach ($tests as $base) {
            $base = strtr($base, "\\", "/");
            foreach (new \RegexIterator(new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator($base )), '/\.json$/') as $file) {
@ -70,7 +75,7 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
                $path =  preg_replace('/\.json$/', '', $path);
                $name = strtr($path, "\\", "/");
                $name = str_replace(strtr($base, "\\", "/"), "", $name);
-                yield $name => [$name, $path];
+                yield $name => [$name, $path, $options];
            }
        }
    }