Implement textContent parsing properly

2023-06-20 21:20:26 -04:00 · 2023-06-20 21:20:26 -04:00 · 65bbf61579
commit 65bbf61579
parent 650d1c7e98
2 changed files with 80 additions and 41 deletions
--- a/lib/Parser.php
+++ b/lib/Parser.php
@ -313,7 +313,7 @@ class Parser {
        }
        // sort and clean rel microformats
        foreach ($out['rels'] as $k => $v) {
-            $out['rels'][$k] = array_unique($v);
+            $out['rels'][$k] = array_values(array_unique($v));
        }
        foreach ($out['rel-urls'] as $k => $v) {
            $out['rel-urls'][$k]['rels'] = array_unique($v['rels']);
@ -1040,7 +1040,19 @@ class Parser {
        if ($this->options['basicTrim']) {
            return $this->getCleanTextBasic($node, $prefix);
        } else {
-            return $this->getCleanTextThorough($node, $prefix);
+            // https://microformats.org/wiki/textcontent-parsing
            # Plain text of element
            # To get the plain text for an Element input:
            # Let output be the result of running [Element to string] on input
            $output = $this->getCleanTextThorough($node, $prefix);
            # Remove any sequence of one or more consecutive U+0020 SPACE code points directly before and after an U+000A LF code point from output
            $output = preg_replace('/^\s+|\s+$/m', "", $output);
            # Strip leading and trailing ASCII whitespace from output
            $output = trim($output);
            # Replace any sequence of one or more consecutive U+0020 SPACE code points in output with a single U+0020 SPACE code point
            $output = preg_replace('/ {2,}/', " ", $output);
            # Return output
            return $output;
        }
    }
@ -1060,7 +1072,7 @@ class Parser {
                // NOTE: This ought to include FORM FEED characters
                $value = strtr($value, "\t\n\r\f", "    ");
                # Append value to output
-                $output .= $value;
+                $output[] = $value;
            } elseif ($n instanceof \DOMElement) {
                # If child is an Element, switch on its tagName:
                // NOTE: we switch on localName instead to avoid silly case folding
@ -1068,37 +1080,59 @@ class Parser {
                    case "script":
                    case "style":
                    case "template":
-                    # SCRIPT
+                        # SCRIPT
-                    # STYLE
+                        # STYLE
-                    // TEMPLATE as well
+                        // TEMPLATE as well
                        # Continue
                        continue 2;
-                    
+                    case "img":
-                # IMG
+                        # IMG
-                # If child has an alt attribute, then:
+                        if ($n->hasAttribute("alt")) {
-                # Let value be the contents of the alt attribute
+                            # If child has an alt attribute, then:
-                # Strip leading and trailing ASCII whitespace from value
+                            # Let value be the contents of the alt attribute
-                # Else if child has a src attribute, then:
+                            # Strip leading and trailing ASCII whitespace from value
-                # Let value be the contents of the src attribute
+                            $value = trim($n->getAttribute("alt"));
-                # Strip leading and trailing ASCII whitespace from value
+                        } elseif ($n->hasAttribute("src")) {
-                # Set value to the absolute URL created by resolving value following the containing document’s language’s rules
+                            # Else if child has a src attribute, then:
-                # Else continue
+                            # Let value be the contents of the src attribute
-                # Append and prepend a single U+0020 SPACE code point to value
+                            # Strip leading and trailing ASCII whitespace from value
-                # Append value to output
+                            $value = trim($n->getAttribute("src"));
-                # BR
+                            # Set value to the absolute URL created by resolving value following the containing document’s language’s rules
-                # Append a string containing a single U+000A LF code point to output
+                            $value = $this->normalizeUrl($value);
-                # P
+                        } else {
-                # Let value be the result of running this algorithm on child
+                            # Else continue
-                # Prepend a single U+000A LF code point to value
+                            continue 2;
-                # Append value to output
+                        }
-                # Any other value
+                        # Append and prepend a single U+0020 SPACE code point to value
-                # Let value be the result of running this algorithm on child
+                        # Append value to output
-                # Append value to output
+                        $output[] = " ".$value." ";
                        break;
                    case "br":
                        # BR
                        # Append a string containing a single U+000A LF code point to output
                        $output[] = "\n";
                        break;
                    case "p":
                        # P
                        # Let value be the result of running this algorithm on child
                        # Prepend a single U+000A LF code point to value
                        # Append value to output
                        $output[] = "\n".$this->getCleanTextThorough($n, $prefix);
                        break;
                    default:
                        # Any other value
                        # Let value be the result of running this algorithm on child
                        # Append value to output
                        $output[] = $this->getCleanTextThorough($n, $prefix);
                        break;
                }
-        # Else continue
+            } else {
                # Else continue
                continue;
            }
        # Return the concatenation of output
        }
        # Return the concatenation of output
        return implode("", $output);
    }
    protected function getCleanTextBasic(\DOMElement $node, string $prefix): string {
--- a/tests/cases/StandardTest.php
+++ b/tests/cases/StandardTest.php
@ -22,7 +22,7 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
    ];
    /** @dataProvider provideStandardTests */
-    public function testStandardTests(string $name, string $path): void {
+    public function testStandardTests(string $name, string $path, $options): void {
        if (isset(self::SUPPRESSED[$name])) {
            $this->markTestIncomplete(self::SUPPRESSED[$name]);
        }
@ -32,17 +32,23 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
        // fix up expectation where necessary
        array_walk_recursive($exp, function(&$v) {
            // URLs differ trivially from output of our normalization library
-            if (preg_match('#^https?://[^/]+$#', $v)) {
+            $v = preg_replace('#^https?://[^/]+$#', "$0/", $v);
                $v .= "/";
            }
        });
        // URLs also need fixing as keys in rel-urls
        foreach ($exp['rel-urls'] as $k => $v) {
            $fixed = preg_replace('#^https?://[^/]+$#', "$0/", $k);
            $exp['rel-urls'][$fixed] = $v;
            if ($fixed !== $k) {
                unset($exp['rel-urls'][$k]);
            }
        }
        // perform some further monkey-patching on specific tests
        $exp = $this->fixTests($exp, $name);
        // parse input
        $dom = new DOMParser;
        $parser = new Parser;
        $doc = $dom->parseFromString($html, "text/html; charset=UTF-8");
-        $act = $parser->parseElement($doc->documentElement, "http://example.com");
+        $act = $parser->parseElement($doc->documentElement, "http://example.com", $options);
        // sort both arrays
        $this->ksort($exp);
        $this->ksort($act);
@ -55,14 +61,13 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
    }
    public function provideStandardTests(): \Generator {
-        return $this->provideTestList(\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/");
+        // the standard tests
        yield from $this->provideTestList([\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/"], ['basicTrim' => true]);
        // tests from php-mf2
        yield from $this->provideTestList([\MensBeam\Microformats\BASE."tests/cases/json/"], null);
    }
-    protected function provideTestList(): \Generator {
+    protected function provideTestList(array $tests, ?array $options = null): \Generator {
        $tests = [
            \MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/", // standard tests
            \MensBeam\Microformats\BASE."tests/cases/json/", // additional tests
        ];
        foreach ($tests as $base) {
            $base = strtr($base, "\\", "/");
            foreach (new \RegexIterator(new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator($base )), '/\.json$/') as $file) {
@ -70,7 +75,7 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
                $path =  preg_replace('/\.json$/', '', $path);
                $name = strtr($path, "\\", "/");
                $name = str_replace(strtr($base, "\\", "/"), "", $name);
-                yield $name => [$name, $path];
+                yield $name => [$name, $path, $options];
            }
        }
    }