Align parser better with test suite

1. Null-character tree construction errors have been added 2. Comments outside the root element are now tested for correctly
3 years ago · 905ea234bc
4 changed files with 25 additions and 45 deletions
--- a/README.md
+++ b/README.md
@ -19,11 +19,10 @@ The API is still in flux, but should be finalized soon.
 The primary aim of this library is accuracy. If the document object differs from what the specification mandates, this is probably a bug. However, we are also constrained by PHP, which imposes various limtations. These are as follows:

 - Due to PHP's DOM being designed for XML, element and attribute names which are illegal in XML are mangled as recommended by the specification
- PHP's DOM does not allow comments to be inserted outside the root element. The parser will perform the insertions, but the comment nodes are then silently dropped
 - PHP's DOM has no special understanding of the HTML `<template>` element. Consequently template contents is treated no differently from the children of other elements
 - PHP's DOM treats `xmlns` attributes specially. Attributes which would change the namespace URI of an element or prefix to inconsistent values are thus dropped
- Due to a PHP bug which severely degrades performance with large documents and in consideration of existing PHP software, HTML elements are placed in the null namespace rather than in the HTML namespace
- PHP's DOM does not allow DOCTYPEs with no name (i.e. `<!DOCTYPE>` rather than `<!DOCTYPE html>`); in such cases the parser will create a DOCTYPE using a single `U+0020 SPACE` character as its name
+- Due to a PHP bug which severely degrades performance with large documents and in consideration of existing PHP software, HTML elements are placed in the null namespace by default rather than in the HTML namespace
+- PHP's DOM does not allow DOCTYPEs with no name (i.e. `<!DOCTYPE >` rather than `<!DOCTYPE html>`); in such cases the parser will create a DOCTYPE using a single `U+0020 SPACE` character as its name

 ## Comparison with `masterminds/html5`

@ -45,7 +44,7 @@ This library and [masterminds/html5](https://packagist.org/packages/masterminds/
 | Handling of omitted start tags                      | Elements are not inserted             | Elements are not inserted                                | Per specification                      |
 | Handling of processing instructions                 | Retained                              | Retained                                                 | Per specification, configurable        |
 | Handling of bogus XLink namespace\*                 | Foreign content not supported         | XLink attributes are lost if preceded by bogus namespace | Bogus namespace is ignored             |
-| Namespace for HTML elements                         | Null                                  | Per specification, configurable                          | Null                                   |
+| Namespace for HTML elements                         | Null                                  | Per specification, configurable                          | Null, configurable                                   |
 | Time needed to parse single-page HTML specification | 0.5 seconds                           | 2.7 seconds†                                             | 6.0 seconds‡                           |
 | Peak memory needed for same                         | 11.6 MB                               | 38 MB                                                    | 13.9 MB                                |

@ -53,4 +52,4 @@ This library and [masterminds/html5](https://packagist.org/packages/masterminds/

 † With HTML namespace disabled. With HTML namespace enabled it does not finish in a reasonable time due to a PHP bug.

-‡ With parse errors suppressed. Reporting parse errors adds approximately 10% overhead.
+‡ With parse errors suppressed. Reporting parse errors adds approximately 10% overhead.
--- a/lib/Parser/ParseError.php
+++ b/lib/Parser/ParseError.php
@ -76,6 +76,8 @@ class ParseError {
    public const FOSTERED_START_TAG                                                = 215;
    public const FOSTERED_END_TAG                                                  = 216;
    public const FOSTERED_CHAR                                                     = 217;
+    public const UNEXPECTED_NULL_CHARACTER_OMIT                                    = 218;
+    public const UNEXPECTED_NULL_CHARACTER_REPLACE                                 = 219;

    public const MESSAGES = [
        self::EXPECTED_DOCTYPE_BUT_GOT_START_TAG                                => 'Expected DOCTYPE but got start tag <%s>',
@ -96,6 +98,8 @@ class ParseError {
        self::FOSTERED_START_TAG                                                => 'Start tag <%s> moved to before table',
        self::FOSTERED_END_TAG                                                  => 'End tag </%s> moved to before table',
        self::FOSTERED_CHAR                                                     => 'Character moved to before table',
+        self::UNEXPECTED_NULL_CHARACTER_OMIT                                    => 'Unexpected null character; omitted from document',
+        self::UNEXPECTED_NULL_CHARACTER_REPLACE                                 => 'Unexpected null character; replacement character substituted',

        self::ENCODING_ERROR                                                    => 'Corrupt encoding near byte position %s',
        self::UNEXPECTED_NULL_CHARACTER                                         => 'Unexpected null character',
--- a/lib/Parser/TreeBuilder.php
+++ b/lib/Parser/TreeBuilder.php
@ -1255,9 +1255,7 @@ class TreeBuilder {
                    # A character token that is U+0000 NULL
                    elseif ($token instanceof NullCharacterToken) {
                        # Parse error. Ignore the token
-                        // DEVIATION: the parse error is already reported by the tokenizer;
-                        // this is probably an oversight in the specification, so we don't
-                        // report it a second time
+                        $this->error(ParseError::UNEXPECTED_NULL_CHARACTER_OMIT);
                    }
                    # Any other character token
                    elseif ($token instanceof CharacterToken) {
@ -2258,7 +2256,7 @@ class TreeBuilder {
                    # A character token that is U+0000 NULL
                    if ($token instanceof NullCharacterToken) {
                        # Parse error. Ignore the token.
-                        $this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
+                        $this->error(ParseError::UNEXPECTED_NULL_CHARACTER_OMIT);
                    }
                    # Any other character token
                    elseif ($token instanceof CharacterToken) {
@ -3450,7 +3448,7 @@ class TreeBuilder {
                # A character token that is U+0000 NULL
                if ($token instanceof NullCharacterToken) {
                    # Parse error. Insert a U+FFFD REPLACEMENT CHARACTER character.
-                    // DEVIATION: Parse errors for null characters are already emitted by the tokenizer
+                    $this->error(ParseError::UNEXPECTED_NULL_CHARACTER_REPLACE);
                    $this->insertCharacterToken(new CharacterToken("\u{FFFD}"));
                }
                # A character token that is one of U+0009 CHARACTER TABULATION, "LF" (U+000A),
--- a/tests/cases/TestTreeConstructor.php
+++ b/tests/cases/TestTreeConstructor.php
@ -115,7 +115,7 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
        if ($errors !== false) {
            $actualErrors = $this->formatErrors($errorHandler->errors);
            // If $errors is false, the test does not include errors when there are in fact errors
-            $this->assertCount(sizeof($errors), $actualErrors, var_export($errors, true).var_export($actualErrors, true));
+            $this->assertCount(sizeof($errors), $actualErrors, $treeBuilder->debugLog."\n".var_export($errors, true).var_export($actualErrors, true));
        }
    }

@ -135,15 +135,6 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
    protected function patchTest(string $data, $fragment, array $errors, array $exp): array {
        $patched = false;
        $skip = "";
-        // comments outside the root element are silently dropped by the PHP DOM
-        if (!$fragment) {
-            for ($a = 0; $a < sizeof($exp); $a++) {
-                if (strpos($exp[$a], "| <!--") === 0) {
-                    array_splice($exp, $a--, 1);
-                    $patched = true;
-                }
-            }
-        }
        // some tests don't document errors when they should
        if (!$errors && in_array($data, [
            // math.dat
@ -306,20 +297,9 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
                }
            }
            $errors = array_values($errors);
-            // other errors are spurious, or are for runs of character tokens
+            // other errors are spurious
            for ($a = 0, $stop = sizeof($errors); $a < $stop; $a++) {
-                if (preg_match("/^\((\d+),(\d+)\):? (foster-parenting-character(?:-in-table)?|unexpected-character-in-colgroup|unexpected-char-after-frameset|unexpected-char-in-frameset|expected-eof-but-got-char)$/", $errors[$a], $m1) && preg_match("/^\((\d+),(\d+)\):? $m1[3]$/", $errors[$a + 1] ?? "", $m2)) {
-                    // if the next error is also a character error at the next or same character position, this implies a run of characters where we only have one token
-                    // technically we should be reporting each one, so this is properly a FIXME
-                    if ($m1[1] == $m2[1] && ($m1[2] + 1 == $m2[2] || $m1[2] == $m2[2])) {
-                        unset($errors[$a]);
-                        $patched = true;
-                    }
-                } elseif (preg_match("/^foster-parenting text /", $errors[$a]) && preg_match("/^foster-parenting text /", $errors[$a + 1] ?? "")) {
-                    // template tests have a different format of error message
-                    unset($errors[$a]);
-                    $patched = true;
-                } elseif (preg_match("/^\((\d+,\d+)\):? unexpected-end-tag$/", $errors[$a], $m) && preg_match("/^\($m[1]\):? (unexpected-end-tag|end-tag-too-early|expected-one-end-tag-but-got-another|adoption-agency-1.3)$/", $errors[$a + 1] ?? "")) {
+                if (preg_match("/^\((\d+,\d+)\):? unexpected-end-tag$/", $errors[$a], $m) && preg_match("/^\($m[1]\):? (unexpected-end-tag|end-tag-too-early|expected-one-end-tag-but-got-another|adoption-agency-1.3)$/", $errors[$a + 1] ?? "")) {
                    // unexpected-end-tag errors should only be reported once for a given tag
                    unset($errors[$a]);
                }
@ -353,18 +333,8 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
                $this->serializeNode($n);
            }
        } else {
-            if ($d->doctype) {
-                $dt = "<!DOCTYPE ";
-                $dt .= ($d->doctype->name !== " ") ? $d->doctype->name : "";
-                if (strlen($d->doctype->publicId) || strlen($d->doctype->systemId)) {
-                    $dt .= ' "'.$d->doctype->publicId.'"';
-                    $dt .= ' "'.$d->doctype->systemId.'"';
-                }
-                $dt .= ">";
-                $this->push($dt);
-            }
-            if ($d->documentElement) {
-                $this->serializeElement($d->documentElement);
+            foreach ($d->childNodes as $n) {
+                $this->serializeNode($n);
            }
        }
        return $this->out;
@ -427,6 +397,15 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
            $this->push("<!-- ".$n->data." -->");
        } elseif ($n instanceof \DOMCharacterData) {
            $this->push('"'.$n->data.'"');
+        } elseif ($n instanceof \DOMDocumentType) {
+            $dt = "<!DOCTYPE ";
+            $dt .= ($n->name !== " ") ? $n->name : "";
+            if (strlen($n->publicId) || strlen($n->systemId)) {
+                $dt .= ' "'.$n->publicId.'"';
+                $dt .= ' "'.$n->systemId.'"';
+            }
+            $dt .= ">";
+            $this->push($dt);
        } else {
            throw new \Exception("Node type ".get_class($n)." not handled");
        }