Baseline pass over tokenizer

- Implemented missing states (except entity and char ref states) - Re-copied and reformated most text from the specification - Emitted parse errors per spec (except invalid characters) - Properly handled null characters - Passed through invalid characters (these do not yet emit errors) - Added assertions before manipulation of tokens and temporary buffers - Removed problematic optimizations - Reoved explicit continue statements - Allowed end tags to have attributes - Simplified duplicate attribute detection - Corrected DOCTYPE properties not being "missing" - Skipped BOM in encoding-neutral way I may have introduced regressions, and the assertions are mostly serving to mask undefined-variable errors rather than helping to fix them, but at least warnings and notices are not being spammed this way. Work still need to be done in emitting errors for invalid characters (and invalid character sequences), also well as in consuming character references and entities correctly, not to mention general debugging.
5 years ago · d08438052a
4 changed files with 1747 additions and 1325 deletions
--- a/lib/Data.php
+++ b/lib/Data.php
@ -42,29 +42,18 @@ class Data {
        // encoding. At this moment this implementation won't determine a character
        // encoding and will just assume UTF-8.

-        # One leading U+FEFF BYTE ORDER MARK character must be ignored if any are present
-        # in the input stream.
-
-        # Note: The handling of U+0000 NULL characters varies based on where the
-        # characters are found. In general, they are ignored except where doing so could
-        # plausibly introduce an attack vector. This handling is, by necessity, spread
-        # across both the tokenization stage and the tree construction stage.
-
-        // DEVIATION: Just going to remove NULL characters. There is no scripting involved
-        // in this implementation and therefore no attack vector possible due to it.
-        $data = preg_replace(['/^\xEF\xBB\xBF/','/\x00/'], '', $data);
-
-        // Won't provide line or column counts for this as it's done before that
-        // information is available. It will be rare that this is triggered.
-        $data = preg_replace_callback('/(?:[\x01-\x08\x0B\x0E-\x1F\x7F]|\xC2[\x80-\x9F]|\xED(?:\xA0[\x80-\xFF]|[\xA1-\xBE][\x00-\xFF]|\xBF[\x00-\xBF])|\xEF\xB7[\x90-\xAF]|\xEF\xBF[\xBE\xBF]|[\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF])/u', function($matches) {
-            $this->error(ParseError::INVALID_CONTROL_OR_NONCHARACTERS);
-            return '';
-        }, $data);
-

        // Normalize line breaks. Convert CRLF and CR to LF.
        // Break the string up into a traversable object.
        $this->data = new \MensBeam\Intl\Encoding\UTF8(str_replace(["\r\n", "\r"], "\n", $data));
+
+        # One leading U+FEFF BYTE ORDER MARK character must be ignored if any are present
+        # in the input stream.
+
+        if ($this->data->nextChar() !== '\xEF\xBB\xBF') {
+            // rewind to the start of the string if the first character was not a BOM
+            $this->data->rewind();
+        }
    }

    public function consume(int $length = 1): string {
--- a/lib/ParseError.php
+++ b/lib/ParseError.php
@ -5,38 +5,85 @@ namespace dW\HTML5;
 class ParseError {
    protected $data;

-    const TAG_NAME_EXPECTED = 0;
    const UNEXPECTED_EOF = 1;
-    const UNEXPECTED_CHARACTER = 2;
-    const ATTRIBUTE_EXISTS = 3;
-    const UNEXPECTED_END_OF_TAG = 4;
-    const UNEXPECTED_START_TAG = 5;
-    const UNEXPECTED_END_TAG = 6;
-    const UNEXPECTED_DOCTYPE = 7;
-    const INVALID_DOCTYPE = 8;
    const INVALID_CONTROL_OR_NONCHARACTERS = 9;
-    const UNEXPECTED_XMLNS_ATTRIBUTE_VALUE = 10;
    const ENTITY_UNEXPECTED_CHARACTER = 11;
    const INVALID_NUMERIC_ENTITY = 12;
    const INVALID_NAMED_ENTITY = 13;
    const INVALID_CODEPOINT = 14;

+    const UNEXPECTED_NULL_CHARACTER                                         = 101;
+    const UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME                      = 102;
+    const EOF_BEFORE_TAG_NAME                                               = 103;
+    const INVALID_FIRST_CHARACTER_OF_TAG_NAME                               = 104;
+    const MISSING_END_TAG_NAME                                              = 105;
+    const EOF_IN_TAG                                                        = 106;
+    const EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT                              = 107;
+    const UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME                      = 108;
+    const DUPLICATE_ATTRIBUTE                                               = 109;
+    const UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME                            = 110;
+    const MISSING_ATTRIBUTE_VALUE                                           = 111;
+    const UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE                  = 112;
+    const MISSING_WHITESPACE_BETWEEN_ATTRIBUTES                             = 113;
+    const UNEXPECTED_SOLIDUS_IN_TAG                                         = 114;
+    const CDATA_IN_HTML_CONTENT                                             = 115;
+    const INCORRECTLY_OPENED_COMMENT                                        = 116;
+    const ABRUPT_CLOSING_OF_EMPTY_COMMENT                                   = 117;
+    const EOF_IN_COMMENT                                                    = 118;
+    const NESTED_COMMENT                                                    = 119;
+    const INCORRECTLY_CLOSED_COMMENT                                        = 120;
+    const EOF_IN_DOCTYPE                                                    = 121;
+    const MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME                            = 122;
+    const MISSING_DOCTYPE_NAME                                              = 123;
+    const INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME                     = 124;
+    const MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD                   = 125;
+    const MISSING_DOCTYPE_PUBLIC_IDENTIFIER                                 = 126;
+    const MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER                    = 127;
+    const ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER                                  = 128;
+    const MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS  = 129;
+    const MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD                   = 130;
+    const MISSING_DOCTYPE_SYSTEM_IDENTIFIER                                 = 131;
+    const MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER                    = 132;
+    const ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER                                  = 133;
+    const UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER              = 134;
+    const EOF_IN_CDATA                                                      = 135;
+
    protected static $messages = [
-        self::TAG_NAME_EXPECTED                => 'Tag name expected',
-        self::UNEXPECTED_EOF                   => 'Unexpected end-of-file',
-        self::UNEXPECTED_CHARACTER             => 'Unexpected "%s" character',
-        self::ATTRIBUTE_EXISTS                 => '%s attribute already exists; discarding',
-        self::UNEXPECTED_END_OF_TAG            => 'Unexpected end-of-tag',
-        self::UNEXPECTED_START_TAG             => 'Unexpected %s start tag',
-        self::UNEXPECTED_END_TAG               => 'Unexpected %s end tag',
-        self::UNEXPECTED_DOCTYPE               => 'Unexpected DOCTYPE',
-        self::INVALID_DOCTYPE                  => 'Invalid DOCTYPE',
-        self::INVALID_CONTROL_OR_NONCHARACTERS => 'Invalid Control or Non-character; removing',
-        self::UNEXPECTED_XMLNS_ATTRIBUTE_VALUE => 'Unexpected xmlns attribute value',
-        self::ENTITY_UNEXPECTED_CHARACTER      => 'Unexpected "%s" character in entity; %s expected',
-        self::INVALID_NUMERIC_ENTITY           => '"%s" is an invalid numeric entity',
-        self::INVALID_NAMED_ENTITY             => '"%s" is an invalid name for an entity',
-        self::INVALID_CODEPOINT                => '"%s" is an invalid character codepoint'
+        self::UNEXPECTED_NULL_CHARACTER                                         => 'Unexpected null character',
+        self::UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME                      => 'Unexpected "?" character instead of tag name',
+        self::EOF_BEFORE_TAG_NAME                                               => 'End-of-file before tag name',
+        self::INVALID_FIRST_CHARACTER_OF_TAG_NAME                               => 'Invalid first character "%s" of tag name',
+        self::MISSING_END_TAG_NAME                                              => 'Missing end-tag name',
+        self::EOF_IN_TAG                                                        => 'End-of-file in tag',
+        self::EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT                              => 'End-of-file in script (HTML comment-like) text',
+        self::UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME                      => 'Unexpected equals sign before attribute name',
+        self::DUPLICATE_ATTRIBUTE                                               => 'Duplicate attribute "%s" in start tag',
+        self::UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME                            => 'Unexpected character "%s" in attribute name',
+        self::MISSING_ATTRIBUTE_VALUE                                           => 'Missing attribute value',
+        self::UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE                  => 'Unexpected character "%s" in unquoted attribute value',
+        self::MISSING_WHITESPACE_BETWEEN_ATTRIBUTES                             => 'Missing whitespace between attributes',
+        self::UNEXPECTED_SOLIDUS_IN_TAG                                         => 'Unexpected solidus in tag',
+        self::CDATA_IN_HTML_CONTENT                                             => 'CDATA in HTML content',
+        self::INCORRECTLY_OPENED_COMMENT                                        => 'Incorrectly opened comment',
+        self::ABRUPT_CLOSING_OF_EMPTY_COMMENT                                   => 'Abrupt closing of empty comment',
+        self::EOF_IN_COMMENT                                                    => 'End-of-file in comment',
+        self::NESTED_COMMENT                                                    => 'Nested comment',
+        self::INCORRECTLY_CLOSED_COMMENT                                        => 'Incorrectly closed comment',
+        self::EOF_IN_DOCTYPE                                                    => 'End-of-file in DOCTYPE',
+        self::MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME                            => 'Missing whitespace before DOCTYPE name',
+        self::MISSING_DOCTYPE_NAME                                              => 'Missing DOCTYPE name',
+        self::INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME                     => 'Invalid character sequence after DOCTYPE name',
+        self::MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD                   => 'Missing whitespace after DOCTYPE "PUBLIC" keyword',
+        self::MISSING_DOCTYPE_PUBLIC_IDENTIFIER                                 => 'Missing DOCTYPE "PUBLIC" identifier',
+        self::MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER                    => 'Missing quote before DOCTYPE "PUBLIC" identifier',
+        self::ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER                                  => 'Abrupt DOCTYPE "PUBLIC" identifier',
+        self::MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS  => 'Missing whitespace between DOCTYPE "PUBLIC" and "SYSTEM" identifiers',
+        self::MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD                   => 'Missing whitespace after DOCTYPE "SYSTEM" keyword',
+        self::MISSING_DOCTYPE_SYSTEM_IDENTIFIER                                 => 'Missing DOCTYPE "SYSTEM" identifier',
+        self::MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER                    => 'Missing quote before DOCTYPE "SYSTEM" identifier',
+        self::ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER                                  => 'Abrupt DOCTYPE "SYSTEM" identifier',
+        self::UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER              => 'Unexpected character "%s" after DOCTYPE "SYSTEM" identifier',
+        self::EOF_IN_CDATA                                                      => 'End-of-file in CDATA section',
    ];

    public function setHandler() {
--- a/lib/Token.php
+++ b/lib/Token.php
@ -12,24 +12,22 @@ abstract class DataToken extends Token {
    }
 }

-abstract class TagToken extends Token {
-    public $name;
-
-    public function __construct(string $name) {
-        $this->name = $name;
-    }
-}
-
-class EOFToken extends Token {}
-
 class DOCTYPEToken extends Token {
+    # DOCTYPE tokens have a name, a public identifier,
+    #   a system identifier, and a force-quirks flag.
+    # When a DOCTYPE token is created, its name,
+    #   public identifier, and system identifier must
+    #   be marked as missing (which is a distinct state
+    #   from the empty string), and the force-quirks flag
+    #   must be set to off (its other state is on).
    public $forceQuirks = false;
+    public $name;
    public $public;
    public $system;

-    public function __construct(string $name = null, string $public = '', string $system = '') {
+    public function __construct(string $name = null, string $public = null, string $system = null) {
+        // null stands in for the distinct "missing" state
        $this->name = $name;
-
        $this->public = $public;
        $this->system = $system;
    }
@ -43,7 +41,14 @@ class CommentToken extends DataToken {
    }
 }

-class StartTagToken extends TagToken {
+abstract class TagToken extends Token {
+    # Start and end tag tokens have a tag name,
+    #   a self-closing flag, and a list of attributes,
+    #   each of which has a name and a value.
+    # When a start or end tag token is created, its
+    #   self-closing flag must be unset (its other state
+    #   is that it be set), and its attributes list must be empty.
+    public $name;
    public $namespace;
    public $selfClosing;
    public $attributes = [];
@ -51,7 +56,7 @@ class StartTagToken extends TagToken {
    public function __construct(string $name, bool $selfClosing = false, string $namespace = Parser::HTML_NAMESPACE) {
        $this->selfClosing = $selfClosing;
        $this->namespace = $namespace;
-        parent::__construct($name);
+        $this->name = $name;
    }

     public function getAttribute(string $name) {
@ -92,8 +97,12 @@ class StartTagToken extends TagToken {
     }
 }

+class StartTagToken extends TagToken {}
+
 class EndTagToken extends TagToken {}

+class EOFToken extends Token {}
+
 class TokenAttr {
    public $name;
    public $value;
@ -104,4 +113,4 @@ class TokenAttr {
        $this->value = $value;
        $this->namespace = $namespace;
    }
-}
+}
--- a/lib/Tokenizer.php
+++ b/lib/Tokenizer.php