From 43f380c1f9cd7fa76ed46e38199cc1cc0c94e787 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Sun, 15 Dec 2019 19:45:59 -0500 Subject: [PATCH] Fix EOF and end tags - End tags now emit errors if they have attributes - End tags now emit errors if they are self-closing - The last character before EOF is now correctly reconsumed Also changed the tokenizer debug log to be zero-cost --- lib/Data.php | 8 +++++- lib/ParseError.php | 6 ++++- lib/Tokenizer.php | 47 ++++++++++++++++++++++++++++++----- tests/cases/TestTokenizer.php | 2 +- 4 files changed, 54 insertions(+), 9 deletions(-) diff --git a/lib/Data.php b/lib/Data.php index 7ca9441..0c95644 100644 --- a/lib/Data.php +++ b/lib/Data.php @@ -17,6 +17,8 @@ class Data { // Used for error reporting when unconsuming to calculate column number from // last newline. protected $newlines = []; + // Whether the EOF imaginary character has been consumed + protected $eof = false; // Used for debugging to print out information as data is consumed. @@ -73,6 +75,10 @@ class Data { $string .= $char; } + if ($char === '') { + $this->eof = true; + } + if (self::$debug) { echo "\nConsume\n==========\n"; echo "Length: $length\n"; @@ -89,7 +95,7 @@ class Data { public function unconsume(int $length = 1) { assert($length > 0, new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length)); - if (!$this->data->eof()) { + if (!$this->eof) { $this->data->seek(0 - $length); $string = $this->data->peekChar($length); diff --git a/lib/ParseError.php b/lib/ParseError.php index 6769834..7dcc7f7 100644 --- a/lib/ParseError.php +++ b/lib/ParseError.php @@ -47,6 +47,8 @@ class ParseError { const ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER = 133; const UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 134; const EOF_IN_CDATA = 135; + const END_TAG_WITH_ATTRIBUTES = 136; + const END_TAG_WITH_TRAILING_SOLIDUS = 137; protected static $messages = [ self::UNEXPECTED_NULL_CHARACTER => 'Unexpected null character', @@ -84,10 +86,12 @@ class ParseError { self::ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER => 'Abrupt DOCTYPE "SYSTEM" identifier', self::UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER => 'Unexpected character "%s" after DOCTYPE "SYSTEM" identifier', self::EOF_IN_CDATA => 'End-of-file in CDATA section', + self::END_TAG_WITH_ATTRIBUTES => 'End-tag with attributes', + self::END_TAG_WITH_TRAILING_SOLIDUS => 'End-tag with trailing solidus', ]; public function setHandler() { - // Set the error handler and honor already-set error reporting rules. + // Set the errror handler and honor already-set error reporting rules. set_error_handler([$this, 'errorHandler'], error_reporting()); } diff --git a/lib/Tokenizer.php b/lib/Tokenizer.php index 084b841..9d4a40c 100644 --- a/lib/Tokenizer.php +++ b/lib/Tokenizer.php @@ -6,6 +6,8 @@ class Tokenizer { use ParseErrorEmitter; public $state; + public $debugLog = ''; + public $debugCount = 0; protected $data; protected $stack; @@ -133,6 +135,7 @@ class Tokenizer { self::COMMENT_LESS_THAN_SIGN_STATE => "Comment less-than sign", self::COMMENT_LESS_THAN_SIGN_BANG_STATE => "Comment less-than sign bang", self::COMMENT_LESS_THAN_SIGN_BANG_DASH_STATE => "Comment less-than sign bang dash", + self::COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH_STATE => "Comment less-than sign bang dash dash", self::COMMENT_END_DASH_STATE => "Comment end dash", self::COMMENT_END_STATE => "Comment end", self::COMMENT_END_BANG_STATE => "Comment end bang", @@ -166,6 +169,24 @@ class Tokenizer { $this->errorHandler = $errorHandler; } + protected function sanitizeTag(TagToken $token): void { + if ($token instanceof EndTagToken) { + # When an end tag token is emitted with attributes, + # that is an end-tag-with-attributes parse error. + if ($token->attributes) { + $this->error(ParseError::END_TAG_WITH_ATTRIBUTES); + $token->attributes = []; + } + # When an end tag token is emitted with its self-closing + # flag set, that is an end-tag-with-trailing-solidus parse error. + if ($token->selfClosing) { + $this->error(ParseError::END_TAG_WITH_TRAILING_SOLIDUS); + $token->selfClosing = false; + } + } + + } + protected function keepOrDiscardAttribute(TagToken $token, TokenAttr $attribute): void { // See 12.2.5.33 Attribute name state @@ -190,13 +211,17 @@ class Tokenizer { } public function createToken(): Token { + assert((function() { + $this->debugLog .= "TOKEN ".++$this->debugCount."\n"; + return true; + })()); + while (true) { - if (self::$debug) { - $state = self::STATE_NAMES[$this->state] ?? ""; - assert(strlen($state) > 0); - echo "State: $state\n"; - unset($state); - } + assert((function() { + $state = self::STATE_NAMES[$this->state] ?? $this->state; + $this->debugLog .= " State: $state\n"; + return true; + })()); # 12.2.5.1 Data state if ($this->state === self::DATA_STATE) { @@ -513,6 +538,7 @@ class Tokenizer { # Switch to the data state. Emit the current tag token. $this->state = self::DATA_STATE; assert(isset($token) && $token instanceof TagToken); + $this->sanitizeTag($token); return $token; } # Uppercase ASCII letter @@ -639,6 +665,7 @@ class Tokenizer { assert(isset($token) && $token instanceof Token); if ($token->name === $this->stack->currentNodeName) { $this->state = self::DATA_STATE; + $this->sanitizeTag($token); return $token; } else { goto RCDATA_end_tag_name_state_anything_else; @@ -765,6 +792,7 @@ class Tokenizer { assert(isset($token) && $token instanceof Token); if ($token->name === $this->stack->currentNodeName) { $this->state = self::DATA_STATE; + $this->sanitizeTag($token); return $token; } else { goto RAWTEXT_end_tag_name_state_anything_else; @@ -898,6 +926,7 @@ class Tokenizer { assert(isset($token) && $token instanceof Token); if ($token->name === $this->stack->currentNodeName) { $this->state = self::DATA_STATE; + $this->sanitizeTag($token); return $token; } else { goto script_data_end_tag_name_state_anything_else; @@ -1211,6 +1240,7 @@ class Tokenizer { assert(isset($token) && $token instanceof Token); if ($token->name === $this->stack->currentNodeName) { $this->state = self::DATA_STATE; + $this->sanitizeTag($token); return $token; } else { goto script_data_escaped_end_tag_name_state_anything_else; @@ -1639,6 +1669,7 @@ class Tokenizer { # Emit the current tag token. $this->state = self::DATA_STATE; assert(isset($token) && $token instanceof Token); + $this->sanitizeTag($token); return $token; } # EOF @@ -1689,6 +1720,7 @@ class Tokenizer { $this->error(ParseError::MISSING_ATTRIBUTE_VALUE); $this->state = self::DATA_STATE; assert(isset($token) && $token instanceof Token); + $this->sanitizeTag($token); return $token; } # Anything else @@ -1818,6 +1850,7 @@ class Tokenizer { # Switch to the data state. Emit the current tag token. $this->state = self::DATA_STATE; assert(isset($token) && $token instanceof Token); + $this->sanitizeTag($token); return $token; } # U+0000 NULL @@ -1880,6 +1913,7 @@ class Tokenizer { # Emit the current tag token. $this->state = self::DATA_STATE; assert(isset($token) && $token instanceof Token); + $this->sanitizeTag($token); return $token; } # EOF @@ -1912,6 +1946,7 @@ class Tokenizer { assert(isset($token) && $token instanceof Token); $token->selfClosing = true; $this->state = self::DATA_STATE; + $this->sanitizeTag($token); return $token; } # EOF diff --git a/tests/cases/TestTokenizer.php b/tests/cases/TestTokenizer.php index ee3c082..6286d60 100644 --- a/tests/cases/TestTokenizer.php +++ b/tests/cases/TestTokenizer.php @@ -39,7 +39,7 @@ class TestTokenizer extends \dW\HTML5\Test\StandardTest { } while (!($t instanceof EOFToken)); array_pop($actual); $actual = $this->combineCharacterTokens($actual); - $this->assertEquals($expected, $actual); + $this->assertEquals($expected, $actual, $tokenizer->debugLog); } public function provideStandardTokenizerTests() {