Browse Source

Fix EOF and end tags

- End tags now emit errors if they have attributes
- End tags now emit errors if they are self-closing
- The last character before EOF is now correctly reconsumed

Also changed the tokenizer debug log to be zero-cost
ns
J. King 5 years ago
parent
commit
43f380c1f9
  1. 8
      lib/Data.php
  2. 6
      lib/ParseError.php
  3. 47
      lib/Tokenizer.php
  4. 2
      tests/cases/TestTokenizer.php

8
lib/Data.php

@ -17,6 +17,8 @@ class Data {
// Used for error reporting when unconsuming to calculate column number from
// last newline.
protected $newlines = [];
// Whether the EOF imaginary character has been consumed
protected $eof = false;
// Used for debugging to print out information as data is consumed.
@ -73,6 +75,10 @@ class Data {
$string .= $char;
}
if ($char === '') {
$this->eof = true;
}
if (self::$debug) {
echo "\nConsume\n==========\n";
echo "Length: $length\n";
@ -89,7 +95,7 @@ class Data {
public function unconsume(int $length = 1) {
assert($length > 0, new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length));
if (!$this->data->eof()) {
if (!$this->eof) {
$this->data->seek(0 - $length);
$string = $this->data->peekChar($length);

6
lib/ParseError.php

@ -47,6 +47,8 @@ class ParseError {
const ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER = 133;
const UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 134;
const EOF_IN_CDATA = 135;
const END_TAG_WITH_ATTRIBUTES = 136;
const END_TAG_WITH_TRAILING_SOLIDUS = 137;
protected static $messages = [
self::UNEXPECTED_NULL_CHARACTER => 'Unexpected null character',
@ -84,10 +86,12 @@ class ParseError {
self::ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER => 'Abrupt DOCTYPE "SYSTEM" identifier',
self::UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER => 'Unexpected character "%s" after DOCTYPE "SYSTEM" identifier',
self::EOF_IN_CDATA => 'End-of-file in CDATA section',
self::END_TAG_WITH_ATTRIBUTES => 'End-tag with attributes',
self::END_TAG_WITH_TRAILING_SOLIDUS => 'End-tag with trailing solidus',
];
public function setHandler() {
// Set the error handler and honor already-set error reporting rules.
// Set the errror handler and honor already-set error reporting rules.
set_error_handler([$this, 'errorHandler'], error_reporting());
}

47
lib/Tokenizer.php

@ -6,6 +6,8 @@ class Tokenizer {
use ParseErrorEmitter;
public $state;
public $debugLog = '';
public $debugCount = 0;
protected $data;
protected $stack;
@ -133,6 +135,7 @@ class Tokenizer {
self::COMMENT_LESS_THAN_SIGN_STATE => "Comment less-than sign",
self::COMMENT_LESS_THAN_SIGN_BANG_STATE => "Comment less-than sign bang",
self::COMMENT_LESS_THAN_SIGN_BANG_DASH_STATE => "Comment less-than sign bang dash",
self::COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH_STATE => "Comment less-than sign bang dash dash",
self::COMMENT_END_DASH_STATE => "Comment end dash",
self::COMMENT_END_STATE => "Comment end",
self::COMMENT_END_BANG_STATE => "Comment end bang",
@ -166,6 +169,24 @@ class Tokenizer {
$this->errorHandler = $errorHandler;
}
protected function sanitizeTag(TagToken $token): void {
if ($token instanceof EndTagToken) {
# When an end tag token is emitted with attributes,
# that is an end-tag-with-attributes parse error.
if ($token->attributes) {
$this->error(ParseError::END_TAG_WITH_ATTRIBUTES);
$token->attributes = [];
}
# When an end tag token is emitted with its self-closing
# flag set, that is an end-tag-with-trailing-solidus parse error.
if ($token->selfClosing) {
$this->error(ParseError::END_TAG_WITH_TRAILING_SOLIDUS);
$token->selfClosing = false;
}
}
}
protected function keepOrDiscardAttribute(TagToken $token, TokenAttr $attribute): void {
// See 12.2.5.33 Attribute name state
@ -190,13 +211,17 @@ class Tokenizer {
}
public function createToken(): Token {
assert((function() {
$this->debugLog .= "TOKEN ".++$this->debugCount."\n";
return true;
})());
while (true) {
if (self::$debug) {
$state = self::STATE_NAMES[$this->state] ?? "";
assert(strlen($state) > 0);
echo "State: $state\n";
unset($state);
}
assert((function() {
$state = self::STATE_NAMES[$this->state] ?? $this->state;
$this->debugLog .= " State: $state\n";
return true;
})());
# 12.2.5.1 Data state
if ($this->state === self::DATA_STATE) {
@ -513,6 +538,7 @@ class Tokenizer {
# Switch to the data state. Emit the current tag token.
$this->state = self::DATA_STATE;
assert(isset($token) && $token instanceof TagToken);
$this->sanitizeTag($token);
return $token;
}
# Uppercase ASCII letter
@ -639,6 +665,7 @@ class Tokenizer {
assert(isset($token) && $token instanceof Token);
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::DATA_STATE;
$this->sanitizeTag($token);
return $token;
} else {
goto RCDATA_end_tag_name_state_anything_else;
@ -765,6 +792,7 @@ class Tokenizer {
assert(isset($token) && $token instanceof Token);
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::DATA_STATE;
$this->sanitizeTag($token);
return $token;
} else {
goto RAWTEXT_end_tag_name_state_anything_else;
@ -898,6 +926,7 @@ class Tokenizer {
assert(isset($token) && $token instanceof Token);
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::DATA_STATE;
$this->sanitizeTag($token);
return $token;
} else {
goto script_data_end_tag_name_state_anything_else;
@ -1211,6 +1240,7 @@ class Tokenizer {
assert(isset($token) && $token instanceof Token);
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::DATA_STATE;
$this->sanitizeTag($token);
return $token;
} else {
goto script_data_escaped_end_tag_name_state_anything_else;
@ -1639,6 +1669,7 @@ class Tokenizer {
# Emit the current tag token.
$this->state = self::DATA_STATE;
assert(isset($token) && $token instanceof Token);
$this->sanitizeTag($token);
return $token;
}
# EOF
@ -1689,6 +1720,7 @@ class Tokenizer {
$this->error(ParseError::MISSING_ATTRIBUTE_VALUE);
$this->state = self::DATA_STATE;
assert(isset($token) && $token instanceof Token);
$this->sanitizeTag($token);
return $token;
}
# Anything else
@ -1818,6 +1850,7 @@ class Tokenizer {
# Switch to the data state. Emit the current tag token.
$this->state = self::DATA_STATE;
assert(isset($token) && $token instanceof Token);
$this->sanitizeTag($token);
return $token;
}
# U+0000 NULL
@ -1880,6 +1913,7 @@ class Tokenizer {
# Emit the current tag token.
$this->state = self::DATA_STATE;
assert(isset($token) && $token instanceof Token);
$this->sanitizeTag($token);
return $token;
}
# EOF
@ -1912,6 +1946,7 @@ class Tokenizer {
assert(isset($token) && $token instanceof Token);
$token->selfClosing = true;
$this->state = self::DATA_STATE;
$this->sanitizeTag($token);
return $token;
}
# EOF

2
tests/cases/TestTokenizer.php

@ -39,7 +39,7 @@ class TestTokenizer extends \dW\HTML5\Test\StandardTest {
} while (!($t instanceof EOFToken));
array_pop($actual);
$actual = $this->combineCharacterTokens($actual);
$this->assertEquals($expected, $actual);
$this->assertEquals($expected, $actual, $tokenizer->debugLog);
}
public function provideStandardTokenizerTests() {

Loading…
Cancel
Save