Browse Source

Rewrite how parse errors are handled

Everything which can emit a parse error should have the error handler
and data stream as properties and use the ParseErrorEmitter trait to
avoid complicating the task of actually producing an error.

Normally the Parser would be expected to set the error handler before it
begins (this commit does not do this) and unset it after it's done.
Alternatively, the entire means of reporting errors can now be easily
replaced.
ns
J. King 5 years ago
parent
commit
bb2a7b5a95
  1. 32
      lib/Data.php
  2. 50
      lib/ParseError.php
  3. 21
      lib/ParseErrorEmitter.php
  4. 167
      lib/Tokenizer.php
  5. 20
      tests/cases/TestTokenizer.php

32
lib/Data.php

@ -2,8 +2,9 @@
declare(strict_types=1); declare(strict_types=1);
namespace dW\HTML5; namespace dW\HTML5;
class Data class Data {
{ use ParseErrorEmitter;
// Used to get the file path for error reporting. // Used to get the file path for error reporting.
public $filePath; public $filePath;
@ -28,7 +29,8 @@ class Data
const WHITESPACE = "\t\n\x0c\x0d "; const WHITESPACE = "\t\n\x0c\x0d ";
public function __construct(string $data, string $filePath = 'STDIN') { public function __construct(string $data, string $filePath = 'STDIN', ParseError $errorHandler = null) {
$this->errorHandler = $errorHandler ?? new ParseError;
if ($filePath !== 'STDIN') { if ($filePath !== 'STDIN') {
$this->filePath = realpath($filePath); $this->filePath = realpath($filePath);
$data = file_get_contents($this->filePath); $data = file_get_contents($this->filePath);
@ -55,7 +57,7 @@ class Data
// Won't provide line or column counts for this as it's done before that // Won't provide line or column counts for this as it's done before that
// information is available. It will be rare that this is triggered. // information is available. It will be rare that this is triggered.
$data = preg_replace_callback('/(?:[\x01-\x08\x0B\x0E-\x1F\x7F]|\xC2[\x80-\x9F]|\xED(?:\xA0[\x80-\xFF]|[\xA1-\xBE][\x00-\xFF]|\xBF[\x00-\xBF])|\xEF\xB7[\x90-\xAF]|\xEF\xBF[\xBE\xBF]|[\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF])/u', function($matches) { $data = preg_replace_callback('/(?:[\x01-\x08\x0B\x0E-\x1F\x7F]|\xC2[\x80-\x9F]|\xED(?:\xA0[\x80-\xFF]|[\xA1-\xBE][\x00-\xFF]|\xBF[\x00-\xBF])|\xEF\xB7[\x90-\xAF]|\xEF\xBF[\xBE\xBF]|[\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF])/u', function($matches) {
ParseError::trigger(ParseError::INVALID_CONTROL_OR_NONCHARACTERS); $this->error(ParseError::INVALID_CONTROL_OR_NONCHARACTERS);
return ''; return '';
}, $data); }, $data);
@ -197,7 +199,7 @@ class Data
# unconsume the U+0023 NUMBER SIGN character and, if appropriate, the X # unconsume the U+0023 NUMBER SIGN character and, if appropriate, the X
# character). This is a parse error; nothing is returned. # character). This is a parse error; nothing is returned.
if (!$number) { if (!$number) {
ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $this->peek(), 'hexadecimal digit'); $this->error(ParseError::ENTITY_UNEXPECTED_CHARACTER, $this->peek(), 'hexadecimal digit');
$this->unconsume(2); $this->unconsume(2);
return '&'; return '&';
} }
@ -211,9 +213,9 @@ class Data
if (!$number) { if (!$number) {
$peek = $this->peek(); $peek = $this->peek();
if ($peek !== '') { if ($peek !== '') {
ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $this->peek(), 'decimal digit'); $this->error(ParseError::ENTITY_UNEXPECTED_CHARACTER, $this->peek(), 'decimal digit');
} else { } else {
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
} }
$this->unconsume(); $this->unconsume();
@ -227,9 +229,9 @@ class Data
if ($char === ';') { if ($char === ';') {
$this->consume(); $this->consume();
} elseif ($char === '') { } elseif ($char === '') {
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
} else { } else {
ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $char, 'semicolon terminator'); $this->error(ParseError::ENTITY_UNEXPECTED_CHARACTER, $char, 'semicolon terminator');
} }
# If one or more characters match the range, then take them all and interpret the # If one or more characters match the range, then take them all and interpret the
@ -328,7 +330,7 @@ class Data
} }
if ($returnValue) { if ($returnValue) {
ParseError::trigger(ParseError::INVALID_NUMERIC_ENTITY, $number); $this->error(ParseError::INVALID_NUMERIC_ENTITY, $number);
// Consume the ampersand but return the value instead. // Consume the ampersand but return the value instead.
$this->consume(); $this->consume();
return $returnValue; return $returnValue;
@ -338,7 +340,7 @@ class Data
# 0x10FFFF, then this is a parse error. Return a U+FFFD REPLACEMENT CHARACTER # 0x10FFFF, then this is a parse error. Return a U+FFFD REPLACEMENT CHARACTER
# character token. # character token.
if (($number >= 0xD800 && $number <= 0xDFFF) || $number > 0x10FFFF) { if (($number >= 0xD800 && $number <= 0xDFFF) || $number > 0x10FFFF) {
ParseError::trigger(ParseError::INVALID_CODEPOINT, $number); $this->error(ParseError::INVALID_CODEPOINT, $number);
return '�'; return '�';
} }
@ -359,7 +361,7 @@ class Data
$number === 0xBFFFF || $number === 0xCFFFE || $number === 0xCFFFF || $number === 0xDFFFE || $number === 0xBFFFF || $number === 0xCFFFE || $number === 0xCFFFF || $number === 0xDFFFE ||
$number === 0xDFFFF || $number === 0xEFFFE || $number === 0xEFFFF || $number === 0xFFFFE || $number === 0xDFFFF || $number === 0xEFFFE || $number === 0xEFFFF || $number === 0xFFFFE ||
$number === 0xFFFFF || $number === 0x10FFFE || $number === 0x10FFFF) { $number === 0xFFFFF || $number === 0x10FFFE || $number === 0x10FFFF) {
ParseError::trigger(ParseError::INVALID_CODEPOINT, $number); $this->error(ParseError::INVALID_CODEPOINT, $number);
// Consume the ampersand. // Consume the ampersand.
$this->consume(); $this->consume();
return '&'; return '&';
@ -403,7 +405,7 @@ class Data
$next = $this->peek(); $next = $this->peek();
if ($inAttribute && $lastChar !== ';' && ($next === '=' || ctype_alnum($next))) { if ($inAttribute && $lastChar !== ';' && ($next === '=' || ctype_alnum($next))) {
if ($next === '=') { if ($next === '=') {
ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $next, 'semicolon terminator'); $this->error(ParseError::ENTITY_UNEXPECTED_CHARACTER, $next, 'semicolon terminator');
} }
// Consume the ampersand. // Consume the ampersand.
@ -419,7 +421,7 @@ class Data
// Used for PHP's entity decoder. Described below. // Used for PHP's entity decoder. Described below.
$sequence.=';'; $sequence.=';';
ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $lastChar, 'semicolon terminator'); $this->error(ParseError::ENTITY_UNEXPECTED_CHARACTER, $lastChar, 'semicolon terminator');
} }
# Return one or two character tokens for the character(s) corresponding to the # Return one or two character tokens for the character(s) corresponding to the
@ -437,7 +439,7 @@ class Data
# (&) consist of a sequence of one or more alphanumeric ASCII characters followed # (&) consist of a sequence of one or more alphanumeric ASCII characters followed
# by a U+003B SEMICOLON character (;), then this is a parse error. # by a U+003B SEMICOLON character (;), then this is a parse error.
if (preg_match('/^[A-Za-z0-9]+;/', $char)) { if (preg_match('/^[A-Za-z0-9]+;/', $char)) {
ParseError::trigger(ParseError::INVALID_NAMED_ENTITY, $char); $this->error(ParseError::INVALID_NAMED_ENTITY, $char);
} }
// Consume the ampersand. // Consume the ampersand.

50
lib/ParseError.php

@ -39,49 +39,32 @@ class ParseError {
self::INVALID_CODEPOINT => '"%s" is an invalid character codepoint' self::INVALID_CODEPOINT => '"%s" is an invalid character codepoint'
]; ];
public function __construct(Data $data) { public function setHandler() {
$this->data = $data;
// Set the error handler and honor already-set error reporting rules. // Set the error handler and honor already-set error reporting rules.
set_error_handler([$this, 'errorHandler'], error_reporting()); set_error_handler([$this, 'errorHandler'], error_reporting());
} }
public function __destruct() { public function clearHandler() {
restore_error_handler(); restore_error_handler();
} }
public function errorHandler(int $code, string $message, string $file, int $line) { protected function prepareMessage(string $file, int $line, int $column, int $code, ...$arg): string {
if ($code === E_USER_WARNING) {
$errMsg = sprintf("HTML5 Parse Error: \"%s\" in %s", $message, $this->data->filePath);
if ($this->data->length !== 0) {
$errMsg .= sprintf(" on line %s, column %s\n", $this->data->line, $this->data->column);
} else {
$errMsg .= "\n";
}
echo $errMsg;
}
}
public static function trigger(int $code, ...$args): bool {
if (!isset(static::$messages[$code])) { if (!isset(static::$messages[$code])) {
throw new Exception(Exception::INVALID_CODE); throw new Exception(Exception::INVALID_CODE);
} }
$message = static::$messages[$code]; $message = static::$messages[$code];
// Count the number of replacements needed in the message. // Count the number of replacements needed in the message.
$count = substr_count($message, '%s'); $count = substr_count($message, '%s');
// If the number of replacements don't match the arguments then oops. // If the number of replacements don't match the arguments then oops.
if (count($args) !== $count) { if (count($arg) !== $count) {
throw new Exception(Exception::INCORRECT_PARAMETERS_FOR_MESSAGE, $count); throw new Exception(Exception::INCORRECT_PARAMETERS_FOR_MESSAGE, $count);
} }
if ($count > 0) { if ($count > 0) {
// Convert newlines and tabs in the arguments to words to better express what they // Convert newlines and tabs in the arguments to words to better express what they
// are. // are.
$args = array_map(function($value) { $arg = array_map(function($value) {
if ($value === "\n") { if ($value === "\n") {
return 'Newline'; return 'Newline';
} elseif ($value === "\t") { } elseif ($value === "\t") {
@ -91,12 +74,27 @@ class ParseError {
} else { } else {
return $value; return $value;
} }
}, $args); }, $arg);
// Go through each of the arguments and run sprintf on the strings. // Go through each of the arguments and run sprintf on the strings.
$message = call_user_func_array('sprintf', array_merge([$message], $args)); $message = sprintf($message, ...$arg);
}
// Wrap with preamble and location
// TODO: the file path should be middle-elided when necessary so that the message does not exceed 1024 bytes
$message = sprintf("HTML5 Parse Error: \"%s\" in %s", $message, $file);
if ($line) {
$message .= sprintf(" on line %s, column %s", $line, $column);
}
return $message;
}
public function emit(string $file, int $line, int $column, int $code, ...$arg): bool {
return trigger_error($this->prepareMessage($file, $line, $column, $code, ...$arg), \E_USER_WARNING);
}
public function errorHandler(int $code, string $message, string $file, int $line) {
if ($code === E_USER_WARNING) {
echo "$message\n";
} }
$output = trigger_error($message, E_USER_WARNING);
return $output;
} }
} }

21
lib/ParseErrorEmitter.php

@ -0,0 +1,21 @@
<?php
declare(strict_types=1);
namespace dW\HTML5;
trait ParseErrorEmitter {
/** @var ParseError $errorHandler */
private $errorHandler;
private function error(int $code, ...$arg): bool {
$data = ($this instanceof Data) ? $this : ($this->data ?? null);
if ($this->errorHandler) {
if ($data) {
return $this->errorHandler->emit($data->filePath, $data->line, $data->column, $code, ...$arg);
} else {
throw new \Exception("Emitted parse error without data stream");
}
} else {
throw new \Exception("Emitted error without error handler");
}
}
}

167
lib/Tokenizer.php

@ -3,6 +3,8 @@ declare(strict_types=1);
namespace dW\HTML5; namespace dW\HTML5;
class Tokenizer { class Tokenizer {
use ParseErrorEmitter;
public $state; public $state;
protected $data; protected $data;
@ -148,10 +150,11 @@ class Tokenizer {
const CTYPE_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'; const CTYPE_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
const CTYPE_UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; const CTYPE_UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
public function __construct(Data $data, OpenElementsStack $stack) { public function __construct(Data $data, OpenElementsStack $stack, ParseError $errorHandler) {
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$this->data = $data; $this->data = $data;
$this->stack = $stack; $this->stack = $stack;
$this->errorHandler = $errorHandler;
} }
public function createToken(): Token { public function createToken(): Token {
@ -369,9 +372,9 @@ class Tokenizer {
// Making errors more expressive. // Making errors more expressive.
if ($char !== '') { if ($char !== '') {
ParseError::trigger(ParseError::TAG_NAME_EXPECTED); $this->error(ParseError::TAG_NAME_EXPECTED);
} else { } else {
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
} }
$this->state = self::BOGUS_COMMENT_STATE; $this->state = self::BOGUS_COMMENT_STATE;
@ -383,9 +386,9 @@ class Tokenizer {
// Making errors more expressive. // Making errors more expressive.
if ($char !== '') { if ($char !== '') {
ParseError::trigger(ParseError::TAG_NAME_EXPECTED); $this->error(ParseError::TAG_NAME_EXPECTED);
} else { } else {
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
} }
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
@ -423,7 +426,7 @@ class Tokenizer {
# ">" (U+003E) # ">" (U+003E)
elseif ($char === '>') { elseif ($char === '>') {
# Parse error. Switch to the data state. # Parse error. Switch to the data state.
ParseError::trigger(ParseError::TAG_NAME_EXPECTED); $this->error(ParseError::TAG_NAME_EXPECTED);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
} }
# EOF # EOF
@ -431,7 +434,7 @@ class Tokenizer {
# Parse error. Switch to the data state. Emit a U+003C LESS-THAN SIGN character # Parse error. Switch to the data state. Emit a U+003C LESS-THAN SIGN character
# token and a U+002F SOLIDUS character token. Reconsume the EOF character. # token and a U+002F SOLIDUS character token. Reconsume the EOF character.
// Making errors more expressive. // Making errors more expressive.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$this->data->unconsume(); $this->data->unconsume();
return new CharacterToken('</'); return new CharacterToken('</');
@ -439,7 +442,7 @@ class Tokenizer {
# Anything else # Anything else
else { else {
# Parse error. Switch to the bogus comment state. # Parse error. Switch to the bogus comment state.
ParseError::trigger(ParseError::TAG_NAME_EXPECTED); $this->error(ParseError::TAG_NAME_EXPECTED);
$this->state = self::BOGUS_COMMENT_STATE; $this->state = self::BOGUS_COMMENT_STATE;
} }
@ -485,9 +488,9 @@ class Tokenizer {
// Making errors more expressive. // Making errors more expressive.
if ($char !== '') { if ($char !== '') {
ParseError::trigger(ParseError::TAG_NAME_EXPECTED, $char); $this->error(ParseError::TAG_NAME_EXPECTED, $char);
} else { } else {
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
} }
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
@ -1012,7 +1015,7 @@ class Tokenizer {
elseif ($char === '') { elseif ($char === '') {
# Switch to the data state. Parse error. Reconsume the EOF character. # Switch to the data state. Parse error. Reconsume the EOF character.
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->data->unconsume(); $this->data->unconsume();
} }
# Anything else # Anything else
@ -1047,7 +1050,7 @@ class Tokenizer {
elseif ($char === '') { elseif ($char === '') {
# Switch to the data state. Parse error. Reconsume the EOF character. # Switch to the data state. Parse error. Reconsume the EOF character.
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->data->unconsume(); $this->data->unconsume();
} }
# Anything else # Anything else
@ -1087,7 +1090,7 @@ class Tokenizer {
elseif ($char === '') { elseif ($char === '') {
# Switch to the data state. Parse error. Reconsume the EOF character. # Switch to the data state. Parse error. Reconsume the EOF character.
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->data->unconsume(); $this->data->unconsume();
} }
# Anything else # Anything else
@ -1296,7 +1299,7 @@ class Tokenizer {
# EOF # EOF
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character. # Parse error. Switch to the data state. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$this->data->unconsume(); $this->data->unconsume();
} }
@ -1429,7 +1432,7 @@ class Tokenizer {
# EOF # EOF
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character. # Parse error. Switch to the data state. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$this->data->unconsume(); $this->data->unconsume();
} }
@ -1447,7 +1450,7 @@ class Tokenizer {
# attribute name state. # attribute name state.
if ($char === '"' || $char === "'" || $char === '<' || $char === '=') { if ($char === '"' || $char === "'" || $char === '<' || $char === '=') {
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char); $this->error(ParseError::UNEXPECTED_CHARACTER, $char);
} }
// Need to add the current attribute to the token, if necessary. // Need to add the current attribute to the token, if necessary.
@ -1477,7 +1480,7 @@ class Tokenizer {
# EOF # EOF
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' || $char === '/' || $char === '>' || $char === '') { if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' || $char === '/' || $char === '>' || $char === '') {
if ($token->hasAttribute($attribute->name)) { if ($token->hasAttribute($attribute->name)) {
ParseError::trigger(ParseError::ATTRIBUTE_EXISTS, $attribute->name); $this->error(ParseError::ATTRIBUTE_EXISTS, $attribute->name);
} }
# Reconsume in the after attribute name state. # Reconsume in the after attribute name state.
@ -1487,7 +1490,7 @@ class Tokenizer {
# "=" (U+003D) # "=" (U+003D)
elseif ($char === '=') { elseif ($char === '=') {
if ($token instanceof StartTagToken && $token->hasAttribute($attribute->name)) { if ($token instanceof StartTagToken && $token->hasAttribute($attribute->name)) {
ParseError::trigger(ParseError::ATTRIBUTE_EXISTS, $attribute->name); $this->error(ParseError::ATTRIBUTE_EXISTS, $attribute->name);
} }
# Switch to the before attribute value state. # Switch to the before attribute value state.
@ -1513,7 +1516,7 @@ class Tokenizer {
# Append the current input character to the current attribute's name. # Append the current input character to the current attribute's name.
if ($char === '"' || $char === "'" || $char === '<' || $char === '=') { if ($char === '"' || $char === "'" || $char === '<' || $char === '=') {
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char); $this->error(ParseError::UNEXPECTED_CHARACTER, $char);
} }
// OPTIMIZATION: Will just check for alpha characters and strtolower the // OPTIMIZATION: Will just check for alpha characters and strtolower the
@ -1588,7 +1591,7 @@ class Tokenizer {
# EOF # EOF
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character. # Parse error. Switch to the data state. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$this->data->unconsume(); $this->data->unconsume();
} }
@ -1606,7 +1609,7 @@ class Tokenizer {
# attribute name state. # attribute name state.
if ($char === '"' || $char === "'" || $char === '<' || $char === '=') { if ($char === '"' || $char === "'" || $char === '<' || $char === '=') {
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char); $this->error(ParseError::UNEXPECTED_CHARACTER, $char);
} }
// Need to add the current attribute to the token, if necessary. // Need to add the current attribute to the token, if necessary.
@ -1654,7 +1657,7 @@ class Tokenizer {
# ">" (U+003E) # ">" (U+003E)
elseif ($char === '>') { elseif ($char === '>') {
# Parse error. Switch to the data state. Emit the current tag token. # Parse error. Switch to the data state. Emit the current tag token.
ParseError::trigger(ParseError::UNEXPECTED_END_OF_TAG); $this->error(ParseError::UNEXPECTED_END_OF_TAG);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
// Need to add the current attribute to the token, if necessary. // Need to add the current attribute to the token, if necessary.
@ -1668,7 +1671,7 @@ class Tokenizer {
# EOF # EOF
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character. # Parse error. Switch to the data state. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$this->data->unconsume(); $this->data->unconsume();
} }
@ -1684,7 +1687,7 @@ class Tokenizer {
# the attribute value (unquoted) state. # the attribute value (unquoted) state.
if ($char === '<' || $char === '=' || $char === '`') { if ($char === '<' || $char === '=' || $char === '`') {
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char); $this->error(ParseError::UNEXPECTED_CHARACTER, $char);
} }
$attribute->value .= $char; $attribute->value .= $char;
@ -1723,7 +1726,7 @@ class Tokenizer {
# EOF # EOF
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character. # Parse error. Switch to the data state. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$this->data->unconsume(); $this->data->unconsume();
} }
@ -1768,7 +1771,7 @@ class Tokenizer {
# EOF # EOF
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character. # Parse error. Switch to the data state. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$this->data->unconsume(); $this->data->unconsume();
} }
@ -1832,7 +1835,7 @@ class Tokenizer {
} }
# Parse error. Switch to the data state. Reconsume the EOF character. # Parse error. Switch to the data state. Reconsume the EOF character.
elseif ($char === '') { elseif ($char === '') {
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$this->data->unconsume(); $this->data->unconsume();
} }
@ -1849,7 +1852,7 @@ class Tokenizer {
# Append the current input character to the current attribute's value. # Append the current input character to the current attribute's value.
if ($char === '"' || $char === "'" || $char === '<' || $char === '=' || $char === '`') { if ($char === '"' || $char === "'" || $char === '<' || $char === '=' || $char === '`') {
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char); $this->error(ParseError::UNEXPECTED_CHARACTER, $char);
} }
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having // OPTIMIZATION: Consume all characters that aren't listed above to prevent having
@ -1894,14 +1897,14 @@ class Tokenizer {
# EOF # EOF
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character. # Parse error. Switch to the data state. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$this->data->unconsume(); $this->data->unconsume();
} }
# Anything else # Anything else
else { else {
# Parse error. Switch to the before attribute name state. Reconsume the character. # Parse error. Switch to the before attribute name state. Reconsume the character.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char); $this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE; $this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
$this->data->unconsume(); $this->data->unconsume();
} }
@ -1932,14 +1935,14 @@ class Tokenizer {
# EOF # EOF
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character. # Parse error. Switch to the data state. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$this->data->unconsume(); $this->data->unconsume();
} }
# Anything else # Anything else
else { else {
# Parse error. Switch to the before attribute name state. Reconsume the character. # Parse error. Switch to the before attribute name state. Reconsume the character.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char); $this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE; $this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
$this->data->unconsume(); $this->data->unconsume();
} }
@ -2007,9 +2010,9 @@ class Tokenizer {
else { else {
$char = $this->data->consume(); $char = $this->data->consume();
if ($char !== '') { if ($char !== '') {
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char); $this->error(ParseError::UNEXPECTED_CHARACTER, $char);
} else { } else {
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
} }
$this->state = self::BOGUS_COMMENT_STATE; $this->state = self::BOGUS_COMMENT_STATE;
@ -2032,7 +2035,7 @@ class Tokenizer {
# ">" (U+003E) # ">" (U+003E)
elseif ($char === '>') { elseif ($char === '>') {
# Parse error. Switch to the data state. Emit the comment token. # Parse error. Switch to the data state. Emit the comment token.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '>'); $this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
return $token; return $token;
} }
@ -2040,7 +2043,7 @@ class Tokenizer {
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF # Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF
# character. # character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$this->data->unconsume(); $this->data->unconsume();
return $token; return $token;
@ -2069,7 +2072,7 @@ class Tokenizer {
# ">" (U+003E) # ">" (U+003E)
elseif ($char === '>') { elseif ($char === '>') {
# Parse error. Switch to the data state. Emit the comment token. # Parse error. Switch to the data state. Emit the comment token.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '>'); $this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
return $token; return $token;
} }
@ -2077,7 +2080,7 @@ class Tokenizer {
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF # Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF
# character. # character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$this->data->unconsume(); $this->data->unconsume();
return $token; return $token;
@ -2107,7 +2110,7 @@ class Tokenizer {
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF # Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF
# character. # character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$this->data->unconsume(); $this->data->unconsume();
return $token; return $token;
@ -2138,7 +2141,7 @@ class Tokenizer {
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF # Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF
# character. # character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$this->data->unconsume(); $this->data->unconsume();
return $token; return $token;
@ -2168,7 +2171,7 @@ class Tokenizer {
# "!" (U+0021) # "!" (U+0021)
elseif ($char === '!') { elseif ($char === '!') {
# Parse error. Switch to the comment end bang state. # Parse error. Switch to the comment end bang state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '!'); $this->error(ParseError::UNEXPECTED_CHARACTER, '!');
$this->state = self::COMMENT_END_BANG_STATE; $this->state = self::COMMENT_END_BANG_STATE;
} }
# "-" (U+002D) # "-" (U+002D)
@ -2179,7 +2182,7 @@ class Tokenizer {
// here every single time. // here every single time.
$char .= $this->data->consumeWhile('-'); $char .= $this->data->consumeWhile('-');
for ($i = 0; $i < strlen($char); $i++) { for ($i = 0; $i < strlen($char); $i++) {
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '-'); $this->error(ParseError::UNEXPECTED_CHARACTER, '-');
} }
$token->data .= $char; $token->data .= $char;
@ -2188,7 +2191,7 @@ class Tokenizer {
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF # Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF
# character. # character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$this->data->unconsume(); $this->data->unconsume();
return $token; return $token;
@ -2197,7 +2200,7 @@ class Tokenizer {
else { else {
# Parse error. Append two "-" (U+002D) characters and the current input character # Parse error. Append two "-" (U+002D) characters and the current input character
# to the comment token's data. Switch to the comment state. # to the comment token's data. Switch to the comment state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char); $this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$token->data .= '--'.$char; $token->data .= '--'.$char;
$this->state = self::COMMENT_STATE; $this->state = self::COMMENT_STATE;
} }
@ -2227,7 +2230,7 @@ class Tokenizer {
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF # Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF
# character. # character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$this->data->unconsume(); $this->data->unconsume();
return $token; return $token;
@ -2264,7 +2267,7 @@ class Tokenizer {
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Create a new DOCTYPE token. Set its # Parse error. Switch to the data state. Create a new DOCTYPE token. Set its
# force-quirks flag to on. Emit the token. Reconsume the EOF character. # force-quirks flag to on. Emit the token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$token = new DOCTYPEToken(); $token = new DOCTYPEToken();
$token->forceQuirks = true; $token->forceQuirks = true;
@ -2274,7 +2277,7 @@ class Tokenizer {
# Anything else # Anything else
else { else {
# Parse error. Switch to the before DOCTYPE name state. Reconsume the character. # Parse error. Switch to the before DOCTYPE name state. Reconsume the character.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char); $this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$this->state = self::DOCTYPE_NAME_STATE; $this->state = self::DOCTYPE_NAME_STATE;
$this->data->unconsume(); $this->data->unconsume();
} }
@ -2306,7 +2309,7 @@ class Tokenizer {
elseif ($char === '>') { elseif ($char === '>') {
# Parse error. Create a new DOCTYPE token. Set its force-quirks flag to on. Switch # Parse error. Create a new DOCTYPE token. Set its force-quirks flag to on. Switch
# to the data state. Emit the token. # to the data state. Emit the token.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '>'); $this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$token = new DOCTYPEToken(); $token = new DOCTYPEToken();
$token->forceQuirks = true; $token->forceQuirks = true;
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
@ -2316,7 +2319,7 @@ class Tokenizer {
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Create a new DOCTYPE token. Set its # Parse error. Switch to the data state. Create a new DOCTYPE token. Set its
# force-quirks flag to on. Emit the token. Reconsume the EOF character. # force-quirks flag to on. Emit the token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$token = new DOCTYPEToken(); $token = new DOCTYPEToken();
$token->forceQuirks = true; $token->forceQuirks = true;
@ -2368,7 +2371,7 @@ class Tokenizer {
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character. # to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$token->forceQuirks = true; $token->forceQuirks = true;
$this->data->unconsume(); $this->data->unconsume();
@ -2409,7 +2412,7 @@ class Tokenizer {
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character. # to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$token->forceQuirks = true; $token->forceQuirks = true;
$this->data->unconsume(); $this->data->unconsume();
@ -2436,7 +2439,7 @@ class Tokenizer {
else { else {
// Need to unconsume what was consumed earlier. // Need to unconsume what was consumed earlier.
$this->data->unconsume(5); $this->data->unconsume(5);
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char[0]); $this->error(ParseError::UNEXPECTED_CHARACTER, $char[0]);
$token->forceQuirks = true; $token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE; $this->state = self::BOGUS_DOCTYPE_STATE;
} }
@ -2462,7 +2465,7 @@ class Tokenizer {
elseif ($char === '"') { elseif ($char === '"') {
# Parse error. Set the DOCTYPE token's public identifier to the empty string (not # Parse error. Set the DOCTYPE token's public identifier to the empty string (not
# missing), then switch to the DOCTYPE public identifier (double-quoted) state. # missing), then switch to the DOCTYPE public identifier (double-quoted) state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '"'); $this->error(ParseError::UNEXPECTED_CHARACTER, '"');
$token->public = ''; $token->public = '';
$this->state = self::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE; $this->state = self::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
} }
@ -2470,7 +2473,7 @@ class Tokenizer {
elseif ($char === "'") { elseif ($char === "'") {
# Parse error. Set the DOCTYPE token's public identifier to the empty string (not # Parse error. Set the DOCTYPE token's public identifier to the empty string (not
# missing), then switch to the DOCTYPE public identifier (single-quoted) state. # missing), then switch to the DOCTYPE public identifier (single-quoted) state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, "'"); $this->error(ParseError::UNEXPECTED_CHARACTER, "'");
$token->public = ''; $token->public = '';
$this->state = self::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE; $this->state = self::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
} }
@ -2478,7 +2481,7 @@ class Tokenizer {
elseif ($char === '>') { elseif ($char === '>') {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
# state. Emit that DOCTYPE token. # state. Emit that DOCTYPE token.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '>'); $this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$token->forceQuirks = true; $token->forceQuirks = true;
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
return $token; return $token;
@ -2487,7 +2490,7 @@ class Tokenizer {
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character. # to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$token->forceQuirks = true; $token->forceQuirks = true;
$this->data->unconsume(); $this->data->unconsume();
@ -2497,7 +2500,7 @@ class Tokenizer {
else { else {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
# bogus DOCTYPE state. # bogus DOCTYPE state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char); $this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$token->forceQuirks = true; $token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE; $this->state = self::BOGUS_DOCTYPE_STATE;
} }
@ -2535,7 +2538,7 @@ class Tokenizer {
elseif ($char === '>') { elseif ($char === '>') {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
# state. Emit that DOCTYPE token. # state. Emit that DOCTYPE token.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '>'); $this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$token->forceQuirks = true; $token->forceQuirks = true;
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
return $token; return $token;
@ -2544,7 +2547,7 @@ class Tokenizer {
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character. # to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$token->forceQuirks = true; $token->forceQuirks = true;
$this->data->unconsume(); $this->data->unconsume();
@ -2554,7 +2557,7 @@ class Tokenizer {
else { else {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
# bogus DOCTYPE state. # bogus DOCTYPE state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char); $this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$token->forceQuirks = true; $token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE; $this->state = self::BOGUS_DOCTYPE_STATE;
} }
@ -2576,7 +2579,7 @@ class Tokenizer {
elseif ($char === '>') { elseif ($char === '>') {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
# state. Emit that DOCTYPE token. # state. Emit that DOCTYPE token.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '>'); $this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$token->forceQuirks = true; $token->forceQuirks = true;
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
return $token; return $token;
@ -2585,7 +2588,7 @@ class Tokenizer {
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character. # to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$token->forceQuirks = true; $token->forceQuirks = true;
$this->data->unconsume(); $this->data->unconsume();
@ -2617,7 +2620,7 @@ class Tokenizer {
elseif ($char === '>') { elseif ($char === '>') {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
# state. Emit that DOCTYPE token. # state. Emit that DOCTYPE token.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '>'); $this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
return $token; return $token;
} }
@ -2625,7 +2628,7 @@ class Tokenizer {
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character. # to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$token->forceQuirks = true; $token->forceQuirks = true;
$this->data->unconsume(); $this->data->unconsume();
@ -2680,7 +2683,7 @@ class Tokenizer {
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character. # to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$token->forceQuirks = true; $token->forceQuirks = true;
$this->data->unconsume(); $this->data->unconsume();
@ -2690,7 +2693,7 @@ class Tokenizer {
else { else {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
# bogus DOCTYPE state. # bogus DOCTYPE state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char); $this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$token->forceQuirks = true; $token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE; $this->state = self::BOGUS_DOCTYPE_STATE;
} }
@ -2734,7 +2737,7 @@ class Tokenizer {
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character. # to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF, 'DOCTYPE public identifier'); $this->error(ParseError::UNEXPECTED_EOF, 'DOCTYPE public identifier');
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$token->forceQuirks = true; $token->forceQuirks = true;
$this->data->unconsume(); $this->data->unconsume();
@ -2744,7 +2747,7 @@ class Tokenizer {
else { else {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
# bogus DOCTYPE state. # bogus DOCTYPE state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char, 'DOCTYPE public identifier'); $this->error(ParseError::UNEXPECTED_CHARACTER, $char, 'DOCTYPE public identifier');
$token->forceQuirks = true; $token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE; $this->state = self::BOGUS_DOCTYPE_STATE;
} }
@ -2769,7 +2772,7 @@ class Tokenizer {
elseif ($char === '"') { elseif ($char === '"') {
# Parse error. Set the DOCTYPE token's system identifier to the empty string (not # Parse error. Set the DOCTYPE token's system identifier to the empty string (not
# missing), then switch to the DOCTYPE system identifier (double-quoted) state. # missing), then switch to the DOCTYPE system identifier (double-quoted) state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '"'); $this->error(ParseError::UNEXPECTED_CHARACTER, '"');
$token->system = ''; $token->system = '';
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE; $this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
} }
@ -2777,7 +2780,7 @@ class Tokenizer {
elseif ($char === "'") { elseif ($char === "'") {
# Parse error. Set the DOCTYPE token's system identifier to the empty string (not # Parse error. Set the DOCTYPE token's system identifier to the empty string (not
# missing), then switch to the DOCTYPE system identifier (single-quoted) state. # missing), then switch to the DOCTYPE system identifier (single-quoted) state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, "'"); $this->error(ParseError::UNEXPECTED_CHARACTER, "'");
$token->system = ''; $token->system = '';
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE; $this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
} }
@ -2785,7 +2788,7 @@ class Tokenizer {
elseif ($char === '>') { elseif ($char === '>') {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
# state. Emit that DOCTYPE token. # state. Emit that DOCTYPE token.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '>'); $this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$token->forceQuirks = true; $token->forceQuirks = true;
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
return $token; return $token;
@ -2794,7 +2797,7 @@ class Tokenizer {
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character. # to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$token->forceQuirks = true; $token->forceQuirks = true;
$this->data->unconsume(); $this->data->unconsume();
@ -2804,7 +2807,7 @@ class Tokenizer {
else { else {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
# bogus DOCTYPE state. # bogus DOCTYPE state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char); $this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$token->forceQuirks = true; $token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE; $this->state = self::BOGUS_DOCTYPE_STATE;
} }
@ -2842,7 +2845,7 @@ class Tokenizer {
elseif ($char === '>') { elseif ($char === '>') {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
# state. Emit that DOCTYPE token. # state. Emit that DOCTYPE token.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '>'); $this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$token->forceQuirks = true; $token->forceQuirks = true;
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
return $token; return $token;
@ -2851,7 +2854,7 @@ class Tokenizer {
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character. # to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$token->forceQuirks = true; $token->forceQuirks = true;
$this->data->unconsume(); $this->data->unconsume();
@ -2861,7 +2864,7 @@ class Tokenizer {
else { else {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
# bogus DOCTYPE state. # bogus DOCTYPE state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char); $this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$token->forceQuirks = true; $token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE; $this->state = self::BOGUS_DOCTYPE_STATE;
} }
@ -2883,7 +2886,7 @@ class Tokenizer {
elseif ($char === '>') { elseif ($char === '>') {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
# state. Emit that DOCTYPE token. # state. Emit that DOCTYPE token.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '>'); $this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
return $token; return $token;
} }
@ -2891,7 +2894,7 @@ class Tokenizer {
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character. # to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$token->forceQuirks = true; $token->forceQuirks = true;
$this->data->unconsume(); $this->data->unconsume();
@ -2923,7 +2926,7 @@ class Tokenizer {
elseif ($char === '>') { elseif ($char === '>') {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
# state. Emit that DOCTYPE token. # state. Emit that DOCTYPE token.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '>'); $this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
return $token; return $token;
} }
@ -2931,7 +2934,7 @@ class Tokenizer {
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character. # to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$token->forceQuirks = true; $token->forceQuirks = true;
$this->data->unconsume(); $this->data->unconsume();
@ -2986,7 +2989,7 @@ class Tokenizer {
elseif ($char === '') { elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character. # to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF); $this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$token->forceQuirks = true; $token->forceQuirks = true;
$this->data->unconsume(); $this->data->unconsume();
@ -2996,7 +2999,7 @@ class Tokenizer {
else { else {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
# bogus DOCTYPE state. # bogus DOCTYPE state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char); $this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$token->forceQuirks = true; $token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE; $this->state = self::BOGUS_DOCTYPE_STATE;
} }

20
tests/cases/TestTokenizer.php

@ -5,6 +5,7 @@ namespace dW\HTML5\TestCase;
use dW\HTML5\Data; use dW\HTML5\Data;
use dW\HTML5\EOFToken; use dW\HTML5\EOFToken;
use dW\HTML5\OpenElementsStack; use dW\HTML5\OpenElementsStack;
use dW\HTML5\ParseError;
use dW\HTML5\Tokenizer; use dW\HTML5\Tokenizer;
class TestTokenizer extends \dW\HTML5\Test\StandardTest { class TestTokenizer extends \dW\HTML5\Test\StandardTest {
@ -18,18 +19,25 @@ class TestTokenizer extends \dW\HTML5\Test\StandardTest {
} }
/** @dataProvider provideStandardTokenizerTests */ /** @dataProvider provideStandardTokenizerTests */
public function testStandardTokenizerTests(string $input, array $expected, int $state, string $open = null, array $errors) { public function testStandardTokenizerTests(string $input, array $expected, int $state, string $open = null, array $errors) {
$data = new Data($input);
$stack = new OpenElementsStack(); $stack = new OpenElementsStack();
$errorHandler = new ParseError;
if ($open) { if ($open) {
$stack[] = (new \DOMDocument)->createElement($open); $stack[] = (new \DOMDocument)->createElement($open);
} }
$tokenizer = new Tokenizer($data, $stack); $errorHandler = new ParseError;
$errorHandler->setHandler();
$data = new Data($input, "STDIN", $errorHandler);
$tokenizer = new Tokenizer($data, $stack, $errorHandler);
$tokenizer->state = $state; $tokenizer->state = $state;
$actual = []; $actual = [];
do { try {
$t = $tokenizer->createToken(); do {
$actual[] = $t; $t = $tokenizer->createToken();
} while (!($t instanceof EOFToken)); $actual[] = $t;
} while (!($t instanceof EOFToken));
} finally {
$errorHandler->clearHandler();
}
array_pop($actual); array_pop($actual);
$actual = $this->combineCharacterTokens($actual); $actual = $this->combineCharacterTokens($actual);
$this->assertEquals($expected, $actual); $this->assertEquals($expected, $actual);

Loading…
Cancel
Save