Browse Source

Rewrite how parse errors are handled

Everything which can emit a parse error should have the error handler
and data stream as properties and use the ParseErrorEmitter trait to
avoid complicating the task of actually producing an error.

Normally the Parser would be expected to set the error handler before it
begins (this commit does not do this) and unset it after it's done.
Alternatively, the entire means of reporting errors can now be easily
replaced.
ns
J. King 5 years ago
parent
commit
bb2a7b5a95
  1. 32
      lib/Data.php
  2. 50
      lib/ParseError.php
  3. 21
      lib/ParseErrorEmitter.php
  4. 167
      lib/Tokenizer.php
  5. 20
      tests/cases/TestTokenizer.php

32
lib/Data.php

@ -2,8 +2,9 @@
declare(strict_types=1);
namespace dW\HTML5;
class Data
{
class Data {
use ParseErrorEmitter;
// Used to get the file path for error reporting.
public $filePath;
@ -28,7 +29,8 @@ class Data
const WHITESPACE = "\t\n\x0c\x0d ";
public function __construct(string $data, string $filePath = 'STDIN') {
public function __construct(string $data, string $filePath = 'STDIN', ParseError $errorHandler = null) {
$this->errorHandler = $errorHandler ?? new ParseError;
if ($filePath !== 'STDIN') {
$this->filePath = realpath($filePath);
$data = file_get_contents($this->filePath);
@ -55,7 +57,7 @@ class Data
// Won't provide line or column counts for this as it's done before that
// information is available. It will be rare that this is triggered.
$data = preg_replace_callback('/(?:[\x01-\x08\x0B\x0E-\x1F\x7F]|\xC2[\x80-\x9F]|\xED(?:\xA0[\x80-\xFF]|[\xA1-\xBE][\x00-\xFF]|\xBF[\x00-\xBF])|\xEF\xB7[\x90-\xAF]|\xEF\xBF[\xBE\xBF]|[\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF])/u', function($matches) {
ParseError::trigger(ParseError::INVALID_CONTROL_OR_NONCHARACTERS);
$this->error(ParseError::INVALID_CONTROL_OR_NONCHARACTERS);
return '';
}, $data);
@ -197,7 +199,7 @@ class Data
# unconsume the U+0023 NUMBER SIGN character and, if appropriate, the X
# character). This is a parse error; nothing is returned.
if (!$number) {
ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $this->peek(), 'hexadecimal digit');
$this->error(ParseError::ENTITY_UNEXPECTED_CHARACTER, $this->peek(), 'hexadecimal digit');
$this->unconsume(2);
return '&';
}
@ -211,9 +213,9 @@ class Data
if (!$number) {
$peek = $this->peek();
if ($peek !== '') {
ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $this->peek(), 'decimal digit');
$this->error(ParseError::ENTITY_UNEXPECTED_CHARACTER, $this->peek(), 'decimal digit');
} else {
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
}
$this->unconsume();
@ -227,9 +229,9 @@ class Data
if ($char === ';') {
$this->consume();
} elseif ($char === '') {
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
} else {
ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $char, 'semicolon terminator');
$this->error(ParseError::ENTITY_UNEXPECTED_CHARACTER, $char, 'semicolon terminator');
}
# If one or more characters match the range, then take them all and interpret the
@ -328,7 +330,7 @@ class Data
}
if ($returnValue) {
ParseError::trigger(ParseError::INVALID_NUMERIC_ENTITY, $number);
$this->error(ParseError::INVALID_NUMERIC_ENTITY, $number);
// Consume the ampersand but return the value instead.
$this->consume();
return $returnValue;
@ -338,7 +340,7 @@ class Data
# 0x10FFFF, then this is a parse error. Return a U+FFFD REPLACEMENT CHARACTER
# character token.
if (($number >= 0xD800 && $number <= 0xDFFF) || $number > 0x10FFFF) {
ParseError::trigger(ParseError::INVALID_CODEPOINT, $number);
$this->error(ParseError::INVALID_CODEPOINT, $number);
return '�';
}
@ -359,7 +361,7 @@ class Data
$number === 0xBFFFF || $number === 0xCFFFE || $number === 0xCFFFF || $number === 0xDFFFE ||
$number === 0xDFFFF || $number === 0xEFFFE || $number === 0xEFFFF || $number === 0xFFFFE ||
$number === 0xFFFFF || $number === 0x10FFFE || $number === 0x10FFFF) {
ParseError::trigger(ParseError::INVALID_CODEPOINT, $number);
$this->error(ParseError::INVALID_CODEPOINT, $number);
// Consume the ampersand.
$this->consume();
return '&';
@ -403,7 +405,7 @@ class Data
$next = $this->peek();
if ($inAttribute && $lastChar !== ';' && ($next === '=' || ctype_alnum($next))) {
if ($next === '=') {
ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $next, 'semicolon terminator');
$this->error(ParseError::ENTITY_UNEXPECTED_CHARACTER, $next, 'semicolon terminator');
}
// Consume the ampersand.
@ -419,7 +421,7 @@ class Data
// Used for PHP's entity decoder. Described below.
$sequence.=';';
ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $lastChar, 'semicolon terminator');
$this->error(ParseError::ENTITY_UNEXPECTED_CHARACTER, $lastChar, 'semicolon terminator');
}
# Return one or two character tokens for the character(s) corresponding to the
@ -437,7 +439,7 @@ class Data
# (&) consist of a sequence of one or more alphanumeric ASCII characters followed
# by a U+003B SEMICOLON character (;), then this is a parse error.
if (preg_match('/^[A-Za-z0-9]+;/', $char)) {
ParseError::trigger(ParseError::INVALID_NAMED_ENTITY, $char);
$this->error(ParseError::INVALID_NAMED_ENTITY, $char);
}
// Consume the ampersand.

50
lib/ParseError.php

@ -39,49 +39,32 @@ class ParseError {
self::INVALID_CODEPOINT => '"%s" is an invalid character codepoint'
];
public function __construct(Data $data) {
$this->data = $data;
public function setHandler() {
// Set the error handler and honor already-set error reporting rules.
set_error_handler([$this, 'errorHandler'], error_reporting());
}
public function __destruct() {
public function clearHandler() {
restore_error_handler();
}
public function errorHandler(int $code, string $message, string $file, int $line) {
if ($code === E_USER_WARNING) {
$errMsg = sprintf("HTML5 Parse Error: \"%s\" in %s", $message, $this->data->filePath);
if ($this->data->length !== 0) {
$errMsg .= sprintf(" on line %s, column %s\n", $this->data->line, $this->data->column);
} else {
$errMsg .= "\n";
}
echo $errMsg;
}
}
public static function trigger(int $code, ...$args): bool {
protected function prepareMessage(string $file, int $line, int $column, int $code, ...$arg): string {
if (!isset(static::$messages[$code])) {
throw new Exception(Exception::INVALID_CODE);
}
$message = static::$messages[$code];
// Count the number of replacements needed in the message.
$count = substr_count($message, '%s');
// If the number of replacements don't match the arguments then oops.
if (count($args) !== $count) {
if (count($arg) !== $count) {
throw new Exception(Exception::INCORRECT_PARAMETERS_FOR_MESSAGE, $count);
}
if ($count > 0) {
// Convert newlines and tabs in the arguments to words to better express what they
// are.
$args = array_map(function($value) {
$arg = array_map(function($value) {
if ($value === "\n") {
return 'Newline';
} elseif ($value === "\t") {
@ -91,12 +74,27 @@ class ParseError {
} else {
return $value;
}
}, $args);
}, $arg);
// Go through each of the arguments and run sprintf on the strings.
$message = call_user_func_array('sprintf', array_merge([$message], $args));
$message = sprintf($message, ...$arg);
}
// Wrap with preamble and location
// TODO: the file path should be middle-elided when necessary so that the message does not exceed 1024 bytes
$message = sprintf("HTML5 Parse Error: \"%s\" in %s", $message, $file);
if ($line) {
$message .= sprintf(" on line %s, column %s", $line, $column);
}
return $message;
}
public function emit(string $file, int $line, int $column, int $code, ...$arg): bool {
return trigger_error($this->prepareMessage($file, $line, $column, $code, ...$arg), \E_USER_WARNING);
}
public function errorHandler(int $code, string $message, string $file, int $line) {
if ($code === E_USER_WARNING) {
echo "$message\n";
}
$output = trigger_error($message, E_USER_WARNING);
return $output;
}
}

21
lib/ParseErrorEmitter.php

@ -0,0 +1,21 @@
<?php
declare(strict_types=1);
namespace dW\HTML5;
trait ParseErrorEmitter {
/** @var ParseError $errorHandler */
private $errorHandler;
private function error(int $code, ...$arg): bool {
$data = ($this instanceof Data) ? $this : ($this->data ?? null);
if ($this->errorHandler) {
if ($data) {
return $this->errorHandler->emit($data->filePath, $data->line, $data->column, $code, ...$arg);
} else {
throw new \Exception("Emitted parse error without data stream");
}
} else {
throw new \Exception("Emitted error without error handler");
}
}
}

167
lib/Tokenizer.php

@ -3,6 +3,8 @@ declare(strict_types=1);
namespace dW\HTML5;
class Tokenizer {
use ParseErrorEmitter;
public $state;
protected $data;
@ -148,10 +150,11 @@ class Tokenizer {
const CTYPE_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
const CTYPE_UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
public function __construct(Data $data, OpenElementsStack $stack) {
public function __construct(Data $data, OpenElementsStack $stack, ParseError $errorHandler) {
$this->state = self::DATA_STATE;
$this->data = $data;
$this->stack = $stack;
$this->errorHandler = $errorHandler;
}
public function createToken(): Token {
@ -369,9 +372,9 @@ class Tokenizer {
// Making errors more expressive.
if ($char !== '') {
ParseError::trigger(ParseError::TAG_NAME_EXPECTED);
$this->error(ParseError::TAG_NAME_EXPECTED);
} else {
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
}
$this->state = self::BOGUS_COMMENT_STATE;
@ -383,9 +386,9 @@ class Tokenizer {
// Making errors more expressive.
if ($char !== '') {
ParseError::trigger(ParseError::TAG_NAME_EXPECTED);
$this->error(ParseError::TAG_NAME_EXPECTED);
} else {
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
}
$this->state = self::DATA_STATE;
@ -423,7 +426,7 @@ class Tokenizer {
# ">" (U+003E)
elseif ($char === '>') {
# Parse error. Switch to the data state.
ParseError::trigger(ParseError::TAG_NAME_EXPECTED);
$this->error(ParseError::TAG_NAME_EXPECTED);
$this->state = self::DATA_STATE;
}
# EOF
@ -431,7 +434,7 @@ class Tokenizer {
# Parse error. Switch to the data state. Emit a U+003C LESS-THAN SIGN character
# token and a U+002F SOLIDUS character token. Reconsume the EOF character.
// Making errors more expressive.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
return new CharacterToken('</');
@ -439,7 +442,7 @@ class Tokenizer {
# Anything else
else {
# Parse error. Switch to the bogus comment state.
ParseError::trigger(ParseError::TAG_NAME_EXPECTED);
$this->error(ParseError::TAG_NAME_EXPECTED);
$this->state = self::BOGUS_COMMENT_STATE;
}
@ -485,9 +488,9 @@ class Tokenizer {
// Making errors more expressive.
if ($char !== '') {
ParseError::trigger(ParseError::TAG_NAME_EXPECTED, $char);
$this->error(ParseError::TAG_NAME_EXPECTED, $char);
} else {
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
}
$this->state = self::DATA_STATE;
@ -1012,7 +1015,7 @@ class Tokenizer {
elseif ($char === '') {
# Switch to the data state. Parse error. Reconsume the EOF character.
$this->state = self::DATA_STATE;
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->data->unconsume();
}
# Anything else
@ -1047,7 +1050,7 @@ class Tokenizer {
elseif ($char === '') {
# Switch to the data state. Parse error. Reconsume the EOF character.
$this->state = self::DATA_STATE;
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->data->unconsume();
}
# Anything else
@ -1087,7 +1090,7 @@ class Tokenizer {
elseif ($char === '') {
# Switch to the data state. Parse error. Reconsume the EOF character.
$this->state = self::DATA_STATE;
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->data->unconsume();
}
# Anything else
@ -1296,7 +1299,7 @@ class Tokenizer {
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
}
@ -1429,7 +1432,7 @@ class Tokenizer {
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
}
@ -1447,7 +1450,7 @@ class Tokenizer {
# attribute name state.
if ($char === '"' || $char === "'" || $char === '<' || $char === '=') {
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char);
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
}
// Need to add the current attribute to the token, if necessary.
@ -1477,7 +1480,7 @@ class Tokenizer {
# EOF
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' || $char === '/' || $char === '>' || $char === '') {
if ($token->hasAttribute($attribute->name)) {
ParseError::trigger(ParseError::ATTRIBUTE_EXISTS, $attribute->name);
$this->error(ParseError::ATTRIBUTE_EXISTS, $attribute->name);
}
# Reconsume in the after attribute name state.
@ -1487,7 +1490,7 @@ class Tokenizer {
# "=" (U+003D)
elseif ($char === '=') {
if ($token instanceof StartTagToken && $token->hasAttribute($attribute->name)) {
ParseError::trigger(ParseError::ATTRIBUTE_EXISTS, $attribute->name);
$this->error(ParseError::ATTRIBUTE_EXISTS, $attribute->name);
}
# Switch to the before attribute value state.
@ -1513,7 +1516,7 @@ class Tokenizer {
# Append the current input character to the current attribute's name.
if ($char === '"' || $char === "'" || $char === '<' || $char === '=') {
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char);
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
}
// OPTIMIZATION: Will just check for alpha characters and strtolower the
@ -1588,7 +1591,7 @@ class Tokenizer {
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
}
@ -1606,7 +1609,7 @@ class Tokenizer {
# attribute name state.
if ($char === '"' || $char === "'" || $char === '<' || $char === '=') {
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char);
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
}
// Need to add the current attribute to the token, if necessary.
@ -1654,7 +1657,7 @@ class Tokenizer {
# ">" (U+003E)
elseif ($char === '>') {
# Parse error. Switch to the data state. Emit the current tag token.
ParseError::trigger(ParseError::UNEXPECTED_END_OF_TAG);
$this->error(ParseError::UNEXPECTED_END_OF_TAG);
$this->state = self::DATA_STATE;
// Need to add the current attribute to the token, if necessary.
@ -1668,7 +1671,7 @@ class Tokenizer {
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
}
@ -1684,7 +1687,7 @@ class Tokenizer {
# the attribute value (unquoted) state.
if ($char === '<' || $char === '=' || $char === '`') {
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char);
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
}
$attribute->value .= $char;
@ -1723,7 +1726,7 @@ class Tokenizer {
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
}
@ -1768,7 +1771,7 @@ class Tokenizer {
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
}
@ -1832,7 +1835,7 @@ class Tokenizer {
}
# Parse error. Switch to the data state. Reconsume the EOF character.
elseif ($char === '') {
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
}
@ -1849,7 +1852,7 @@ class Tokenizer {
# Append the current input character to the current attribute's value.
if ($char === '"' || $char === "'" || $char === '<' || $char === '=' || $char === '`') {
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char);
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
}
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
@ -1894,14 +1897,14 @@ class Tokenizer {
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
}
# Anything else
else {
# Parse error. Switch to the before attribute name state. Reconsume the character.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char);
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
$this->data->unconsume();
}
@ -1932,14 +1935,14 @@ class Tokenizer {
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
}
# Anything else
else {
# Parse error. Switch to the before attribute name state. Reconsume the character.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char);
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
$this->data->unconsume();
}
@ -2007,9 +2010,9 @@ class Tokenizer {
else {
$char = $this->data->consume();
if ($char !== '') {
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char);
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
} else {
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
}
$this->state = self::BOGUS_COMMENT_STATE;
@ -2032,7 +2035,7 @@ class Tokenizer {
# ">" (U+003E)
elseif ($char === '>') {
# Parse error. Switch to the data state. Emit the comment token.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '>');
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$this->state = self::DATA_STATE;
return $token;
}
@ -2040,7 +2043,7 @@ class Tokenizer {
elseif ($char === '') {
# Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF
# character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
return $token;
@ -2069,7 +2072,7 @@ class Tokenizer {
# ">" (U+003E)
elseif ($char === '>') {
# Parse error. Switch to the data state. Emit the comment token.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '>');
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$this->state = self::DATA_STATE;
return $token;
}
@ -2077,7 +2080,7 @@ class Tokenizer {
elseif ($char === '') {
# Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF
# character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
return $token;
@ -2107,7 +2110,7 @@ class Tokenizer {
elseif ($char === '') {
# Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF
# character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
return $token;
@ -2138,7 +2141,7 @@ class Tokenizer {
elseif ($char === '') {
# Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF
# character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
return $token;
@ -2168,7 +2171,7 @@ class Tokenizer {
# "!" (U+0021)
elseif ($char === '!') {
# Parse error. Switch to the comment end bang state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '!');
$this->error(ParseError::UNEXPECTED_CHARACTER, '!');
$this->state = self::COMMENT_END_BANG_STATE;
}
# "-" (U+002D)
@ -2179,7 +2182,7 @@ class Tokenizer {
// here every single time.
$char .= $this->data->consumeWhile('-');
for ($i = 0; $i < strlen($char); $i++) {
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '-');
$this->error(ParseError::UNEXPECTED_CHARACTER, '-');
}
$token->data .= $char;
@ -2188,7 +2191,7 @@ class Tokenizer {
elseif ($char === '') {
# Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF
# character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
return $token;
@ -2197,7 +2200,7 @@ class Tokenizer {
else {
# Parse error. Append two "-" (U+002D) characters and the current input character
# to the comment token's data. Switch to the comment state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char);
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$token->data .= '--'.$char;
$this->state = self::COMMENT_STATE;
}
@ -2227,7 +2230,7 @@ class Tokenizer {
elseif ($char === '') {
# Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF
# character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
return $token;
@ -2264,7 +2267,7 @@ class Tokenizer {
elseif ($char === '') {
# Parse error. Switch to the data state. Create a new DOCTYPE token. Set its
# force-quirks flag to on. Emit the token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token = new DOCTYPEToken();
$token->forceQuirks = true;
@ -2274,7 +2277,7 @@ class Tokenizer {
# Anything else
else {
# Parse error. Switch to the before DOCTYPE name state. Reconsume the character.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char);
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$this->state = self::DOCTYPE_NAME_STATE;
$this->data->unconsume();
}
@ -2306,7 +2309,7 @@ class Tokenizer {
elseif ($char === '>') {
# Parse error. Create a new DOCTYPE token. Set its force-quirks flag to on. Switch
# to the data state. Emit the token.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '>');
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$token = new DOCTYPEToken();
$token->forceQuirks = true;
$this->state = self::DATA_STATE;
@ -2316,7 +2319,7 @@ class Tokenizer {
elseif ($char === '') {
# Parse error. Switch to the data state. Create a new DOCTYPE token. Set its
# force-quirks flag to on. Emit the token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token = new DOCTYPEToken();
$token->forceQuirks = true;
@ -2368,7 +2371,7 @@ class Tokenizer {
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
@ -2409,7 +2412,7 @@ class Tokenizer {
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
@ -2436,7 +2439,7 @@ class Tokenizer {
else {
// Need to unconsume what was consumed earlier.
$this->data->unconsume(5);
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char[0]);
$this->error(ParseError::UNEXPECTED_CHARACTER, $char[0]);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
}
@ -2462,7 +2465,7 @@ class Tokenizer {
elseif ($char === '"') {
# Parse error. Set the DOCTYPE token's public identifier to the empty string (not
# missing), then switch to the DOCTYPE public identifier (double-quoted) state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '"');
$this->error(ParseError::UNEXPECTED_CHARACTER, '"');
$token->public = '';
$this->state = self::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
}
@ -2470,7 +2473,7 @@ class Tokenizer {
elseif ($char === "'") {
# Parse error. Set the DOCTYPE token's public identifier to the empty string (not
# missing), then switch to the DOCTYPE public identifier (single-quoted) state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, "'");
$this->error(ParseError::UNEXPECTED_CHARACTER, "'");
$token->public = '';
$this->state = self::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
}
@ -2478,7 +2481,7 @@ class Tokenizer {
elseif ($char === '>') {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
# state. Emit that DOCTYPE token.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '>');
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$token->forceQuirks = true;
$this->state = self::DATA_STATE;
return $token;
@ -2487,7 +2490,7 @@ class Tokenizer {
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
@ -2497,7 +2500,7 @@ class Tokenizer {
else {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
# bogus DOCTYPE state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char);
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
}
@ -2535,7 +2538,7 @@ class Tokenizer {
elseif ($char === '>') {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
# state. Emit that DOCTYPE token.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '>');
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$token->forceQuirks = true;
$this->state = self::DATA_STATE;
return $token;
@ -2544,7 +2547,7 @@ class Tokenizer {
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
@ -2554,7 +2557,7 @@ class Tokenizer {
else {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
# bogus DOCTYPE state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char);
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
}
@ -2576,7 +2579,7 @@ class Tokenizer {
elseif ($char === '>') {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
# state. Emit that DOCTYPE token.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '>');
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$token->forceQuirks = true;
$this->state = self::DATA_STATE;
return $token;
@ -2585,7 +2588,7 @@ class Tokenizer {
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
@ -2617,7 +2620,7 @@ class Tokenizer {
elseif ($char === '>') {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
# state. Emit that DOCTYPE token.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '>');
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$this->state = self::DATA_STATE;
return $token;
}
@ -2625,7 +2628,7 @@ class Tokenizer {
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
@ -2680,7 +2683,7 @@ class Tokenizer {
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
@ -2690,7 +2693,7 @@ class Tokenizer {
else {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
# bogus DOCTYPE state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char);
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
}
@ -2734,7 +2737,7 @@ class Tokenizer {
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF, 'DOCTYPE public identifier');
$this->error(ParseError::UNEXPECTED_EOF, 'DOCTYPE public identifier');
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
@ -2744,7 +2747,7 @@ class Tokenizer {
else {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
# bogus DOCTYPE state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char, 'DOCTYPE public identifier');
$this->error(ParseError::UNEXPECTED_CHARACTER, $char, 'DOCTYPE public identifier');
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
}
@ -2769,7 +2772,7 @@ class Tokenizer {
elseif ($char === '"') {
# Parse error. Set the DOCTYPE token's system identifier to the empty string (not
# missing), then switch to the DOCTYPE system identifier (double-quoted) state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '"');
$this->error(ParseError::UNEXPECTED_CHARACTER, '"');
$token->system = '';
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
}
@ -2777,7 +2780,7 @@ class Tokenizer {
elseif ($char === "'") {
# Parse error. Set the DOCTYPE token's system identifier to the empty string (not
# missing), then switch to the DOCTYPE system identifier (single-quoted) state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, "'");
$this->error(ParseError::UNEXPECTED_CHARACTER, "'");
$token->system = '';
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
}
@ -2785,7 +2788,7 @@ class Tokenizer {
elseif ($char === '>') {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
# state. Emit that DOCTYPE token.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '>');
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$token->forceQuirks = true;
$this->state = self::DATA_STATE;
return $token;
@ -2794,7 +2797,7 @@ class Tokenizer {
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
@ -2804,7 +2807,7 @@ class Tokenizer {
else {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
# bogus DOCTYPE state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char);
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
}
@ -2842,7 +2845,7 @@ class Tokenizer {
elseif ($char === '>') {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
# state. Emit that DOCTYPE token.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '>');
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$token->forceQuirks = true;
$this->state = self::DATA_STATE;
return $token;
@ -2851,7 +2854,7 @@ class Tokenizer {
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
@ -2861,7 +2864,7 @@ class Tokenizer {
else {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
# bogus DOCTYPE state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char);
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
}
@ -2883,7 +2886,7 @@ class Tokenizer {
elseif ($char === '>') {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
# state. Emit that DOCTYPE token.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '>');
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$this->state = self::DATA_STATE;
return $token;
}
@ -2891,7 +2894,7 @@ class Tokenizer {
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
@ -2923,7 +2926,7 @@ class Tokenizer {
elseif ($char === '>') {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
# state. Emit that DOCTYPE token.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, '>');
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$this->state = self::DATA_STATE;
return $token;
}
@ -2931,7 +2934,7 @@ class Tokenizer {
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
@ -2986,7 +2989,7 @@ class Tokenizer {
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF);
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
@ -2996,7 +2999,7 @@ class Tokenizer {
else {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
# bogus DOCTYPE state.
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char);
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
}

20
tests/cases/TestTokenizer.php

@ -5,6 +5,7 @@ namespace dW\HTML5\TestCase;
use dW\HTML5\Data;
use dW\HTML5\EOFToken;
use dW\HTML5\OpenElementsStack;
use dW\HTML5\ParseError;
use dW\HTML5\Tokenizer;
class TestTokenizer extends \dW\HTML5\Test\StandardTest {
@ -18,18 +19,25 @@ class TestTokenizer extends \dW\HTML5\Test\StandardTest {
}
/** @dataProvider provideStandardTokenizerTests */
public function testStandardTokenizerTests(string $input, array $expected, int $state, string $open = null, array $errors) {
$data = new Data($input);
$stack = new OpenElementsStack();
$errorHandler = new ParseError;
if ($open) {
$stack[] = (new \DOMDocument)->createElement($open);
}
$tokenizer = new Tokenizer($data, $stack);
$errorHandler = new ParseError;
$errorHandler->setHandler();
$data = new Data($input, "STDIN", $errorHandler);
$tokenizer = new Tokenizer($data, $stack, $errorHandler);
$tokenizer->state = $state;
$actual = [];
do {
$t = $tokenizer->createToken();
$actual[] = $t;
} while (!($t instanceof EOFToken));
try {
do {
$t = $tokenizer->createToken();
$actual[] = $t;
} while (!($t instanceof EOFToken));
} finally {
$errorHandler->clearHandler();
}
array_pop($actual);
$actual = $this->combineCharacterTokens($actual);
$this->assertEquals($expected, $actual);

Loading…
Cancel
Save