Prep for character references
- Add missing state constants - Break all existing deviations for character refs - Add assertions before use of $attribute - Also fix DOCTYPE state
This commit is contained in:
parent
d4a7280405
commit
67c7f382e2
2 changed files with 59 additions and 34 deletions
|
@ -85,6 +85,15 @@ class Tokenizer {
|
|||
const CDATA_SECTION_STATE = 69;
|
||||
const CDATA_SECTION_BRACKET_STATE = 70;
|
||||
const CDATA_SECTION_END_STATE = 71;
|
||||
const CHARACTER_REFERENCE_STATE = 72;
|
||||
const NAMED_CHARACTER_REFERENCE_STATE = 73;
|
||||
const AMBIGUOUS_AMPERSAND_STATE = 74;
|
||||
const NUMERIC_CHARACTER_REFERENCE_STATE = 75;
|
||||
const HEXADECIMAL_CHARACTER_REFERENCE_START_STATE = 76;
|
||||
const DECIMAL_CHARACTER_REFERENCE_START_STATE = 77;
|
||||
const HEXADECIMAL_CHARACTER_REFERENCE_STATE = 78;
|
||||
const DECIMAL_CHARACTER_REFERENCE_STATE = 79;
|
||||
const NUMERIC_CHARACTER_REFERENCE_END_STATE = 80;
|
||||
|
||||
const STATE_NAMES = [
|
||||
self::DATA_STATE => "Data",
|
||||
|
@ -156,6 +165,15 @@ class Tokenizer {
|
|||
self::AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE => "After DOCTYPE system identifier",
|
||||
self::BOGUS_DOCTYPE_STATE => "Bogus comment",
|
||||
self::CDATA_SECTION_STATE => "CDATA section",
|
||||
self::CHARACTER_REFERENCE_STATE => "Character reference",
|
||||
self::NAMED_CHARACTER_REFERENCE_STATE => "Named character reference",
|
||||
self::AMBIGUOUS_AMPERSAND_STATE => "Ambiguous ampersand",
|
||||
self::NUMERIC_CHARACTER_REFERENCE_STATE => "Numeric character reference",
|
||||
self::HEXADECIMAL_CHARACTER_REFERENCE_START_STATE => "Hexadecimal character reference start",
|
||||
self::DECIMAL_CHARACTER_REFERENCE_START_STATE => "Decimal character reference start",
|
||||
self::HEXADECIMAL_CHARACTER_REFERENCE_STATE => "Hexadecimal character reference",
|
||||
self::DECIMAL_CHARACTER_REFERENCE_STATE => "Decimal character reference",
|
||||
self::NUMERIC_CHARACTER_REFERENCE_END_STATE => "Numeric character reference",
|
||||
];
|
||||
|
||||
// Ctype constants
|
||||
|
@ -233,11 +251,8 @@ class Tokenizer {
|
|||
if ($char === '&') {
|
||||
# Set the return state to the data state.
|
||||
# Switch to the character reference state.
|
||||
|
||||
// DEVIATION:
|
||||
// This implementation does the character reference consuming in a
|
||||
// function for which it is more suited for.
|
||||
return new CharacterToken($this->data->consumeCharacterReference());
|
||||
$returnState = self::DATA_STATE;
|
||||
$this->state = self::CHARACTER_REFERENCE_STATE;
|
||||
}
|
||||
# U+003C LESS-THAN SIGN (<)
|
||||
elseif ($char === '<') {
|
||||
|
@ -277,11 +292,8 @@ class Tokenizer {
|
|||
if ($char === '&') {
|
||||
# Set the return state to the RCDATA state.
|
||||
# Switch to the character reference state.
|
||||
|
||||
// DEVIATION:
|
||||
// This implementation does the character reference consuming in a
|
||||
// function for which it is more suited for.
|
||||
return new CharacterToken($this->data->consumeCharacterReference());
|
||||
$returnState = self::RCDATA_STATE;
|
||||
$this->state = self::CHARACTER_REFERENCE_STATE;
|
||||
}
|
||||
# U+003C LESS-THAN SIGN (<)
|
||||
elseif ($char === '<') {
|
||||
|
@ -1596,6 +1608,7 @@ class Tokenizer {
|
|||
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' || $char === '/' || $char === '>' || $char === '') {
|
||||
# Reconsume in the after attribute name state.
|
||||
assert(isset($token) && $token instanceof Token);
|
||||
assert(isset($attribute) && $attribute instanceof TokenAttr);
|
||||
$this->keepOrDiscardAttribute($token, $attribute);
|
||||
$this->data->unconsume();
|
||||
$this->state = self::AFTER_ATTRIBUTE_NAME_STATE;
|
||||
|
@ -1604,6 +1617,7 @@ class Tokenizer {
|
|||
elseif ($char === '=') {
|
||||
# Switch to the before attribute value state.
|
||||
assert(isset($token) && $token instanceof Token);
|
||||
assert(isset($attribute) && $attribute instanceof TokenAttr);
|
||||
$this->keepOrDiscardAttribute($token, $attribute);
|
||||
$this->state = self::BEFORE_ATTRIBUTE_VALUE_STATE;
|
||||
}
|
||||
|
@ -1616,6 +1630,7 @@ class Tokenizer {
|
|||
// OPTIMIZATION:
|
||||
// Consume all characters that are uppercase ASCII letters to prevent
|
||||
// having to loop back through here every single time.
|
||||
assert(isset($attribute) && $attribute instanceof TokenAttr);
|
||||
$attribute->name .= strtolower($char.$this->data->consumeWhile(self::CTYPE_UPPER));
|
||||
}
|
||||
# U+0000 NULL
|
||||
|
@ -1623,6 +1638,7 @@ class Tokenizer {
|
|||
# This is an unexpected-null-character parse error.
|
||||
# Append a U+FFFD REPLACEMENT CHARACTER character to the current attribute's name.
|
||||
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
|
||||
assert(isset($attribute) && $attribute instanceof TokenAttr);
|
||||
$attribute->name .= "\u{FFFD}";
|
||||
}
|
||||
# U+0022 QUOTATION MARK (")
|
||||
|
@ -1638,6 +1654,7 @@ class Tokenizer {
|
|||
else {
|
||||
attribute_name_state_anything_else:
|
||||
# Append the current input character to the current attribute's name.
|
||||
assert(isset($attribute) && $attribute instanceof TokenAttr);
|
||||
$attribute->name .= $char.$this->data->consumeUntil("\t\n\x0c /=>\0\"'<".self::CTYPE_UPPER);
|
||||
}
|
||||
}
|
||||
|
@ -1746,17 +1763,15 @@ class Tokenizer {
|
|||
elseif ($char === '&') {
|
||||
# Set the return state to the attribute value (double-quoted) state.
|
||||
# Switch to the character reference state.
|
||||
|
||||
// DEVIATION:
|
||||
// This implementation does the character reference consuming in a
|
||||
// function for which it is more suited for.
|
||||
$attribute->value .= $this->data->consumeCharacterReference('"', true);
|
||||
$returnState = self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
|
||||
$this->state = self::CHARACTER_REFERENCE_STATE;
|
||||
}
|
||||
# U+0000 NULL
|
||||
elseif ($char === "\0") {
|
||||
# This is an unexpected-null-character parse error.
|
||||
# Append a U+FFFD REPLACEMENT CHARACTER character to the current attribute's value.
|
||||
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
|
||||
assert(isset($attribute) && $attribute instanceof TokenAttr);
|
||||
$attribute->value .= "\u{FFFD}";
|
||||
}
|
||||
# EOF
|
||||
|
@ -1773,6 +1788,7 @@ class Tokenizer {
|
|||
// OPTIMIZATION:
|
||||
// Consume all characters that aren't listed above to prevent having
|
||||
// to loop back through here every single time.
|
||||
assert(isset($attribute) && $attribute instanceof TokenAttr);
|
||||
$attribute->value .= $char.$this->data->consumeUntil("\"&\0");
|
||||
}
|
||||
}
|
||||
|
@ -1791,17 +1807,15 @@ class Tokenizer {
|
|||
elseif ($char === '&') {
|
||||
# Set the return state to the attribute value (single-quoted) state.
|
||||
# Switch to the character reference state.
|
||||
|
||||
// DEVIATION:
|
||||
// This implementation does the character reference consuming in a
|
||||
// function for which it is more suited for.
|
||||
$attribute->value .= $this->data->consumeCharacterReference("'", true);
|
||||
$returnState = self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
|
||||
$this->state = self::CHARACTER_REFERENCE_STATE;
|
||||
}
|
||||
# U+0000 NULL
|
||||
elseif ($char === "\0") {
|
||||
# This is an unexpected-null-character parse error.
|
||||
# Append a U+FFFD REPLACEMENT CHARACTER character to the current attribute's value.
|
||||
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
|
||||
assert(isset($attribute) && $attribute instanceof TokenAttr);
|
||||
$attribute->value .= "\u{FFFD}";
|
||||
}
|
||||
# EOF
|
||||
|
@ -1818,6 +1832,7 @@ class Tokenizer {
|
|||
// OPTIMIZATION:
|
||||
// Consume all characters that aren't listed above to prevent having
|
||||
// to loop back through here every single time.
|
||||
assert(isset($attribute) && $attribute instanceof TokenAttr);
|
||||
$attribute->value .= $char.$this->data->consumeUntil("'&\0");
|
||||
}
|
||||
}
|
||||
|
@ -1840,11 +1855,8 @@ class Tokenizer {
|
|||
elseif ($char === '&') {
|
||||
# Set the return state to the attribute value (unquoted) state.
|
||||
# Switch to the character reference state.
|
||||
|
||||
// DEVIATION:
|
||||
// This implementation does the character reference consuming in a
|
||||
// function for which it is more suited for.
|
||||
$attribute->value .= $this->data->consumeCharacterReference('>', true);
|
||||
$returnState = self::ATTRIBUTE_VALUE_UNQUOTED_STATE;
|
||||
$this->state = self::CHARACTER_REFERENCE_STATE;
|
||||
}
|
||||
# ">" (U+003E)
|
||||
elseif ($char === '>') {
|
||||
|
@ -1859,6 +1871,7 @@ class Tokenizer {
|
|||
# This is an unexpected-null-character parse error.
|
||||
# Append a U+FFFD REPLACEMENT CHARACTER character to the current attribute's value.
|
||||
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
|
||||
assert(isset($attribute) && $attribute instanceof TokenAttr);
|
||||
$attribute->value .= "\u{FFFD}";
|
||||
}
|
||||
# U+0022 QUOTATION MARK (")
|
||||
|
@ -1886,6 +1899,7 @@ class Tokenizer {
|
|||
|
||||
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
|
||||
// to loop back through here every single time.
|
||||
assert(isset($attribute) && $attribute instanceof TokenAttr);
|
||||
$attribute->value .= $char.$this->data->consumeUntil("\t\n\x0c &>\0\"'<=`");
|
||||
}
|
||||
}
|
||||
|
@ -2426,7 +2440,7 @@ class Tokenizer {
|
|||
# U+0020 SPACE
|
||||
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
||||
# Switch to the before DOCTYPE name state.
|
||||
$this->state = self::DOCTYPE_NAME_STATE;
|
||||
$this->state = self::BEFORE_DOCTYPE_NAME_STATE;
|
||||
}
|
||||
# EOF
|
||||
elseif ($char === '') {
|
||||
|
@ -3470,9 +3484,16 @@ class Tokenizer {
|
|||
}
|
||||
}
|
||||
|
||||
#12.2.5.72 Character reference state
|
||||
elseif ($this->state === self::CHARACTER_REFERENCE_STATE) {
|
||||
// Not implemented
|
||||
$this->state = $returnState;
|
||||
return new CharacterToken('&');
|
||||
}
|
||||
|
||||
# Not a valid state
|
||||
else {
|
||||
throw new \Exception("Tokenizer state: ".$this->state);
|
||||
throw new \Exception("Unimplemented state: ".(self::STATE_NAMES[$this->state] ?? $this->state));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -33,13 +33,17 @@ class TestTokenizer extends \dW\HTML5\Test\StandardTest {
|
|||
$tokenizer->state = $state;
|
||||
// perform the test
|
||||
$actual = [];
|
||||
do {
|
||||
$t = $tokenizer->createToken();
|
||||
$actual[] = $t;
|
||||
} while (!($t instanceof EOFToken));
|
||||
array_pop($actual);
|
||||
$actual = $this->combineCharacterTokens($actual);
|
||||
$this->assertEquals($expected, $actual, $tokenizer->debugLog);
|
||||
try {
|
||||
do {
|
||||
$t = $tokenizer->createToken();
|
||||
if (!($t instanceof EOFToken)) {
|
||||
$actual[] = $t;
|
||||
}
|
||||
} while (!($t instanceof EOFToken));
|
||||
} finally {
|
||||
$actual = $this->combineCharacterTokens($actual);
|
||||
$this->assertEquals($expected, $actual, $tokenizer->debugLog);
|
||||
}
|
||||
}
|
||||
|
||||
public function provideStandardTokenizerTests() {
|
||||
|
|
Loading…
Reference in a new issue