J. King
6b42f08fbc
This has only been done some parts of the code that are internal to the parser at large.
3073 lines
146 KiB
PHP
3073 lines
146 KiB
PHP
<?php
|
|
declare(strict_types=1);
|
|
namespace dW\HTML5;
|
|
|
|
class Tokenizer {
|
|
use ParseErrorEmitter;
|
|
|
|
public $state;
|
|
|
|
protected $data;
|
|
protected $stack;
|
|
|
|
public static $debug = false;
|
|
|
|
const DATA_STATE = 0;
|
|
const RCDATA_STATE = 1;
|
|
const RAWTEXT_STATE = 2;
|
|
const SCRIPT_DATA_STATE = 3;
|
|
const PLAINTEXT_STATE = 4;
|
|
const TAG_OPEN_STATE = 5;
|
|
const END_TAG_OPEN_STATE = 6;
|
|
const TAG_NAME_STATE = 7;
|
|
const RCDATA_LESS_THAN_SIGN_STATE = 8;
|
|
const RCDATA_END_TAG_OPEN_STATE = 9;
|
|
const RCDATA_END_TAG_NAME_STATE = 10;
|
|
const RAWTEXT_LESS_THAN_SIGN_STATE = 11;
|
|
const RAWTEXT_END_TAG_OPEN_STATE = 12;
|
|
const RAWTEXT_END_TAG_NAME_STATE = 13;
|
|
const SCRIPT_DATA_LESS_THAN_SIGN_STATE = 14;
|
|
const SCRIPT_DATA_END_TAG_OPEN_STATE = 15;
|
|
const SCRIPT_DATA_END_TAG_NAME_STATE = 16;
|
|
const SCRIPT_DATA_ESCAPE_START_STATE = 17;
|
|
const SCRIPT_DATA_ESCAPE_START_DASH_STATE = 18;
|
|
const SCRIPT_DATA_ESCAPED_STATE = 19;
|
|
const SCRIPT_DATA_ESCAPED_DASH_STATE = 20;
|
|
const SCRIPT_DATA_ESCAPED_DASH_DASH_STATE = 21;
|
|
const SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE = 22;
|
|
const SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE = 23;
|
|
const SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE = 24;
|
|
const SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE = 25;
|
|
const SCRIPT_DATA_DOUBLE_ESCAPED_STATE = 26;
|
|
const SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE = 27;
|
|
const SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE = 28;
|
|
const SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE = 29;
|
|
const SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE = 30;
|
|
const BEFORE_ATTRIBUTE_NAME_STATE = 31;
|
|
const ATTRIBUTE_NAME_STATE = 32;
|
|
const AFTER_ATTRIBUTE_NAME_STATE = 33;
|
|
const BEFORE_ATTRIBUTE_VALUE_STATE = 34;
|
|
const ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE = 35;
|
|
const ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE = 36;
|
|
const ATTRIBUTE_VALUE_UNQUOTED_STATE = 37;
|
|
const AFTER_ATTRIBUTE_VALUE_QUOTED_STATE = 38;
|
|
const SELF_CLOSING_START_TAG_STATE = 39;
|
|
const BOGUS_COMMENT_STATE = 40;
|
|
const MARKUP_DECLARATION_OPEN_STATE = 41;
|
|
const COMMENT_START_STATE = 42;
|
|
const COMMENT_START_DASH_STATE = 43;
|
|
const COMMENT_STATE = 44;
|
|
const COMMENT_END_DASH_STATE = 45;
|
|
const COMMENT_END_STATE = 46;
|
|
const COMMENT_END_BANG_STATE = 47;
|
|
const DOCTYPE_STATE = 48;
|
|
const BEFORE_DOCTYPE_NAME_STATE = 49;
|
|
const DOCTYPE_NAME_STATE = 50;
|
|
const AFTER_DOCTYPE_NAME_STATE = 51;
|
|
const AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE = 52;
|
|
const BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE = 53;
|
|
const DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE = 54;
|
|
const DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE = 55;
|
|
const AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE = 56;
|
|
const BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE = 57;
|
|
const AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE = 58;
|
|
const BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE = 59;
|
|
const DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE = 60;
|
|
const DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE = 61;
|
|
const AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE = 62;
|
|
const BOGUS_DOCTYPE_STATE = 63;
|
|
const CDATA_SECTION_STATE = 64;
|
|
|
|
const STATE_NAMES = [
|
|
self::DATA_STATE => "Data",
|
|
self::RCDATA_STATE => "RCDATA",
|
|
self::RAWTEXT_STATE => "RAWTEXT",
|
|
self::SCRIPT_DATA_STATE => "Script data",
|
|
self::PLAINTEXT_STATE => "PLAINTEXT",
|
|
self::TAG_OPEN_STATE => "Tag open",
|
|
self::END_TAG_OPEN_STATE => "End tag open",
|
|
self::TAG_NAME_STATE => "Tag name",
|
|
self::RCDATA_LESS_THAN_SIGN_STATE => "RCDATA less-than sign",
|
|
self::RCDATA_END_TAG_OPEN_STATE => "RCDATA end tag open",
|
|
self::RCDATA_END_TAG_NAME_STATE => "RCDATA end tag name",
|
|
self::RAWTEXT_LESS_THAN_SIGN_STATE => "RAWTEXT less than sign",
|
|
self::RAWTEXT_END_TAG_OPEN_STATE => "RAWTEXT end tag open",
|
|
self::RAWTEXT_END_TAG_NAME_STATE => "RAWTEXT end tag name",
|
|
self::SCRIPT_DATA_LESS_THAN_SIGN_STATE => "Script data less-than sign",
|
|
self::SCRIPT_DATA_END_TAG_OPEN_STATE => "Script data end tag open",
|
|
self::SCRIPT_DATA_END_TAG_NAME_STATE => "Script data end tag name",
|
|
self::SCRIPT_DATA_ESCAPE_START_STATE => "Script data escape start",
|
|
self::SCRIPT_DATA_ESCAPE_START_DASH_STATE => "Script data escape start dash",
|
|
self::SCRIPT_DATA_ESCAPED_STATE => "Script data escaped",
|
|
self::SCRIPT_DATA_ESCAPED_DASH_STATE => "Script data escaped dash",
|
|
self::SCRIPT_DATA_ESCAPED_DASH_DASH_STATE => "Script data escaped dash dash",
|
|
self::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE => "Script data escaped less-than sign",
|
|
self::SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE => "Script data escaped end tag open",
|
|
self::SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE => "Script data escaped end tag name",
|
|
self::SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE => "Script data double escape start",
|
|
self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE => "Script data double escaped",
|
|
self::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE => "Script data double escaped dash",
|
|
self::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE => "Script data double escaped dash dash",
|
|
self::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE => "Script data double escaped less-than sign",
|
|
self::SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE => "Script data double escape end",
|
|
self::BEFORE_ATTRIBUTE_NAME_STATE => "Before attribute",
|
|
self::ATTRIBUTE_NAME_STATE => "Attribute name",
|
|
self::AFTER_ATTRIBUTE_NAME_STATE => "After attribute name",
|
|
self::BEFORE_ATTRIBUTE_VALUE_STATE => "Before attribute value",
|
|
self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE => "Attribute value (double quoted)",
|
|
self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE => "Attribute value (single quoted)",
|
|
self::ATTRIBUTE_VALUE_UNQUOTED_STATE => "Attribute value (unquoted)",
|
|
self::AFTER_ATTRIBUTE_VALUE_QUOTED_STATE => "After attribute value (quoted)",
|
|
self::SELF_CLOSING_START_TAG_STATE => "Self-closing start tag",
|
|
self::BOGUS_COMMENT_STATE => "Bogus comment",
|
|
self::MARKUP_DECLARATION_OPEN_STATE => "Markup declaration open",
|
|
self::COMMENT_START_STATE => "Comment start",
|
|
self::COMMENT_START_DASH_STATE => "Comment start dash",
|
|
self::COMMENT_STATE => "Comment",
|
|
self::COMMENT_END_DASH_STATE => "Comment end dash",
|
|
self::COMMENT_END_STATE => "Comment end",
|
|
self::COMMENT_END_BANG_STATE => "Comment end bang",
|
|
self::DOCTYPE_STATE => "DOCTYPE",
|
|
self::BEFORE_DOCTYPE_NAME_STATE => "Before DOCTYPE name",
|
|
self::DOCTYPE_NAME_STATE => "DOCTYPE name",
|
|
self::AFTER_DOCTYPE_NAME_STATE => "After DOCTYPE name",
|
|
self::AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE => "After DOCTYPE public keyword",
|
|
self::BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE => "Before DOCTYPE public identifier",
|
|
self::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE => "DOCTYPE public identifier (double quoted)",
|
|
self::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE => "DOCTYPE public identifier (single quoted)",
|
|
self::AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE => "After DOCTYPE public identifier",
|
|
self::BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE => "Between DOCTYPE public and system identifiers",
|
|
self::AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE => "After DOCTYPE system keyword",
|
|
self::BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE => "Before DOCTYPE system identifier",
|
|
self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE => "DOCTYPE system identifier (double-quoted)",
|
|
self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE => "DOCTYPE system identifier (single-quoted)",
|
|
self::AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE => "After DOCTYPE system identifier",
|
|
self::BOGUS_DOCTYPE_STATE => "Bogus comment",
|
|
self::CDATA_SECTION_STATE => "CDATA section",
|
|
];
|
|
|
|
// Ctype constants
|
|
const CTYPE_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
|
|
const CTYPE_UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
|
|
|
|
public function __construct(Data $data, OpenElementsStack $stack, ParseError $errorHandler) {
|
|
$this->state = self::DATA_STATE;
|
|
$this->data = $data;
|
|
$this->stack = $stack;
|
|
$this->errorHandler = $errorHandler;
|
|
}
|
|
|
|
public function createToken(): Token {
|
|
while (true) {
|
|
if (self::$debug) {
|
|
$state = self::STATE_NAMES[$this->state] ?? "";
|
|
assert($state, new Exception(Exception::UNKNOWN_ERROR));
|
|
echo "State: $state\n";
|
|
unset($state);
|
|
}
|
|
|
|
# 12.2.4.1 Data state
|
|
if ($this->state === self::DATA_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# U+0026 AMPERSAND (&)
|
|
if ($char === '&') {
|
|
# Switch to the character reference in data state.
|
|
|
|
# 8.2.4.2 Character reference in data state:
|
|
# Switch to the data state.
|
|
# Attempt to consume a character reference, with no additional allowed character.
|
|
# If nothing is returned, emit a U+0026 AMPERSAND character (&) token.
|
|
# Otherwise, emit the character tokens that were returned.
|
|
|
|
// DEVIATION: This implementation does the character reference consuming in a
|
|
// function for which it is more suited for.
|
|
return new CharacterToken($this->data->consumeCharacterReference());
|
|
}
|
|
# U+003C LESS-THAN SIGN (<)
|
|
elseif ($char === '<') {
|
|
# Switch to the tag open state.
|
|
$this->state = self::TAG_OPEN_STATE;
|
|
continue;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Emit an end-of-file token.
|
|
return new EOFToken;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Emit the current input character as a character token.
|
|
// OPTIMIZATION: Consume all characters that don't match what is above and emit
|
|
// that as a character token instead to prevent having to loop back through here
|
|
// every single time.
|
|
|
|
return new CharacterToken($char.$this->data->consumeUntil('&<'));
|
|
}
|
|
}
|
|
|
|
# 12.2.4.2 Character reference in data state
|
|
// OPTIMIZATION: This is instead done in the block above.
|
|
|
|
# 12.2.4.3 RCDATA state
|
|
elseif ($this->state === self::RCDATA_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# U+0026 AMPERSAND (&)
|
|
if ($char === '&') {
|
|
# Switch to the character reference in RCDATA state.
|
|
|
|
# 8.2.4.4 Character reference in RCDATA state:
|
|
# Switch to the RCDATA state.
|
|
# Attempt to consume a character reference, with no additional allowed character.
|
|
# If nothing is returned, emit a U+0026 AMPERSAND character (&) token.
|
|
# Otherwise, emit the character tokens that were returned.
|
|
|
|
// DEVIATION: This implementation does the character reference consuming in a
|
|
// function for which it is more suited for.
|
|
return new CharacterToken($this->data->consumeCharacterReference());
|
|
}
|
|
# U+003C LESS-THAN SIGN (<)
|
|
elseif ($char === '<') {
|
|
# Switch to the RCDATA less-than sign state.
|
|
$this->state = self::RCDATA_LESS_THAN_SIGN_STATE;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Emit an end-of-file token.
|
|
return new EOFToken;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Emit the current input character as a character token.
|
|
// OPTIMIZATION: Consume all characters that don't match what is above and emit
|
|
// that as a character token instead to prevent having to loop back through here
|
|
// every single time.
|
|
return new CharacterToken($char.$this->data->consumeUntil('&<'));
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 12.2.4.4 Character reference in RCDATA state
|
|
// OPTIMIZATION: This is instead done in the block above.
|
|
|
|
# 12.2.4.5 RAWTEXT state
|
|
elseif ($this->state === self::RAWTEXT_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# U+003C LESS-THAN SIGN (<)
|
|
if ($char === '<') {
|
|
# Switch to the RAWTEXT less-than sign state.
|
|
$this->state = self::RAWTEXT_LESS_THAN_SIGN_STATE;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Emit an end-of-file token.
|
|
return new EOFToken;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Emit the current input character as a character token.
|
|
// OPTIMIZATION: Consume all characters that don't match what is above and emit
|
|
// that as a character token instead to prevent having to loop back through here
|
|
// every single time.
|
|
return new CharacterToken($char.$this->data->consumeUntil('<'));
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 12.2.4.6 Script data state
|
|
elseif ($this->state === self::SCRIPT_DATA_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# U+003C LESS-THAN SIGN (<)
|
|
if ($char === '<') {
|
|
# Switch to the script data less-than sign state.
|
|
$this->state = self::SCRIPT_DATA_LESS_THAN_SIGN_STATE;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Emit an end-of-file token.
|
|
return new EOFToken;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Emit the current input character as a character token.
|
|
// OPTIMIZATION: Consume all characters that don't match what is above and emit
|
|
// that as a character token instead to prevent having to loop back through here
|
|
// every single time.
|
|
return new CharacterToken($char.$this->data->consumeUntil('<'));
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 12.2.4.7 PLAINTEXT state
|
|
elseif ($this->state === self::PLAINTEXT_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# EOF
|
|
if ($char === '') {
|
|
# Emit an end-of-file token.
|
|
return new EOFToken;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Emit the current input character as a character token.
|
|
// OPTIMIZATION: Consume all characters that don't match what is above and emit
|
|
// that as a character token instead to prevent having to loop back through here
|
|
// every single time.
|
|
return new CharacterToken($char.$this->data->consumeUntil(''));
|
|
}
|
|
}
|
|
|
|
# 12.2.4.8 Tag open state
|
|
elseif ($this->state === self::TAG_OPEN_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# U+0021 EXCLAMATION MARK (!)
|
|
if ($char === '!') {
|
|
# Switch to the markup declaration open state.
|
|
$this->state = self::MARKUP_DECLARATION_OPEN_STATE;
|
|
}
|
|
# U+002F SOLIDUS (/)
|
|
elseif ($char === '/') {
|
|
# Switch to the end tag open state.
|
|
$this->state = self::END_TAG_OPEN_STATE;
|
|
}
|
|
# Uppercase ASCII letter
|
|
# Lowercase ASCII letter
|
|
elseif (ctype_alpha($char)) {
|
|
# Uppercase:
|
|
# Create a new start tag token, set its tag name to the lowercase version of the
|
|
# current input character (add 0x0020 to the character's code point), then switch
|
|
# to the tag name state. (Don't emit the token yet; further details will be filled
|
|
# in before it is emitted.)
|
|
# Lowercase:
|
|
# Create a new start tag token, set its tag name to the current input character,
|
|
# then switch to the tag name state. (Don't emit the token yet; further details
|
|
# will be filled in before it is emitted.)
|
|
|
|
// OPTIMIZATION: Will just check for alpha characters and strtolower the
|
|
// characters.
|
|
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
|
|
// to loop back through here every single time.
|
|
$token = new StartTagToken(strtolower($char.$this->data->consumeWhile(self::CTYPE_ALPHA)));
|
|
$this->state = self::TAG_NAME_STATE;
|
|
}
|
|
# U+003F QUESTION MARK (?)
|
|
elseif ($char === '?') {
|
|
# Parse error. Switch to the bogus comment state.
|
|
|
|
// Making errors more expressive.
|
|
if ($char !== '') {
|
|
$this->error(ParseError::TAG_NAME_EXPECTED);
|
|
} else {
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
}
|
|
|
|
$this->state = self::BOGUS_COMMENT_STATE;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Parse error. Switch to the data state. Emit a U+003C LESS-THAN SIGN character
|
|
# token. Reconsume the current input character.
|
|
|
|
// Making errors more expressive.
|
|
if ($char !== '') {
|
|
$this->error(ParseError::TAG_NAME_EXPECTED);
|
|
} else {
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
}
|
|
|
|
$this->state = self::DATA_STATE;
|
|
$this->data->unconsume();
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.9 End tag open state
|
|
elseif ($this->state === self::END_TAG_OPEN_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# Uppercase ASCII letter
|
|
# Lowercase ASCII letter
|
|
if (ctype_alpha($char)) {
|
|
# Uppercase:
|
|
# Create a new end tag token, set its tag name to the lowercase version of the
|
|
# current input character (add 0x0020 to the character's code point), then switch
|
|
# to the tag name state. (Don't emit the token yet; further details will be filled
|
|
# in before it is emitted.)
|
|
# Lowercase:
|
|
# Create a new end tag token, set its tag name to the current input character,
|
|
# then switch to the tag name state. (Don't emit the token yet; further details
|
|
# will be filled in before it is emitted.)
|
|
|
|
// OPTIMIZATION: Will just check for alpha characters and strtolower the
|
|
// characters.
|
|
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
|
|
// to loop back through here every single time.
|
|
$token = new EndTagToken(strtolower($char.$this->data->consumeWhile(self::CTYPE_ALPHA)));
|
|
$this->state = self::TAG_NAME_STATE;
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Parse error. Switch to the data state.
|
|
$this->error(ParseError::TAG_NAME_EXPECTED);
|
|
$this->state = self::DATA_STATE;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Emit a U+003C LESS-THAN SIGN character
|
|
# token and a U+002F SOLIDUS character token. Reconsume the EOF character.
|
|
// Making errors more expressive.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken('</');
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Parse error. Switch to the bogus comment state.
|
|
$this->error(ParseError::TAG_NAME_EXPECTED);
|
|
$this->state = self::BOGUS_COMMENT_STATE;
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.10 Tag name state
|
|
elseif ($this->state === self::TAG_NAME_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "tab" (U+0009)
|
|
# "LF" (U+000A)
|
|
# "FF" (U+000C)
|
|
# U+0020 SPACE
|
|
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
|
# Switch to the before attribute name state.
|
|
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
|
|
}
|
|
# "/" (U+002F)
|
|
elseif ($char === '/') {
|
|
# Switch to the self-closing start tag state.
|
|
$this->state = self::SELF_CLOSING_START_TAG_STATE;
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Switch to the data state. Emit the current tag token.
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
}
|
|
# Uppercase ASCII letter
|
|
elseif (ctype_upper($char)) {
|
|
# Append the lowercase version of the current input character (add 0x0020 to the
|
|
# character's code point) to the current tag token's tag name.
|
|
|
|
// OPTIMIZATION: Consume all characters that are Uppercase ASCII characters to
|
|
// prevent having to loop back through here every single time.
|
|
$token->name = $token->name.strtolower($char.$this->data->consumeWhile(self::CTYPE_UPPER));
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Reconsume the EOF character.
|
|
|
|
// Making errors more expressive.
|
|
if ($char !== '') {
|
|
$this->error(ParseError::TAG_NAME_EXPECTED, $char);
|
|
} else {
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
}
|
|
|
|
$this->state = self::DATA_STATE;
|
|
$this->data->unconsume();
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Append the current input character to the current tag token's tag name.
|
|
|
|
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
|
|
// to loop back through here every single time.
|
|
$token->name = $token->name.$char.$this->data->consumeUntil("\t\n\x0c />".self::CTYPE_UPPER);
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.11 RCDATA less-than sign state
|
|
elseif ($this->state === self::RCDATA_LESS_THAN_SIGN_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "/" (U+002F)
|
|
if ($char === '/') {
|
|
# Set the temporary buffer to the empty string. Switch to the RCDATA end tag open
|
|
# state.
|
|
$temporaryBuffer = '';
|
|
$this->state = self::RCDATA_END_TAG_OPEN_STATE;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Switch to the RCDATA state. Emit a U+003C LESS-THAN SIGN character token.
|
|
# Reconsume the current input character.
|
|
$this->state = self::RCDATA_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken('<');
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.12 RCDATA end tag open state
|
|
elseif ($this->state === self::RCDATA_END_TAG_OPEN_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# Uppercase ASCII letter
|
|
# Lowercase ASCII letter
|
|
if (ctype_alpha($char)) {
|
|
# Uppercase:
|
|
# Create a new end tag token, and set its tag name to the lowercase version of the
|
|
# current input character (add 0x0020 to the character's code point). Append the
|
|
# current input character to the temporary buffer. Finally, switch to the RCDATA
|
|
# end tag name state. (Don't emit the token yet; further details will be filled in
|
|
# before it is emitted.)
|
|
# Lowercase:
|
|
# Create a new end tag token, and set its tag name to the current input character.
|
|
# Append the current input character to the temporary buffer. Finally, switch to
|
|
# the RCDATA end tag name state. (Don't emit the token yet; further details will
|
|
# be filled in before it is emitted.)
|
|
|
|
// OPTIMIZATION: Will just check for alpha characters and strtolower the
|
|
// characters.
|
|
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
|
|
// to loop back through here every single time.
|
|
$token = new EndTagToken(strtolower($char));
|
|
$temporaryBuffer .= $char;
|
|
$this->state = self::RCDATA_END_TAG_NAME_STATE;
|
|
continue;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Switch to the RCDATA state. Emit a U+003C LESS-THAN SIGN character token and a
|
|
# U+002F SOLIDUS character token. Reconsume the current input character.
|
|
$this->state = self::RCDATA_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken('</');
|
|
}
|
|
}
|
|
|
|
# 8.2.4.13 RCDATA end tag name state
|
|
elseif ($this->state === self::RCDATA_END_TAG_NAME_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "tab" (U+0009)
|
|
# "LF" (U+000A)
|
|
# "FF" (U+000C)
|
|
# U+0020 SPACE
|
|
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
|
# If the current end tag token is an appropriate end tag token, then switch to the
|
|
# before attribute name state. Otherwise, treat it as per the "anything else"
|
|
# entry below.
|
|
if ($token->name === $this->stack->currentNodeName) {
|
|
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
|
|
} else {
|
|
$this->state = self::RCDATA_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken('</'.$temporaryBuffer);
|
|
}
|
|
}
|
|
# "/" (U+002F)
|
|
elseif ($char === '/') {
|
|
# If the current end tag token is an appropriate end tag token, then switch to the
|
|
# self-closing start tag state. Otherwise, treat it as per the "anything else"
|
|
# entry below.
|
|
if ($token->name === $this->stack->currentNodeName) {
|
|
$this->state = self::SELF_CLOSING_START_TAG_STATE;
|
|
} else {
|
|
$this->state = self::RCDATA_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken('</'.$temporaryBuffer);
|
|
}
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# If the current end tag token is an appropriate end tag token, then switch to the
|
|
# data state and emit the current tag token. Otherwise, treat it as per the
|
|
# "anything else" entry below.
|
|
if ($token->name === $this->stack->currentNodeName) {
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
} else {
|
|
$this->state = self::RCDATA_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken('</'.$temporaryBuffer);
|
|
}
|
|
}
|
|
# Uppercase ASCII letter
|
|
# Lowercase ASCII letter
|
|
elseif (ctype_alpha($char)) {
|
|
# Uppercase:
|
|
# Append the lowercase version of the current input character (add 0x0020 to the
|
|
# character's code point) to the current tag token's tag name. Append the current
|
|
# input character to the temporary buffer.
|
|
# Lowercase:
|
|
# Append the current input character to the current tag token's tag name. Append
|
|
# the current input character to the temporary buffer.
|
|
|
|
// OPTIMIZATION: Will just check for alpha characters and strtolower the
|
|
// characters.
|
|
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
|
|
// to loop back through here every single time.
|
|
$token->name .= $token->name.strtolower($char.$this->data->consumeWhile(self::CTYPE_ALPHA));
|
|
$temporaryBuffer .= $char;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Switch to the RCDATA state. Emit a U+003C LESS-THAN SIGN character token, a
|
|
# U+002F SOLIDUS character token, and a character token for each of the characters
|
|
# in the temporary buffer (in the order they were added to the buffer). Reconsume
|
|
# the current input character.
|
|
$this->state = self::RCDATA_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken('</'.$temporaryBuffer);
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.14 RAWTEXT less-than sign state
|
|
elseif ($this->state === self::RAWTEXT_LESS_THAN_SIGN_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "/" (U+002F)
|
|
if ($char === '/') {
|
|
# Set the temporary buffer to the empty string. Switch to the RAWTEXT end tag open
|
|
# state.
|
|
$temporaryBuffer = '';
|
|
$this->state = self::RAWTEXT_END_TAG_OPEN_STATE;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Switch to the RAWTEXT state. Emit a U+003C LESS-THAN SIGN character token.
|
|
# Reconsume the current input character.
|
|
$this->state = self::RAWTEXT_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken('<');
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.15 RAWTEXT end tag open state
|
|
elseif ($this->state === self::RAWTEXT_END_TAG_OPEN_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# Uppercase ASCII letter
|
|
# Lowercase ASCII letter
|
|
if (ctype_alpha($char)) {
|
|
# Uppercase:
|
|
# Create a new end tag token, and set its tag name to the lowercase version of the
|
|
# current input character (add 0x0020 to the character's code point). Append the
|
|
# current input character to the temporary buffer. Finally, switch to the RAWTEXT
|
|
# end tag name state. (Don't emit the token yet; further details will be filled in
|
|
# before it is emitted.)
|
|
# Lowercase:
|
|
# Create a new end tag token, and set its tag name to the current input character.
|
|
# Append the current input character to the temporary buffer. Finally, switch to
|
|
# the RAWTEXT end tag name state. (Don't emit the token yet; further details will
|
|
# be filled in before it is emitted.)
|
|
|
|
// OPTIMIZATION: Will just check for alpha characters and strtolower the
|
|
// characters.
|
|
$token = new EndTagToken(strtolower($char));
|
|
$temporaryBuffer .= $char;
|
|
$this->state = self::RAWTEXT_END_TAG_NAME_STATE;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Switch to the RAWTEXT state. Emit a U+003C LESS-THAN SIGN character token and a
|
|
# U+002F SOLIDUS character token. Reconsume the current input character.
|
|
$this->state = self::RAWTEXT_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken('</');
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.16 RAWTEXT end tag name state
|
|
elseif ($this->state === self::RAWTEXT_END_TAG_NAME_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "tab" (U+0009)
|
|
# "LF" (U+000A)
|
|
# "FF" (U+000C)
|
|
# U+0020 SPACE
|
|
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
|
# If the current end tag token is an appropriate end tag token, then switch to the
|
|
# before attribute name state. Otherwise, treat it as per the "anything else"
|
|
# entry below.
|
|
if ($token->name === $this->stack->currentNodeName) {
|
|
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
|
|
} else {
|
|
$this->state = self::RAWTEXT_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken('</'.$temporaryBuffer);
|
|
}
|
|
|
|
continue;
|
|
}
|
|
# "/" (U+002F)
|
|
elseif ($char === '/') {
|
|
# If the current end tag token is an appropriate end tag token, then switch to the
|
|
# self-closing start tag state. Otherwise, treat it as per the "anything else"
|
|
# entry below.
|
|
if ($token->name === $this->stack->currentNodeName) {
|
|
$this->state = self::SELF_CLOSING_START_TAG_STATE;
|
|
} else {
|
|
$this->state = self::RAWTEXT_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken('</'.$temporaryBuffer);
|
|
}
|
|
|
|
continue;
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# If the current end tag token is an appropriate end tag token, then switch to the
|
|
# data state and emit the current tag token. Otherwise, treat it as per the
|
|
# "anything else" entry below.
|
|
if ($token->name === $this->stack->currentNodeName) {
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
} else {
|
|
$this->state = self::RAWTEXT_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken('</'.$temporaryBuffer);
|
|
}
|
|
|
|
continue;
|
|
}
|
|
# Uppercase ASCII letter
|
|
# Lowercase ASCII letter
|
|
elseif (ctype_alpha($char)) {
|
|
# Uppercase:
|
|
# Append the lowercase version of the current input character (add 0x0020 to the
|
|
# character's code point) to the current tag token's tag name. Append the current
|
|
# input character to the temporary buffer.
|
|
# Lowercase:
|
|
# Append the current input character to the current tag token's tag name. Append
|
|
# the current input character to the temporary buffer.
|
|
|
|
// OPTIMIZATION: Will just check for alpha characters and strtolower the
|
|
// characters.
|
|
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
|
|
// to loop back through here every single time.
|
|
$token->name .= $token->name.strtolower($char.$this->data->consumeWhile(self::CTYPE_ALPHA));
|
|
$temporaryBuffer .= $char;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Switch to the RAWTEXT state. Emit a U+003C LESS-THAN SIGN character token, a
|
|
# U+002F SOLIDUS character token, and a character token for each of the characters
|
|
# in the temporary buffer (in the order they were added to the buffer). Reconsume
|
|
# the current input character.
|
|
$this->state = self::RAWTEXT_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken('</'.$temporaryBuffer);
|
|
|
|
continue;
|
|
}
|
|
}
|
|
|
|
# 8.2.4.17 Script data less-than sign state
|
|
elseif ($this->state === self::SCRIPT_DATA_LESS_THAN_SIGN_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "/" (U+002F)
|
|
if ($char === '/') {
|
|
# Set the temporary buffer to the empty string. Switch to the script data end tag
|
|
# open state.
|
|
$temporaryBuffer = '';
|
|
$this->state = self::SCRIPT_DATA_END_TAG_OPEN_STATE;
|
|
|
|
continue;
|
|
}
|
|
# "!" (U+0021)
|
|
elseif ($char === '!') {
|
|
# Switch to the script data escape start state. Emit a U+003C LESS-THAN SIGN
|
|
# character token and a U+0021 EXCLAMATION MARK character token.
|
|
$this->state = self::SCRIPT_DATA_ESCAPE_START_STATE;
|
|
return new CharacterToken('<!');
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Switch to the script data state. Emit a U+003C LESS-THAN SIGN character token.
|
|
# Reconsume the current input character.
|
|
$this->state = self::SCRIPT_DATA_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken('<');
|
|
|
|
continue;
|
|
}
|
|
}
|
|
|
|
# 8.2.4.18 Script data end tag open state
|
|
elseif ($this->state === self::SCRIPT_DATA_END_TAG_OPEN_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# Uppercase ASCII letter
|
|
# Lowercase ASCII letter
|
|
if (ctype_alpha($char)) {
|
|
# Uppercase:
|
|
# Create a new end tag token, and set its tag name to the lowercase version of the
|
|
# current input character (add 0x0020 to the character's code point). Append the
|
|
# current input character to the temporary buffer. Finally, switch to the script
|
|
# data end tag name state. (Don't emit the token yet; further details will be
|
|
# filled in before it is emitted.)
|
|
# Lowercase:
|
|
# Create a new end tag token, and set its tag name to the current input character.
|
|
# Append the current input character to the temporary buffer. Finally, switch to
|
|
# the script data end tag name state. (Don't emit the token yet; further details
|
|
# will be filled in before it is emitted.)
|
|
|
|
// OPTIMIZATION: Will just check for alpha characters and strtolower the
|
|
// characters.
|
|
$token = new EndTagToken(strtolower($char));
|
|
$temporaryBuffer .= $char;
|
|
$this->state = self::SCRIPT_DATA_END_TAG_NAME_STATE;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Switch to the script data state. Emit a U+003C LESS-THAN SIGN character token
|
|
# and a U+002F SOLIDUS character token. Reconsume the current input character.
|
|
$this->state = self::SCRIPT_DATA_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken('</');
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.19 Script data end tag name state
|
|
elseif ($this->state === self::SCRIPT_DATA_END_TAG_NAME_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "tab" (U+0009)
|
|
# "LF" (U+000A)
|
|
# "FF" (U+000C)
|
|
# U+0020 SPACE
|
|
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
|
# If the current end tag token is an appropriate end tag token, then switch to the
|
|
# before attribute name state. Otherwise, treat it as per the "anything else"
|
|
# entry below.
|
|
if ($token->name === $this->stack->currentNodeName) {
|
|
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
|
|
} else {
|
|
$this->state = self::SCRIPT_DATA_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken('</'.$temporaryBuffer);
|
|
}
|
|
}
|
|
# "/" (U+002F)
|
|
elseif ($char === '/') {
|
|
# If the current end tag token is an appropriate end tag token, then switch to the
|
|
# self-closing start tag state. Otherwise, treat it as per the "anything else"
|
|
# entry below.
|
|
if ($token->name === $this->stack->currentNodeName) {
|
|
$this->state = self::SELF_CLOSING_START_TAG_STATE;
|
|
} else {
|
|
$this->state = self::SCRIPT_DATA_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken('</'.$temporaryBuffer);
|
|
}
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# If the current end tag token is an appropriate end tag token, then switch to the
|
|
# data state and emit the current tag token. Otherwise, treat it as per the
|
|
# "anything else" entry below.
|
|
if ($token->name === $this->stack->currentNodeName) {
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
} else {
|
|
$this->state = self::SCRIPT_DATA_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken('</'.$temporaryBuffer);
|
|
}
|
|
}
|
|
# Uppercase ASCII letter
|
|
# Lowercase ASCII letter
|
|
elseif (ctype_alpha($char)) {
|
|
# Uppercase:
|
|
# Append the lowercase version of the current input character (add 0x0020 to the
|
|
# character's code point) to the current tag token's tag name. Append the current
|
|
# input character to the temporary buffer.
|
|
# Lowercase:
|
|
# Append the current input character to the current tag token's tag name. Append
|
|
# the current input character to the temporary buffer.
|
|
|
|
// OPTIMIZATION: Will just check for alpha characters and strtolower the
|
|
// characters.
|
|
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
|
|
// to loop back through here every single time.
|
|
$token->name .= $token->name.strtolower($char.$this->data->consumeWhile(self::CTYPE_ALPHA));
|
|
$temporaryBuffer .= $char;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Switch to the script data state. Emit a U+003C LESS-THAN SIGN character token, a
|
|
# U+002F SOLIDUS character token, and a character token for each of the characters
|
|
# in the temporary buffer (in the order they were added to the buffer). Reconsume
|
|
# the current input character.
|
|
$this->state = self::SCRIPT_DATA_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken('</'.$temporaryBuffer);
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.20 Script data escape start state
|
|
elseif ($this->state === self::SCRIPT_DATA_ESCAPE_START_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "-" (U+002D)
|
|
if ($char === '-') {
|
|
# Switch to the script data escape start dash state. Emit a U+002D HYPHEN-MINUS
|
|
# character token.
|
|
$this->state = self::SCRIPT_DATA_ESCAPE_START_DASH_STATE;
|
|
return new CharacterToken('-');
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Switch to the script data state. Reconsume the current input character.
|
|
$this->state = self::SCRIPT_DATA_STATE;
|
|
$this->data->unconsume();
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.21 Script data escape start dash state
|
|
elseif ($this->state === self::SCRIPT_DATA_ESCAPE_START_DASH_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "-" (U+002D)
|
|
if ($char === '-') {
|
|
# Switch to the script data escaped dash dash state. Emit a U+002D HYPHEN-MINUS
|
|
# character token.
|
|
$this->state = self::SCRIPT_DATA_ESCAPED_DASH_DASH_STATE;
|
|
return new CharacterToken('-');
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Switch to the script data state. Reconsume the current input character.
|
|
$this->state = self::SCRIPT_DATA_STATE;
|
|
$this->data->unconsume();
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.22 Script data escaped state
|
|
elseif ($this->state === self::SCRIPT_DATA_ESCAPED_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "-" (U+002D)
|
|
if ($char === '-') {
|
|
# Switch to the script data escaped dash state. Emit a U+002D HYPHEN-MINUS
|
|
# character token.
|
|
$this->state = self::SCRIPT_DATA_ESCAPED_DASH_STATE;
|
|
return new CharacterToken('-');
|
|
}
|
|
# "<" (U+003C)
|
|
elseif ($char === '<') {
|
|
# Switch to the script data escaped less-than sign state.
|
|
$this->state = self::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Switch to the data state. Parse error. Reconsume the EOF character.
|
|
$this->state = self::DATA_STATE;
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->data->unconsume();
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Emit the current input character as a character token.
|
|
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
|
|
// to loop back through here every single time.
|
|
return new CharacterToken($char.$this->data->consumeUntil('-<'));
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.23 Script data escaped dash state
|
|
elseif ($this->state === self::SCRIPT_DATA_ESCAPED_DASH_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "-" (U+002D)
|
|
if ($char === '-') {
|
|
# Switch to the script data escaped dash dash state. Emit a U+002D HYPHEN-MINUS
|
|
# character token.
|
|
$this->state = self::SCRIPT_DATA_ESCAPED_DASH_DASH_STATE;
|
|
return new CharacterToken('-');
|
|
}
|
|
# "<" (U+003C)
|
|
elseif ($char === '<') {
|
|
# Switch to the script data escaped less-than sign state.
|
|
$this->state = self::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Switch to the data state. Parse error. Reconsume the EOF character.
|
|
$this->state = self::DATA_STATE;
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->data->unconsume();
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Switch to the script data escaped state. Emit the current input character as a
|
|
# character token.
|
|
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
|
|
return new CharacterToken($char);
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.24 Script data escaped dash dash state
|
|
elseif ($this->state === self::SCRIPT_DATA_ESCAPED_DASH_DASH_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "-" (U+002D)
|
|
if ($char === '-') {
|
|
# Emit a U+002D HYPHEN-MINUS character token.
|
|
return new CharacterToken('-');
|
|
}
|
|
# "<" (U+003C)
|
|
elseif ($char === '<') {
|
|
# Switch to the script data escaped less-than sign state.
|
|
$this->state = self::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE;
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Switch to the script data state. Emit a U+003E GREATER-THAN SIGN character
|
|
# token.
|
|
$this->state = self::SCRIPT_DATA_STATE;
|
|
return new CharacterToken('>');
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Switch to the data state. Parse error. Reconsume the EOF character.
|
|
$this->state = self::DATA_STATE;
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->data->unconsume();
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Switch to the script data escaped state. Emit the current input character as a
|
|
# character token.
|
|
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
|
|
return new CharacterToken($char);
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.25 Script data escaped less-than sign state
|
|
elseif ($this->state === self::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "/" (U+002F)
|
|
if ($char === '/') {
|
|
# Set the temporary buffer to the empty string. Switch to the script data escaped
|
|
# end tag open state.
|
|
$temporaryBuffer .= '';
|
|
$this->state = self::SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE;
|
|
}
|
|
# Uppercase ASCII letter
|
|
# Lowercase ASCII letter
|
|
elseif (ctype_alpha($char)) {
|
|
# Uppercase:
|
|
# Set the temporary buffer to the empty string. Append the lowercase version of
|
|
# the current input character (add 0x0020 to the character's code point) to the
|
|
# temporary buffer. Switch to the script data double escape start state. Emit a
|
|
# U+003C LESS-THAN SIGN character token and the current input character as a
|
|
# character token.
|
|
# Lowercase:
|
|
# Set the temporary buffer to the empty string. Append the current input character
|
|
# to the temporary buffer. Switch to the script data double escape start state.
|
|
# Emit a U+003C LESS-THAN SIGN character token and the current input character as
|
|
# a character token.
|
|
|
|
// OPTIMIZATION: Will just check for alpha characters and strtolower the
|
|
// characters.
|
|
$temporaryBuffer = strtolower($char);
|
|
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE;
|
|
return new CharacterToken('<'.$char);
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Switch to the script data escaped state. Emit a U+003C LESS-THAN SIGN character
|
|
# token. Reconsume the current input character.
|
|
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken($char);
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.26 Script data escaped end tag open state
|
|
elseif ($this->state === self::SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# Uppercase ASCII letter
|
|
# Lowercase ASCII letter
|
|
if (ctype_alpha($char)) {
|
|
# Uppercase:
|
|
# Create a new end tag token, and set its tag name to the lowercase version of the
|
|
# current input character (add 0x0020 to the character's code point). Append the
|
|
# current input character to the temporary buffer. Finally, switch to the script
|
|
# data escaped end tag name state. (Don't emit the token yet; further details will
|
|
# be filled in before it is emitted.)
|
|
# Lowercase:
|
|
# Create a new end tag token, and set its tag name to the current input character.
|
|
# Append the current input character to the temporary buffer. Finally, switch to
|
|
# the script data escaped end tag name state. (Don't emit the token yet; further
|
|
# details will be filled in before it is emitted.)
|
|
|
|
// OPTIMIZATION: Will just check for alpha characters and strtolower the
|
|
// characters.
|
|
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
|
|
// to loop back through here every single time.
|
|
$token = new EndTagToken(strtolower($char));
|
|
$temporaryBuffer .= $char;
|
|
$this->state = self::SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Switch to the script data escaped state. Emit a U+003C LESS-THAN SIGN character
|
|
# token and a U+002F SOLIDUS character token. Reconsume the current input
|
|
# character.
|
|
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken('</');
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.27 Script data escaped end tag name state
|
|
elseif ($this->state === self::SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "tab" (U+0009)
|
|
# "LF" (U+000A)
|
|
# "FF" (U+000C)
|
|
# U+0020 SPACE
|
|
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
|
# If the current end tag token is an appropriate end tag token, then switch to the
|
|
# before attribute name state. Otherwise, treat it as per the "anything else"
|
|
# entry below.
|
|
if ($token->name === $this->stack->currentNodeName) {
|
|
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
|
|
} else {
|
|
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken('</'.$temporaryBuffer);
|
|
}
|
|
}
|
|
# "/" (U+002F)
|
|
elseif ($char === '/') {
|
|
# If the current end tag token is an appropriate end tag token, then switch to the
|
|
# self-closing start tag state. Otherwise, treat it as per the "anything else"
|
|
# entry below.
|
|
if ($token->name === $this->stack->currentNodeName) {
|
|
$this->state = self::SELF_CLOSING_START_TAG_STATE;
|
|
} else {
|
|
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken('</'.$temporaryBuffer);
|
|
}
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# If the current end tag token is an appropriate end tag token, then switch to the
|
|
# data state and emit the current tag token. Otherwise, treat it as per the
|
|
# "anything else" entry below.
|
|
if ($token->name === $this->stack->currentNodeName) {
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
} else {
|
|
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken('</'.$temporaryBuffer);
|
|
}
|
|
}
|
|
# Uppercase ASCII letter
|
|
# Lowercase ASCII letter
|
|
elseif (ctype_alpha($char)) {
|
|
# Uppercase:
|
|
# Append the lowercase version of the current input character (add 0x0020 to the
|
|
# character's code point) to the current tag token's tag name. Append the current
|
|
# input character to the temporary buffer.
|
|
# Lowercase:
|
|
# Append the current input character to the current tag token's tag name. Append
|
|
# the current input character to the temporary buffer.
|
|
|
|
// OPTIMIZATION: Will just check for alpha characters and strtolower the
|
|
// characters.
|
|
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
|
|
// to loop back through here every single time.
|
|
$token->name .= $token->name.strtolower($char.$this->data->consumeWhile(self::CTYPE_ALPHA));
|
|
$temporaryBuffer .= $char;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Switch to the script data state. Emit a U+003C LESS-THAN SIGN character token, a
|
|
# U+002F SOLIDUS character token, and a character token for each of the characters
|
|
# in the temporary buffer (in the order they were added to the buffer). Reconsume
|
|
# the current input character.
|
|
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
|
|
$this->data->unconsume();
|
|
return new CharacterToken('</'.$temporaryBuffer);
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.29 Script data double escaped state
|
|
elseif ($this->state === self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "-" (U+002D)
|
|
if ($char === '-') {
|
|
# Switch to the script data double escaped dash dash state. Emit a U+002D
|
|
# HYPHEN-MINUS character token.
|
|
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE;
|
|
return new CharacterToken('-');
|
|
}
|
|
# "<" (U+003C)
|
|
elseif ($char === '<') {
|
|
# Switch to the script data double escaped less-than sign state. Emit a U+003C
|
|
# LESS-THAN SIGN character token.
|
|
$this->state = self::DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE;
|
|
return new CharacterToken('<');
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Switch to the script data state. Emit a U+003E GREATER-THAN SIGN character
|
|
# token.
|
|
$this->state = self::SCRIPT_DATA_STATE;
|
|
return new CharacterToken('>');
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Reconsume the EOF character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$this->data->unconsume();
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Switch to the script data double escaped state. Emit the current input character
|
|
# as a character token.
|
|
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
|
|
return new CharacterToken($char);
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.32 Script data double escaped less-than sign state
|
|
elseif ($this->state === self::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "/" (U+002F)
|
|
if ($char === '/') {
|
|
# Set the temporary buffer to the empty string. Switch to the script data double
|
|
# escape end state. Emit a U+002F SOLIDUS character token.
|
|
$temporaryBuffer = '';
|
|
$this->state === self::SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE;
|
|
return new CharacterToken('/');
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Switch to the script data double escaped state. Reconsume the current input
|
|
# character.
|
|
$this->state === self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
|
|
$this->data->unconsume();
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.33 Script data double escape end state
|
|
elseif ($this->state === self::SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "tab" (U+0009)
|
|
# "LF" (U+000A)
|
|
# "FF" (U+000C)
|
|
# U+0020 SPACE
|
|
# "/" (U+002F)
|
|
# ">" (U+003E)
|
|
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' || $char === '/' || $char === '>') {
|
|
# If the temporary buffer is the string "script", then switch to the script data
|
|
# escaped state. Otherwise, switch to the script data double escaped state. Emit
|
|
# the current input character as a character token.
|
|
if ($temporaryBuffer === 'script') {
|
|
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
|
|
} else {
|
|
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
|
|
return new CharacterToken($char);
|
|
}
|
|
}
|
|
# Uppercase ASCII letter
|
|
# Lowercase ASCII letter
|
|
elseif (ctype_alpha($char)) {
|
|
# Uppercase:
|
|
# Append the lowercase version of the current input character (add 0x0020 to the
|
|
# character's code point) to the temporary buffer. Emit the current input
|
|
# character as a character token.
|
|
# Lowercase:
|
|
# Append the current input character to the temporary buffer. Emit the current
|
|
# input character as a character token.
|
|
|
|
// OPTIMIZATION: Will just check for alpha characters and strtolower the
|
|
// characters.
|
|
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
|
|
// to loop back through here every single time.
|
|
$char = $char.$this->data->consumeWhile(self::CTYPE_ALPHA);
|
|
$temporaryBuffer .= strtolower(strtolower($char));
|
|
return new CharacterToken($char);
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Switch to the script data double escaped state. Reconsume the current input
|
|
# character.
|
|
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
|
|
$this->data->unconsume();
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.34 Before attribute name state
|
|
elseif ($this->state === self::BEFORE_ATTRIBUTE_NAME_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "tab" (U+0009)
|
|
# "LF" (U+000A)
|
|
# "FF" (U+000C)
|
|
# U+0020 SPACE
|
|
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
|
# Ignore the character.
|
|
}
|
|
# "/" (U+002F)
|
|
elseif ($char === '/') {
|
|
# Switch to the self-closing start tag state.
|
|
$this->state = self::SELF_CLOSING_START_TAG_STATE;
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Switch to the data state. Emit the current tag token.
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
}
|
|
# Uppercase ASCII letter
|
|
elseif (ctype_upper($char)) {
|
|
# Start a new attribute in the current tag token. Set that attribute's name to the
|
|
# lowercase version of the current input character (add 0x0020 to the character's
|
|
# code point), and its value to the empty string. Switch to the attribute name
|
|
# state.
|
|
|
|
// Need to add the current attribute to the token, if necessary.
|
|
if ($attribute ?? null) {
|
|
$token->attributes[] = $attribute;
|
|
$attribute = null;
|
|
}
|
|
|
|
$attribute = new TokenAttr(strtolower($char), '');
|
|
$this->state = self::ATTRIBUTE_NAME_STATE;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Reconsume the EOF character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$this->data->unconsume();
|
|
}
|
|
# U+0022 QUOTATION MARK (")
|
|
# "'" (U+0027)
|
|
# "<" (U+003C)
|
|
# "=" (U+003D)
|
|
# Anything else
|
|
else {
|
|
# Quotes, less than sign, equals:
|
|
# Parse error. Treat it as per the "anything else" entry below.
|
|
# Anything else:
|
|
# Start a new attribute in the current tag token. Set that attribute's name to the
|
|
# current input character, and its value to the empty string. Switch to the
|
|
# attribute name state.
|
|
|
|
if ($char === '"' || $char === "'" || $char === '<' || $char === '=') {
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
|
|
}
|
|
|
|
// Need to add the current attribute to the token, if necessary.
|
|
if ($attribute ?? null) {
|
|
$token->attributes[] = $attribute;
|
|
$attribute = null;
|
|
}
|
|
|
|
$attribute = new TokenAttr($char, '');
|
|
$this->state = self::ATTRIBUTE_NAME_STATE;
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.35 Attribute name state
|
|
elseif ($this->state === self::ATTRIBUTE_NAME_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "tab" (U+0009)
|
|
# "LF" (U+000A)
|
|
# "FF" (U+000C)
|
|
# U+0020 SPACE
|
|
# "/" (U+002F)
|
|
# U+003E GREATER-THAN SIGN (>)
|
|
# EOF
|
|
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' || $char === '/' || $char === '>' || $char === '') {
|
|
if ($token->hasAttribute($attribute->name)) {
|
|
$this->error(ParseError::ATTRIBUTE_EXISTS, $attribute->name);
|
|
}
|
|
|
|
# Reconsume in the after attribute name state.
|
|
$this->data->unconsume();
|
|
$this->state = self::AFTER_ATTRIBUTE_NAME_STATE;
|
|
}
|
|
# "=" (U+003D)
|
|
elseif ($char === '=') {
|
|
if ($token instanceof StartTagToken && $token->hasAttribute($attribute->name)) {
|
|
$this->error(ParseError::ATTRIBUTE_EXISTS, $attribute->name);
|
|
}
|
|
|
|
# Switch to the before attribute value state.
|
|
$this->state = self::BEFORE_ATTRIBUTE_VALUE_STATE;
|
|
}
|
|
# Uppercase ASCII letter
|
|
elseif (ctype_upper($char)) {
|
|
# Append the lowercase version of the current input character (add 0x0020 to the
|
|
# character's code point) to the current attribute's name.
|
|
|
|
// OPTIMIZATION: Consume all characters that are uppercase ASCII letters to prevent
|
|
// having to loop back through here every single time.
|
|
$attribute->name .= strtolower($char.$this->data->consumeWhile(self::CTYPE_UPPER));
|
|
}
|
|
# U+0022 QUOTATION MARK (")
|
|
# "'" (U+0027)
|
|
# "<" (U+003C)
|
|
# Anything else
|
|
else {
|
|
# Quotes, less than sign:
|
|
# Parse error. Treat it as per the "anything else" entry below.
|
|
# Anything else:
|
|
# Append the current input character to the current attribute's name.
|
|
|
|
if ($char === '"' || $char === "'" || $char === '<' || $char === '=') {
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
|
|
}
|
|
|
|
// OPTIMIZATION: Will just check for alpha characters and strtolower the
|
|
// characters.
|
|
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
|
|
// to loop back through here every single time.
|
|
$attribute->name .= $char.$this->data->consumeUntil("\t\n\x0c /=>\"'<".self::CTYPE_UPPER);
|
|
}
|
|
|
|
# When the user agent leaves the attribute name state (and before emitting the tag
|
|
# token, if appropriate), the complete attribute's name must be compared to the
|
|
# other attributes on the same token; if there is already an attribute on the
|
|
# token with the exact same name, then this is a parse error and the new attribute
|
|
# must be removed from the token.
|
|
|
|
// DEVIATION: Because this implementation uses a buffer to hold the attribute name
|
|
// it is only added if it is valid. The result is the same, though.
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.36 After attribute name state
|
|
elseif ($this->state === self::AFTER_ATTRIBUTE_NAME_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "tab" (U+0009)
|
|
# "LF" (U+000A)
|
|
# "FF" (U+000C)
|
|
# U+0020 SPACE
|
|
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
|
# Ignore the character.
|
|
}
|
|
# U+002F SOLIDUS (/)
|
|
elseif ($char === '/') {
|
|
# Switch to the self-closing start tag state.
|
|
$this->state = self::SELF_CLOSING_START_TAG_STATE;
|
|
}
|
|
# U+003D EQUALS SIGN (=)
|
|
elseif ($char === '=') {
|
|
# Switch to the before attribute value state.
|
|
$this->state = self::BEFORE_ATTRIBUTE_VALUE_STATE;
|
|
}
|
|
# U+003E GREATER-THAN SIGN (>)
|
|
elseif ($char === '>') {
|
|
// Need to add the current attribute to the token, if necessary.
|
|
if ($attribute) {
|
|
$token->attributes[] = $attribute;
|
|
$attribute = null;
|
|
}
|
|
|
|
# Switch to the data state. Emit the current tag token.
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
}
|
|
# Uppercase ASCII letter
|
|
elseif (ctype_upper($char)) {
|
|
# Start a new attribute in the current tag token. Set that attribute's name to the
|
|
# lowercase version of the current input character (add 0x0020 to the character's
|
|
# code point), and its value to the empty string. Switch to the attribute name
|
|
# state.
|
|
|
|
// Need to add the current attribute to the token, if necessary.
|
|
if ($attribute) {
|
|
$token->attributes[] = $attribute;
|
|
$attribute = null;
|
|
}
|
|
|
|
$attribute = new TokenAttr(strtolower($char), '');
|
|
$this->state = self::ATTRIBUTE_NAME_STATE;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Reconsume the EOF character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$this->data->unconsume();
|
|
}
|
|
# U+0022 QUOTATION MARK (")
|
|
# "'" (U+0027)
|
|
# "<" (U+003C)
|
|
# "=" (U+003D)
|
|
# Anything else
|
|
else {
|
|
# Quotes, less than sign, equals:
|
|
# Parse error. Treat it as per the "anything else" entry below.
|
|
# Anything else:
|
|
# Start a new attribute in the current tag token. Set that attribute's name to the
|
|
# current input character, and its value to the empty string. Switch to the
|
|
# attribute name state.
|
|
|
|
if ($char === '"' || $char === "'" || $char === '<' || $char === '=') {
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
|
|
}
|
|
|
|
// Need to add the current attribute to the token, if necessary.
|
|
if ($attribute) {
|
|
$token->attributes[] = $attribute;
|
|
$attribute = null;
|
|
}
|
|
|
|
$attribute = new TokenAttr($char, '');
|
|
$this->state = self::ATTRIBUTE_NAME_STATE;
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.37 Before attribute value state
|
|
elseif ($this->state === self::BEFORE_ATTRIBUTE_VALUE_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "tab" (U+0009)
|
|
# "LF" (U+000A)
|
|
# "FF" (U+000C)
|
|
# U+0020 SPACE
|
|
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
|
# Ignore the character.
|
|
}
|
|
# U+0022 QUOTATION MARK (")
|
|
elseif ($char === '"') {
|
|
# Switch to the attribute value (double-quoted) state.
|
|
$this->state = self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
|
|
}
|
|
# U+0026 AMPERSAND (&)
|
|
elseif ($char === '&') {
|
|
# Switch to the attribute value (unquoted) state. Reconsume the current input
|
|
# character.
|
|
$this->state = self::ATTRIBUTE_VALUE_UNQUOTED_STATE;
|
|
$this->data->unconsume();
|
|
}
|
|
# "'" (U+0027)
|
|
elseif ($char === "'") {
|
|
# Switch to the attribute value (single-quoted) state.
|
|
$this->state = self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Parse error. Switch to the data state. Emit the current tag token.
|
|
$this->error(ParseError::UNEXPECTED_END_OF_TAG);
|
|
$this->state = self::DATA_STATE;
|
|
|
|
// Need to add the current attribute to the token, if necessary.
|
|
if ($attribute) {
|
|
$token->attributes[] = $attribute;
|
|
$attribute = null;
|
|
}
|
|
|
|
return $token;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Reconsume the EOF character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$this->data->unconsume();
|
|
}
|
|
# "<" (U+003C)
|
|
# "=" (U+003D)
|
|
# "`" (U+0060)
|
|
# Anything else
|
|
else {
|
|
# less than sign, equals, tick:
|
|
# Parse error. Treat it as per the "anything else" entry below.
|
|
# Anything else:
|
|
# Append the current input character to the current attribute's value. Switch to
|
|
# the attribute value (unquoted) state.
|
|
|
|
if ($char === '<' || $char === '=' || $char === '`') {
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
|
|
}
|
|
|
|
$attribute->value .= $char;
|
|
$this->state = self::ATTRIBUTE_VALUE_UNQUOTED_STATE;
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.38 Attribute value (double-quoted) state
|
|
elseif ($this->state === self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# U+0022 QUOTATION MARK (")
|
|
if ($char === '"') {
|
|
$this->state = self::AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
|
|
}
|
|
# U+0026 AMPERSAND (&)
|
|
elseif ($char === '&') {
|
|
# Switch to the character reference in attribute value state, with the additional
|
|
# allowed character being U+0022 QUOTATION MARK (").
|
|
|
|
# 8.2.4.41 Character reference in attribute value state:
|
|
# Attempt to consume a character reference.
|
|
# If nothing is returned, append a U+0026 AMPERSAND character (&) to the current
|
|
# attribute's value.
|
|
# Otherwise, append the returned character tokens to the current attribute's
|
|
# value.
|
|
# Finally, switch back to the attribute value state that switched into this state.
|
|
|
|
// DEVIATION: This implementation does the character reference consuming in a
|
|
// function for which it is more suited for.
|
|
$attribute->value .= $this->data->consumeCharacterReference('"', true);
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Reconsume the EOF character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$this->data->unconsume();
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Append the current input character to the current attribute's value.
|
|
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
|
|
// to loop back through here every single time.
|
|
$attribute->value .= $char.$this->data->consumeUntil('"&');
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.39 Attribute value (single-quoted) state
|
|
elseif ($this->state === self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "'" (U+0027)
|
|
if ($char === "'") {
|
|
# Switch to the after attribute value (quoted) state.
|
|
$this->state = self::AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
|
|
}
|
|
# U+0026 AMPERSAND (&)
|
|
elseif ($char === '&') {
|
|
# Switch to the character reference in attribute value state, with the additional
|
|
# allowed character being "'" (U+0027).
|
|
|
|
# 8.2.4.41 Character reference in attribute value state:
|
|
# Attempt to consume a character reference.
|
|
# If nothing is returned, append a U+0026 AMPERSAND character (&) to the current
|
|
# attribute's value.
|
|
# Otherwise, append the returned character tokens to the current attribute's
|
|
# value.
|
|
# Finally, switch back to the attribute value state that switched into this state.
|
|
|
|
# DEVIATION: This implementation does the character reference consuming in a
|
|
# function for which it is more suited for.
|
|
$attribute->value .= $this->data->consumeCharacterReference("'", true);
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Reconsume the EOF character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$this->data->unconsume();
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Append the current input character to the current attribute's value.
|
|
|
|
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
|
|
// to loop back through here every single time.
|
|
$attribute->value .= $char.$this->data->consumeUntil("'&");
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
|
|
# 8.2.4.40 Attribute value (unquoted) state
|
|
elseif ($this->state === self::ATTRIBUTE_VALUE_UNQUOTED_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "tab" (U+0009)
|
|
# "LF" (U+000A)
|
|
# "FF" (U+000C)
|
|
# U+0020 SPACE
|
|
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
|
$this->state = self::BEFORE_ATTRIBUTE_VALUE_STATE;
|
|
}
|
|
# U+0026 AMPERSAND (&)
|
|
elseif ($char === '&') {
|
|
# Switch to the character reference in attribute value state, with the additional
|
|
# allowed character being ">" (U+003E).
|
|
|
|
# Switch to the character reference in attribute value state, with the additional
|
|
# allowed character being "'" (U+0027).
|
|
|
|
# 8.2.4.41 Character reference in attribute value state:
|
|
# Attempt to consume a character reference.
|
|
# If nothing is returned, append a U+0026 AMPERSAND character (&) to the current
|
|
# attribute's value.
|
|
# Otherwise, append the returned character tokens to the current attribute's
|
|
# value.
|
|
# Finally, switch back to the attribute value state that switched into this state.
|
|
|
|
// DEVIATION: This implementation does the character reference consuming in a
|
|
// function for which it is more suited for.
|
|
$attribute->value .= $this->data->consumeCharacterReference('>', true);
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Switch to the data state. Emit the current tag token.
|
|
$this->state = self::DATA_STATE;
|
|
|
|
// Need to add the current attribute to the token, if necessary.
|
|
if ($attribute) {
|
|
$token->attributes[] = $attribute;
|
|
$attribute = null;
|
|
}
|
|
|
|
return $token;
|
|
}
|
|
# Parse error. Switch to the data state. Reconsume the EOF character.
|
|
elseif ($char === '') {
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$this->data->unconsume();
|
|
}
|
|
# U+0022 QUOTATION MARK (")
|
|
# "'" (U+0027)
|
|
# "<" (U+003C)
|
|
# "=" (U+003D)
|
|
# "`" (U+0060)
|
|
# Anything else
|
|
else {
|
|
# Quotes, less than sign, equals, tick:
|
|
# Parse error. Treat it as per the "anything else" entry below.
|
|
# Anything else:
|
|
# Append the current input character to the current attribute's value.
|
|
|
|
if ($char === '"' || $char === "'" || $char === '<' || $char === '=' || $char === '`') {
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
|
|
}
|
|
|
|
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
|
|
// to loop back through here every single time.
|
|
$attribute->value .= $char.$this->data->consumeUntil("\t\n\x0c &>\"'<=`");
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.42 After attribute value (quoted) state
|
|
elseif ($this->state === self::AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "tab" (U+0009)
|
|
# "LF" (U+000A)
|
|
# "FF" (U+000C)
|
|
# U+0020 SPACE
|
|
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
|
# Switch to the before attribute name state.
|
|
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
|
|
}
|
|
# "/" (U+002F)
|
|
elseif ($char === '/') {
|
|
# Switch to the self-closing start tag state.
|
|
$this->state = self::SELF_CLOSING_START_TAG_STATE;
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Switch to the data state. Emit the current tag token.
|
|
$this->state = self::DATA_STATE;
|
|
|
|
// Need to add the current attribute to the token, if necessary.
|
|
if ($attribute) {
|
|
$token->attributes[] = $attribute;
|
|
$attribute = null;
|
|
}
|
|
|
|
return $token;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Reconsume the EOF character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$this->data->unconsume();
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Parse error. Switch to the before attribute name state. Reconsume the character.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
|
|
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
|
|
$this->data->unconsume();
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.43 Self-closing start tag state
|
|
elseif ($this->state === self::SELF_CLOSING_START_TAG_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# ">" (U+003E)
|
|
if ($char === '>') {
|
|
# Set the self-closing flag of the current tag token. Switch to the data state.
|
|
# Emit the current tag token.
|
|
$token->selfClosing = true;
|
|
$this->state = self::DATA_STATE;
|
|
|
|
// Need to add the current attribute to the token, if necessary.
|
|
if ($attribute ?? null) {
|
|
$token->attributes[] = $attribute;
|
|
$attribute = null;
|
|
}
|
|
|
|
return $token;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Reconsume the EOF character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$this->data->unconsume();
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Parse error. Switch to the before attribute name state. Reconsume the character.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
|
|
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
|
|
$this->data->unconsume();
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.44 Bogus comment state
|
|
elseif ($this->state === self::BOGUS_COMMENT_STATE) {
|
|
# Consume every character up to and including the first ">" (U+003E) character or
|
|
# the end of the file (EOF), whichever comes first. Emit a comment token whose
|
|
# data is the concatenation of all the characters starting from and including the
|
|
# character that caused the state machine to switch into the bogus comment state,
|
|
# up to and including the character immediately before the last consumed character
|
|
# (i.e. up to the character just before the U+003E or EOF character), but with any
|
|
# U+0000 NULL characters replaced by U+FFFD REPLACEMENT CHARACTER characters. (If
|
|
# the comment was started by the end of the file (EOF), the token is empty.
|
|
# Similarly, the token is empty if it was generated by the string "<!>".)
|
|
|
|
$char = $char.$this->data->consumeUntil('>');
|
|
$nextChar = $this->data->consume();
|
|
|
|
# Switch to the data state.
|
|
$this->state = self::DATA_STATE;
|
|
|
|
# If the end of the file was reached, reconsume the EOF character.
|
|
if ($nextChar === '') {
|
|
$this->data->unconsume();
|
|
}
|
|
|
|
return new CommentToken($char);
|
|
}
|
|
|
|
# 8.2.4.45 Markup declaration open state
|
|
elseif ($this->state === self::MARKUP_DECLARATION_OPEN_STATE) {
|
|
# If the next two characters are both "-" (U+002D) characters, consume those two
|
|
# characters, create a comment token whose data is the empty string, and switch to
|
|
# the comment start state.
|
|
if ($this->data->peek(2) === '--') {
|
|
$this->data->consume(2);
|
|
$token = new CommentToken();
|
|
$this->state = self::COMMENT_START_STATE;
|
|
}
|
|
# Otherwise, if the next seven characters are an ASCII case-insensitive match for
|
|
# the word "DOCTYPE", then consume those characters and switch to the DOCTYPE
|
|
# state.
|
|
elseif (strtolower($this->data->peek(7)) === 'doctype') {
|
|
$this->data->consume(7);
|
|
$this->state = self::DOCTYPE_STATE;
|
|
}
|
|
# Otherwise, if there is an adjusted current node and it is not an element in the
|
|
# HTML namespace and the next seven characters are a case-sensitive match for the
|
|
# string "[CDATA[" (the five uppercase letters "CDATA" with a U+005B LEFT SQUARE
|
|
# BRACKET character before and after), then consume those characters and switch to
|
|
# the CDATA section state.
|
|
else {
|
|
$adjustedCurrentNode = $this->stack->adjustedCurrentNode;
|
|
if ($adjustedCurrentNode && $adjustedCurrentNode->namespace !== Parser::HTML_NAMESPACE && $this->data->peek(7) === '[CDATA[') {
|
|
$this->data->consume(7);
|
|
$this->state = self::CDATA_SECTION_STATE;
|
|
}
|
|
# Otherwise, this is a parse error. Switch to the bogus comment state. The next
|
|
# character that is consumed, if any, is the first character that will be in the
|
|
# comment.
|
|
else {
|
|
$char = $this->data->consume();
|
|
if ($char !== '') {
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
|
|
} else {
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
}
|
|
|
|
$this->state = self::BOGUS_COMMENT_STATE;
|
|
}
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.46 Comment start state
|
|
elseif ($this->state === self::COMMENT_START_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "-" (U+002D)
|
|
if ($char === '-') {
|
|
# Switch to the comment start dash state.
|
|
$this->state = self::COMMENT_START_DASH_STATE;
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Parse error. Switch to the data state. Emit the comment token.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF
|
|
# character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$this->data->unconsume();
|
|
return $token;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Append the current input character to the comment token's data. Switch to the
|
|
# comment state.
|
|
$token->data .= $char;
|
|
$this->state = self::COMMENT_STATE;
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.47 Comment start dash state
|
|
elseif ($this->state === self::COMMENT_START_DASH_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "-" (U+002D)
|
|
if ($char === '-') {
|
|
# Switch to the comment start dash state.
|
|
$this->state = self::COMMENT_END_STATE;
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Parse error. Switch to the data state. Emit the comment token.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF
|
|
# character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$this->data->unconsume();
|
|
return $token;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Append a "-" (U+002D) character and the current input character to the comment
|
|
# token's data. Switch to the comment state.
|
|
$token->data .= '-'.$char;
|
|
$this->state = self::COMMENT_STATE;
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.48 Comment state
|
|
elseif ($this->state === self::COMMENT_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "-" (U+002D)
|
|
if ($char === '-') {
|
|
# Switch to the comment end dash state
|
|
$this->state = self::COMMENT_END_DASH_STATE;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF
|
|
# character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$this->data->unconsume();
|
|
return $token;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Append the current input character to the comment token's data.
|
|
|
|
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
|
|
// to loop back through here every single time.
|
|
$token->data .= $char.$this->data->consumeUntil('-');
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.49 Comment end dash state
|
|
elseif ($this->state === self::COMMENT_END_DASH_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "-" (U+002D)
|
|
if ($char === '-') {
|
|
# Switch to the comment end state
|
|
$this->state = self::COMMENT_END_STATE;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF
|
|
# character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$this->data->unconsume();
|
|
return $token;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Append a "-" (U+002D) character and the current input character to the comment
|
|
# token's data. Switch to the comment state.
|
|
$token->data .= '-'.$char;
|
|
$this->state = self::COMMENT_STATE;
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.50 Comment end state
|
|
elseif ($this->state === self::COMMENT_END_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# ">" (U+003E)
|
|
if ($char === '>') {
|
|
# Switch to the data state. Emit the comment token.
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
}
|
|
# "!" (U+0021)
|
|
elseif ($char === '!') {
|
|
# Parse error. Switch to the comment end bang state.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, '!');
|
|
$this->state = self::COMMENT_END_BANG_STATE;
|
|
}
|
|
# "-" (U+002D)
|
|
elseif ($char === '-') {
|
|
# Parse error. Append a "-" (U+002D) character to the comment token's data.
|
|
|
|
// OPTIMIZATION: Consume all '-' characters to prevent having to loop back through
|
|
// here every single time.
|
|
$char .= $this->data->consumeWhile('-');
|
|
for ($i = 0; $i < strlen($char); $i++) {
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, '-');
|
|
}
|
|
|
|
$token->data .= $char;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF
|
|
# character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$this->data->unconsume();
|
|
return $token;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Parse error. Append two "-" (U+002D) characters and the current input character
|
|
# to the comment token's data. Switch to the comment state.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
|
|
$token->data .= '--'.$char;
|
|
$this->state = self::COMMENT_STATE;
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.51 Comment end bang state
|
|
elseif ($this->state === self::COMMENT_END_BANG_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "-" (U+002D)
|
|
if ($char === '-') {
|
|
# Append two "-" (U+002D) characters and a "!" (U+0021) character to the comment
|
|
# token's data. Switch to the comment end dash state.
|
|
$token->data .= '--!';
|
|
$this->state = self::COMMENT_END_DASH_STATE;
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Switch to the data state. Emit the comment token.
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF
|
|
# character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$this->data->unconsume();
|
|
return $token;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Append two "-" (U+002D) characters, a "!" (U+0021) character, and the current
|
|
# input character to the comment token's data. Switch to the comment state.
|
|
$token->data .= '--!'.$char;
|
|
$this->state = self::COMMENT_STATE;
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.52 DOCTYPE state
|
|
elseif ($this->state === self::DOCTYPE_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "tab" (U+0009)
|
|
# "LF" (U+000A)
|
|
# "FF" (U+000C)
|
|
# U+0020 SPACE
|
|
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
|
# Switch to the before DOCTYPE name state.
|
|
|
|
// Spec doesn't say to create a token here, but if you don't it leads to a
|
|
// situation where a token doesn't exist.
|
|
$token = new DOCTYPEToken();
|
|
$this->state = self::DOCTYPE_NAME_STATE;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Create a new DOCTYPE token. Set its
|
|
# force-quirks flag to on. Emit the token. Reconsume the EOF character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$token = new DOCTYPEToken();
|
|
$token->forceQuirks = true;
|
|
$this->data->unconsume();
|
|
return $token;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Parse error. Switch to the before DOCTYPE name state. Reconsume the character.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
|
|
$this->state = self::DOCTYPE_NAME_STATE;
|
|
$this->data->unconsume();
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.53 Before DOCTYPE name state
|
|
elseif ($this->state === self::BEFORE_DOCTYPE_NAME_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "tab" (U+0009)
|
|
# "LF" (U+000A)
|
|
# "FF" (U+000C)
|
|
# U+0020 SPACE
|
|
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
|
# Ignore the character.
|
|
}
|
|
# Uppercase ASCII letter
|
|
elseif (ctype_upper($char)) {
|
|
# Create a new DOCTYPE token. Set the token's name to the lowercase version of the
|
|
# current input character (add 0x0020 to the character's code point). Switch to
|
|
# the DOCTYPE name state.
|
|
$token = new DOCTYPEToken($char);
|
|
$token->tokenizerState = self::DOCTYPE_NAME_STATE;
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Parse error. Create a new DOCTYPE token. Set its force-quirks flag to on. Switch
|
|
# to the data state. Emit the token.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
|
|
$token = new DOCTYPEToken();
|
|
$token->forceQuirks = true;
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Create a new DOCTYPE token. Set its
|
|
# force-quirks flag to on. Emit the token. Reconsume the EOF character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$token = new DOCTYPEToken();
|
|
$token->forceQuirks = true;
|
|
$this->data->unconsume();
|
|
return $token;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Create a new DOCTYPE token. Set the token's name to the current input character.
|
|
# Switch to the DOCTYPE name state.
|
|
$token = new DOCTYPEToken($char);
|
|
$token->tokenizerState = self::DOCTYPE_NAME_STATE;
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.54 DOCTYPE name state
|
|
elseif ($this->state === self::DOCTYPE_NAME_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "tab" (U+0009)
|
|
# "LF" (U+000A)
|
|
# "FF" (U+000C)
|
|
# U+0020 SPACE
|
|
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
|
# Switch to the after DOCTYPE name state.
|
|
$this->state = self::AFTER_DOCTYPE_NAME_STATE;
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Switch to the data state. Emit the current DOCTYPE token.
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
}
|
|
# Uppercase ASCII letter
|
|
elseif (ctype_alpha($char)) {
|
|
# Append the lowercase version of the current input character (add 0x0020 to the
|
|
# character's code point) to the current DOCTYPE token's name.
|
|
|
|
// OPTIMIZATION: Will just check for alpha characters and strtolower the
|
|
// characters.
|
|
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
|
|
// to loop back through here every single time.
|
|
$token->name .= strtolower($char.$this->data->consumeWhile(self::CTYPE_ALPHA));
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
|
|
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$token->forceQuirks = true;
|
|
$this->data->unconsume();
|
|
return $token;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Append the current input character to the current DOCTYPE token's name.
|
|
|
|
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
|
|
// to loop back through here every single time.
|
|
$token->name .= $char.$this->data->consumeUntil("\t\n\x0c >".self::CTYPE_ALPHA);
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.55 After DOCTYPE name state
|
|
elseif ($this->state === self::AFTER_DOCTYPE_NAME_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "tab" (U+0009)
|
|
# "LF" (U+000A)
|
|
# "FF" (U+000C)
|
|
# U+0020 SPACE
|
|
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
|
# Switch to the after DOCTYPE name state.
|
|
$this->state = self::AFTER_DOCTYPE_NAME_STATE;
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Switch to the data state. Emit the current DOCTYPE token.
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
|
|
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$token->forceQuirks = true;
|
|
$this->data->unconsume();
|
|
return $token;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# If the six characters starting from the current input character are an ASCII
|
|
# case-insensitive match for the word "PUBLIC", then consume those characters and
|
|
# switch to the after DOCTYPE public keyword state.
|
|
// Simpler to just consume and then unconsume if they're not needed.
|
|
$char .= $this->data->consume(5);
|
|
if (strtolower($char) === 'public') {
|
|
$this->state = self::AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE;
|
|
}
|
|
# Otherwise, if the six characters starting from the current input character are
|
|
# an ASCII case-insensitive match for the word "SYSTEM", then consume those
|
|
# characters and switch to the after DOCTYPE system keyword state.
|
|
elseif (strtolower($char) === 'system') {
|
|
$this->state = self::AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE;
|
|
}
|
|
# Otherwise, this is a parse error. Set the DOCTYPE token's force-quirks flag to
|
|
# on. Switch to the bogus DOCTYPE state.
|
|
else {
|
|
// Need to unconsume what was consumed earlier.
|
|
$this->data->unconsume(5);
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, $char[0]);
|
|
$token->forceQuirks = true;
|
|
$this->state = self::BOGUS_DOCTYPE_STATE;
|
|
}
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.56 After DOCTYPE public keyword state
|
|
elseif ($this->state === self::AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "tab" (U+0009)
|
|
# "LF" (U+000A)
|
|
# "FF" (U+000C)
|
|
# U+0020 SPACE
|
|
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
|
# Switch to the before DOCTYPE public identifier state.
|
|
$this->state = self::BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
|
|
}
|
|
# U+0022 QUOTATION MARK (")
|
|
elseif ($char === '"') {
|
|
# Parse error. Set the DOCTYPE token's public identifier to the empty string (not
|
|
# missing), then switch to the DOCTYPE public identifier (double-quoted) state.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, '"');
|
|
$token->public = '';
|
|
$this->state = self::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
|
|
}
|
|
# "'" (U+0027)
|
|
elseif ($char === "'") {
|
|
# Parse error. Set the DOCTYPE token's public identifier to the empty string (not
|
|
# missing), then switch to the DOCTYPE public identifier (single-quoted) state.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, "'");
|
|
$token->public = '';
|
|
$this->state = self::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
|
|
# state. Emit that DOCTYPE token.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
|
|
$token->forceQuirks = true;
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
|
|
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$token->forceQuirks = true;
|
|
$this->data->unconsume();
|
|
return $token;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
|
|
# bogus DOCTYPE state.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
|
|
$token->forceQuirks = true;
|
|
$this->state = self::BOGUS_DOCTYPE_STATE;
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.57 Before DOCTYPE public identifier state
|
|
elseif ($this->state === self::BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "tab" (U+0009)
|
|
# "LF" (U+000A)
|
|
# "FF" (U+000C)
|
|
# U+0020 SPACE
|
|
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
|
# Ignore the character.
|
|
}
|
|
# U+0022 QUOTATION MARK (")
|
|
elseif ($char === '"') {
|
|
# Set the DOCTYPE token's public identifier to the empty string (not missing),
|
|
# then switch to the DOCTYPE public identifier (double-quoted) state.
|
|
$token->public = '';
|
|
$this->state = self::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
|
|
}
|
|
# "'" (U+0027)
|
|
elseif ($char === "'") {
|
|
# Set the DOCTYPE token's public identifier to the empty string (not missing),
|
|
# then switch to the DOCTYPE public identifier (single-quoted) state.
|
|
$token->public = '';
|
|
$this->state = self::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
|
|
# state. Emit that DOCTYPE token.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
|
|
$token->forceQuirks = true;
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
|
|
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$token->forceQuirks = true;
|
|
$this->data->unconsume();
|
|
return $token;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
|
|
# bogus DOCTYPE state.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
|
|
$token->forceQuirks = true;
|
|
$this->state = self::BOGUS_DOCTYPE_STATE;
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.58 DOCTYPE public identifier (double-quoted) state
|
|
elseif ($this->state === self::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# U+0022 QUOTATION MARK (")
|
|
if ($char === '"') {
|
|
# Switch to the after DOCTYPE public identifier state.
|
|
$this->state = self::AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
|
|
# state. Emit that DOCTYPE token.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
|
|
$token->forceQuirks = true;
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
|
|
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$token->forceQuirks = true;
|
|
$this->data->unconsume();
|
|
return $token;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Append the current input character to the current DOCTYPE token's public identifier.
|
|
|
|
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
|
|
// to loop back through here every single time.
|
|
$token->public .= $char.$this->data->consumeUntil('">');
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.59 DOCTYPE public identifier (single-quoted) state
|
|
elseif ($this->state === self::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "'" (U+0027)
|
|
if ($char === "'") {
|
|
# Switch to the after DOCTYPE public identifier state.
|
|
$this->state = self::AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
|
|
# state. Emit that DOCTYPE token.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
|
|
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$token->forceQuirks = true;
|
|
$this->data->unconsume();
|
|
return $token;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Append the current input character to the current DOCTYPE token's public identifier.
|
|
|
|
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
|
|
// to loop back through here every single time.
|
|
$token->public .= $char.$this->data->consumeUntil("'>");
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.60 After DOCTYPE public identifier state
|
|
elseif ($this->state === self::AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "tab" (U+0009)
|
|
# "LF" (U+000A)
|
|
# "FF" (U+000C)
|
|
# U+0020 SPACE
|
|
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
|
# Switch to the between DOCTYPE public and system identifiers state.
|
|
$this->state = self::BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE;
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Switch to the data state. Emit the current DOCTYPE token.
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
}
|
|
# U+0022 QUOTATION MARK (")
|
|
elseif ($char === '"') {
|
|
# Set the DOCTYPE token's system identifier to the empty string (not missing),
|
|
# then switch to the DOCTYPE system identifier (double-quoted) state.
|
|
$this->system = '';
|
|
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
|
|
}
|
|
# "'" (U+0027)
|
|
elseif ($char === "'") {
|
|
# Set the DOCTYPE token's system identifier to the empty string (not missing),
|
|
# then switch to the DOCTYPE system identifier (single-quoted) state.
|
|
$this->system = '';
|
|
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
|
|
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$token->forceQuirks = true;
|
|
$this->data->unconsume();
|
|
return $token;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
|
|
# bogus DOCTYPE state.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
|
|
$token->forceQuirks = true;
|
|
$this->state = self::BOGUS_DOCTYPE_STATE;
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.61 Between DOCTYPE public and system identifiers state
|
|
elseif ($this->state === self::BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "tab" (U+0009)
|
|
# "LF" (U+000A)
|
|
# "FF" (U+000C)
|
|
# U+0020 SPACE
|
|
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
|
# Ignore the character.
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Switch to the data state. Emit the current DOCTYPE token.
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
}
|
|
# U+0022 QUOTATION MARK (")
|
|
elseif ($char === '"') {
|
|
# Set the DOCTYPE token's system identifier to the empty string (not missing),
|
|
# then switch to the DOCTYPE system identifier (double-quoted) state.
|
|
$this->system = '';
|
|
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
|
|
}
|
|
# "'" (U+0027)
|
|
elseif ($char === "'") {
|
|
# Set the DOCTYPE token's system identifier to the empty string (not missing),
|
|
# then switch to the DOCTYPE system identifier (single-quoted) state.
|
|
$this->system = '';
|
|
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
|
|
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
|
|
$this->error(ParseError::UNEXPECTED_EOF, 'DOCTYPE public identifier');
|
|
$this->state = self::DATA_STATE;
|
|
$token->forceQuirks = true;
|
|
$this->data->unconsume();
|
|
return $token;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
|
|
# bogus DOCTYPE state.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, $char, 'DOCTYPE public identifier');
|
|
$token->forceQuirks = true;
|
|
$this->state = self::BOGUS_DOCTYPE_STATE;
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.62 After DOCTYPE system keyword state
|
|
elseif ($this->state === self::AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "tab" (U+0009)
|
|
# "LF" (U+000A)
|
|
# "FF" (U+000C)
|
|
# U+0020 SPACE
|
|
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
|
# Switch to the before DOCTYPE system identifier state.
|
|
$this->state = self::BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
|
|
}
|
|
# U+0022 QUOTATION MARK (")
|
|
elseif ($char === '"') {
|
|
# Parse error. Set the DOCTYPE token's system identifier to the empty string (not
|
|
# missing), then switch to the DOCTYPE system identifier (double-quoted) state.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, '"');
|
|
$token->system = '';
|
|
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
|
|
}
|
|
# "'" (U+0027)
|
|
elseif ($char === "'") {
|
|
# Parse error. Set the DOCTYPE token's system identifier to the empty string (not
|
|
# missing), then switch to the DOCTYPE system identifier (single-quoted) state.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, "'");
|
|
$token->system = '';
|
|
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
|
|
# state. Emit that DOCTYPE token.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
|
|
$token->forceQuirks = true;
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
|
|
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$token->forceQuirks = true;
|
|
$this->data->unconsume();
|
|
return $token;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
|
|
# bogus DOCTYPE state.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
|
|
$token->forceQuirks = true;
|
|
$this->state = self::BOGUS_DOCTYPE_STATE;
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.63 Before DOCTYPE system identifier state
|
|
elseif ($this->state === self::BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "tab" (U+0009)
|
|
# "LF" (U+000A)
|
|
# "FF" (U+000C)
|
|
# U+0020 SPACE
|
|
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
|
# Ignore the character.
|
|
}
|
|
# U+0022 QUOTATION MARK (")
|
|
elseif ($char === '"') {
|
|
# Set the DOCTYPE token's system identifier to the empty string (not missing),
|
|
# then switch to the DOCTYPE system identifier (double-quoted) state.
|
|
$token->system = '';
|
|
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
|
|
}
|
|
# "'" (U+0027)
|
|
elseif ($char === "'") {
|
|
# Set the DOCTYPE token's system identifier to the empty string (not missing),
|
|
# then switch to the DOCTYPE system identifier (single-quoted) state.
|
|
$token->system = '';
|
|
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
|
|
# state. Emit that DOCTYPE token.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
|
|
$token->forceQuirks = true;
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
|
|
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$token->forceQuirks = true;
|
|
$this->data->unconsume();
|
|
return $token;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
|
|
# bogus DOCTYPE state.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
|
|
$token->forceQuirks = true;
|
|
$this->state = self::BOGUS_DOCTYPE_STATE;
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.64 DOCTYPE system identifier (double-quoted) state
|
|
elseif ($this->state === self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# U+0022 QUOTATION MARK (")
|
|
if ($char === '"') {
|
|
# Switch to the after DOCTYPE system identifier state.
|
|
$this->state = self::AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
|
|
# state. Emit that DOCTYPE token.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
|
|
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$token->forceQuirks = true;
|
|
$this->data->unconsume();
|
|
return $token;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Append the current input character to the current DOCTYPE token's system identifier.
|
|
|
|
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
|
|
// to loop back through here every single time.
|
|
$token->system .= $char.$this->data->consumeUntil('">');
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.65 DOCTYPE system identifier (single-quoted) state
|
|
elseif ($this->state === self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "'" (U+0027)
|
|
if ($char === "'") {
|
|
# Switch to the after DOCTYPE system identifier state.
|
|
$this->state = self::AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
|
|
# state. Emit that DOCTYPE token.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
|
|
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$token->forceQuirks = true;
|
|
$this->data->unconsume();
|
|
return $token;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Append the current input character to the current DOCTYPE token's system identifier.
|
|
|
|
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
|
|
// to loop back through here every single time.
|
|
$token->system .= $char.$this->data->consumeUntil("'>");
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.66 After DOCTYPE system identifier state
|
|
elseif ($this->state === self::AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# "tab" (U+0009)
|
|
# "LF" (U+000A)
|
|
# "FF" (U+000C)
|
|
# U+0020 SPACE
|
|
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
|
|
# Switch to the between DOCTYPE system and system identifiers state.
|
|
$this->state = self::BETWEEN_DOCTYPE_SYSTEM_AND_SYSTEM_IDENTIFIERS_STATE;
|
|
}
|
|
# ">" (U+003E)
|
|
elseif ($char === '>') {
|
|
# Switch to the data state. Emit the current DOCTYPE token.
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
}
|
|
# U+0022 QUOTATION MARK (")
|
|
elseif ($char === '"') {
|
|
# Set the DOCTYPE token's system identifier to the empty string (not missing),
|
|
# then switch to the DOCTYPE system identifier (double-quoted) state.
|
|
$this->system = '';
|
|
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
|
|
}
|
|
# "'" (U+0027)
|
|
elseif ($char === "'") {
|
|
# Set the DOCTYPE token's system identifier to the empty string (not missing),
|
|
# then switch to the DOCTYPE system identifier (single-quoted) state.
|
|
$this->system = '';
|
|
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
|
|
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
|
|
$this->error(ParseError::UNEXPECTED_EOF);
|
|
$this->state = self::DATA_STATE;
|
|
$token->forceQuirks = true;
|
|
$this->data->unconsume();
|
|
return $token;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
|
|
# bogus DOCTYPE state.
|
|
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
|
|
$token->forceQuirks = true;
|
|
$this->state = self::BOGUS_DOCTYPE_STATE;
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.67 Bogus DOCTYPE state
|
|
elseif ($this->state === self::BOGUS_DOCTYPE_STATE) {
|
|
# Consume the next input character
|
|
$char = $this->data->consume();
|
|
|
|
# ">" (U+003E)
|
|
if ($char === '>') {
|
|
# Switch to the data state. Emit the DOCTYPE token.
|
|
$this->state = self::DATA_STATE;
|
|
return $token;
|
|
}
|
|
# EOF
|
|
elseif ($char === '') {
|
|
# Switch to the data state. Emit the DOCTYPE token.
|
|
$this->state = self::DATA_STATE;
|
|
$this->data->unconsume();
|
|
return $token;
|
|
}
|
|
# Anything else
|
|
# Ignore the character.
|
|
|
|
continue;
|
|
}
|
|
|
|
# 8.2.4.68 CDATA section state
|
|
elseif ($this->state === self::CDATA_SECTION_STATE) {
|
|
# Switch to the data state.
|
|
$this->state = self::DATA_STATE;
|
|
|
|
# Consume every character up to the next occurrence of the three character
|
|
# sequence U+005D RIGHT SQUARE BRACKET U+005D RIGHT SQUARE BRACKET U+003E
|
|
# GREATER-THAN SIGN (]]>), or the end of the file (EOF), whichever comes first.
|
|
# Emit a series of character tokens consisting of all the characters consumed
|
|
# except the matching three character sequence at the end (if one was found before
|
|
# the end of the file).
|
|
$char = '';
|
|
while (true) {
|
|
$char .= $this->data->consumeUntil(']');
|
|
$peek = $this->data->peek(3);
|
|
$peeklen = strlen($peek);
|
|
|
|
if ($peek === ']]>') {
|
|
$this->data->consume(3);
|
|
return new CharacterToken($char);
|
|
} elseif ($peek === '') {
|
|
# If the end of the file was reached, reconsume the EOF character.
|
|
$this->data->unconsume();
|
|
return new CharacterToken($char);
|
|
} elseif ($peeklen < 3) {
|
|
$char .= $this->data->consume($peeklen);
|
|
# If the end of the file was reached, reconsume the EOF character.
|
|
$this->data->unconsume();
|
|
return new CharacterToken($char);
|
|
} else {
|
|
$char .= $this->data->consume();
|
|
}
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
// If this is reached then we've fucked up. The tokenizer is in an infinite loop
|
|
// and should exit immediately.
|
|
throw new Exception(Exception::TOKENIZER_INVALID_STATE);
|
|
}
|
|
}
|
|
}
|