HTML-DOM/lib/Tokenizer.php
J. King 6b42f08fbc Change some if-the-exception blocks to assertions
This has only been done some parts of the code that are internal
to the parser at large.
2019-12-12 17:35:24 -05:00

3073 lines
146 KiB
PHP

<?php
declare(strict_types=1);
namespace dW\HTML5;
class Tokenizer {
use ParseErrorEmitter;
public $state;
protected $data;
protected $stack;
public static $debug = false;
const DATA_STATE = 0;
const RCDATA_STATE = 1;
const RAWTEXT_STATE = 2;
const SCRIPT_DATA_STATE = 3;
const PLAINTEXT_STATE = 4;
const TAG_OPEN_STATE = 5;
const END_TAG_OPEN_STATE = 6;
const TAG_NAME_STATE = 7;
const RCDATA_LESS_THAN_SIGN_STATE = 8;
const RCDATA_END_TAG_OPEN_STATE = 9;
const RCDATA_END_TAG_NAME_STATE = 10;
const RAWTEXT_LESS_THAN_SIGN_STATE = 11;
const RAWTEXT_END_TAG_OPEN_STATE = 12;
const RAWTEXT_END_TAG_NAME_STATE = 13;
const SCRIPT_DATA_LESS_THAN_SIGN_STATE = 14;
const SCRIPT_DATA_END_TAG_OPEN_STATE = 15;
const SCRIPT_DATA_END_TAG_NAME_STATE = 16;
const SCRIPT_DATA_ESCAPE_START_STATE = 17;
const SCRIPT_DATA_ESCAPE_START_DASH_STATE = 18;
const SCRIPT_DATA_ESCAPED_STATE = 19;
const SCRIPT_DATA_ESCAPED_DASH_STATE = 20;
const SCRIPT_DATA_ESCAPED_DASH_DASH_STATE = 21;
const SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE = 22;
const SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE = 23;
const SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE = 24;
const SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE = 25;
const SCRIPT_DATA_DOUBLE_ESCAPED_STATE = 26;
const SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE = 27;
const SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE = 28;
const SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE = 29;
const SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE = 30;
const BEFORE_ATTRIBUTE_NAME_STATE = 31;
const ATTRIBUTE_NAME_STATE = 32;
const AFTER_ATTRIBUTE_NAME_STATE = 33;
const BEFORE_ATTRIBUTE_VALUE_STATE = 34;
const ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE = 35;
const ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE = 36;
const ATTRIBUTE_VALUE_UNQUOTED_STATE = 37;
const AFTER_ATTRIBUTE_VALUE_QUOTED_STATE = 38;
const SELF_CLOSING_START_TAG_STATE = 39;
const BOGUS_COMMENT_STATE = 40;
const MARKUP_DECLARATION_OPEN_STATE = 41;
const COMMENT_START_STATE = 42;
const COMMENT_START_DASH_STATE = 43;
const COMMENT_STATE = 44;
const COMMENT_END_DASH_STATE = 45;
const COMMENT_END_STATE = 46;
const COMMENT_END_BANG_STATE = 47;
const DOCTYPE_STATE = 48;
const BEFORE_DOCTYPE_NAME_STATE = 49;
const DOCTYPE_NAME_STATE = 50;
const AFTER_DOCTYPE_NAME_STATE = 51;
const AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE = 52;
const BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE = 53;
const DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE = 54;
const DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE = 55;
const AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE = 56;
const BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE = 57;
const AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE = 58;
const BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE = 59;
const DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE = 60;
const DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE = 61;
const AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE = 62;
const BOGUS_DOCTYPE_STATE = 63;
const CDATA_SECTION_STATE = 64;
const STATE_NAMES = [
self::DATA_STATE => "Data",
self::RCDATA_STATE => "RCDATA",
self::RAWTEXT_STATE => "RAWTEXT",
self::SCRIPT_DATA_STATE => "Script data",
self::PLAINTEXT_STATE => "PLAINTEXT",
self::TAG_OPEN_STATE => "Tag open",
self::END_TAG_OPEN_STATE => "End tag open",
self::TAG_NAME_STATE => "Tag name",
self::RCDATA_LESS_THAN_SIGN_STATE => "RCDATA less-than sign",
self::RCDATA_END_TAG_OPEN_STATE => "RCDATA end tag open",
self::RCDATA_END_TAG_NAME_STATE => "RCDATA end tag name",
self::RAWTEXT_LESS_THAN_SIGN_STATE => "RAWTEXT less than sign",
self::RAWTEXT_END_TAG_OPEN_STATE => "RAWTEXT end tag open",
self::RAWTEXT_END_TAG_NAME_STATE => "RAWTEXT end tag name",
self::SCRIPT_DATA_LESS_THAN_SIGN_STATE => "Script data less-than sign",
self::SCRIPT_DATA_END_TAG_OPEN_STATE => "Script data end tag open",
self::SCRIPT_DATA_END_TAG_NAME_STATE => "Script data end tag name",
self::SCRIPT_DATA_ESCAPE_START_STATE => "Script data escape start",
self::SCRIPT_DATA_ESCAPE_START_DASH_STATE => "Script data escape start dash",
self::SCRIPT_DATA_ESCAPED_STATE => "Script data escaped",
self::SCRIPT_DATA_ESCAPED_DASH_STATE => "Script data escaped dash",
self::SCRIPT_DATA_ESCAPED_DASH_DASH_STATE => "Script data escaped dash dash",
self::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE => "Script data escaped less-than sign",
self::SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE => "Script data escaped end tag open",
self::SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE => "Script data escaped end tag name",
self::SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE => "Script data double escape start",
self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE => "Script data double escaped",
self::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE => "Script data double escaped dash",
self::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE => "Script data double escaped dash dash",
self::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE => "Script data double escaped less-than sign",
self::SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE => "Script data double escape end",
self::BEFORE_ATTRIBUTE_NAME_STATE => "Before attribute",
self::ATTRIBUTE_NAME_STATE => "Attribute name",
self::AFTER_ATTRIBUTE_NAME_STATE => "After attribute name",
self::BEFORE_ATTRIBUTE_VALUE_STATE => "Before attribute value",
self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE => "Attribute value (double quoted)",
self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE => "Attribute value (single quoted)",
self::ATTRIBUTE_VALUE_UNQUOTED_STATE => "Attribute value (unquoted)",
self::AFTER_ATTRIBUTE_VALUE_QUOTED_STATE => "After attribute value (quoted)",
self::SELF_CLOSING_START_TAG_STATE => "Self-closing start tag",
self::BOGUS_COMMENT_STATE => "Bogus comment",
self::MARKUP_DECLARATION_OPEN_STATE => "Markup declaration open",
self::COMMENT_START_STATE => "Comment start",
self::COMMENT_START_DASH_STATE => "Comment start dash",
self::COMMENT_STATE => "Comment",
self::COMMENT_END_DASH_STATE => "Comment end dash",
self::COMMENT_END_STATE => "Comment end",
self::COMMENT_END_BANG_STATE => "Comment end bang",
self::DOCTYPE_STATE => "DOCTYPE",
self::BEFORE_DOCTYPE_NAME_STATE => "Before DOCTYPE name",
self::DOCTYPE_NAME_STATE => "DOCTYPE name",
self::AFTER_DOCTYPE_NAME_STATE => "After DOCTYPE name",
self::AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE => "After DOCTYPE public keyword",
self::BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE => "Before DOCTYPE public identifier",
self::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE => "DOCTYPE public identifier (double quoted)",
self::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE => "DOCTYPE public identifier (single quoted)",
self::AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE => "After DOCTYPE public identifier",
self::BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE => "Between DOCTYPE public and system identifiers",
self::AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE => "After DOCTYPE system keyword",
self::BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE => "Before DOCTYPE system identifier",
self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE => "DOCTYPE system identifier (double-quoted)",
self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE => "DOCTYPE system identifier (single-quoted)",
self::AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE => "After DOCTYPE system identifier",
self::BOGUS_DOCTYPE_STATE => "Bogus comment",
self::CDATA_SECTION_STATE => "CDATA section",
];
// Ctype constants
const CTYPE_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
const CTYPE_UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
public function __construct(Data $data, OpenElementsStack $stack, ParseError $errorHandler) {
$this->state = self::DATA_STATE;
$this->data = $data;
$this->stack = $stack;
$this->errorHandler = $errorHandler;
}
public function createToken(): Token {
while (true) {
if (self::$debug) {
$state = self::STATE_NAMES[$this->state] ?? "";
assert($state, new Exception(Exception::UNKNOWN_ERROR));
echo "State: $state\n";
unset($state);
}
# 12.2.4.1 Data state
if ($this->state === self::DATA_STATE) {
# Consume the next input character
$char = $this->data->consume();
# U+0026 AMPERSAND (&)
if ($char === '&') {
# Switch to the character reference in data state.
# 8.2.4.2 Character reference in data state:
# Switch to the data state.
# Attempt to consume a character reference, with no additional allowed character.
# If nothing is returned, emit a U+0026 AMPERSAND character (&) token.
# Otherwise, emit the character tokens that were returned.
// DEVIATION: This implementation does the character reference consuming in a
// function for which it is more suited for.
return new CharacterToken($this->data->consumeCharacterReference());
}
# U+003C LESS-THAN SIGN (<)
elseif ($char === '<') {
# Switch to the tag open state.
$this->state = self::TAG_OPEN_STATE;
continue;
}
# EOF
elseif ($char === '') {
# Emit an end-of-file token.
return new EOFToken;
}
# Anything else
else {
# Emit the current input character as a character token.
// OPTIMIZATION: Consume all characters that don't match what is above and emit
// that as a character token instead to prevent having to loop back through here
// every single time.
return new CharacterToken($char.$this->data->consumeUntil('&<'));
}
}
# 12.2.4.2 Character reference in data state
// OPTIMIZATION: This is instead done in the block above.
# 12.2.4.3 RCDATA state
elseif ($this->state === self::RCDATA_STATE) {
# Consume the next input character
$char = $this->data->consume();
# U+0026 AMPERSAND (&)
if ($char === '&') {
# Switch to the character reference in RCDATA state.
# 8.2.4.4 Character reference in RCDATA state:
# Switch to the RCDATA state.
# Attempt to consume a character reference, with no additional allowed character.
# If nothing is returned, emit a U+0026 AMPERSAND character (&) token.
# Otherwise, emit the character tokens that were returned.
// DEVIATION: This implementation does the character reference consuming in a
// function for which it is more suited for.
return new CharacterToken($this->data->consumeCharacterReference());
}
# U+003C LESS-THAN SIGN (<)
elseif ($char === '<') {
# Switch to the RCDATA less-than sign state.
$this->state = self::RCDATA_LESS_THAN_SIGN_STATE;
}
# EOF
elseif ($char === '') {
# Emit an end-of-file token.
return new EOFToken;
}
# Anything else
else {
# Emit the current input character as a character token.
// OPTIMIZATION: Consume all characters that don't match what is above and emit
// that as a character token instead to prevent having to loop back through here
// every single time.
return new CharacterToken($char.$this->data->consumeUntil('&<'));
}
continue;
}
# 12.2.4.4 Character reference in RCDATA state
// OPTIMIZATION: This is instead done in the block above.
# 12.2.4.5 RAWTEXT state
elseif ($this->state === self::RAWTEXT_STATE) {
# Consume the next input character
$char = $this->data->consume();
# U+003C LESS-THAN SIGN (<)
if ($char === '<') {
# Switch to the RAWTEXT less-than sign state.
$this->state = self::RAWTEXT_LESS_THAN_SIGN_STATE;
}
# EOF
elseif ($char === '') {
# Emit an end-of-file token.
return new EOFToken;
}
# Anything else
else {
# Emit the current input character as a character token.
// OPTIMIZATION: Consume all characters that don't match what is above and emit
// that as a character token instead to prevent having to loop back through here
// every single time.
return new CharacterToken($char.$this->data->consumeUntil('<'));
}
continue;
}
# 12.2.4.6 Script data state
elseif ($this->state === self::SCRIPT_DATA_STATE) {
# Consume the next input character
$char = $this->data->consume();
# U+003C LESS-THAN SIGN (<)
if ($char === '<') {
# Switch to the script data less-than sign state.
$this->state = self::SCRIPT_DATA_LESS_THAN_SIGN_STATE;
}
# EOF
elseif ($char === '') {
# Emit an end-of-file token.
return new EOFToken;
}
# Anything else
else {
# Emit the current input character as a character token.
// OPTIMIZATION: Consume all characters that don't match what is above and emit
// that as a character token instead to prevent having to loop back through here
// every single time.
return new CharacterToken($char.$this->data->consumeUntil('<'));
}
continue;
}
# 12.2.4.7 PLAINTEXT state
elseif ($this->state === self::PLAINTEXT_STATE) {
# Consume the next input character
$char = $this->data->consume();
# EOF
if ($char === '') {
# Emit an end-of-file token.
return new EOFToken;
}
# Anything else
else {
# Emit the current input character as a character token.
// OPTIMIZATION: Consume all characters that don't match what is above and emit
// that as a character token instead to prevent having to loop back through here
// every single time.
return new CharacterToken($char.$this->data->consumeUntil(''));
}
}
# 12.2.4.8 Tag open state
elseif ($this->state === self::TAG_OPEN_STATE) {
# Consume the next input character
$char = $this->data->consume();
# U+0021 EXCLAMATION MARK (!)
if ($char === '!') {
# Switch to the markup declaration open state.
$this->state = self::MARKUP_DECLARATION_OPEN_STATE;
}
# U+002F SOLIDUS (/)
elseif ($char === '/') {
# Switch to the end tag open state.
$this->state = self::END_TAG_OPEN_STATE;
}
# Uppercase ASCII letter
# Lowercase ASCII letter
elseif (ctype_alpha($char)) {
# Uppercase:
# Create a new start tag token, set its tag name to the lowercase version of the
# current input character (add 0x0020 to the character's code point), then switch
# to the tag name state. (Don't emit the token yet; further details will be filled
# in before it is emitted.)
# Lowercase:
# Create a new start tag token, set its tag name to the current input character,
# then switch to the tag name state. (Don't emit the token yet; further details
# will be filled in before it is emitted.)
// OPTIMIZATION: Will just check for alpha characters and strtolower the
// characters.
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
// to loop back through here every single time.
$token = new StartTagToken(strtolower($char.$this->data->consumeWhile(self::CTYPE_ALPHA)));
$this->state = self::TAG_NAME_STATE;
}
# U+003F QUESTION MARK (?)
elseif ($char === '?') {
# Parse error. Switch to the bogus comment state.
// Making errors more expressive.
if ($char !== '') {
$this->error(ParseError::TAG_NAME_EXPECTED);
} else {
$this->error(ParseError::UNEXPECTED_EOF);
}
$this->state = self::BOGUS_COMMENT_STATE;
}
# Anything else
else {
# Parse error. Switch to the data state. Emit a U+003C LESS-THAN SIGN character
# token. Reconsume the current input character.
// Making errors more expressive.
if ($char !== '') {
$this->error(ParseError::TAG_NAME_EXPECTED);
} else {
$this->error(ParseError::UNEXPECTED_EOF);
}
$this->state = self::DATA_STATE;
$this->data->unconsume();
}
continue;
}
# 8.2.4.9 End tag open state
elseif ($this->state === self::END_TAG_OPEN_STATE) {
# Consume the next input character
$char = $this->data->consume();
# Uppercase ASCII letter
# Lowercase ASCII letter
if (ctype_alpha($char)) {
# Uppercase:
# Create a new end tag token, set its tag name to the lowercase version of the
# current input character (add 0x0020 to the character's code point), then switch
# to the tag name state. (Don't emit the token yet; further details will be filled
# in before it is emitted.)
# Lowercase:
# Create a new end tag token, set its tag name to the current input character,
# then switch to the tag name state. (Don't emit the token yet; further details
# will be filled in before it is emitted.)
// OPTIMIZATION: Will just check for alpha characters and strtolower the
// characters.
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
// to loop back through here every single time.
$token = new EndTagToken(strtolower($char.$this->data->consumeWhile(self::CTYPE_ALPHA)));
$this->state = self::TAG_NAME_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Parse error. Switch to the data state.
$this->error(ParseError::TAG_NAME_EXPECTED);
$this->state = self::DATA_STATE;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Emit a U+003C LESS-THAN SIGN character
# token and a U+002F SOLIDUS character token. Reconsume the EOF character.
// Making errors more expressive.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
return new CharacterToken('</');
}
# Anything else
else {
# Parse error. Switch to the bogus comment state.
$this->error(ParseError::TAG_NAME_EXPECTED);
$this->state = self::BOGUS_COMMENT_STATE;
}
continue;
}
# 8.2.4.10 Tag name state
elseif ($this->state === self::TAG_NAME_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
# Switch to the before attribute name state.
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
}
# "/" (U+002F)
elseif ($char === '/') {
# Switch to the self-closing start tag state.
$this->state = self::SELF_CLOSING_START_TAG_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Switch to the data state. Emit the current tag token.
$this->state = self::DATA_STATE;
return $token;
}
# Uppercase ASCII letter
elseif (ctype_upper($char)) {
# Append the lowercase version of the current input character (add 0x0020 to the
# character's code point) to the current tag token's tag name.
// OPTIMIZATION: Consume all characters that are Uppercase ASCII characters to
// prevent having to loop back through here every single time.
$token->name = $token->name.strtolower($char.$this->data->consumeWhile(self::CTYPE_UPPER));
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character.
// Making errors more expressive.
if ($char !== '') {
$this->error(ParseError::TAG_NAME_EXPECTED, $char);
} else {
$this->error(ParseError::UNEXPECTED_EOF);
}
$this->state = self::DATA_STATE;
$this->data->unconsume();
}
# Anything else
else {
# Append the current input character to the current tag token's tag name.
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
$token->name = $token->name.$char.$this->data->consumeUntil("\t\n\x0c />".self::CTYPE_UPPER);
}
continue;
}
# 8.2.4.11 RCDATA less-than sign state
elseif ($this->state === self::RCDATA_LESS_THAN_SIGN_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "/" (U+002F)
if ($char === '/') {
# Set the temporary buffer to the empty string. Switch to the RCDATA end tag open
# state.
$temporaryBuffer = '';
$this->state = self::RCDATA_END_TAG_OPEN_STATE;
}
# Anything else
else {
# Switch to the RCDATA state. Emit a U+003C LESS-THAN SIGN character token.
# Reconsume the current input character.
$this->state = self::RCDATA_STATE;
$this->data->unconsume();
return new CharacterToken('<');
}
continue;
}
# 8.2.4.12 RCDATA end tag open state
elseif ($this->state === self::RCDATA_END_TAG_OPEN_STATE) {
# Consume the next input character
$char = $this->data->consume();
# Uppercase ASCII letter
# Lowercase ASCII letter
if (ctype_alpha($char)) {
# Uppercase:
# Create a new end tag token, and set its tag name to the lowercase version of the
# current input character (add 0x0020 to the character's code point). Append the
# current input character to the temporary buffer. Finally, switch to the RCDATA
# end tag name state. (Don't emit the token yet; further details will be filled in
# before it is emitted.)
# Lowercase:
# Create a new end tag token, and set its tag name to the current input character.
# Append the current input character to the temporary buffer. Finally, switch to
# the RCDATA end tag name state. (Don't emit the token yet; further details will
# be filled in before it is emitted.)
// OPTIMIZATION: Will just check for alpha characters and strtolower the
// characters.
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
// to loop back through here every single time.
$token = new EndTagToken(strtolower($char));
$temporaryBuffer .= $char;
$this->state = self::RCDATA_END_TAG_NAME_STATE;
continue;
}
# Anything else
else {
# Switch to the RCDATA state. Emit a U+003C LESS-THAN SIGN character token and a
# U+002F SOLIDUS character token. Reconsume the current input character.
$this->state = self::RCDATA_STATE;
$this->data->unconsume();
return new CharacterToken('</');
}
}
# 8.2.4.13 RCDATA end tag name state
elseif ($this->state === self::RCDATA_END_TAG_NAME_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
# If the current end tag token is an appropriate end tag token, then switch to the
# before attribute name state. Otherwise, treat it as per the "anything else"
# entry below.
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
} else {
$this->state = self::RCDATA_STATE;
$this->data->unconsume();
return new CharacterToken('</'.$temporaryBuffer);
}
}
# "/" (U+002F)
elseif ($char === '/') {
# If the current end tag token is an appropriate end tag token, then switch to the
# self-closing start tag state. Otherwise, treat it as per the "anything else"
# entry below.
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::SELF_CLOSING_START_TAG_STATE;
} else {
$this->state = self::RCDATA_STATE;
$this->data->unconsume();
return new CharacterToken('</'.$temporaryBuffer);
}
}
# ">" (U+003E)
elseif ($char === '>') {
# If the current end tag token is an appropriate end tag token, then switch to the
# data state and emit the current tag token. Otherwise, treat it as per the
# "anything else" entry below.
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::DATA_STATE;
return $token;
} else {
$this->state = self::RCDATA_STATE;
$this->data->unconsume();
return new CharacterToken('</'.$temporaryBuffer);
}
}
# Uppercase ASCII letter
# Lowercase ASCII letter
elseif (ctype_alpha($char)) {
# Uppercase:
# Append the lowercase version of the current input character (add 0x0020 to the
# character's code point) to the current tag token's tag name. Append the current
# input character to the temporary buffer.
# Lowercase:
# Append the current input character to the current tag token's tag name. Append
# the current input character to the temporary buffer.
// OPTIMIZATION: Will just check for alpha characters and strtolower the
// characters.
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
// to loop back through here every single time.
$token->name .= $token->name.strtolower($char.$this->data->consumeWhile(self::CTYPE_ALPHA));
$temporaryBuffer .= $char;
}
# Anything else
else {
# Switch to the RCDATA state. Emit a U+003C LESS-THAN SIGN character token, a
# U+002F SOLIDUS character token, and a character token for each of the characters
# in the temporary buffer (in the order they were added to the buffer). Reconsume
# the current input character.
$this->state = self::RCDATA_STATE;
$this->data->unconsume();
return new CharacterToken('</'.$temporaryBuffer);
}
continue;
}
# 8.2.4.14 RAWTEXT less-than sign state
elseif ($this->state === self::RAWTEXT_LESS_THAN_SIGN_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "/" (U+002F)
if ($char === '/') {
# Set the temporary buffer to the empty string. Switch to the RAWTEXT end tag open
# state.
$temporaryBuffer = '';
$this->state = self::RAWTEXT_END_TAG_OPEN_STATE;
}
# Anything else
else {
# Switch to the RAWTEXT state. Emit a U+003C LESS-THAN SIGN character token.
# Reconsume the current input character.
$this->state = self::RAWTEXT_STATE;
$this->data->unconsume();
return new CharacterToken('<');
}
continue;
}
# 8.2.4.15 RAWTEXT end tag open state
elseif ($this->state === self::RAWTEXT_END_TAG_OPEN_STATE) {
# Consume the next input character
$char = $this->data->consume();
# Uppercase ASCII letter
# Lowercase ASCII letter
if (ctype_alpha($char)) {
# Uppercase:
# Create a new end tag token, and set its tag name to the lowercase version of the
# current input character (add 0x0020 to the character's code point). Append the
# current input character to the temporary buffer. Finally, switch to the RAWTEXT
# end tag name state. (Don't emit the token yet; further details will be filled in
# before it is emitted.)
# Lowercase:
# Create a new end tag token, and set its tag name to the current input character.
# Append the current input character to the temporary buffer. Finally, switch to
# the RAWTEXT end tag name state. (Don't emit the token yet; further details will
# be filled in before it is emitted.)
// OPTIMIZATION: Will just check for alpha characters and strtolower the
// characters.
$token = new EndTagToken(strtolower($char));
$temporaryBuffer .= $char;
$this->state = self::RAWTEXT_END_TAG_NAME_STATE;
}
# Anything else
else {
# Switch to the RAWTEXT state. Emit a U+003C LESS-THAN SIGN character token and a
# U+002F SOLIDUS character token. Reconsume the current input character.
$this->state = self::RAWTEXT_STATE;
$this->data->unconsume();
return new CharacterToken('</');
}
continue;
}
# 8.2.4.16 RAWTEXT end tag name state
elseif ($this->state === self::RAWTEXT_END_TAG_NAME_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
# If the current end tag token is an appropriate end tag token, then switch to the
# before attribute name state. Otherwise, treat it as per the "anything else"
# entry below.
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
} else {
$this->state = self::RAWTEXT_STATE;
$this->data->unconsume();
return new CharacterToken('</'.$temporaryBuffer);
}
continue;
}
# "/" (U+002F)
elseif ($char === '/') {
# If the current end tag token is an appropriate end tag token, then switch to the
# self-closing start tag state. Otherwise, treat it as per the "anything else"
# entry below.
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::SELF_CLOSING_START_TAG_STATE;
} else {
$this->state = self::RAWTEXT_STATE;
$this->data->unconsume();
return new CharacterToken('</'.$temporaryBuffer);
}
continue;
}
# ">" (U+003E)
elseif ($char === '>') {
# If the current end tag token is an appropriate end tag token, then switch to the
# data state and emit the current tag token. Otherwise, treat it as per the
# "anything else" entry below.
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::DATA_STATE;
return $token;
} else {
$this->state = self::RAWTEXT_STATE;
$this->data->unconsume();
return new CharacterToken('</'.$temporaryBuffer);
}
continue;
}
# Uppercase ASCII letter
# Lowercase ASCII letter
elseif (ctype_alpha($char)) {
# Uppercase:
# Append the lowercase version of the current input character (add 0x0020 to the
# character's code point) to the current tag token's tag name. Append the current
# input character to the temporary buffer.
# Lowercase:
# Append the current input character to the current tag token's tag name. Append
# the current input character to the temporary buffer.
// OPTIMIZATION: Will just check for alpha characters and strtolower the
// characters.
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
// to loop back through here every single time.
$token->name .= $token->name.strtolower($char.$this->data->consumeWhile(self::CTYPE_ALPHA));
$temporaryBuffer .= $char;
}
# Anything else
else {
# Switch to the RAWTEXT state. Emit a U+003C LESS-THAN SIGN character token, a
# U+002F SOLIDUS character token, and a character token for each of the characters
# in the temporary buffer (in the order they were added to the buffer). Reconsume
# the current input character.
$this->state = self::RAWTEXT_STATE;
$this->data->unconsume();
return new CharacterToken('</'.$temporaryBuffer);
continue;
}
}
# 8.2.4.17 Script data less-than sign state
elseif ($this->state === self::SCRIPT_DATA_LESS_THAN_SIGN_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "/" (U+002F)
if ($char === '/') {
# Set the temporary buffer to the empty string. Switch to the script data end tag
# open state.
$temporaryBuffer = '';
$this->state = self::SCRIPT_DATA_END_TAG_OPEN_STATE;
continue;
}
# "!" (U+0021)
elseif ($char === '!') {
# Switch to the script data escape start state. Emit a U+003C LESS-THAN SIGN
# character token and a U+0021 EXCLAMATION MARK character token.
$this->state = self::SCRIPT_DATA_ESCAPE_START_STATE;
return new CharacterToken('<!');
}
# Anything else
else {
# Switch to the script data state. Emit a U+003C LESS-THAN SIGN character token.
# Reconsume the current input character.
$this->state = self::SCRIPT_DATA_STATE;
$this->data->unconsume();
return new CharacterToken('<');
continue;
}
}
# 8.2.4.18 Script data end tag open state
elseif ($this->state === self::SCRIPT_DATA_END_TAG_OPEN_STATE) {
# Consume the next input character
$char = $this->data->consume();
# Uppercase ASCII letter
# Lowercase ASCII letter
if (ctype_alpha($char)) {
# Uppercase:
# Create a new end tag token, and set its tag name to the lowercase version of the
# current input character (add 0x0020 to the character's code point). Append the
# current input character to the temporary buffer. Finally, switch to the script
# data end tag name state. (Don't emit the token yet; further details will be
# filled in before it is emitted.)
# Lowercase:
# Create a new end tag token, and set its tag name to the current input character.
# Append the current input character to the temporary buffer. Finally, switch to
# the script data end tag name state. (Don't emit the token yet; further details
# will be filled in before it is emitted.)
// OPTIMIZATION: Will just check for alpha characters and strtolower the
// characters.
$token = new EndTagToken(strtolower($char));
$temporaryBuffer .= $char;
$this->state = self::SCRIPT_DATA_END_TAG_NAME_STATE;
}
# Anything else
else {
# Switch to the script data state. Emit a U+003C LESS-THAN SIGN character token
# and a U+002F SOLIDUS character token. Reconsume the current input character.
$this->state = self::SCRIPT_DATA_STATE;
$this->data->unconsume();
return new CharacterToken('</');
}
continue;
}
# 8.2.4.19 Script data end tag name state
elseif ($this->state === self::SCRIPT_DATA_END_TAG_NAME_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
# If the current end tag token is an appropriate end tag token, then switch to the
# before attribute name state. Otherwise, treat it as per the "anything else"
# entry below.
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
} else {
$this->state = self::SCRIPT_DATA_STATE;
$this->data->unconsume();
return new CharacterToken('</'.$temporaryBuffer);
}
}
# "/" (U+002F)
elseif ($char === '/') {
# If the current end tag token is an appropriate end tag token, then switch to the
# self-closing start tag state. Otherwise, treat it as per the "anything else"
# entry below.
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::SELF_CLOSING_START_TAG_STATE;
} else {
$this->state = self::SCRIPT_DATA_STATE;
$this->data->unconsume();
return new CharacterToken('</'.$temporaryBuffer);
}
}
# ">" (U+003E)
elseif ($char === '>') {
# If the current end tag token is an appropriate end tag token, then switch to the
# data state and emit the current tag token. Otherwise, treat it as per the
# "anything else" entry below.
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::DATA_STATE;
return $token;
} else {
$this->state = self::SCRIPT_DATA_STATE;
$this->data->unconsume();
return new CharacterToken('</'.$temporaryBuffer);
}
}
# Uppercase ASCII letter
# Lowercase ASCII letter
elseif (ctype_alpha($char)) {
# Uppercase:
# Append the lowercase version of the current input character (add 0x0020 to the
# character's code point) to the current tag token's tag name. Append the current
# input character to the temporary buffer.
# Lowercase:
# Append the current input character to the current tag token's tag name. Append
# the current input character to the temporary buffer.
// OPTIMIZATION: Will just check for alpha characters and strtolower the
// characters.
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
// to loop back through here every single time.
$token->name .= $token->name.strtolower($char.$this->data->consumeWhile(self::CTYPE_ALPHA));
$temporaryBuffer .= $char;
}
# Anything else
else {
# Switch to the script data state. Emit a U+003C LESS-THAN SIGN character token, a
# U+002F SOLIDUS character token, and a character token for each of the characters
# in the temporary buffer (in the order they were added to the buffer). Reconsume
# the current input character.
$this->state = self::SCRIPT_DATA_STATE;
$this->data->unconsume();
return new CharacterToken('</'.$temporaryBuffer);
}
continue;
}
# 8.2.4.20 Script data escape start state
elseif ($this->state === self::SCRIPT_DATA_ESCAPE_START_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "-" (U+002D)
if ($char === '-') {
# Switch to the script data escape start dash state. Emit a U+002D HYPHEN-MINUS
# character token.
$this->state = self::SCRIPT_DATA_ESCAPE_START_DASH_STATE;
return new CharacterToken('-');
}
# Anything else
else {
# Switch to the script data state. Reconsume the current input character.
$this->state = self::SCRIPT_DATA_STATE;
$this->data->unconsume();
}
continue;
}
# 8.2.4.21 Script data escape start dash state
elseif ($this->state === self::SCRIPT_DATA_ESCAPE_START_DASH_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "-" (U+002D)
if ($char === '-') {
# Switch to the script data escaped dash dash state. Emit a U+002D HYPHEN-MINUS
# character token.
$this->state = self::SCRIPT_DATA_ESCAPED_DASH_DASH_STATE;
return new CharacterToken('-');
}
# Anything else
else {
# Switch to the script data state. Reconsume the current input character.
$this->state = self::SCRIPT_DATA_STATE;
$this->data->unconsume();
}
continue;
}
# 8.2.4.22 Script data escaped state
elseif ($this->state === self::SCRIPT_DATA_ESCAPED_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "-" (U+002D)
if ($char === '-') {
# Switch to the script data escaped dash state. Emit a U+002D HYPHEN-MINUS
# character token.
$this->state = self::SCRIPT_DATA_ESCAPED_DASH_STATE;
return new CharacterToken('-');
}
# "<" (U+003C)
elseif ($char === '<') {
# Switch to the script data escaped less-than sign state.
$this->state = self::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE;
}
# EOF
elseif ($char === '') {
# Switch to the data state. Parse error. Reconsume the EOF character.
$this->state = self::DATA_STATE;
$this->error(ParseError::UNEXPECTED_EOF);
$this->data->unconsume();
}
# Anything else
else {
# Emit the current input character as a character token.
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
return new CharacterToken($char.$this->data->consumeUntil('-<'));
}
continue;
}
# 8.2.4.23 Script data escaped dash state
elseif ($this->state === self::SCRIPT_DATA_ESCAPED_DASH_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "-" (U+002D)
if ($char === '-') {
# Switch to the script data escaped dash dash state. Emit a U+002D HYPHEN-MINUS
# character token.
$this->state = self::SCRIPT_DATA_ESCAPED_DASH_DASH_STATE;
return new CharacterToken('-');
}
# "<" (U+003C)
elseif ($char === '<') {
# Switch to the script data escaped less-than sign state.
$this->state = self::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE;
}
# EOF
elseif ($char === '') {
# Switch to the data state. Parse error. Reconsume the EOF character.
$this->state = self::DATA_STATE;
$this->error(ParseError::UNEXPECTED_EOF);
$this->data->unconsume();
}
# Anything else
else {
# Switch to the script data escaped state. Emit the current input character as a
# character token.
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
return new CharacterToken($char);
}
continue;
}
# 8.2.4.24 Script data escaped dash dash state
elseif ($this->state === self::SCRIPT_DATA_ESCAPED_DASH_DASH_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "-" (U+002D)
if ($char === '-') {
# Emit a U+002D HYPHEN-MINUS character token.
return new CharacterToken('-');
}
# "<" (U+003C)
elseif ($char === '<') {
# Switch to the script data escaped less-than sign state.
$this->state = self::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Switch to the script data state. Emit a U+003E GREATER-THAN SIGN character
# token.
$this->state = self::SCRIPT_DATA_STATE;
return new CharacterToken('>');
}
# EOF
elseif ($char === '') {
# Switch to the data state. Parse error. Reconsume the EOF character.
$this->state = self::DATA_STATE;
$this->error(ParseError::UNEXPECTED_EOF);
$this->data->unconsume();
}
# Anything else
else {
# Switch to the script data escaped state. Emit the current input character as a
# character token.
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
return new CharacterToken($char);
}
continue;
}
# 8.2.4.25 Script data escaped less-than sign state
elseif ($this->state === self::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "/" (U+002F)
if ($char === '/') {
# Set the temporary buffer to the empty string. Switch to the script data escaped
# end tag open state.
$temporaryBuffer .= '';
$this->state = self::SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE;
}
# Uppercase ASCII letter
# Lowercase ASCII letter
elseif (ctype_alpha($char)) {
# Uppercase:
# Set the temporary buffer to the empty string. Append the lowercase version of
# the current input character (add 0x0020 to the character's code point) to the
# temporary buffer. Switch to the script data double escape start state. Emit a
# U+003C LESS-THAN SIGN character token and the current input character as a
# character token.
# Lowercase:
# Set the temporary buffer to the empty string. Append the current input character
# to the temporary buffer. Switch to the script data double escape start state.
# Emit a U+003C LESS-THAN SIGN character token and the current input character as
# a character token.
// OPTIMIZATION: Will just check for alpha characters and strtolower the
// characters.
$temporaryBuffer = strtolower($char);
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE;
return new CharacterToken('<'.$char);
}
# Anything else
else {
# Switch to the script data escaped state. Emit a U+003C LESS-THAN SIGN character
# token. Reconsume the current input character.
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
$this->data->unconsume();
return new CharacterToken($char);
}
continue;
}
# 8.2.4.26 Script data escaped end tag open state
elseif ($this->state === self::SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE) {
# Consume the next input character
$char = $this->data->consume();
# Uppercase ASCII letter
# Lowercase ASCII letter
if (ctype_alpha($char)) {
# Uppercase:
# Create a new end tag token, and set its tag name to the lowercase version of the
# current input character (add 0x0020 to the character's code point). Append the
# current input character to the temporary buffer. Finally, switch to the script
# data escaped end tag name state. (Don't emit the token yet; further details will
# be filled in before it is emitted.)
# Lowercase:
# Create a new end tag token, and set its tag name to the current input character.
# Append the current input character to the temporary buffer. Finally, switch to
# the script data escaped end tag name state. (Don't emit the token yet; further
# details will be filled in before it is emitted.)
// OPTIMIZATION: Will just check for alpha characters and strtolower the
// characters.
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
// to loop back through here every single time.
$token = new EndTagToken(strtolower($char));
$temporaryBuffer .= $char;
$this->state = self::SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE;
}
# Anything else
else {
# Switch to the script data escaped state. Emit a U+003C LESS-THAN SIGN character
# token and a U+002F SOLIDUS character token. Reconsume the current input
# character.
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
$this->data->unconsume();
return new CharacterToken('</');
}
continue;
}
# 8.2.4.27 Script data escaped end tag name state
elseif ($this->state === self::SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
# If the current end tag token is an appropriate end tag token, then switch to the
# before attribute name state. Otherwise, treat it as per the "anything else"
# entry below.
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
} else {
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
$this->data->unconsume();
return new CharacterToken('</'.$temporaryBuffer);
}
}
# "/" (U+002F)
elseif ($char === '/') {
# If the current end tag token is an appropriate end tag token, then switch to the
# self-closing start tag state. Otherwise, treat it as per the "anything else"
# entry below.
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::SELF_CLOSING_START_TAG_STATE;
} else {
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
$this->data->unconsume();
return new CharacterToken('</'.$temporaryBuffer);
}
}
# ">" (U+003E)
elseif ($char === '>') {
# If the current end tag token is an appropriate end tag token, then switch to the
# data state and emit the current tag token. Otherwise, treat it as per the
# "anything else" entry below.
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::DATA_STATE;
return $token;
} else {
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
$this->data->unconsume();
return new CharacterToken('</'.$temporaryBuffer);
}
}
# Uppercase ASCII letter
# Lowercase ASCII letter
elseif (ctype_alpha($char)) {
# Uppercase:
# Append the lowercase version of the current input character (add 0x0020 to the
# character's code point) to the current tag token's tag name. Append the current
# input character to the temporary buffer.
# Lowercase:
# Append the current input character to the current tag token's tag name. Append
# the current input character to the temporary buffer.
// OPTIMIZATION: Will just check for alpha characters and strtolower the
// characters.
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
// to loop back through here every single time.
$token->name .= $token->name.strtolower($char.$this->data->consumeWhile(self::CTYPE_ALPHA));
$temporaryBuffer .= $char;
}
# Anything else
else {
# Switch to the script data state. Emit a U+003C LESS-THAN SIGN character token, a
# U+002F SOLIDUS character token, and a character token for each of the characters
# in the temporary buffer (in the order they were added to the buffer). Reconsume
# the current input character.
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
$this->data->unconsume();
return new CharacterToken('</'.$temporaryBuffer);
}
continue;
}
# 8.2.4.29 Script data double escaped state
elseif ($this->state === self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "-" (U+002D)
if ($char === '-') {
# Switch to the script data double escaped dash dash state. Emit a U+002D
# HYPHEN-MINUS character token.
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE;
return new CharacterToken('-');
}
# "<" (U+003C)
elseif ($char === '<') {
# Switch to the script data double escaped less-than sign state. Emit a U+003C
# LESS-THAN SIGN character token.
$this->state = self::DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE;
return new CharacterToken('<');
}
# ">" (U+003E)
elseif ($char === '>') {
# Switch to the script data state. Emit a U+003E GREATER-THAN SIGN character
# token.
$this->state = self::SCRIPT_DATA_STATE;
return new CharacterToken('>');
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
}
# Anything else
else {
# Switch to the script data double escaped state. Emit the current input character
# as a character token.
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
return new CharacterToken($char);
}
continue;
}
# 8.2.4.32 Script data double escaped less-than sign state
elseif ($this->state === self::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "/" (U+002F)
if ($char === '/') {
# Set the temporary buffer to the empty string. Switch to the script data double
# escape end state. Emit a U+002F SOLIDUS character token.
$temporaryBuffer = '';
$this->state === self::SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE;
return new CharacterToken('/');
}
# Anything else
else {
# Switch to the script data double escaped state. Reconsume the current input
# character.
$this->state === self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
$this->data->unconsume();
}
continue;
}
# 8.2.4.33 Script data double escape end state
elseif ($this->state === self::SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
# "/" (U+002F)
# ">" (U+003E)
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' || $char === '/' || $char === '>') {
# If the temporary buffer is the string "script", then switch to the script data
# escaped state. Otherwise, switch to the script data double escaped state. Emit
# the current input character as a character token.
if ($temporaryBuffer === 'script') {
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
} else {
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
return new CharacterToken($char);
}
}
# Uppercase ASCII letter
# Lowercase ASCII letter
elseif (ctype_alpha($char)) {
# Uppercase:
# Append the lowercase version of the current input character (add 0x0020 to the
# character's code point) to the temporary buffer. Emit the current input
# character as a character token.
# Lowercase:
# Append the current input character to the temporary buffer. Emit the current
# input character as a character token.
// OPTIMIZATION: Will just check for alpha characters and strtolower the
// characters.
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
// to loop back through here every single time.
$char = $char.$this->data->consumeWhile(self::CTYPE_ALPHA);
$temporaryBuffer .= strtolower(strtolower($char));
return new CharacterToken($char);
}
# Anything else
else {
# Switch to the script data double escaped state. Reconsume the current input
# character.
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
$this->data->unconsume();
}
continue;
}
# 8.2.4.34 Before attribute name state
elseif ($this->state === self::BEFORE_ATTRIBUTE_NAME_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
# Ignore the character.
}
# "/" (U+002F)
elseif ($char === '/') {
# Switch to the self-closing start tag state.
$this->state = self::SELF_CLOSING_START_TAG_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Switch to the data state. Emit the current tag token.
$this->state = self::DATA_STATE;
return $token;
}
# Uppercase ASCII letter
elseif (ctype_upper($char)) {
# Start a new attribute in the current tag token. Set that attribute's name to the
# lowercase version of the current input character (add 0x0020 to the character's
# code point), and its value to the empty string. Switch to the attribute name
# state.
// Need to add the current attribute to the token, if necessary.
if ($attribute ?? null) {
$token->attributes[] = $attribute;
$attribute = null;
}
$attribute = new TokenAttr(strtolower($char), '');
$this->state = self::ATTRIBUTE_NAME_STATE;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
}
# U+0022 QUOTATION MARK (")
# "'" (U+0027)
# "<" (U+003C)
# "=" (U+003D)
# Anything else
else {
# Quotes, less than sign, equals:
# Parse error. Treat it as per the "anything else" entry below.
# Anything else:
# Start a new attribute in the current tag token. Set that attribute's name to the
# current input character, and its value to the empty string. Switch to the
# attribute name state.
if ($char === '"' || $char === "'" || $char === '<' || $char === '=') {
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
}
// Need to add the current attribute to the token, if necessary.
if ($attribute ?? null) {
$token->attributes[] = $attribute;
$attribute = null;
}
$attribute = new TokenAttr($char, '');
$this->state = self::ATTRIBUTE_NAME_STATE;
}
continue;
}
# 8.2.4.35 Attribute name state
elseif ($this->state === self::ATTRIBUTE_NAME_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
# "/" (U+002F)
# U+003E GREATER-THAN SIGN (>)
# EOF
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' || $char === '/' || $char === '>' || $char === '') {
if ($token->hasAttribute($attribute->name)) {
$this->error(ParseError::ATTRIBUTE_EXISTS, $attribute->name);
}
# Reconsume in the after attribute name state.
$this->data->unconsume();
$this->state = self::AFTER_ATTRIBUTE_NAME_STATE;
}
# "=" (U+003D)
elseif ($char === '=') {
if ($token instanceof StartTagToken && $token->hasAttribute($attribute->name)) {
$this->error(ParseError::ATTRIBUTE_EXISTS, $attribute->name);
}
# Switch to the before attribute value state.
$this->state = self::BEFORE_ATTRIBUTE_VALUE_STATE;
}
# Uppercase ASCII letter
elseif (ctype_upper($char)) {
# Append the lowercase version of the current input character (add 0x0020 to the
# character's code point) to the current attribute's name.
// OPTIMIZATION: Consume all characters that are uppercase ASCII letters to prevent
// having to loop back through here every single time.
$attribute->name .= strtolower($char.$this->data->consumeWhile(self::CTYPE_UPPER));
}
# U+0022 QUOTATION MARK (")
# "'" (U+0027)
# "<" (U+003C)
# Anything else
else {
# Quotes, less than sign:
# Parse error. Treat it as per the "anything else" entry below.
# Anything else:
# Append the current input character to the current attribute's name.
if ($char === '"' || $char === "'" || $char === '<' || $char === '=') {
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
}
// OPTIMIZATION: Will just check for alpha characters and strtolower the
// characters.
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
$attribute->name .= $char.$this->data->consumeUntil("\t\n\x0c /=>\"'<".self::CTYPE_UPPER);
}
# When the user agent leaves the attribute name state (and before emitting the tag
# token, if appropriate), the complete attribute's name must be compared to the
# other attributes on the same token; if there is already an attribute on the
# token with the exact same name, then this is a parse error and the new attribute
# must be removed from the token.
// DEVIATION: Because this implementation uses a buffer to hold the attribute name
// it is only added if it is valid. The result is the same, though.
continue;
}
# 8.2.4.36 After attribute name state
elseif ($this->state === self::AFTER_ATTRIBUTE_NAME_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
# Ignore the character.
}
# U+002F SOLIDUS (/)
elseif ($char === '/') {
# Switch to the self-closing start tag state.
$this->state = self::SELF_CLOSING_START_TAG_STATE;
}
# U+003D EQUALS SIGN (=)
elseif ($char === '=') {
# Switch to the before attribute value state.
$this->state = self::BEFORE_ATTRIBUTE_VALUE_STATE;
}
# U+003E GREATER-THAN SIGN (>)
elseif ($char === '>') {
// Need to add the current attribute to the token, if necessary.
if ($attribute) {
$token->attributes[] = $attribute;
$attribute = null;
}
# Switch to the data state. Emit the current tag token.
$this->state = self::DATA_STATE;
return $token;
}
# Uppercase ASCII letter
elseif (ctype_upper($char)) {
# Start a new attribute in the current tag token. Set that attribute's name to the
# lowercase version of the current input character (add 0x0020 to the character's
# code point), and its value to the empty string. Switch to the attribute name
# state.
// Need to add the current attribute to the token, if necessary.
if ($attribute) {
$token->attributes[] = $attribute;
$attribute = null;
}
$attribute = new TokenAttr(strtolower($char), '');
$this->state = self::ATTRIBUTE_NAME_STATE;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
}
# U+0022 QUOTATION MARK (")
# "'" (U+0027)
# "<" (U+003C)
# "=" (U+003D)
# Anything else
else {
# Quotes, less than sign, equals:
# Parse error. Treat it as per the "anything else" entry below.
# Anything else:
# Start a new attribute in the current tag token. Set that attribute's name to the
# current input character, and its value to the empty string. Switch to the
# attribute name state.
if ($char === '"' || $char === "'" || $char === '<' || $char === '=') {
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
}
// Need to add the current attribute to the token, if necessary.
if ($attribute) {
$token->attributes[] = $attribute;
$attribute = null;
}
$attribute = new TokenAttr($char, '');
$this->state = self::ATTRIBUTE_NAME_STATE;
}
continue;
}
# 8.2.4.37 Before attribute value state
elseif ($this->state === self::BEFORE_ATTRIBUTE_VALUE_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
# Ignore the character.
}
# U+0022 QUOTATION MARK (")
elseif ($char === '"') {
# Switch to the attribute value (double-quoted) state.
$this->state = self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
}
# U+0026 AMPERSAND (&)
elseif ($char === '&') {
# Switch to the attribute value (unquoted) state. Reconsume the current input
# character.
$this->state = self::ATTRIBUTE_VALUE_UNQUOTED_STATE;
$this->data->unconsume();
}
# "'" (U+0027)
elseif ($char === "'") {
# Switch to the attribute value (single-quoted) state.
$this->state = self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Parse error. Switch to the data state. Emit the current tag token.
$this->error(ParseError::UNEXPECTED_END_OF_TAG);
$this->state = self::DATA_STATE;
// Need to add the current attribute to the token, if necessary.
if ($attribute) {
$token->attributes[] = $attribute;
$attribute = null;
}
return $token;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
}
# "<" (U+003C)
# "=" (U+003D)
# "`" (U+0060)
# Anything else
else {
# less than sign, equals, tick:
# Parse error. Treat it as per the "anything else" entry below.
# Anything else:
# Append the current input character to the current attribute's value. Switch to
# the attribute value (unquoted) state.
if ($char === '<' || $char === '=' || $char === '`') {
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
}
$attribute->value .= $char;
$this->state = self::ATTRIBUTE_VALUE_UNQUOTED_STATE;
}
continue;
}
# 8.2.4.38 Attribute value (double-quoted) state
elseif ($this->state === self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
# Consume the next input character
$char = $this->data->consume();
# U+0022 QUOTATION MARK (")
if ($char === '"') {
$this->state = self::AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
}
# U+0026 AMPERSAND (&)
elseif ($char === '&') {
# Switch to the character reference in attribute value state, with the additional
# allowed character being U+0022 QUOTATION MARK (").
# 8.2.4.41 Character reference in attribute value state:
# Attempt to consume a character reference.
# If nothing is returned, append a U+0026 AMPERSAND character (&) to the current
# attribute's value.
# Otherwise, append the returned character tokens to the current attribute's
# value.
# Finally, switch back to the attribute value state that switched into this state.
// DEVIATION: This implementation does the character reference consuming in a
// function for which it is more suited for.
$attribute->value .= $this->data->consumeCharacterReference('"', true);
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
}
# Anything else
else {
# Append the current input character to the current attribute's value.
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
$attribute->value .= $char.$this->data->consumeUntil('"&');
}
continue;
}
# 8.2.4.39 Attribute value (single-quoted) state
elseif ($this->state === self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "'" (U+0027)
if ($char === "'") {
# Switch to the after attribute value (quoted) state.
$this->state = self::AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
}
# U+0026 AMPERSAND (&)
elseif ($char === '&') {
# Switch to the character reference in attribute value state, with the additional
# allowed character being "'" (U+0027).
# 8.2.4.41 Character reference in attribute value state:
# Attempt to consume a character reference.
# If nothing is returned, append a U+0026 AMPERSAND character (&) to the current
# attribute's value.
# Otherwise, append the returned character tokens to the current attribute's
# value.
# Finally, switch back to the attribute value state that switched into this state.
# DEVIATION: This implementation does the character reference consuming in a
# function for which it is more suited for.
$attribute->value .= $this->data->consumeCharacterReference("'", true);
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
}
# Anything else
else {
# Append the current input character to the current attribute's value.
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
$attribute->value .= $char.$this->data->consumeUntil("'&");
}
continue;
}
# 8.2.4.40 Attribute value (unquoted) state
elseif ($this->state === self::ATTRIBUTE_VALUE_UNQUOTED_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
$this->state = self::BEFORE_ATTRIBUTE_VALUE_STATE;
}
# U+0026 AMPERSAND (&)
elseif ($char === '&') {
# Switch to the character reference in attribute value state, with the additional
# allowed character being ">" (U+003E).
# Switch to the character reference in attribute value state, with the additional
# allowed character being "'" (U+0027).
# 8.2.4.41 Character reference in attribute value state:
# Attempt to consume a character reference.
# If nothing is returned, append a U+0026 AMPERSAND character (&) to the current
# attribute's value.
# Otherwise, append the returned character tokens to the current attribute's
# value.
# Finally, switch back to the attribute value state that switched into this state.
// DEVIATION: This implementation does the character reference consuming in a
// function for which it is more suited for.
$attribute->value .= $this->data->consumeCharacterReference('>', true);
}
# ">" (U+003E)
elseif ($char === '>') {
# Switch to the data state. Emit the current tag token.
$this->state = self::DATA_STATE;
// Need to add the current attribute to the token, if necessary.
if ($attribute) {
$token->attributes[] = $attribute;
$attribute = null;
}
return $token;
}
# Parse error. Switch to the data state. Reconsume the EOF character.
elseif ($char === '') {
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
}
# U+0022 QUOTATION MARK (")
# "'" (U+0027)
# "<" (U+003C)
# "=" (U+003D)
# "`" (U+0060)
# Anything else
else {
# Quotes, less than sign, equals, tick:
# Parse error. Treat it as per the "anything else" entry below.
# Anything else:
# Append the current input character to the current attribute's value.
if ($char === '"' || $char === "'" || $char === '<' || $char === '=' || $char === '`') {
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
}
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
$attribute->value .= $char.$this->data->consumeUntil("\t\n\x0c &>\"'<=`");
}
continue;
}
# 8.2.4.42 After attribute value (quoted) state
elseif ($this->state === self::AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
# Switch to the before attribute name state.
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
}
# "/" (U+002F)
elseif ($char === '/') {
# Switch to the self-closing start tag state.
$this->state = self::SELF_CLOSING_START_TAG_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Switch to the data state. Emit the current tag token.
$this->state = self::DATA_STATE;
// Need to add the current attribute to the token, if necessary.
if ($attribute) {
$token->attributes[] = $attribute;
$attribute = null;
}
return $token;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
}
# Anything else
else {
# Parse error. Switch to the before attribute name state. Reconsume the character.
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
$this->data->unconsume();
}
continue;
}
# 8.2.4.43 Self-closing start tag state
elseif ($this->state === self::SELF_CLOSING_START_TAG_STATE) {
# Consume the next input character
$char = $this->data->consume();
# ">" (U+003E)
if ($char === '>') {
# Set the self-closing flag of the current tag token. Switch to the data state.
# Emit the current tag token.
$token->selfClosing = true;
$this->state = self::DATA_STATE;
// Need to add the current attribute to the token, if necessary.
if ($attribute ?? null) {
$token->attributes[] = $attribute;
$attribute = null;
}
return $token;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
}
# Anything else
else {
# Parse error. Switch to the before attribute name state. Reconsume the character.
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
$this->data->unconsume();
}
continue;
}
# 8.2.4.44 Bogus comment state
elseif ($this->state === self::BOGUS_COMMENT_STATE) {
# Consume every character up to and including the first ">" (U+003E) character or
# the end of the file (EOF), whichever comes first. Emit a comment token whose
# data is the concatenation of all the characters starting from and including the
# character that caused the state machine to switch into the bogus comment state,
# up to and including the character immediately before the last consumed character
# (i.e. up to the character just before the U+003E or EOF character), but with any
# U+0000 NULL characters replaced by U+FFFD REPLACEMENT CHARACTER characters. (If
# the comment was started by the end of the file (EOF), the token is empty.
# Similarly, the token is empty if it was generated by the string "<!>".)
$char = $char.$this->data->consumeUntil('>');
$nextChar = $this->data->consume();
# Switch to the data state.
$this->state = self::DATA_STATE;
# If the end of the file was reached, reconsume the EOF character.
if ($nextChar === '') {
$this->data->unconsume();
}
return new CommentToken($char);
}
# 8.2.4.45 Markup declaration open state
elseif ($this->state === self::MARKUP_DECLARATION_OPEN_STATE) {
# If the next two characters are both "-" (U+002D) characters, consume those two
# characters, create a comment token whose data is the empty string, and switch to
# the comment start state.
if ($this->data->peek(2) === '--') {
$this->data->consume(2);
$token = new CommentToken();
$this->state = self::COMMENT_START_STATE;
}
# Otherwise, if the next seven characters are an ASCII case-insensitive match for
# the word "DOCTYPE", then consume those characters and switch to the DOCTYPE
# state.
elseif (strtolower($this->data->peek(7)) === 'doctype') {
$this->data->consume(7);
$this->state = self::DOCTYPE_STATE;
}
# Otherwise, if there is an adjusted current node and it is not an element in the
# HTML namespace and the next seven characters are a case-sensitive match for the
# string "[CDATA[" (the five uppercase letters "CDATA" with a U+005B LEFT SQUARE
# BRACKET character before and after), then consume those characters and switch to
# the CDATA section state.
else {
$adjustedCurrentNode = $this->stack->adjustedCurrentNode;
if ($adjustedCurrentNode && $adjustedCurrentNode->namespace !== Parser::HTML_NAMESPACE && $this->data->peek(7) === '[CDATA[') {
$this->data->consume(7);
$this->state = self::CDATA_SECTION_STATE;
}
# Otherwise, this is a parse error. Switch to the bogus comment state. The next
# character that is consumed, if any, is the first character that will be in the
# comment.
else {
$char = $this->data->consume();
if ($char !== '') {
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
} else {
$this->error(ParseError::UNEXPECTED_EOF);
}
$this->state = self::BOGUS_COMMENT_STATE;
}
}
continue;
}
# 8.2.4.46 Comment start state
elseif ($this->state === self::COMMENT_START_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "-" (U+002D)
if ($char === '-') {
# Switch to the comment start dash state.
$this->state = self::COMMENT_START_DASH_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Parse error. Switch to the data state. Emit the comment token.
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$this->state = self::DATA_STATE;
return $token;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF
# character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
return $token;
}
# Anything else
else {
# Append the current input character to the comment token's data. Switch to the
# comment state.
$token->data .= $char;
$this->state = self::COMMENT_STATE;
}
continue;
}
# 8.2.4.47 Comment start dash state
elseif ($this->state === self::COMMENT_START_DASH_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "-" (U+002D)
if ($char === '-') {
# Switch to the comment start dash state.
$this->state = self::COMMENT_END_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Parse error. Switch to the data state. Emit the comment token.
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$this->state = self::DATA_STATE;
return $token;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF
# character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
return $token;
}
# Anything else
else {
# Append a "-" (U+002D) character and the current input character to the comment
# token's data. Switch to the comment state.
$token->data .= '-'.$char;
$this->state = self::COMMENT_STATE;
}
continue;
}
# 8.2.4.48 Comment state
elseif ($this->state === self::COMMENT_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "-" (U+002D)
if ($char === '-') {
# Switch to the comment end dash state
$this->state = self::COMMENT_END_DASH_STATE;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF
# character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
return $token;
}
# Anything else
else {
# Append the current input character to the comment token's data.
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
$token->data .= $char.$this->data->consumeUntil('-');
}
continue;
}
# 8.2.4.49 Comment end dash state
elseif ($this->state === self::COMMENT_END_DASH_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "-" (U+002D)
if ($char === '-') {
# Switch to the comment end state
$this->state = self::COMMENT_END_STATE;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF
# character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
return $token;
}
# Anything else
else {
# Append a "-" (U+002D) character and the current input character to the comment
# token's data. Switch to the comment state.
$token->data .= '-'.$char;
$this->state = self::COMMENT_STATE;
}
continue;
}
# 8.2.4.50 Comment end state
elseif ($this->state === self::COMMENT_END_STATE) {
# Consume the next input character
$char = $this->data->consume();
# ">" (U+003E)
if ($char === '>') {
# Switch to the data state. Emit the comment token.
$this->state = self::DATA_STATE;
return $token;
}
# "!" (U+0021)
elseif ($char === '!') {
# Parse error. Switch to the comment end bang state.
$this->error(ParseError::UNEXPECTED_CHARACTER, '!');
$this->state = self::COMMENT_END_BANG_STATE;
}
# "-" (U+002D)
elseif ($char === '-') {
# Parse error. Append a "-" (U+002D) character to the comment token's data.
// OPTIMIZATION: Consume all '-' characters to prevent having to loop back through
// here every single time.
$char .= $this->data->consumeWhile('-');
for ($i = 0; $i < strlen($char); $i++) {
$this->error(ParseError::UNEXPECTED_CHARACTER, '-');
}
$token->data .= $char;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF
# character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
return $token;
}
# Anything else
else {
# Parse error. Append two "-" (U+002D) characters and the current input character
# to the comment token's data. Switch to the comment state.
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$token->data .= '--'.$char;
$this->state = self::COMMENT_STATE;
}
continue;
}
# 8.2.4.51 Comment end bang state
elseif ($this->state === self::COMMENT_END_BANG_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "-" (U+002D)
if ($char === '-') {
# Append two "-" (U+002D) characters and a "!" (U+0021) character to the comment
# token's data. Switch to the comment end dash state.
$token->data .= '--!';
$this->state = self::COMMENT_END_DASH_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Switch to the data state. Emit the comment token.
$this->state = self::DATA_STATE;
return $token;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF
# character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$this->data->unconsume();
return $token;
}
# Anything else
else {
# Append two "-" (U+002D) characters, a "!" (U+0021) character, and the current
# input character to the comment token's data. Switch to the comment state.
$token->data .= '--!'.$char;
$this->state = self::COMMENT_STATE;
}
continue;
}
# 8.2.4.52 DOCTYPE state
elseif ($this->state === self::DOCTYPE_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
# Switch to the before DOCTYPE name state.
// Spec doesn't say to create a token here, but if you don't it leads to a
// situation where a token doesn't exist.
$token = new DOCTYPEToken();
$this->state = self::DOCTYPE_NAME_STATE;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Create a new DOCTYPE token. Set its
# force-quirks flag to on. Emit the token. Reconsume the EOF character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token = new DOCTYPEToken();
$token->forceQuirks = true;
$this->data->unconsume();
return $token;
}
# Anything else
else {
# Parse error. Switch to the before DOCTYPE name state. Reconsume the character.
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$this->state = self::DOCTYPE_NAME_STATE;
$this->data->unconsume();
}
continue;
}
# 8.2.4.53 Before DOCTYPE name state
elseif ($this->state === self::BEFORE_DOCTYPE_NAME_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
# Ignore the character.
}
# Uppercase ASCII letter
elseif (ctype_upper($char)) {
# Create a new DOCTYPE token. Set the token's name to the lowercase version of the
# current input character (add 0x0020 to the character's code point). Switch to
# the DOCTYPE name state.
$token = new DOCTYPEToken($char);
$token->tokenizerState = self::DOCTYPE_NAME_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Parse error. Create a new DOCTYPE token. Set its force-quirks flag to on. Switch
# to the data state. Emit the token.
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$token = new DOCTYPEToken();
$token->forceQuirks = true;
$this->state = self::DATA_STATE;
return $token;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Create a new DOCTYPE token. Set its
# force-quirks flag to on. Emit the token. Reconsume the EOF character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token = new DOCTYPEToken();
$token->forceQuirks = true;
$this->data->unconsume();
return $token;
}
# Anything else
else {
# Create a new DOCTYPE token. Set the token's name to the current input character.
# Switch to the DOCTYPE name state.
$token = new DOCTYPEToken($char);
$token->tokenizerState = self::DOCTYPE_NAME_STATE;
}
continue;
}
# 8.2.4.54 DOCTYPE name state
elseif ($this->state === self::DOCTYPE_NAME_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
# Switch to the after DOCTYPE name state.
$this->state = self::AFTER_DOCTYPE_NAME_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Switch to the data state. Emit the current DOCTYPE token.
$this->state = self::DATA_STATE;
return $token;
}
# Uppercase ASCII letter
elseif (ctype_alpha($char)) {
# Append the lowercase version of the current input character (add 0x0020 to the
# character's code point) to the current DOCTYPE token's name.
// OPTIMIZATION: Will just check for alpha characters and strtolower the
// characters.
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
// to loop back through here every single time.
$token->name .= strtolower($char.$this->data->consumeWhile(self::CTYPE_ALPHA));
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
return $token;
}
# Anything else
else {
# Append the current input character to the current DOCTYPE token's name.
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
$token->name .= $char.$this->data->consumeUntil("\t\n\x0c >".self::CTYPE_ALPHA);
}
continue;
}
# 8.2.4.55 After DOCTYPE name state
elseif ($this->state === self::AFTER_DOCTYPE_NAME_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
# Switch to the after DOCTYPE name state.
$this->state = self::AFTER_DOCTYPE_NAME_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Switch to the data state. Emit the current DOCTYPE token.
$this->state = self::DATA_STATE;
return $token;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
return $token;
}
# Anything else
else {
# If the six characters starting from the current input character are an ASCII
# case-insensitive match for the word "PUBLIC", then consume those characters and
# switch to the after DOCTYPE public keyword state.
// Simpler to just consume and then unconsume if they're not needed.
$char .= $this->data->consume(5);
if (strtolower($char) === 'public') {
$this->state = self::AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE;
}
# Otherwise, if the six characters starting from the current input character are
# an ASCII case-insensitive match for the word "SYSTEM", then consume those
# characters and switch to the after DOCTYPE system keyword state.
elseif (strtolower($char) === 'system') {
$this->state = self::AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE;
}
# Otherwise, this is a parse error. Set the DOCTYPE token's force-quirks flag to
# on. Switch to the bogus DOCTYPE state.
else {
// Need to unconsume what was consumed earlier.
$this->data->unconsume(5);
$this->error(ParseError::UNEXPECTED_CHARACTER, $char[0]);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
}
}
continue;
}
# 8.2.4.56 After DOCTYPE public keyword state
elseif ($this->state === self::AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
# Switch to the before DOCTYPE public identifier state.
$this->state = self::BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
}
# U+0022 QUOTATION MARK (")
elseif ($char === '"') {
# Parse error. Set the DOCTYPE token's public identifier to the empty string (not
# missing), then switch to the DOCTYPE public identifier (double-quoted) state.
$this->error(ParseError::UNEXPECTED_CHARACTER, '"');
$token->public = '';
$this->state = self::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
}
# "'" (U+0027)
elseif ($char === "'") {
# Parse error. Set the DOCTYPE token's public identifier to the empty string (not
# missing), then switch to the DOCTYPE public identifier (single-quoted) state.
$this->error(ParseError::UNEXPECTED_CHARACTER, "'");
$token->public = '';
$this->state = self::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
# state. Emit that DOCTYPE token.
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$token->forceQuirks = true;
$this->state = self::DATA_STATE;
return $token;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
return $token;
}
# Anything else
else {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
# bogus DOCTYPE state.
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
}
continue;
}
# 8.2.4.57 Before DOCTYPE public identifier state
elseif ($this->state === self::BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
# Ignore the character.
}
# U+0022 QUOTATION MARK (")
elseif ($char === '"') {
# Set the DOCTYPE token's public identifier to the empty string (not missing),
# then switch to the DOCTYPE public identifier (double-quoted) state.
$token->public = '';
$this->state = self::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
}
# "'" (U+0027)
elseif ($char === "'") {
# Set the DOCTYPE token's public identifier to the empty string (not missing),
# then switch to the DOCTYPE public identifier (single-quoted) state.
$token->public = '';
$this->state = self::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
# state. Emit that DOCTYPE token.
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$token->forceQuirks = true;
$this->state = self::DATA_STATE;
return $token;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
return $token;
}
# Anything else
else {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
# bogus DOCTYPE state.
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
}
continue;
}
# 8.2.4.58 DOCTYPE public identifier (double-quoted) state
elseif ($this->state === self::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
# Consume the next input character
$char = $this->data->consume();
# U+0022 QUOTATION MARK (")
if ($char === '"') {
# Switch to the after DOCTYPE public identifier state.
$this->state = self::AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
# state. Emit that DOCTYPE token.
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$token->forceQuirks = true;
$this->state = self::DATA_STATE;
return $token;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
return $token;
}
# Anything else
else {
# Append the current input character to the current DOCTYPE token's public identifier.
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
$token->public .= $char.$this->data->consumeUntil('">');
}
continue;
}
# 8.2.4.59 DOCTYPE public identifier (single-quoted) state
elseif ($this->state === self::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "'" (U+0027)
if ($char === "'") {
# Switch to the after DOCTYPE public identifier state.
$this->state = self::AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
# state. Emit that DOCTYPE token.
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$this->state = self::DATA_STATE;
return $token;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
return $token;
}
# Anything else
else {
# Append the current input character to the current DOCTYPE token's public identifier.
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
$token->public .= $char.$this->data->consumeUntil("'>");
}
continue;
}
# 8.2.4.60 After DOCTYPE public identifier state
elseif ($this->state === self::AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
# Switch to the between DOCTYPE public and system identifiers state.
$this->state = self::BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Switch to the data state. Emit the current DOCTYPE token.
$this->state = self::DATA_STATE;
return $token;
}
# U+0022 QUOTATION MARK (")
elseif ($char === '"') {
# Set the DOCTYPE token's system identifier to the empty string (not missing),
# then switch to the DOCTYPE system identifier (double-quoted) state.
$this->system = '';
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
}
# "'" (U+0027)
elseif ($char === "'") {
# Set the DOCTYPE token's system identifier to the empty string (not missing),
# then switch to the DOCTYPE system identifier (single-quoted) state.
$this->system = '';
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
return $token;
}
# Anything else
else {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
# bogus DOCTYPE state.
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
}
continue;
}
# 8.2.4.61 Between DOCTYPE public and system identifiers state
elseif ($this->state === self::BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
# Ignore the character.
}
# ">" (U+003E)
elseif ($char === '>') {
# Switch to the data state. Emit the current DOCTYPE token.
$this->state = self::DATA_STATE;
return $token;
}
# U+0022 QUOTATION MARK (")
elseif ($char === '"') {
# Set the DOCTYPE token's system identifier to the empty string (not missing),
# then switch to the DOCTYPE system identifier (double-quoted) state.
$this->system = '';
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
}
# "'" (U+0027)
elseif ($char === "'") {
# Set the DOCTYPE token's system identifier to the empty string (not missing),
# then switch to the DOCTYPE system identifier (single-quoted) state.
$this->system = '';
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
$this->error(ParseError::UNEXPECTED_EOF, 'DOCTYPE public identifier');
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
return $token;
}
# Anything else
else {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
# bogus DOCTYPE state.
$this->error(ParseError::UNEXPECTED_CHARACTER, $char, 'DOCTYPE public identifier');
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
}
continue;
}
# 8.2.4.62 After DOCTYPE system keyword state
elseif ($this->state === self::AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
# Switch to the before DOCTYPE system identifier state.
$this->state = self::BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
}
# U+0022 QUOTATION MARK (")
elseif ($char === '"') {
# Parse error. Set the DOCTYPE token's system identifier to the empty string (not
# missing), then switch to the DOCTYPE system identifier (double-quoted) state.
$this->error(ParseError::UNEXPECTED_CHARACTER, '"');
$token->system = '';
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
}
# "'" (U+0027)
elseif ($char === "'") {
# Parse error. Set the DOCTYPE token's system identifier to the empty string (not
# missing), then switch to the DOCTYPE system identifier (single-quoted) state.
$this->error(ParseError::UNEXPECTED_CHARACTER, "'");
$token->system = '';
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
# state. Emit that DOCTYPE token.
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$token->forceQuirks = true;
$this->state = self::DATA_STATE;
return $token;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
return $token;
}
# Anything else
else {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
# bogus DOCTYPE state.
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
}
continue;
}
# 8.2.4.63 Before DOCTYPE system identifier state
elseif ($this->state === self::BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
# Ignore the character.
}
# U+0022 QUOTATION MARK (")
elseif ($char === '"') {
# Set the DOCTYPE token's system identifier to the empty string (not missing),
# then switch to the DOCTYPE system identifier (double-quoted) state.
$token->system = '';
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
}
# "'" (U+0027)
elseif ($char === "'") {
# Set the DOCTYPE token's system identifier to the empty string (not missing),
# then switch to the DOCTYPE system identifier (single-quoted) state.
$token->system = '';
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
# state. Emit that DOCTYPE token.
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$token->forceQuirks = true;
$this->state = self::DATA_STATE;
return $token;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
return $token;
}
# Anything else
else {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
# bogus DOCTYPE state.
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
}
continue;
}
# 8.2.4.64 DOCTYPE system identifier (double-quoted) state
elseif ($this->state === self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
# Consume the next input character
$char = $this->data->consume();
# U+0022 QUOTATION MARK (")
if ($char === '"') {
# Switch to the after DOCTYPE system identifier state.
$this->state = self::AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
# state. Emit that DOCTYPE token.
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$this->state = self::DATA_STATE;
return $token;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
return $token;
}
# Anything else
else {
# Append the current input character to the current DOCTYPE token's system identifier.
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
$token->system .= $char.$this->data->consumeUntil('">');
}
continue;
}
# 8.2.4.65 DOCTYPE system identifier (single-quoted) state
elseif ($this->state === self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "'" (U+0027)
if ($char === "'") {
# Switch to the after DOCTYPE system identifier state.
$this->state = self::AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data
# state. Emit that DOCTYPE token.
$this->error(ParseError::UNEXPECTED_CHARACTER, '>');
$this->state = self::DATA_STATE;
return $token;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
return $token;
}
# Anything else
else {
# Append the current input character to the current DOCTYPE token's system identifier.
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
$token->system .= $char.$this->data->consumeUntil("'>");
}
continue;
}
# 8.2.4.66 After DOCTYPE system identifier state
elseif ($this->state === self::AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
# Consume the next input character
$char = $this->data->consume();
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
# Switch to the between DOCTYPE system and system identifiers state.
$this->state = self::BETWEEN_DOCTYPE_SYSTEM_AND_SYSTEM_IDENTIFIERS_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Switch to the data state. Emit the current DOCTYPE token.
$this->state = self::DATA_STATE;
return $token;
}
# U+0022 QUOTATION MARK (")
elseif ($char === '"') {
# Set the DOCTYPE token's system identifier to the empty string (not missing),
# then switch to the DOCTYPE system identifier (double-quoted) state.
$this->system = '';
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
}
# "'" (U+0027)
elseif ($char === "'") {
# Set the DOCTYPE token's system identifier to the empty string (not missing),
# then switch to the DOCTYPE system identifier (single-quoted) state.
$this->system = '';
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag
# to on. Emit that DOCTYPE token. Reconsume the EOF character.
$this->error(ParseError::UNEXPECTED_EOF);
$this->state = self::DATA_STATE;
$token->forceQuirks = true;
$this->data->unconsume();
return $token;
}
# Anything else
else {
# Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the
# bogus DOCTYPE state.
$this->error(ParseError::UNEXPECTED_CHARACTER, $char);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
}
continue;
}
# 8.2.4.67 Bogus DOCTYPE state
elseif ($this->state === self::BOGUS_DOCTYPE_STATE) {
# Consume the next input character
$char = $this->data->consume();
# ">" (U+003E)
if ($char === '>') {
# Switch to the data state. Emit the DOCTYPE token.
$this->state = self::DATA_STATE;
return $token;
}
# EOF
elseif ($char === '') {
# Switch to the data state. Emit the DOCTYPE token.
$this->state = self::DATA_STATE;
$this->data->unconsume();
return $token;
}
# Anything else
# Ignore the character.
continue;
}
# 8.2.4.68 CDATA section state
elseif ($this->state === self::CDATA_SECTION_STATE) {
# Switch to the data state.
$this->state = self::DATA_STATE;
# Consume every character up to the next occurrence of the three character
# sequence U+005D RIGHT SQUARE BRACKET U+005D RIGHT SQUARE BRACKET U+003E
# GREATER-THAN SIGN (]]>), or the end of the file (EOF), whichever comes first.
# Emit a series of character tokens consisting of all the characters consumed
# except the matching three character sequence at the end (if one was found before
# the end of the file).
$char = '';
while (true) {
$char .= $this->data->consumeUntil(']');
$peek = $this->data->peek(3);
$peeklen = strlen($peek);
if ($peek === ']]>') {
$this->data->consume(3);
return new CharacterToken($char);
} elseif ($peek === '') {
# If the end of the file was reached, reconsume the EOF character.
$this->data->unconsume();
return new CharacterToken($char);
} elseif ($peeklen < 3) {
$char .= $this->data->consume($peeklen);
# If the end of the file was reached, reconsume the EOF character.
$this->data->unconsume();
return new CharacterToken($char);
} else {
$char .= $this->data->consume();
}
}
continue;
}
// If this is reached then we've fucked up. The tokenizer is in an infinite loop
// and should exit immediately.
throw new Exception(Exception::TOKENIZER_INVALID_STATE);
}
}
}