A modern, accurate HTML parser and serializer for PHP
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

3700 lines
173 KiB

<?php
/** @license MIT
* Copyright 2017 , Dustin Wilson, J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\HTML\Parser;
use MensBeam\HTML\Parser;
use MensBeam\Intl\Encoding\UTF8;
class Tokenizer {
use ParseErrorEmitter;
public $state;
public $debugLog = '';
public $debugCount = 0;
protected $data;
protected $stack;
protected $temporaryBuffer = "";
const DATA_STATE = 1;
const RCDATA_STATE = 2;
const RAWTEXT_STATE = 3;
const SCRIPT_DATA_STATE = 4;
const PLAINTEXT_STATE = 5;
const TAG_OPEN_STATE = 6;
const END_TAG_OPEN_STATE = 7;
const TAG_NAME_STATE = 8;
const RCDATA_LESS_THAN_SIGN_STATE = 9;
const RCDATA_END_TAG_OPEN_STATE = 10;
const RCDATA_END_TAG_NAME_STATE = 11;
const RAWTEXT_LESS_THAN_SIGN_STATE = 12;
const RAWTEXT_END_TAG_OPEN_STATE = 13;
const RAWTEXT_END_TAG_NAME_STATE = 14;
const SCRIPT_DATA_LESS_THAN_SIGN_STATE = 15;
const SCRIPT_DATA_END_TAG_OPEN_STATE = 16;
const SCRIPT_DATA_END_TAG_NAME_STATE = 17;
const SCRIPT_DATA_ESCAPE_START_STATE = 18;
const SCRIPT_DATA_ESCAPE_START_DASH_STATE = 19;
const SCRIPT_DATA_ESCAPED_STATE = 20;
const SCRIPT_DATA_ESCAPED_DASH_STATE = 21;
const SCRIPT_DATA_ESCAPED_DASH_DASH_STATE = 22;
const SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE = 23;
const SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE = 24;
const SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE = 25;
const SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE = 26;
const SCRIPT_DATA_DOUBLE_ESCAPED_STATE = 27;
const SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE = 28;
const SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE = 29;
const SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE = 30;
const SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE = 31;
const BEFORE_ATTRIBUTE_NAME_STATE = 32;
const ATTRIBUTE_NAME_STATE = 33;
const AFTER_ATTRIBUTE_NAME_STATE = 34;
const BEFORE_ATTRIBUTE_VALUE_STATE = 35;
const ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE = 36;
const ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE = 37;
const ATTRIBUTE_VALUE_UNQUOTED_STATE = 38;
const AFTER_ATTRIBUTE_VALUE_QUOTED_STATE = 39;
const SELF_CLOSING_START_TAG_STATE = 40;
const BOGUS_COMMENT_STATE = 41;
const MARKUP_DECLARATION_OPEN_STATE = 42;
const COMMENT_START_STATE = 43;
const COMMENT_START_DASH_STATE = 44;
const COMMENT_STATE = 45;
const COMMENT_LESS_THAN_SIGN_STATE = 46;
const COMMENT_LESS_THAN_SIGN_BANG_STATE = 47;
const COMMENT_LESS_THAN_SIGN_BANG_DASH_STATE = 48;
const COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH_STATE = 49;
const COMMENT_END_DASH_STATE = 50;
const COMMENT_END_STATE = 51;
const COMMENT_END_BANG_STATE = 52;
const DOCTYPE_STATE = 53;
const BEFORE_DOCTYPE_NAME_STATE = 54;
const DOCTYPE_NAME_STATE = 55;
const AFTER_DOCTYPE_NAME_STATE = 56;
const AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE = 57;
const BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE = 58;
const DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE = 59;
const DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE = 60;
const AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE = 61;
const BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE = 62;
const AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE = 63;
const BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE = 64;
const DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE = 65;
const DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE = 66;
const AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE = 67;
const BOGUS_DOCTYPE_STATE = 68;
const CDATA_SECTION_STATE = 69;
const CDATA_SECTION_BRACKET_STATE = 70;
const CDATA_SECTION_END_STATE = 71;
const CHARACTER_REFERENCE_STATE = 72;
const NAMED_CHARACTER_REFERENCE_STATE = 73;
const AMBIGUOUS_AMPERSAND_STATE = 74;
const NUMERIC_CHARACTER_REFERENCE_STATE = 75;
const HEXADECIMAL_CHARACTER_REFERENCE_START_STATE = 76;
const DECIMAL_CHARACTER_REFERENCE_START_STATE = 77;
const HEXADECIMAL_CHARACTER_REFERENCE_STATE = 78;
const DECIMAL_CHARACTER_REFERENCE_STATE = 79;
const NUMERIC_CHARACTER_REFERENCE_END_STATE = 80;
const STATE_NAMES = [
self::DATA_STATE => "Data",
self::RCDATA_STATE => "RCDATA",
self::RAWTEXT_STATE => "RAWTEXT",
self::SCRIPT_DATA_STATE => "Script data",
self::PLAINTEXT_STATE => "PLAINTEXT",
self::TAG_OPEN_STATE => "Tag open",
self::END_TAG_OPEN_STATE => "End tag open",
self::TAG_NAME_STATE => "Tag name",
self::RCDATA_LESS_THAN_SIGN_STATE => "RCDATA less-than sign",
self::RCDATA_END_TAG_OPEN_STATE => "RCDATA end tag open",
self::RCDATA_END_TAG_NAME_STATE => "RCDATA end tag name",
self::RAWTEXT_LESS_THAN_SIGN_STATE => "RAWTEXT less than sign",
self::RAWTEXT_END_TAG_OPEN_STATE => "RAWTEXT end tag open",
self::RAWTEXT_END_TAG_NAME_STATE => "RAWTEXT end tag name",
self::SCRIPT_DATA_LESS_THAN_SIGN_STATE => "Script data less-than sign",
self::SCRIPT_DATA_END_TAG_OPEN_STATE => "Script data end tag open",
self::SCRIPT_DATA_END_TAG_NAME_STATE => "Script data end tag name",
self::SCRIPT_DATA_ESCAPE_START_STATE => "Script data escape start",
self::SCRIPT_DATA_ESCAPE_START_DASH_STATE => "Script data escape start dash",
self::SCRIPT_DATA_ESCAPED_STATE => "Script data escaped",
self::SCRIPT_DATA_ESCAPED_DASH_STATE => "Script data escaped dash",
self::SCRIPT_DATA_ESCAPED_DASH_DASH_STATE => "Script data escaped dash dash",
self::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE => "Script data escaped less-than sign",
self::SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE => "Script data escaped end tag open",
self::SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE => "Script data escaped end tag name",
self::SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE => "Script data double escape start",
self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE => "Script data double escaped",
self::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE => "Script data double escaped dash",
self::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE => "Script data double escaped dash dash",
self::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE => "Script data double escaped less-than sign",
self::SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE => "Script data double escape end",
self::BEFORE_ATTRIBUTE_NAME_STATE => "Before attribute",
self::ATTRIBUTE_NAME_STATE => "Attribute name",
self::AFTER_ATTRIBUTE_NAME_STATE => "After attribute name",
self::BEFORE_ATTRIBUTE_VALUE_STATE => "Before attribute value",
self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE => "Attribute value (double quoted)",
self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE => "Attribute value (single quoted)",
self::ATTRIBUTE_VALUE_UNQUOTED_STATE => "Attribute value (unquoted)",
self::AFTER_ATTRIBUTE_VALUE_QUOTED_STATE => "After attribute value (quoted)",
self::SELF_CLOSING_START_TAG_STATE => "Self-closing start tag",
self::BOGUS_COMMENT_STATE => "Bogus comment",
self::MARKUP_DECLARATION_OPEN_STATE => "Markup declaration open",
self::COMMENT_START_STATE => "Comment start",
self::COMMENT_START_DASH_STATE => "Comment start dash",
self::COMMENT_STATE => "Comment",
self::COMMENT_LESS_THAN_SIGN_STATE => "Comment less-than sign",
self::COMMENT_LESS_THAN_SIGN_BANG_STATE => "Comment less-than sign bang",
self::COMMENT_LESS_THAN_SIGN_BANG_DASH_STATE => "Comment less-than sign bang dash",
self::COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH_STATE => "Comment less-than sign bang dash dash",
self::COMMENT_END_DASH_STATE => "Comment end dash",
self::COMMENT_END_STATE => "Comment end",
self::COMMENT_END_BANG_STATE => "Comment end bang",
self::DOCTYPE_STATE => "DOCTYPE",
self::BEFORE_DOCTYPE_NAME_STATE => "Before DOCTYPE name",
self::DOCTYPE_NAME_STATE => "DOCTYPE name",
self::AFTER_DOCTYPE_NAME_STATE => "After DOCTYPE name",
self::AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE => "After DOCTYPE public keyword",
self::BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE => "Before DOCTYPE public identifier",
self::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE => "DOCTYPE public identifier (double quoted)",
self::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE => "DOCTYPE public identifier (single quoted)",
self::AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE => "After DOCTYPE public identifier",
self::BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE => "Between DOCTYPE public and system identifiers",
self::AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE => "After DOCTYPE system keyword",
self::BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE => "Before DOCTYPE system identifier",
self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE => "DOCTYPE system identifier (double-quoted)",
self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE => "DOCTYPE system identifier (single-quoted)",
self::AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE => "After DOCTYPE system identifier",
self::BOGUS_DOCTYPE_STATE => "Bogus DOCTYPE",
self::CDATA_SECTION_STATE => "CDATA section",
self::CDATA_SECTION_BRACKET_STATE => "CDATA section bracket",
self::CDATA_SECTION_END_STATE => "CDATA section end",
self::CHARACTER_REFERENCE_STATE => "Character reference",
self::NAMED_CHARACTER_REFERENCE_STATE => "Named character reference",
self::AMBIGUOUS_AMPERSAND_STATE => "Ambiguous ampersand",
self::NUMERIC_CHARACTER_REFERENCE_STATE => "Numeric character reference",
self::HEXADECIMAL_CHARACTER_REFERENCE_START_STATE => "Hexadecimal character reference start",
self::DECIMAL_CHARACTER_REFERENCE_START_STATE => "Decimal character reference start",
self::HEXADECIMAL_CHARACTER_REFERENCE_STATE => "Hexadecimal character reference",
self::DECIMAL_CHARACTER_REFERENCE_STATE => "Decimal character reference",
self::NUMERIC_CHARACTER_REFERENCE_END_STATE => "Numeric character reference",
];
const ATTRIBUTE_VALUE_STATE_SET = [
# A character reference is said to be consumed as part of an attribute
# if the return state is either attribute value (double-quoted) state,
# attribute value (single-quoted) state or attribute value (unquoted) state.
self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE,
self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE,
self::ATTRIBUTE_VALUE_UNQUOTED_STATE
];
// Ctype constants
const CTYPE_UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
const CTYPE_ALPHA = self::CTYPE_UPPER.'abcdefghijklmnopqrstuvwxyz';
const CTYPE_NUM = '0123456789';
const CTYPE_ALNUM = self::CTYPE_ALPHA.self::CTYPE_NUM;
const CTYPE_HEX = self::CTYPE_NUM.'ABCDEFabcdef';
public function __construct(Data $data, OpenElementsStack $stack, ?ParseError $errorHandler) {
$this->state = self::DATA_STATE;
$this->data = $data;
$this->stack = $stack;
$this->errorHandler = $errorHandler;
}
protected function sanitizeTag(TagToken $token): void {
if ($token instanceof EndTagToken) {
# When an end tag token is emitted with attributes,
# that is an end-tag-with-attributes parse error.
if ($token->attributes) {
$this->error(ParseError::END_TAG_WITH_ATTRIBUTES);
$token->attributes = [];
}
# When an end tag token is emitted with its self-closing
# flag set, that is an end-tag-with-trailing-solidus parse error.
if ($token->selfClosing) {
$this->error(ParseError::END_TAG_WITH_TRAILING_SOLIDUS);
$token->selfClosing = false;
}
}
}
protected function keepOrDiscardAttribute(TagToken $token, TokenAttr $attribute): void {
// See 13.2.5.33 Attribute name state
# When the user agent leaves the attribute name state
# (and before emitting the tag token, if appropriate),
# the complete attribute's name must be compared to the
# other attributes on the same token; if there is already
# an attribute on the token with the exact same name,
# then this is a duplicate-attribute parse error and the
# new attribute must be removed from the token.
if ($token->hasAttribute($attribute->name)) {
$this->error(ParseError::DUPLICATE_ATTRIBUTE, $attribute->name);
} else {
$token->attributes[] = $attribute;
}
}
public function tokenize(): \Generator {
Consume:
assert((function() {
$this->debugLog .= "TOKEN ".++$this->debugCount."\n";
return true;
})());
while (true) {
// OPTIMIZATION: All but one state consumes; we instead do so
// here unless the state is the exception; this allows us to
// reconsume more efficiently when needed
if ($this->state !== self::MARKUP_DECLARATION_OPEN_STATE) {
$char = $this->data->consume();
}
Reconsume:
assert((function() use ($char) {
$state = self::STATE_NAMES[$this->state] ?? $this->state;
$this->debugLog .= " State: $state ($char)\n";
return true;
})());
# 13.2.5.1 Data state
if ($this->state === self::DATA_STATE) {
# Consume the next input character
# U+0026 AMPERSAND (&)
if ($char === '&') {
# Set the return state to the data state.
# Switch to the character reference state.
// DEVIATION: Character reference consumption implemented as a function
$outChar = $this->switchToCharacterReferenceState(self::DATA_STATE);
if (strspn($outChar, Data::WHITESPACE)) {
yield new WhitespaceToken($outChar); // a character reference is either all whitespace or is not whitespace
} else {
yield new CharacterToken($outChar);
}
}
# U+003C LESS-THAN SIGN (<)
elseif ($char === '<') {
# Switch to the tag open state.
$this->state = self::TAG_OPEN_STATE;
}
# U+0000 NULL
elseif ($char === "\0") {
# This is an unexpected-null-character parse error.
# Emit the current input character as a character token.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
yield new NullCharacterToken($char);
}
# EOF
elseif ($char === '') {
# Emit an end-of-file token.
yield new EOFToken;
return;
}
# Anything else
else {
# Emit the current input character as a character token.
// OPTIMIZATION:
// Consume all characters that don't match what is above and emit
// that as a character token instead to prevent having to loop back
// through here every single time.
if (strspn($char, Data::WHITESPACE)) {
yield new WhitespaceToken($char.$this->data->consumeWhile(Data::WHITESPACE_SAFE));
} else {
yield new CharacterToken($char.$this->data->consumeUntil("&<\0"));
}
}
}
# 13.2.5.2 RCDATA state
elseif ($this->state === self::RCDATA_STATE) {
# Consume the next input character
# U+0026 AMPERSAND (&)
if ($char === '&') {
# Set the return state to the RCDATA state.
# Switch to the character reference state.
// DEVIATION: Character reference consumption implemented as a function
$outChar = $this->switchToCharacterReferenceState(self::RCDATA_STATE);
if (strspn($outChar, Data::WHITESPACE)) {
yield new WhitespaceToken($outChar); // a character reference is either all whitespace is no whitespace
} else {
yield new CharacterToken($outChar);
}
}
# U+003C LESS-THAN SIGN (<)
elseif ($char === '<') {
# Switch to the RCDATA less-than sign state.
$this->state = self::RCDATA_LESS_THAN_SIGN_STATE;
}
# U+0000 NULL
elseif ($char === "\0") {
# This is an unexpected-null-character parse error.
# Emit a U+FFFD REPLACEMENT CHARACTER character token.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
yield new CharacterToken("\u{FFFD}");
}
# EOF
elseif ($char === '') {
# Emit an end-of-file token.
yield new EOFToken;
return;
}
# Anything else
else {
# Emit the current input character as a character token.
// OPTIMIZATION:
// Consume all characters that don't match what is above and emit
// that as a character token instead to prevent having to loop back
// through here every single time.
if (strspn($char, Data::WHITESPACE)) {
yield new WhitespaceToken($char.$this->data->consumeWhile(Data::WHITESPACE_SAFE));
} else {
yield new CharacterToken($char.$this->data->consumeUntil("&<\0"));
}
}
}
# 13.2.5.3 RAWTEXT state
elseif ($this->state === self::RAWTEXT_STATE) {
# Consume the next input character
# U+003C LESS-THAN SIGN (<)
if ($char === '<') {
# Switch to the RAWTEXT less-than sign state.
$this->state = self::RAWTEXT_LESS_THAN_SIGN_STATE;
}
# U+0000 NULL
elseif ($char === "\0") {
# This is an unexpected-null-character parse error.
# Emit a U+FFFD REPLACEMENT CHARACTER character token.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
yield new CharacterToken("\u{FFFD}");
}
# EOF
elseif ($char === '') {
# Emit an end-of-file token.
yield new EOFToken;
return;
}
# Anything else
else {
# Emit the current input character as a character token.
// OPTIMIZATION:
// Consume all characters that don't match what is above and emit
// that as a character token instead to prevent having to loop back
// through here every single time.
if (strspn($char, Data::WHITESPACE)) {
yield new WhitespaceToken($char.$this->data->consumeWhile(Data::WHITESPACE_SAFE));
} else {
yield new CharacterToken($char.$this->data->consumeUntil("<\0"));
}
}
}
# 13.2.5.4 Script data state
elseif ($this->state === self::SCRIPT_DATA_STATE) {
# Consume the next input character
# U+003C LESS-THAN SIGN (<)
if ($char === '<') {
# Switch to the script data less-than sign state.
$this->state = self::SCRIPT_DATA_LESS_THAN_SIGN_STATE;
}
# U+0000 NULL
elseif ($char === "\0") {
# This is an unexpected-null-character parse error.
# Emit a U+FFFD REPLACEMENT CHARACTER character token.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
yield new CharacterToken("\u{FFFD}");
}
# EOF
elseif ($char === '') {
# Emit an end-of-file token.
yield new EOFToken;
return;
}
# Anything else
else {
# Emit the current input character as a character token.
// OPTIMIZATION:
// Consume all characters that don't match what is above and emit
// that as a character token instead to prevent having to loop back
// through here every single time.
if (strspn($char, Data::WHITESPACE)) {
yield new WhitespaceToken($char.$this->data->consumeWhile(Data::WHITESPACE_SAFE));
} else {
yield new CharacterToken($char.$this->data->consumeUntil("<\0"));
}
}
}
# 13.2.5.5 PLAINTEXT state
elseif ($this->state === self::PLAINTEXT_STATE) {
# Consume the next input character
# U+0000 NULL
if ($char === "\0") {
# This is an unexpected-null-character parse error.
# Emit a U+FFFD REPLACEMENT CHARACTER character token.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
yield new CharacterToken("\u{FFFD}");
}
# EOF
elseif ($char === '') {
# Emit an end-of-file token.
yield new EOFToken;
return;
}
# Anything else
else {
# Emit the current input character as a character token.
// OPTIMIZATION:
// Consume all characters that don't match what is above and emit
// that as a character token instead to prevent having to loop back
// through here every single time.
if (strspn($char, Data::WHITESPACE)) {
yield new WhitespaceToken($char.$this->data->consumeWhile(Data::WHITESPACE_SAFE));
} else {
yield new CharacterToken($char.$this->data->consumeUntil("\0"));
}
}
}
# 13.2.5.6 Tag open state
elseif ($this->state === self::TAG_OPEN_STATE) {
# Consume the next input character
# U+0021 EXCLAMATION MARK (!)
if ($char === '!') {
# Switch to the markup declaration open state.
$this->state = self::MARKUP_DECLARATION_OPEN_STATE;
}
# U+002F SOLIDUS (/)
elseif ($char === '/') {
# Switch to the end tag open state.
$this->state = self::END_TAG_OPEN_STATE;
}
# ASCII alpha
elseif (ctype_alpha($char)) {
# Create a new start tag token, set its tag name to the empty string.
# Reconsume in the tag name state.
// OPTIMIZATION:
// Consume all characters that are ASCII characters to prevent having
// to loop back through here every single time.
$token = new StartTagToken(strtolower($char.$this->data->consumeWhile(self::CTYPE_ALPHA)));
$this->state = self::TAG_NAME_STATE;
}
# U+003F QUESTION MARK (?)
elseif ($char === '?') {
# This is an unexpected-question-mark-instead-of-tag-name parse error.
# Create a comment token whose data is the empty string.
# Reconsume in the bogus comment state.
$this->error(ParseError::UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME);
$token = new ProcessingInstructionToken("");
$this->state = self::BOGUS_COMMENT_STATE;
goto Reconsume;
}
# EOF
elseif ($char === '') {
# This is an eof-before-tag-name parse error.
# Emit a U+003C LESS-THAN SIGN character token and an end-of-file token.
$this->error(ParseError::EOF_BEFORE_TAG_NAME);
yield new CharacterToken('<');
yield new EOFToken;
return;
}
# Anything else
else {
# This is an invalid-first-character-of-tag-name parse error.
# Emit a U+003C LESS-THAN SIGN character token.
# Reconsume in the data state.
$this->error(ParseError::INVALID_FIRST_CHARACTER_OF_TAG_NAME, $char);
$this->state = self::DATA_STATE;
yield new CharacterToken('<');
goto Reconsume;
}
}
# 13.2.5.7 End tag open state
elseif ($this->state === self::END_TAG_OPEN_STATE) {
# Consume the next input character
# ASCII alpha
if (ctype_alpha($char)) {
# Create a new end tag token, set its tag name to the empty string.
# Reconsume in the tag name state.
// OPTIMIZATION:
// Consume all characters that are ASCII characters to prevent having
// to loop back through here every single time.
$token = new EndTagToken(strtolower($char.$this->data->consumeWhile(self::CTYPE_ALPHA)));
$this->state = self::TAG_NAME_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# This is a missing-end-tag-name parse error.
# Switch to the data state.
$this->error(ParseError::MISSING_END_TAG_NAME);
$this->state = self::DATA_STATE;
}
# EOF
elseif ($char === '') {
# This is an eof-before-tag-name parse error.
# Emit a U+003C LESS-THAN SIGN character token, a U+002F SOLIDUS character token and an end-of-file token.
// Making errors more expressive.
$this->error(ParseError::EOF_BEFORE_TAG_NAME);
yield new CharacterToken('</');
yield new EOFToken;
return;
}
# Anything else
else {
# This is an invalid-first-character-of-tag-name parse error.
# Create a comment token whose data is the empty string.
# Reconsume in the bogus comment state.
$this->error(ParseError::INVALID_FIRST_CHARACTER_OF_TAG_NAME, $char);
$token = new CommentToken();
$this->state = self::BOGUS_COMMENT_STATE;
goto Reconsume;
}
}
# 13.2.5.8 Tag name state
elseif ($this->state === self::TAG_NAME_STATE) {
# Consume the next input character
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if (strspn($char, " \t\n\x0C")) {
# Switch to the before attribute name state.
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
}
# "/" (U+002F)
elseif ($char === '/') {
# Switch to the self-closing start tag state.
$this->state = self::SELF_CLOSING_START_TAG_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Switch to the data state. Emit the current tag token.
$this->state = self::DATA_STATE;
$this->sanitizeTag($token);
yield $token;
}
# Uppercase ASCII letter
elseif (ctype_upper($char)) {
# Append the lowercase version of the current input character
# (add 0x0020 to the character's code point) to the current
# tag token's tag name.
// OPTIMIZATION:
// Consume all characters that are Uppercase ASCII characters to
// prevent having to loop back through here every single time.
$token->name .= strtolower($char.$this->data->consumeWhile(self::CTYPE_UPPER));
}
# U+0000 NULL
elseif ($char === "\0") {
# This is an unexpected-null-character parse error.
# Append a U+FFFD REPLACEMENT CHARACTER character to
# the current tag token's tag name.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
$token->name .= "\u{FFFD}";
}
# EOF
elseif ($char === '') {
# This is an eof-in-tag parse error.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_TAG);
yield new EOFToken;
return;
}
# Anything else
else {
# Append the current input character to the current tag token's tag name.
// OPTIMIZATION:
// Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
$token->name .= $char.$this->data->consumeUntil("\0\t\n\x0c />".self::CTYPE_UPPER);
}
}
# 13.2.5.9 RCDATA less-than sign state
elseif ($this->state === self::RCDATA_LESS_THAN_SIGN_STATE) {
# Consume the next input character
# "/" (U+002F)
if ($char === '/') {
# Set the temporary buffer to the empty string.
# Switch to the RCDATA end tag open state.
$this->temporaryBuffer = '';
$this->state = self::RCDATA_END_TAG_OPEN_STATE;
}
# Anything else
else {
# Emit a U+003C LESS-THAN SIGN character token.
# Reconsume in the RCDATA state.
$this->state = self::RCDATA_STATE;
yield new CharacterToken('<');
goto Reconsume;
}
}
# 13.2.5.10 RCDATA end tag open state
elseif ($this->state === self::RCDATA_END_TAG_OPEN_STATE) {
# Consume the next input character
# ASCII alpha
if (ctype_alpha($char)) {
# Create a new end tag token, set its tag name to the empty string.
# Reconsume in the RCDATA end tag name state.
$token = new EndTagToken("");
$this->state = self::RCDATA_END_TAG_NAME_STATE;
goto Reconsume;
}
# Anything else
else {
# Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS character token.
# Reconsume in the RCDATA state.
$this->state = self::RCDATA_STATE;
yield new CharacterToken('</');
goto Reconsume;
}
}
# 13.2.5.11 RCDATA end tag name state
elseif ($this->state === self::RCDATA_END_TAG_NAME_STATE) {
# Consume the next input character
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if (strspn($char, " \t\n\x0C")) {
# If the current end tag token is an appropriate end tag token, then switch to the
# before attribute name state. Otherwise, treat it as per the "anything else"
# entry below.
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
} else {
goto RCDATA_end_tag_name_state_anything_else;
}
}
# "/" (U+002F)
elseif ($char === '/') {
# If the current end tag token is an appropriate end tag token, then switch to the
# self-closing start tag state. Otherwise, treat it as per the "anything else"
# entry below.
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::SELF_CLOSING_START_TAG_STATE;
} else {
goto RCDATA_end_tag_name_state_anything_else;
}
}
# ">" (U+003E)
elseif ($char === '>') {
# If the current end tag token is an appropriate end tag token, then switch to the
# data state and emit the current tag token. Otherwise, treat it as per the
# "anything else" entry below.
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::DATA_STATE;
$this->sanitizeTag($token);
yield $token;
} else {
goto RCDATA_end_tag_name_state_anything_else;
}
}
# ASCII upper alpha
# ASCII lower alpha
elseif (ctype_alpha($char)) {
# Uppercase:
# Append the lowercase version of the current input character
# (add 0x0020 to the character's code point) to the current
# tag token's tag name.
# Append the current input character to the temporary buffer.
# Lowercase:
# Append the current input character to the current
# tag token's tag name.
# Append the current input character to the temporary buffer.
// OPTIMIZATION: Combine upper and lower alpha
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
// to loop back through here every single time.
$char .= $this->data->consumeWhile(self::CTYPE_ALPHA);
$token->name .= strtolower($char);
$this->temporaryBuffer .= $char;
}
# Anything else
else {
RCDATA_end_tag_name_state_anything_else:
# Emit a U+003C LESS-THAN SIGN character token,
# a U+002F SOLIDUS character token, and a character
# token for each of the characters in the temporary
# buffer (in the order they were added to the buffer).
# Reconsume in the RCDATA state.
$this->state = self::RCDATA_STATE;
yield new CharacterToken('</'.$this->temporaryBuffer);
goto Reconsume;
}
}
# 13.2.5.12 RAWTEXT less-than sign state
elseif ($this->state === self::RAWTEXT_LESS_THAN_SIGN_STATE) {
# Consume the next input character
# "/" (U+002F)
if ($char === '/') {
# Set the temporary buffer to the empty string.
# Switch to the RAWTEXT end tag open state.
$this->temporaryBuffer = '';
$this->state = self::RAWTEXT_END_TAG_OPEN_STATE;
}
# Anything else
else {
# Emit a U+003C LESS-THAN SIGN character token.
# Reconsume in the RAWTEXT state.
$this->state = self::RAWTEXT_STATE;
yield new CharacterToken('<');
goto Reconsume;
}
}
# 13.2.5.13 RAWTEXT end tag open state
elseif ($this->state === self::RAWTEXT_END_TAG_OPEN_STATE) {
# Consume the next input character
# ASCII alpha
if (ctype_alpha($char)) {
# Create a new end tag token, set its tag name to the empty string.
# Reconsume in the RAWTEXT end tag name state.
$token = new EndTagToken("");
$this->state = self::RAWTEXT_END_TAG_NAME_STATE;
goto Reconsume;
}
# Anything else
else {
# Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS character token.
# Reconsume in the RAWTEXT state.
$this->state = self::RAWTEXT_STATE;
yield new CharacterToken('</');
goto Reconsume;
}
}
# 13.2.5.14 RAWTEXT end tag name state
elseif ($this->state === self::RAWTEXT_END_TAG_NAME_STATE) {
# Consume the next input character
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if (strspn($char, " \t\n\x0C")) {
# If the current end tag token is an appropriate end tag token,
# then switch to the before attribute name state.
# Otherwise, treat it as per the "anything else" entry below.
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
} else {
goto RAWTEXT_end_tag_name_state_anything_else;
}
}
# "/" (U+002F)
elseif ($char === '/') {
# If the current end tag token is an appropriate end tag token,
# then switch to the self-closing start tag state.
# Otherwise, treat it as per the "anything else"
# entry below.
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::SELF_CLOSING_START_TAG_STATE;
} else {
goto RAWTEXT_end_tag_name_state_anything_else;
}
}
# ">" (U+003E)
elseif ($char === '>') {
# If the current end tag token is an appropriate end tag token,
# then switch to the data state and emit the current tag token.
# Otherwise, treat it as per the "anything else" entry below.
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::DATA_STATE;
$this->sanitizeTag($token);
yield $token;
} else {
goto RAWTEXT_end_tag_name_state_anything_else;
}
}
# ASCII upper alpha
# ASCII lower apha
elseif (ctype_alpha($char)) {
# Uppercase:
# Append the lowercase version of the current input character
# (add 0x0020 to the character's code point) to the current
# tag token's tag name.
# Append the current input character to the temporary buffer.
# Lowercase:
# Append the current input character to the current
# tag token's tag name.
# Append the current input character to the temporary buffer.
// OPTIMIZATION: Combine upper and lower alpha
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
// to loop back through here every single time.
$char .= $this->data->consumeWhile(self::CTYPE_ALPHA);
$token->name .= strtolower($char);
$this->temporaryBuffer .= $char;
}
# Anything else
else {
RAWTEXT_end_tag_name_state_anything_else:
# Emit a U+003C LESS-THAN SIGN character token,
# a U+002F SOLIDUS character token, and a character
# token for each of the characters in the temporary
# buffer (in the order they were added to the buffer).
# Reconsume in the RAWTEXT state.
$this->state = self::RAWTEXT_STATE;
yield new CharacterToken('</'.$this->temporaryBuffer);
goto Reconsume;
}
}
# 13.2.5.15 Script data less-than sign state
elseif ($this->state === self::SCRIPT_DATA_LESS_THAN_SIGN_STATE) {
# Consume the next input character
# "/" (U+002F)
if ($char === '/') {
# Set the temporary buffer to the empty string.
# Switch to the script data end tag open state.
$this->temporaryBuffer = '';
$this->state = self::SCRIPT_DATA_END_TAG_OPEN_STATE;
}
# "!" (U+0021)
elseif ($char === '!') {
# Switch to the script data escape start state.
# Emit a U+003C LESS-THAN SIGN character token
# and a U+0021 EXCLAMATION MARK character token.
$this->state = self::SCRIPT_DATA_ESCAPE_START_STATE;
yield new CharacterToken('<!');
}
# Anything else
else {
# Emit a U+003C LESS-THAN SIGN character token.
# Reconsume in the script data state.
$this->state = self::SCRIPT_DATA_STATE;
yield new CharacterToken('<');
goto Reconsume;
}
}
# 13.2.5.16 Script data end tag open state
elseif ($this->state === self::SCRIPT_DATA_END_TAG_OPEN_STATE) {
# Consume the next input character
# ASCII alpha
if (ctype_alpha($char)) {
# Create a new end tag token, set its tag name to the empty string.
# Reconsume in the script data end tag name state.
$token = new EndTagToken("");
$this->state = self::SCRIPT_DATA_END_TAG_NAME_STATE;
goto Reconsume;
}
# Anything else
else {
# Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS character token.
# Reconsume in the script data state.
$this->state = self::SCRIPT_DATA_STATE;
yield new CharacterToken('</');
goto Reconsume;
}
}
# 13.2.5.17 Script data end tag name state
elseif ($this->state === self::SCRIPT_DATA_END_TAG_NAME_STATE) {
# Consume the next input character
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if (strspn($char, " \t\n\x0C")) {
# If the current end tag token is an appropriate end tag token,
# then switch to the before attribute name state.
# Otherwise, treat it as per the "anything else" entry below.
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
} else {
goto script_data_end_tag_name_state_anything_else;
}
}
# "/" (U+002F)
elseif ($char === '/') {
# If the current end tag token is an appropriate end tag token,
# then switch to the self-closing start tag state.
# Otherwise, treat it as per the "anything else" entry below.
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::SELF_CLOSING_START_TAG_STATE;
} else {
goto script_data_end_tag_name_state_anything_else;
}
}
# ">" (U+003E)
elseif ($char === '>') {
# If the current end tag token is an appropriate end tag token,
# then switch to the data state and emit the current tag token.
# Otherwise, treat it as per the "anything else" entry below.
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::DATA_STATE;
$this->sanitizeTag($token);
yield $token;
} else {
goto script_data_end_tag_name_state_anything_else;
}
}
# ASCII upper alpha
# ASCII lower alpha
elseif (ctype_alpha($char)) {
# Uppercase:
# Append the lowercase version of the current input character
# (add 0x0020 to the character's code point) to the current
# tag token's tag name.
# Append the current input character to the temporary buffer.
# Lowercase:
# Append the current input character to the current
# tag token's tag name.
# Append the current input character to the temporary buffer.
// OPTIMIZATION: Combine upper and lower alpha
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
// to loop back through here every single time.
$char = $char.$this->data->consumeWhile(self::CTYPE_ALPHA);
$token->name .= strtolower($char);
$this->temporaryBuffer .= $char;
}
# Anything else
else {
script_data_end_tag_name_state_anything_else:
# Emit a U+003C LESS-THAN SIGN character token,
# a U+002F SOLIDUS character token, and a character
# token for each of the characters in the temporary
# buffer (in the order they were added to the buffer).
# Reconsume in the script data state.
$this->state = self::SCRIPT_DATA_STATE;
yield new CharacterToken('</'.$this->temporaryBuffer);
goto Reconsume;
}
}
# 13.2.5.18 Script data escape start state
elseif ($this->state === self::SCRIPT_DATA_ESCAPE_START_STATE) {
# Consume the next input character
# "-" (U+002D)
if ($char === '-') {
# Switch to the script data escape start dash state.
# Emit a U+002D HYPHEN-MINUS character token.
$this->state = self::SCRIPT_DATA_ESCAPE_START_DASH_STATE;
yield new CharacterToken('-');
}
# Anything else
else {
# Switch to the script data state.
# Reconsume the current input character.
$this->state = self::SCRIPT_DATA_STATE;
goto Reconsume;
}
}
# 13.2.5.19 Script data escape start dash state
elseif ($this->state === self::SCRIPT_DATA_ESCAPE_START_DASH_STATE) {
# Consume the next input character
# "-" (U+002D)
if ($char === '-') {
# Switch to the script data escaped dash dash state.
# Emit a U+002D HYPHEN-MINUS character token.
$this->state = self::SCRIPT_DATA_ESCAPED_DASH_DASH_STATE;
yield new CharacterToken('-');
}
# Anything else
else {
# Reconsume in the script data state.
$this->state = self::SCRIPT_DATA_STATE;
goto Reconsume;
}
}
# 13.2.5.20 Script data escaped state
elseif ($this->state === self::SCRIPT_DATA_ESCAPED_STATE) {
# Consume the next input character
# "-" (U+002D)
if ($char === '-') {
# Switch to the script data escaped dash state.
# Emit a U+002D HYPHEN-MINUS character token.
$this->state = self::SCRIPT_DATA_ESCAPED_DASH_STATE;
yield new CharacterToken('-');
}
# "<" (U+003C)
elseif ($char === '<') {
# Switch to the script data escaped less-than sign state.
$this->state = self::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE;
}
# U+0000 NULL
elseif ($char === "\0") {
# This is an unexpected-null-character parse error.
# Emit a U+FFFD REPLACEMENT CHARACTER character token.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
yield new CharacterToken("\u{FFFD}");
}
# EOF
elseif ($char === '') {
# This is an eof-in-script-html-comment-like-text parse error.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
yield new EOFToken;
return;
}
# Anything else
else {
# Emit the current input character as a character token.
// OPTIMIZATION:
// Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
if (strspn($char, Data::WHITESPACE)) {
yield new WhitespaceToken($char.$this->data->consumeWhile(Data::WHITESPACE_SAFE));
} else {
yield new CharacterToken($char.$this->data->consumeUntil("-<\0"));
}
}
}
# 13.2.5.21 Script data escaped dash state
elseif ($this->state === self::SCRIPT_DATA_ESCAPED_DASH_STATE) {
# Consume the next input character
# "-" (U+002D)
if ($char === '-') {
# Switch to the script data escaped dash dash state.
# Emit a U+002D HYPHEN-MINUS character token.
$this->state = self::SCRIPT_DATA_ESCAPED_DASH_DASH_STATE;
yield new CharacterToken('-');
}
# "<" (U+003C)
elseif ($char === '<') {
# Switch to the script data escaped less-than sign state.
$this->state = self::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE;
}
# U+0000 NULL
elseif ($char === "\0") {
# This is an unexpected-null-character parse error.
# Switch to the script data escaped state.
# Emit a U+FFFD REPLACEMENT CHARACTER character token.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
yield new CharacterToken("\u{FFFD}");
}
# EOF
elseif ($char === '') {
# This is an eof-in-script-html-comment-like-text parse error.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
yield new EOFToken;
return;
}
# Anything else
else {
# Switch to the script data escaped state.
# Emit the current input character as a character token.
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
if (strspn($char, Data::WHITESPACE)) {
yield new WhitespaceToken($char);
} else {
yield new CharacterToken($char);
}
}
}
# 13.2.5.22 Script data escaped dash dash state
elseif ($this->state === self::SCRIPT_DATA_ESCAPED_DASH_DASH_STATE) {
# Consume the next input character
# "-" (U+002D)
if ($char === '-') {
# Emit a U+002D HYPHEN-MINUS character token.
yield new CharacterToken('-');
}
# "<" (U+003C)
elseif ($char === '<') {
# Switch to the script data escaped less-than sign state.
$this->state = self::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Switch to the script data state.
# Emit a U+003E GREATER-THAN SIGN character token.
$this->state = self::SCRIPT_DATA_STATE;
yield new CharacterToken('>');
}
# U+0000 NULL
elseif ($char === "\0") {
# This is an unexpected-null-character parse error.
# Switch to the script data escaped state.
# Emit a U+FFFD REPLACEMENT CHARACTER character token.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
yield new CharacterToken("\u{FFFD}");
}
# EOF
elseif ($char === '') {
# This is an eof-in-script-html-comment-like-text parse error.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
yield new EOFToken;
return;
}
# Anything else
else {
# Switch to the script data escaped state.
# Emit the current input character as a character token.
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
if (strspn($char, Data::WHITESPACE)) {
yield new WhitespaceToken($char);
} else {
yield new CharacterToken($char);
}
}
}
# 13.2.5.23 Script data escaped less-than sign state
elseif ($this->state === self::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE) {
# Consume the next input character
# "/" (U+002F)
if ($char === '/') {
# Set the temporary buffer to the empty string.
# Switch to the script data escaped end tag open state.
$this->temporaryBuffer = '';
$this->state = self::SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE;
}
# ASCII alpha
elseif (ctype_alpha($char)) {
# Set the temporary buffer to the empty string.
# Emit a U+003C LESS-THAN SIGN character token.
# Reconsume in the script data double escape start state.
$this->temporaryBuffer = '';
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE;
yield new CharacterToken('<');
goto Reconsume;
}
# Anything else
else {
# Emit a U+003C LESS-THAN SIGN character token.
# Reconsume in the script data escaped state.
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
yield new CharacterToken("<");
goto Reconsume;
}
}
# 13.2.5.24 Script data escaped end tag open state
elseif ($this->state === self::SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE) {
# Consume the next input character
# ASCII alpha
if (ctype_alpha($char)) {
# Create a new end tag token, set its tag name to the empty string.
# Reconsume in the script data escaped end tag name state.
// OPTIMIZATION: Avoid reconsuming
// Set the tag name to the lowercase
// Append the original to the temporary buffer
$token = new EndTagToken(strtolower($char));
$this->temporaryBuffer = $char;
$this->state = self::SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE;
}
# Anything else
else {
# Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS character token.
# Reconsume in the script data escaped state.
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
yield new CharacterToken('</');
goto Reconsume;
}
}
# 13.2.5.25 Script data escaped end tag name state
elseif ($this->state === self::SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE) {
# Consume the next input character
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if (strspn($char, " \t\n\x0C")) {
# If the current end tag token is an appropriate end tag token,
# then switch to the before attribute name state.
# Otherwise, treat it as per the "anything else" entry below.
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
} else {
goto script_data_escaped_end_tag_name_state_anything_else;
}
}
# "/" (U+002F)
elseif ($char === '/') {
# If the current end tag token is an appropriate end tag token,
# then switch to the self-closing start tag state.
# Otherwise, treat it as per the "anything else" entry below.
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::SELF_CLOSING_START_TAG_STATE;
} else {
goto script_data_escaped_end_tag_name_state_anything_else;
}
}
# ">" (U+003E)
elseif ($char === '>') {
# If the current end tag token is an appropriate end tag token,
# then switch to the data state and emit the current tag token.
# Otherwise, treat it as per the "anything else" entry below.
if ($token->name === $this->stack->currentNodeName) {
$this->state = self::DATA_STATE;
$this->sanitizeTag($token);
yield $token;
} else {
goto script_data_escaped_end_tag_name_state_anything_else;
}
}
# ASCII upper alpha
# ASCII lower alpha
elseif (ctype_alpha($char)) {
# Uppercase:
# Append the lowercase version of the current input character
# (add 0x0020 to the character's code point) to the current
# tag token's tag name.
# Append the current input character to the temporary buffer.
# Lowercase:
# Append the current input character to the current tag
# token's tag name.
# Append the current input character to the temporary buffer.
// OPTIMIZATION: Combine upper and lower alpha
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
// to loop back through here every single time.
$char .= $this->data->consumeWhile(self::CTYPE_ALPHA);
$token->name .= strtolower($char);
$this->temporaryBuffer .= $char;
}
# Anything else
else {
script_data_escaped_end_tag_name_state_anything_else:
# Emit a U+003C LESS-THAN SIGN character token,
# a U+002F SOLIDUS character token, and a character token
# for each of the characters in the temporary buffer
# (in the order they were added to the buffer).
# Reconsume in the script data escaped state.
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
yield new CharacterToken('</'.$this->temporaryBuffer);
goto Reconsume;
}
}
# 13.2.5.26 Script data double escape start state
elseif ($this->state === self::SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE) {
# Consume the next input character
# U+0009 CHARACTER TABULATION (tab)
# U+000A LINE FEED (LF)
# U+000C FORM FEED (FF)
# U+0020 SPACE
# U+002F SOLIDUS (/)
# U+003E GREATER-THAN SIGN (>)
if (strspn($char, " />\t\n\x0C")) {
# If the temporary buffer is the string "script",
# then switch to the script data double escaped state.
# Otherwise, switch to the script data escaped state.
# Emit the current input character as a character token.
if ($this->temporaryBuffer === 'script') {
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
} else {
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
}
if (strspn($char, Data::WHITESPACE)) {
yield new WhitespaceToken($char);
} else {
yield new CharacterToken($char);
}
}
# ASCII upper alpha
# ASCII lower alpha
elseif (ctype_alpha($char)) {
# Append the lowercase version of the current input character
# (add 0x0020 to the character's code point) to the temporary buffer.
# Emit the current input character as a character token.
// OPTIMIZATION: Combine upper and lower alpha
// OPTIMIZATION:
// Consume all characters that are ASCII characters to prevent having
// to loop back through here every single time.
$char = $char.$this->data->consumeWhile(self::CTYPE_ALPHA);
$this->temporaryBuffer .= strtolower($char);
yield new CharacterToken($char);
}
# Anything else
else {
# Reconsume in the script data escaped state.
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
goto Reconsume;
}
}
# 13.2.5.27 Script data double escaped state
elseif ($this->state === self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE) {
# Consume the next input character
# "-" (U+002D)
if ($char === '-') {
# Switch to the script data double escaped dash state.
# Emit a U+002D HYPHEN-MINUS character token.
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE;
yield new CharacterToken('-');
}
# "<" (U+003C)
elseif ($char === '<') {
# Switch to the script data double escaped less-than sign state.
# Emit a U+003C LESS-THAN SIGN character token.
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE;
yield new CharacterToken('<');
}
# U+0000 NULL
elseif ($char === "\0") {
# This is an unexpected-null-character parse error.
# Emit a U+FFFD REPLACEMENT CHARACTER character token.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
yield new CharacterToken("\u{FFFD}");
}
# EOF
elseif ($char === '') {
# This is an eof-in-script-html-comment-like-text parse error.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
yield new EOFToken;
return;
}
# Anything else
else {
# Emit the current input character as a character token.
// OPTIMIZATION:
// Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
if (strspn($char, Data::WHITESPACE)) {
yield new WhitespaceToken($char.$this->data->consumeWhile(Data::WHITESPACE_SAFE));
} else {
yield new CharacterToken($char.$this->data->consumeUntil("-<\0"));
}
}
}
# 13.2.5.28 Script data double escaped dash state
elseif ($this->state == self::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE) {
# Consume the next input character
# "-" (U+002D)
if ($char === '-') {
# Switch to the script data double escaped dash dash state.
# Emit a U+002D HYPHEN-MINUS character token.
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE;
yield new CharacterToken('-');
}
# "<" (U+003C)
elseif ($char === '<') {
# Switch to the script data double escaped less-than sign state.
# Emit a U+003C LESS-THAN SIGN character token.
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE;
yield new CharacterToken('<');
}
# U+0000 NULL
elseif ($char === "\0") {
# This is an unexpected-null-character parse error.
# Switch to the script data double escaped state.
# Emit a U+FFFD REPLACEMENT CHARACTER character token.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
yield new CharacterToken("\u{FFFD}");
}
# EOF
elseif ($char === '') {
# This is an eof-in-script-html-comment-like-text parse error.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
yield new EOFToken;
return;
}
# Anything else
else {
# Switch to the script data double escaped state.
# Emit the current input character as a character token.
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
if (strspn($char, Data::WHITESPACE)) {
yield new WhitespaceToken($char);
} else {
yield new CharacterToken($char);
}
}
}
# 13.2.5.29 Script data double escaped dash dash state
elseif ($this->state == self::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE) {
# Consume the next input character
# "-" (U+002D)
if ($char === '-') {
# Emit a U+002D HYPHEN-MINUS character token.
yield new CharacterToken('-');
}
# "<" (U+003C)
elseif ($char === '<') {
# Switch to the script data double escaped less-than sign state.
# Emit a U+003C LESS-THAN SIGN character token.
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE;
yield new CharacterToken('<');
}
# ">" (U+003E)
elseif ($char === '>') {
# Switch to the script data state.
# Emit a U+003E GREATER-THAN SIGN character token.
$this->state = self::SCRIPT_DATA_STATE;
yield new CharacterToken('>');
}
# U+0000 NULL
elseif ($char === "\0") {
# This is an unexpected-null-character parse error.
# Switch to the script data double escaped state.
# Emit a U+FFFD REPLACEMENT CHARACTER character token.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
yield new CharacterToken("\u{FFFD}");
}
# EOF
elseif ($char === '') {
# This is an eof-in-script-html-comment-like-text parse error.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
yield new EOFToken;
return;
}
# Anything else
else {
# Switch to the script data double escaped state.
# Emit the current input character as a character token.
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
if (strspn($char, Data::WHITESPACE)) {
yield new WhitespaceToken($char);
} else {
yield new CharacterToken($char);
}
}
}
# 13.2.5.30 Script data double escaped less-than sign state
elseif ($this->state === self::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE) {
# Consume the next input character
# "/" (U+002F)
if ($char === '/') {
# Set the temporary buffer to the empty string.
# Switch to the script data double escape end state.
# Emit a U+002F SOLIDUS character token.
$this->temporaryBuffer = '';
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE;
yield new CharacterToken('/');
}
# Anything else
else {
# Reconsume in the script data double escaped state.
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
goto Reconsume;
}
}
# 13.2.5.31 Script data double escape end state
elseif ($this->state === self::SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE) {
# Consume the next input character
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
# "/" (U+002F)
# ">" (U+003E)
if (strspn($char, " />\t\n\x0C")) {
# If the temporary buffer is the string "script",
# then switch to the script data escaped state.
# Otherwise, switch to the script data double escaped state.
# Emit the current input character as a character token.
if ($this->temporaryBuffer === 'script') {
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
} else {
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
}
if (strspn($char, Data::WHITESPACE)) {
yield new WhitespaceToken($char);
} else {
yield new CharacterToken($char);
}
}
# ASCII upper alpha
# ASCII lower alpha
elseif (ctype_alpha($char)) {
# Uppercase:
# Append the lowercase version of the current input character
# (add 0x0020 to the character's code point) to the temporary buffer.
# Emit the current input character as a character token.
# Lowercase:
# Append the current input character to the temporary buffer.
# Emit the current input character as a character token.
// OPTIMIZATION: Combine upper and lower alpha
// OPTIMIZATION: Consume all characters that are ASCII characters to prevent having
// to loop back through here every single time.
$char = $char.$this->data->consumeWhile(self::CTYPE_ALPHA);
$this->temporaryBuffer .= strtolower($char);
yield new CharacterToken($char);
}
# Anything else
else {
# Reconsume in the script data double escaped state.
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
goto Reconsume;
}
}
# 13.2.5.32 Before attribute name state
elseif ($this->state === self::BEFORE_ATTRIBUTE_NAME_STATE) {
# Consume the next input character
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if (strspn($char, " \t\n\x0C")) {
# Ignore the character.
}
# "/" (U+002F)
# ">" (U+003E)
# EOF
elseif ($char === '/' || $char === '>' || $char === '') {
# Reconsume in the after attribute name state.
$this->state = self::AFTER_ATTRIBUTE_NAME_STATE;
goto Reconsume;
}
# "=" (U+003D)
elseif ($char === '=') {
# This is an unexpected-equals-sign-before-attribute-name parse error.
# Start a new attribute in the current tag token.
# Set that attribute's name to the current input character,
# and its value to the empty string.
# Switch to the attribute name state.
$this->error(ParseError::UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME);
$attribute = new TokenAttr($char, '');
$this->state = self::ATTRIBUTE_NAME_STATE;
}
# Anything else
else {
# Start a new attribute in the current tag token.
# Set that attribute name and value to the empty string.
# Reconsume in the attribute name state.
$attribute = new TokenAttr('', '');
$this->state = self::ATTRIBUTE_NAME_STATE;
goto Reconsume;
}
}
# 13.2.5.33 Attribute name state
elseif ($this->state === self::ATTRIBUTE_NAME_STATE) {
# Consume the next input character
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
# "/" (U+002F)
# U+003E GREATER-THAN SIGN (>)
# EOF
if (strspn($char, " />\t\n\x0C") || $char === '') {
# Reconsume in the after attribute name state.
$this->keepOrDiscardAttribute($token, $attribute);
$this->state = self::AFTER_ATTRIBUTE_NAME_STATE;
goto Reconsume;
}
# "=" (U+003D)
elseif ($char === '=') {
# Switch to the before attribute value state.
$this->keepOrDiscardAttribute($token, $attribute);
$this->state = self::BEFORE_ATTRIBUTE_VALUE_STATE;
}
# ASCII upper alpha
elseif (ctype_upper($char)) {
# Append the lowercase version of the current input character
# (add 0x0020 to the character's code point) to the
# current attribute's name.
// OPTIMIZATION:
// Consume all characters that are uppercase ASCII letters to prevent
// having to loop back through here every single time.
$attribute->name .= strtolower($char.$this->data->consumeWhile(self::CTYPE_UPPER));
}
# U+0000 NULL
elseif ($char === "\0") {
# This is an unexpected-null-character parse error.
# Append a U+FFFD REPLACEMENT CHARACTER character to the current attribute's name.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
$attribute->name .= "\u{FFFD}";
}
# U+0022 QUOTATION MARK (")
# "'" (U+0027)
# "<" (U+003C)
elseif ($char === '"' || $char === "'" || $char === '<') {
# This is an unexpected-character-in-attribute-name parse error.
# Treat it as per the "anything else" entry below.
$this->error(ParseError::UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME, $char);
goto attribute_name_state_anything_else;
}
# Anything else
else {
attribute_name_state_anything_else:
# Append the current input character to the current attribute's name.
$attribute->name .= $char.$this->data->consumeUntil("\t\n\x0c /=>\0\"'<".self::CTYPE_UPPER);
}
}
# 13.2.5.34 After attribute name state
elseif ($this->state === self::AFTER_ATTRIBUTE_NAME_STATE) {
# Consume the next input character
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if (strspn($char, " \t\n\x0C")) {
# Ignore the character.
}
# U+002F SOLIDUS (/)
elseif ($char === '/') {
# Switch to the self-closing start tag state.
$this->state = self::SELF_CLOSING_START_TAG_STATE;
}
# U+003D EQUALS SIGN (=)
elseif ($char === '=') {
# Switch to the before attribute value state.
$this->state = self::BEFORE_ATTRIBUTE_VALUE_STATE;
}
# U+003E GREATER-THAN SIGN (>)
elseif ($char === '>') {
# Switch to the data state.
# Emit the current tag token.
$this->state = self::DATA_STATE;
$this->sanitizeTag($token);
yield $token;
}
# EOF
elseif ($char === '') {
# This is an eof-in-tag parse error.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_TAG);
yield new EOFToken;
return;
}
# Anything else
else {
# Start a new attribute in the current tag token.
# Set that attribute name and value to the empty string.
# Reconsume in the attribute name state.
$attribute = new TokenAttr('', '');
$this->state = self::ATTRIBUTE_NAME_STATE;
goto Reconsume;
}
}
# 13.2.5.35 Before attribute value state
elseif ($this->state === self::BEFORE_ATTRIBUTE_VALUE_STATE) {
# Consume the next input character
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if (strspn($char, " \t\n\x0C")) {
# Ignore the character.
}
# U+0022 QUOTATION MARK (")
elseif ($char === '"') {
# Switch to the attribute value (double-quoted) state.
$this->state = self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
}
# "'" (U+0027)
elseif ($char === "'") {
# Switch to the attribute value (single-quoted) state.
$this->state = self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# This is a missing-attribute-value parse error.
# Switch to the data state.
# Emit the current tag token.
$this->error(ParseError::MISSING_ATTRIBUTE_VALUE);
$this->state = self::DATA_STATE;
$this->sanitizeTag($token);
yield $token;
}
# Anything else
else {
# Reconsume in the attribute value (unquoted) state.
$this->state = self::ATTRIBUTE_VALUE_UNQUOTED_STATE;
goto Reconsume;
}
}
# 13.2.5.36 Attribute value (double-quoted) state
elseif ($this->state === self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
# Consume the next input character
# U+0022 QUOTATION MARK (")
if ($char === '"') {
# Switch to the after attribute value (quoted) state.
$this->state = self::AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
}
# U+0026 AMPERSAND (&)
elseif ($char === '&') {
# Set the return state to the attribute value (double-quoted) state.
# Switch to the character reference state.
// DEVIATION: Character reference consumption implemented as a function
$attribute->value .= $this->switchToCharacterReferenceState(self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE);
}
# U+0000 NULL
elseif ($char === "\0") {
# This is an unexpected-null-character parse error.
# Append a U+FFFD REPLACEMENT CHARACTER character to the current attribute's value.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
$attribute->value .= "\u{FFFD}";
}
# EOF
elseif ($char === '') {
# This is an eof-in-tag parse error.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_TAG);
yield new EOFToken;
return;
}
# Anything else
else {
# Append the current input character to the current attribute's value.
// OPTIMIZATION:
// Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
$attribute->value .= $char.$this->data->consumeUntil("\"&\0");
}
}
# 13.2.5.37 Attribute value (single-quoted) state
elseif ($this->state === self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
# Consume the next input character
# U+0027 APOSTROPHE (')
if ($char === "'") {
# Switch to the after attribute value (quoted) state.
$this->state = self::AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
}
# U+0026 AMPERSAND (&)
elseif ($char === '&') {
# Set the return state to the attribute value (single-quoted) state.
# Switch to the character reference state.
// DEVIATION: Character reference consumption implemented as a function
$attribute->value .= $this->switchToCharacterReferenceState(self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE);
}
# U+0000 NULL
elseif ($char === "\0") {
# This is an unexpected-null-character parse error.
# Append a U+FFFD REPLACEMENT CHARACTER character to the current attribute's value.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
$attribute->value .= "\u{FFFD}";
}
# EOF
elseif ($char === '') {
# This is an eof-in-tag parse error.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_TAG);
yield new EOFToken;
return;
}
# Anything else
else {
# Append the current input character to the current attribute's value.
// OPTIMIZATION:
// Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
$attribute->value .= $char.$this->data->consumeUntil("'&\0");
}
}
# 13.2.5.38 Attribute value (unquoted) state
elseif ($this->state === self::ATTRIBUTE_VALUE_UNQUOTED_STATE) {
# Consume the next input character
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if (strspn($char, " \t\n\x0C")) {
# Switch to the before attribute name state.
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
}
# U+0026 AMPERSAND (&)
elseif ($char === '&') {
# Set the return state to the attribute value (unquoted) state.
# Switch to the character reference state.
// DEVIATION: Character reference consumption implemented as a function
$attribute->value .= $this->switchToCharacterReferenceState(self::ATTRIBUTE_VALUE_UNQUOTED_STATE);
}
# ">" (U+003E)
elseif ($char === '>') {
# Switch to the data state. Emit the current tag token.
$this->state = self::DATA_STATE;
$this->sanitizeTag($token);
yield $token;
}
# U+0000 NULL
elseif ($char === "\0") {
# This is an unexpected-null-character parse error.
# Append a U+FFFD REPLACEMENT CHARACTER character to the current attribute's value.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
$attribute->value .= "\u{FFFD}";
}
# U+0022 QUOTATION MARK (")
# "'" (U+0027)
# "<" (U+003C)
# "=" (U+003D)
# "`" (U+0060)
elseif (strspn($char,"\"'<=`")) {
# This is an unexpected-character-in-unquoted-attribute-value parse error.
# Treat it as per the "anything else" entry below.
$this->error(ParseError::UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE, $char);
goto attribute_value_unquoted_state_anything_else;
}
# EOF
elseif ($char === '') {
# This is an eof-in-tag parse error.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_TAG);
yield new EOFToken;
return;
}
# Anything else
else {
attribute_value_unquoted_state_anything_else:
# Append the current input character to the current attribute's value.
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
$attribute->value .= $char.$this->data->consumeUntil("\t\n\x0c &>\0\"'<=`");
}
}
# 13.2.5.39 After attribute value (quoted) state
elseif ($this->state === self::AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
# Consume the next input character
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if (strspn($char, " \t\n\x0C")) {
# Switch to the before attribute name state.
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
}
# "/" (U+002F)
elseif ($char === '/') {
# Switch to the self-closing start tag state.
$this->state = self::SELF_CLOSING_START_TAG_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Switch to the data state.
# Emit the current tag token.
$this->state = self::DATA_STATE;
$this->sanitizeTag($token);
yield $token;
}
# EOF
elseif ($char === '') {
# This is an eof-in-tag parse error.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_TAG);
yield new EOFToken;
return;
}
# Anything else
else {
# This is a missing-whitespace-between-attributes parse error.
# Reconsume in the before attribute name state.
$this->error(ParseError::MISSING_WHITESPACE_BETWEEN_ATTRIBUTES);
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
goto Reconsume;
}
}
# 13.2.5.40 Self-closing start tag state
elseif ($this->state === self::SELF_CLOSING_START_TAG_STATE) {
# Consume the next input character
# ">" (U+003E)
if ($char === '>') {
# Set the self-closing flag of the current tag token.
# Switch to the data state.
# Emit the current tag token.
$token->selfClosing = true;
$this->state = self::DATA_STATE;
$this->sanitizeTag($token);
yield $token;
}
# EOF
elseif ($char === '') {
# This is an eof-in-tag parse error.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_TAG);
yield new EOFToken;
return;
}
# Anything else
else {
# This is an unexpected-solidus-in-tag parse error.
# Reconsume in the before attribute name state.
$this->error(ParseError::UNEXPECTED_SOLIDUS_IN_TAG);
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
goto Reconsume;
}
}
# 13.2.5.44 Bogus comment state
elseif ($this->state === self::BOGUS_COMMENT_STATE) {
# Consume the next input character
# U+003E GREATER-THAN SIGN (>)
if ($char === '>') {
# Switch to the data state.
# Emit the comment token.
$this->state = self::DATA_STATE;
yield $token;
}
# EOF
elseif ($char === '') {
# Emit the comment.
# Emit an end-of-file token.
yield $token;
yield new EOFToken;
return;
}
# U+0000 NULL
elseif ($char === "\0") {
# This is an unexpected-null-character parse error.
# Append a U+FFFD REPLACEMENT CHARACTER character to the comment token's data.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
$token->data .= "\u{FFFD}";
}
# Anything else
else {
# Append the current input character to the comment token's data.
// OPTIMIZATION:
// Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
$token->data .= $char.$this->data->consumeUntil(">\0");
}
}
# 13.2.5.42 Markup declaration open state
elseif ($this->state === self::MARKUP_DECLARATION_OPEN_STATE) {
# If the next few characters are:
# Two U+002D HYPHEN-MINUS characters (-)
if ($this->data->peek(2) === '--') {
# Consume those two characters,
# create a comment token whose data is the empty string,
# and switch to the comment start state.
$this->data->consumeWhile("-", 2);
$token = new CommentToken('');
$this->state = self::COMMENT_START_STATE;
}
//OPTIMIZATION: Peek seven characters only once
else {
$peek = $this->data->peek(7);
# ASCII case-insensitive match for the word "DOCTYPE"
if (strtoupper($peek) === 'DOCTYPE') {
# Consume those characters and switch to the DOCTYPE state.
$this->data->consumeWhile(self::CTYPE_ALPHA, 7);
$this->state = self::DOCTYPE_STATE;
}
# Case-sensitive match for the string "[CDATA["
elseif ($peek === '[CDATA[') {
# Consume those characters.
# If there is an adjusted current node and it is not an
# element in the HTML namespace, then switch to the
# CDATA section state.
# Otherwise, this is a cdata-in-html-content parse error.
# Create a comment token whose data is the "[CDATA[" string.
# Switch to the bogus comment state.
$this->data->consumeWhile(self::CTYPE_ALPHA."[", 7);
if ($this->stack->adjustedCurrentNode && ($this->stack->adjustedCurrentNode->namespaceURI ?? Parser::HTML_NAMESPACE) !== Parser::HTML_NAMESPACE) {
$this->state = self::CDATA_SECTION_STATE;
} else {
$this->error(ParseError::CDATA_IN_HTML_CONTENT);
$token = new CommentToken('[CDATA[');
$this->state = self::BOGUS_COMMENT_STATE;
}
}
# Anything else
else {
# This is an incorrectly-opened-comment parse error.
# Create a comment token whose data is the empty string.
# Switch to the bogus comment state
# (don't consume anything in the current state).
$this->error(ParseError::INCORRECTLY_OPENED_COMMENT);
$token = new CommentToken('');
$this->state = self::BOGUS_COMMENT_STATE;
}
}
}
# 13.2.5.43 Comment start state
elseif ($this->state === self::COMMENT_START_STATE) {
# Consume the next input character
# "-" (U+002D)
if ($char === '-') {
# Switch to the comment start dash state.
$this->state = self::COMMENT_START_DASH_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# This is an abrupt-closing-of-empty-comment parse error.
# Switch to the data state.
# Emit the comment token.
$this->error(ParseError::ABRUPT_CLOSING_OF_EMPTY_COMMENT);
$this->state = self::DATA_STATE;
yield $token;
}
# Anything else
else {
# Reconsume in the comment state.
$this->state = self::COMMENT_STATE;
goto Reconsume;
}
}
# 13.2.5.44 Comment start dash state
elseif ($this->state === self::COMMENT_START_DASH_STATE) {
# Consume the next input character
# "-" (U+002D)
if ($char === '-') {
# Switch to the comment end state.
$this->state = self::COMMENT_END_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# This is an abrupt-closing-of-empty-comment parse error.
# Switch to the data state.
# Emit the comment token.
$this->error(ParseError::ABRUPT_CLOSING_OF_EMPTY_COMMENT);
$this->state = self::DATA_STATE;
yield $token;
}
# EOF
elseif ($char === '') {
# This is an eof-in-comment parse error.
# Emit the comment token.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_COMMENT);
yield $token;
yield new EOFToken;
return;
}
# Anything else
else {
# Append a U+002D HYPHEN-MINUS character (-) to the comment token's data.
# Reconsume in the comment state.
$token->data .= '-';
$this->state = self::COMMENT_STATE;
goto Reconsume;
}
}
# 13.2.5.45 Comment state
elseif ($this->state === self::COMMENT_STATE) {
# Consume the next input character
# "<" (U+003C)
if ($char === '<') {
# Append the current input character to the comment token's data.
# Switch to the comment less-than sign state.
$token->data .= $char;
$this->state = self::COMMENT_LESS_THAN_SIGN_STATE;
}
# "-" (U+002D)
elseif ($char === '-') {
# Switch to the comment end dash state
$this->state = self::COMMENT_END_DASH_STATE;
}
# U+0000 NULL
elseif ($char === "\0") {
# This is an unexpected-null-character parse error.
# Append a U+FFFD REPLACEMENT CHARACTER character to the comment token's data.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
$token->data .= "\u{FFFD}";
}
# EOF
elseif ($char === '') {
# This is an eof-in-comment parse error.
# Emit the comment token.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_COMMENT);
yield $token;
yield new EOFToken;
return;
}
# Anything else
else {
# Append the current input character to the comment token's data.
// OPTIMIZATION:
// Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
$token->data .= $char.$this->data->consumeUntil("<-\0");
}
}
# 13.2.5.46 Comment less-than sign state
elseif ($this->state === self::COMMENT_LESS_THAN_SIGN_STATE) {
# Consume the next input character
# U+0021 EXCLAMATION MARK (!)
if ($char === '!') {
# Append the current input character to the comment token's data.
# Switch to the comment less-than sign bang state.
$token->data .= $char;
$this->state = self::COMMENT_LESS_THAN_SIGN_BANG_STATE;
}
# U+003C LESS-THAN SIGN (<)
elseif ($char ==='<') {
# Append the current input character to the comment token's data.
$token->data .= $char;
}
# Anything else
else {
# Reconsume in the comment state
$this->state = self::COMMENT_STATE;
goto Reconsume;
}
}
# 13.2.5.47 Comment less-than sign bang state
elseif ($this->state === self::COMMENT_LESS_THAN_SIGN_BANG_STATE) {
# Consume the next input character
# U+002D HYPHEN-MINUS (-)
if ($char === '-') {
# Switch to the comment less-than sign bang dash state.
$this->state = self::COMMENT_LESS_THAN_SIGN_BANG_DASH_STATE;
}
# Anything else
else {
# Reconsume in the comment state
$this->state = self::COMMENT_STATE;
goto Reconsume;
}
}
# 13.2.5.48 Comment less-than sign bang dash state
elseif ($this->state === self::COMMENT_LESS_THAN_SIGN_BANG_DASH_STATE) {
# Consume the next input character
# U+002D HYPHEN-MINUS (-)
if ($char === '-') {
# Switch to the comment less-than sign bang dash dash state.
$this->state = self::COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH_STATE;
}
# Anything else
else {
# Reconsume in the comment end dash state
$this->state = self::COMMENT_END_DASH_STATE;
goto Reconsume;
}
}
# 13.2.5.49 Comment less-than sign bang dash dash state
elseif ($this->state === self::COMMENT_LESS_THAN_SIGN_BANG_DASH_DASH_STATE) {
# Consume the next input character
# U+003E GREATER-THAN SIGN (>)
# EOF
if ($char === '>' || $char === '') {
# Reconsume in the comment end state.
$this->state = self::COMMENT_END_STATE;
goto Reconsume;
}
# Anything else
else {
# This is a nested-comment parse error.
# Reconsume in the comment end state.
$this->error(ParseError::NESTED_COMMENT);
$this->state = self::COMMENT_END_STATE;
goto Reconsume;
}
}
# 13.2.5.50 Comment end dash state
elseif ($this->state === self::COMMENT_END_DASH_STATE) {
# Consume the next input character
# "-" (U+002D)
if ($char === '-') {
# Switch to the comment end state
$this->state = self::COMMENT_END_STATE;
}
# EOF
elseif ($char === '') {
# This is an eof-in-comment parse error.
# Emit the comment token.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_COMMENT);
yield $token;
yield new EOFToken;
return;
}
# Anything else
else {
# Append a "-" (U+002D) character to the comment token's data.
# Reconsume in the comment state.
$token->data .= '-';
$this->state = self::COMMENT_STATE;
goto Reconsume;
}
}
# 13.2.5.50 Comment end state
elseif ($this->state === self::COMMENT_END_STATE) {
# Consume the next input character
# ">" (U+003E)
if ($char === '>') {
# Switch to the data state.
# Emit the comment token.
$this->state = self::DATA_STATE;
yield $token;
}
# "!" (U+0021)
elseif ($char === '!') {
# Switch to the comment end bang state.
$this->state = self::COMMENT_END_BANG_STATE;
}
# "-" (U+002D)
elseif ($char === '-') {
# Append a U+002D HYPHEN-MINUS character (-) to the comment token's data.
// OPTIMIZATION:
// Consume all '-' characters to prevent having to loop back through
// here every single time.
$token->data .= $char.$this->data->consumeWhile('-');
}
# EOF
elseif ($char === '') {
# This is an eof-in-comment parse error.
# Emit the comment token.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_COMMENT);
yield $token;
yield new EOFToken;
return;
}
# Anything else
else {
# Append two U+002D HYPHEN-MINUS characters (-) to the comment token's data.
# Reconsume in the comment state.
$token->data .= '--';
$this->state = self::COMMENT_STATE;
goto Reconsume;
}
}
# 13.2.5.52 Comment end bang state
elseif ($this->state === self::COMMENT_END_BANG_STATE) {
# Consume the next input character
# "-" (U+002D)
if ($char === '-') {
# Append two U+002D HYPHEN-MINUS characters (-)
# and a U+0021 EXCLAMATION MARK character (!)
# to the comment token's data.
# Switch to the comment end dash state.
$token->data .= '--!';
$this->state = self::COMMENT_END_DASH_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# This is an incorrectly-closed-comment parse error.
# Switch to the data state.
# Emit the comment token.
$this->error(ParseError::INCORRECTLY_CLOSED_COMMENT);
$this->state = self::DATA_STATE;
yield $token;
}
# EOF
elseif ($char === '') {
# This is an eof-in-comment parse error.
# Emit the comment token.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_COMMENT);
yield $token;
yield new EOFToken;
return;
}
# Anything else
else {
# Append two U+002D HYPHEN-MINUS characters (-)
# and a U+0021 EXCLAMATION MARK character (!)
# to the comment token's data.
# Reconsume in the comment state.
$token->data .= '--!';
$this->state = self::COMMENT_STATE;
goto Reconsume;
}
}
# 13.2.5.53 DOCTYPE state
elseif ($this->state === self::DOCTYPE_STATE) {
# Consume the next input character
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if (strspn($char, "\t\n\x0C ")) {
# Switch to the before DOCTYPE name state.
$this->state = self::BEFORE_DOCTYPE_NAME_STATE;
}
# U+003E GREATER-THAN SIGN (>)
elseif ($char === '>') {
# Reconsume in the before DOCTYPE name state.
$this->state = self::BEFORE_DOCTYPE_NAME_STATE;
goto Reconsume;
}
# EOF
elseif ($char === '') {
# This is an eof-in-doctype parse error.
# Create a new DOCTYPE token.
# Set its force-quirks flag to on.
# Emit the token.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_DOCTYPE);
$token = new DOCTYPEToken();
$token->forceQuirks = true;
yield $token;
yield new EOFToken;
return;
}
# Anything else
else {
# This is a missing-whitespace-before-doctype-name parse error.
# Reconsume in the before DOCTYPE name state.
$this->error(ParseError::MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME);
$this->state = self::BEFORE_DOCTYPE_NAME_STATE;
goto Reconsume;
}
}
# 13.2.5.54 Before DOCTYPE name state
elseif ($this->state === self::BEFORE_DOCTYPE_NAME_STATE) {
# Consume the next input character
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if (strspn($char, "\t\n\x0C ")) {
# Ignore the character.
}
// See below for ASCII upper alpha
# U+0000 NULL
elseif ($char === "\0") {
# This is an unexpected-null-character parse error.
# Create a new DOCTYPE token.
# Set the token's name to a U+FFFD REPLACEMENT CHARACTER character.
# Switch to the DOCTYPE name state.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
$token = new DOCTYPEToken("\u{FFFD}");
$this->state = self::DOCTYPE_NAME_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# This is a missing-doctype-name parse error.
# Create a new DOCTYPE token.
# Set its force-quirks flag to on.
# Switch to the data state.
# Emit the token.
$this->error(ParseError::MISSING_DOCTYPE_NAME);
$token = new DOCTYPEToken();
$token->forceQuirks = true;
$this->state = self::DATA_STATE;
yield $token;
}
# EOF
elseif ($char === '') {
# This is an eof-in-doctype parse error.
# Create a new DOCTYPE token.
# Set its force-quirks flag to on.
# Emit the token.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_DOCTYPE);
$token = new DOCTYPEToken();
$token->forceQuirks = true;
yield $token;
yield new EOFToken;
return;
}
# ASCII upper alpha
# Anything else
else {
# Create a new DOCTYPE token.
# Set the token's name to the current input character.
# Switch to the DOCTYPE name state.
// OPTIMIZATION: Also handle ASCII upper alpha
// OPTIMIZATION: Consume characters not explicitly handled by the "DOCTYPE name" state
$token = new DOCTYPEToken(strtolower($char.$this->data->consumeUntil("\t\n\x0c >\0")));
$this->state = self::DOCTYPE_NAME_STATE;
}
}
# 13.2.5.55 DOCTYPE name state
elseif ($this->state === self::DOCTYPE_NAME_STATE) {
# Consume the next input character
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if (strspn($char, "\t\n\x0C ")) {
# Switch to the after DOCTYPE name state.
$this->state = self::AFTER_DOCTYPE_NAME_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Switch to the data state.
# Emit the current DOCTYPE token.
$this->state = self::DATA_STATE;
yield $token;
}
// See below for ASCII upper alpha
# U+0000 NULL
elseif ($char === "\0") {
# This is an unexpected-null-character parse error.
# Append a U+FFFD REPLACEMENT CHARACTER character
# to the current DOCTYPE token's name.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
$token->name .= "\u{FFFD}";
}
# EOF
elseif ($char === '') {
# This is an eof-in-doctype parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Emit that DOCTYPE token.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_DOCTYPE);
$token->forceQuirks = true;
yield $token;
yield new EOFToken;
return;
}
# ASCII upper alpha
# Anything else
else {
# Append the current input character to the current DOCTYPE token's name.
// OPTIMIZATION: Also handle ASCII upper alpha
// OPTIMIZATION:
// Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
$token->name .= strtolower($char.$this->data->consumeUntil("\t\n\x0c >\0"));
}
}
# 13.2.5.56 After DOCTYPE name state
elseif ($this->state === self::AFTER_DOCTYPE_NAME_STATE) {
# Consume the next input character
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if (strspn($char, "\t\n\x0C ")) {
# Ignore the character
}
# ">" (U+003E)
elseif ($char === '>') {
# Switch to the data state.
# Emit the current DOCTYPE token.
$this->state = self::DATA_STATE;
yield $token;
}
# EOF
elseif ($char === '') {
# This is an eof-in-doctype parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Emit that DOCTYPE token.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_DOCTYPE);
$token->forceQuirks = true;
yield $token;
yield new EOFToken;
return;
}
# Anything else
else {
// OPTIMIZATION: Peek only once; we peek because consuming could alter the order of errors
$peek = strtoupper($char.$this->data->peek(5));
# If the six characters starting from the current input
# character are an ASCII case-insensitive match for the
# word "PUBLIC", then consume those characters and
# switch to the after DOCTYPE public keyword state.
if($peek === 'PUBLIC') {
$this->data->consumeWhile(self::CTYPE_ALPHA, 5);
$this->state = self::AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE;
}
# Otherwise, if the six characters starting from the current input
# character are an ASCII case-insensitive match for the
# word "SYSTEM", then consume those characters and
# switch to the after DOCTYPE system keyword state.
elseif ($peek === 'SYSTEM') {
$this->data->consumeWhile(self::CTYPE_ALPHA, 5);
$this->state = self::AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE;
}
# Otherwise, this is an
# invalid-character-sequence-after-doctype-name
# parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Reconsume in the bogus DOCTYPE state.
else {
$this->error(ParseError::INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
goto Reconsume;
}
}
}
# 13.2.5.57 After DOCTYPE public keyword state
elseif ($this->state === self::AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE) {
# Consume the next input character
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if (strspn($char, "\t\n\x0C ")) {
# Switch to the before DOCTYPE public identifier state.
$this->state = self::BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
}
# U+0022 QUOTATION MARK (")
elseif ($char === '"') {
# This is a missing-whitespace-after-doctype-public-keyword parse error.
# Set the DOCTYPE token's public identifier to the empty string (not missing),
# then switch to the DOCTYPE public identifier (double-quoted) state.
$this->error(ParseError::MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD);
$token->public = '';
$this->state = self::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
}
# "'" (U+0027)
elseif ($char === "'") {
# This is a missing-whitespace-after-doctype-public-keyword parse error.
# Set the DOCTYPE token's public identifier to the empty string (not missing),
# then switch to the DOCTYPE public identifier (single-quoted) state.
$this->error(ParseError::MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD);
$token->public = '';
$this->state = self::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# This is a missing-doctype-public-identifier parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Switch to the data state.
# Emit that DOCTYPE token.
$this->error(ParseError::MISSING_DOCTYPE_PUBLIC_IDENTIFIER);
$token->forceQuirks = true;
$this->state = self::DATA_STATE;
yield $token;
}
# EOF
elseif ($char === '') {
# This is an eof-in-doctype parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Emit that DOCTYPE token.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_DOCTYPE);
$token->forceQuirks = true;
yield $token;
yield new EOFToken;
return;
}
# Anything else
else {
# This is a missing-quote-before-doctype-public-identifier parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Reconsume in the bogus DOCTYPE state.
$this->error(ParseError::MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
goto Reconsume;
}
}
# 13.2.5.58 Before DOCTYPE public identifier state
elseif ($this->state === self::BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
# Consume the next input character
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if (strspn($char, "\t\n\x0C ")) {
# Ignore the character.
}
# U+0022 QUOTATION MARK (")
elseif ($char === '"') {
# Set the DOCTYPE token's public identifier to the empty string (not missing),
# then switch to the DOCTYPE public identifier (double-quoted) state.
$token->public = '';
$this->state = self::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
}
# "'" (U+0027)
elseif ($char === "'") {
# Set the DOCTYPE token's public identifier to the empty string (not missing),
# then switch to the DOCTYPE public identifier (single-quoted) state.
$token->public = '';
$this->state = self::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# This is a missing-doctype-public-identifier parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Switch to the data state.
# Emit that DOCTYPE token.
$this->error(ParseError::MISSING_DOCTYPE_PUBLIC_IDENTIFIER);
$token->forceQuirks = true;
$this->state = self::DATA_STATE;
yield $token;
}
# EOF
elseif ($char === '') {
# This is an eof-in-doctype parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Emit that DOCTYPE token.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_DOCTYPE);
$token->forceQuirks = true;
yield $token;
yield new EOFToken;
return;
}
# Anything else
else {
# This is a missing-quote-before-doctype-public-identifier parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Reconsume in the bogus DOCTYPE state.
$this->error(ParseError::MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
goto Reconsume;
}
}
# 13.2.5.59 DOCTYPE public identifier (double-quoted) state
elseif ($this->state === self::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
# Consume the next input character
# U+0022 QUOTATION MARK (")
if ($char === '"') {
# Switch to the after DOCTYPE public identifier state.
$this->state = self::AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
}
# U+0000 NULL
elseif ($char === "\0") {
# This is an unexpected-null-character parse error.
# Append a U+FFFD REPLACEMENT CHARACTER character
# to the current DOCTYPE token's public identifier.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
$token->public .= "\u{FFFD}";
}
# ">" (U+003E)
elseif ($char === '>') {
# This is an abrupt-doctype-public-identifier parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Switch to the data state.
# Emit that DOCTYPE token.
$this->error(ParseError::ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER);
$token->forceQuirks = true;
$this->state = self::DATA_STATE;
yield $token;
}
# EOF
elseif ($char === '') {
# This is an eof-in-doctype parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Emit that DOCTYPE token.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_DOCTYPE);
$token->forceQuirks = true;
yield $token;
yield new EOFToken;
return;
}
# Anything else
else {
# Append the current input character to the
# current DOCTYPE token's public identifier.
// OPTIMIZATION:
// Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
$token->public .= $char.$this->data->consumeUntil("\">\0");
}
}
# 13.2.5.60 DOCTYPE public identifier (single-quoted) state
elseif ($this->state === self::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
# Consume the next input character
# "'" (U+0027)
if ($char === "'") {
# Switch to the after DOCTYPE public identifier state.
$this->state = self::AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
}
# U+0000 NULL
elseif ($char === "\0") {
# This is an unexpected-null-character parse error.
# Append a U+FFFD REPLACEMENT CHARACTER character
# to the current DOCTYPE token's public identifier.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
$token->public .= "\u{FFFD}";
}
# ">" (U+003E)
elseif ($char === '>') {
# This is an abrupt-doctype-public-identifier parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Switch to the data state.
# Emit that DOCTYPE token.
$this->error(ParseError::ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER);
$token->forceQuirks = true;
$this->state = self::DATA_STATE;
yield $token;
}
# EOF
elseif ($char === '') {
# This is an eof-in-doctype parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Emit that DOCTYPE token.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_DOCTYPE);
$token->forceQuirks = true;
yield $token;
yield new EOFToken;
return;
}
# Anything else
else {
# Append the current input character to the
# current DOCTYPE token's public identifier.
// OPTIMIZATION:
// Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
$token->public .= $char.$this->data->consumeUntil("'>\0");
}
}
# 13.2.5.60 After DOCTYPE public identifier state
elseif ($this->state === self::AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
# Consume the next input character
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if (strspn($char, "\t\n\x0C ")) {
# Switch to the between DOCTYPE public and system identifiers state.
$this->state = self::BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# Switch to the data state.
# Emit the current DOCTYPE token.
$this->state = self::DATA_STATE;
yield $token;
}
# U+0022 QUOTATION MARK (")
elseif ($char === '"') {
# This is a missing-whitespace-between-doctype-public-and-system-identifiers parse error.
# Set the DOCTYPE token's system identifier to the empty string (not missing),
# then switch to the DOCTYPE system identifier (double-quoted) state.
$this->error(ParseError::MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS);
$token->system = '';
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
}
# "'" (U+0027)
elseif ($char === "'") {
# This is a missing-whitespace-between-doctype-public-and-system-identifiers parse error.
# Set the DOCTYPE token's system identifier to the empty string (not missing),
# then switch to the DOCTYPE system identifier (single-quoted) state.
$this->error(ParseError::MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS);
$token->system = '';
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
}
# EOF
elseif ($char === '') {
# This is an eof-in-doctype parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Emit that DOCTYPE token.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_DOCTYPE);
$token->forceQuirks = true;
yield $token;
yield new EOFToken;
return;
}
# Anything else
else {
# This is a missing-quote-before-doctype-system-identifier parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Reconsume in the bogus DOCTYPE state.
$this->error(ParseError::MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
goto Reconsume;
}
}
# 13.2.5.62 Between DOCTYPE public and system identifiers state
elseif ($this->state === self::BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE) {
# Consume the next input character
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if (strspn($char, "\t\n\x0C ")) {
# Ignore the character.
}
# ">" (U+003E)
elseif ($char === '>') {
# Switch to the data state.
# Emit the current DOCTYPE token.
$this->state = self::DATA_STATE;
yield $token;
}
# U+0022 QUOTATION MARK (")
elseif ($char === '"') {
# Set the DOCTYPE token's system identifier to the
# empty string (not missing), then switch to the
# DOCTYPE system identifier (double-quoted) state.
$token->system = '';
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
}
# "'" (U+0027)
elseif ($char === "'") {
# Set the DOCTYPE token's system identifier to the
# empty string (not missing), then switch to the
# DOCTYPE system identifier (single-quoted) state.
$token->system = '';
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
}
# EOF
elseif ($char === '') {
# This is an eof-in-doctype parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Emit that DOCTYPE token.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_DOCTYPE);
$token->forceQuirks = true;
yield $token;
yield new EOFToken;
return;
}
# Anything else
else {
# This is a missing-quote-before-doctype-system-identifier parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Reconsume in the bogus DOCTYPE state.
$this->error(ParseError::MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
goto Reconsume;
}
}
# 13.2.5.63 After DOCTYPE system keyword state
elseif ($this->state === self::AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE) {
# Consume the next input character
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if (strspn($char, "\t\n\x0C ")) {
# Switch to the before DOCTYPE system identifier state.
$this->state = self::BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
}
# U+0022 QUOTATION MARK (")
elseif ($char === '"') {
# This is a missing-whitespace-after-doctype-system-keyword parse error.
# Set the DOCTYPE token's system identifier to the empty string (not missing),
# then switch to the DOCTYPE system identifier (double-quoted) state.
$this->error(ParseError::MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD);
$token->system = '';
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
}
# "'" (U+0027)
elseif ($char === "'") {
# This is a missing-whitespace-after-doctype-system-keyword parse error.
# Set the DOCTYPE token's system identifier to the empty string (not missing),
# then switch to the DOCTYPE system identifier (single-quoted) state.
$this->error(ParseError::MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD);
$token->system = '';
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# This is a missing-doctype-system-identifier parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Switch to the data state.
# Emit that DOCTYPE token.
$this->error(ParseError::MISSING_DOCTYPE_SYSTEM_IDENTIFIER);
$token->forceQuirks = true;
$this->state = self::DATA_STATE;
yield $token;
}
# EOF
elseif ($char === '') {
# This is an eof-in-doctype parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Emit that DOCTYPE token.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_DOCTYPE);
$token->forceQuirks = true;
yield $token;
yield new EOFToken;
return;
}
# Anything else
else {
# This is a missing-quote-before-doctype-system-identifier parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Reconsume in the bogus DOCTYPE state.
$this->error(ParseError::MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
goto Reconsume;
}
}
# 13.2.5.64 Before DOCTYPE system identifier state
elseif ($this->state === self::BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
# Consume the next input character
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if (strspn($char, "\t\n\x0C ")) {
# Ignore the character.
}
# U+0022 QUOTATION MARK (")
elseif ($char === '"') {
# Set the DOCTYPE token's system identifier to the
# empty string (not missing), then switch to the
# DOCTYPE system identifier (double-quoted) state.
$token->system = '';
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
}
# "'" (U+0027)
elseif ($char === "'") {
# Set the DOCTYPE token's system identifier to the
# empty string (not missing), then switch to the
# DOCTYPE system identifier (single-quoted) state.
$token->system = '';
$this->state = self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
}
# ">" (U+003E)
elseif ($char === '>') {
# This is a missing-doctype-system-identifier parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Switch to the data state.
# Emit that DOCTYPE token.
$this->error(ParseError::MISSING_DOCTYPE_SYSTEM_IDENTIFIER);
$token->forceQuirks = true;
$this->state = self::DATA_STATE;
yield $token;
}
# EOF
elseif ($char === '') {
# This is an eof-in-doctype parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Emit that DOCTYPE token.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_DOCTYPE);
$token->forceQuirks = true;
yield $token;
yield new EOFToken;
return;
}
# Anything else
else {
# This is a missing-quote-before-doctype-system-identifier parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Reconsume in the bogus DOCTYPE state.
$this->error(ParseError::MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
goto Reconsume;
}
}
# 13.2.5.64 DOCTYPE system identifier (double-quoted) state
elseif ($this->state === self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
# Consume the next input character
# U+0022 QUOTATION MARK (")
if ($char === '"') {
# Switch to the after DOCTYPE system identifier state.
$this->state = self::AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
}
# U+0000 NULL
elseif ($char === "\0") {
# This is an unexpected-null-character parse error.
# Append a U+FFFD REPLACEMENT CHARACTER character
# to the current DOCTYPE token's system identifier.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
$token->system .= "\u{FFFD}";
}
# ">" (U+003E)
elseif ($char === '>') {
# This is an abrupt-doctype-system-identifier parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Switch to the data state.
# Emit that DOCTYPE token.
$this->error(ParseError::ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER);
$token->forceQuirks = true;
$this->state = self::DATA_STATE;
yield $token;
}
# EOF
elseif ($char === '') {
# This is an eof-in-doctype parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Emit that DOCTYPE token.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_DOCTYPE);
$token->forceQuirks = true;
yield $token;
yield new EOFToken;
return;
}
# Anything else
else {
# Append the current input character to the current DOCTYPE token's system identifier.
// OPTIMIZATION:
// Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
$token->system .= $char.$this->data->consumeUntil("\"\0>");
}
}
# 13.2.5.66 DOCTYPE system identifier (single-quoted) state
elseif ($this->state === self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
# Consume the next input character
# "'" (U+0027)
if ($char === "'") {
# Switch to the after DOCTYPE system identifier state.
$this->state = self::AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
}
# U+0000 NULL
elseif ($char === "\0") {
# This is an unexpected-null-character parse error.
# Append a U+FFFD REPLACEMENT CHARACTER character
# to the current DOCTYPE token's system identifier.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
$token->system .= "\u{FFFD}";
}
# ">" (U+003E)
elseif ($char === '>') {
# This is an abrupt-doctype-system-identifier parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Switch to the data state.
# Emit that DOCTYPE token.
$this->error(ParseError::ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER);
$token->forceQuirks = true;
$this->state = self::DATA_STATE;
yield $token;
}
# EOF
elseif ($char === '') {
# This is an eof-in-doctype parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Emit that DOCTYPE token.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_DOCTYPE);
$token->forceQuirks = true;
yield $token;
yield new EOFToken;
return;
}
# Anything else
else {
# Append the current input character to the current DOCTYPE token's system identifier.
// OPTIMIZATION:
// Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
$token->system .= $char.$this->data->consumeUntil("'\0>");
}
}
# 13.2.5.67 After DOCTYPE system identifier state
elseif ($this->state === self::AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
# Consume the next input character
# "tab" (U+0009)
# "LF" (U+000A)
# "FF" (U+000C)
# U+0020 SPACE
if (strspn($char, "\t\n\x0C ")) {
# Ignore the character
}
# ">" (U+003E)
elseif ($char === '>') {
# Switch to the data state.
# Emit the current DOCTYPE token.
$this->state = self::DATA_STATE;
yield $token;
}
# EOF
elseif ($char === '') {
# This is an eof-in-doctype parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Emit that DOCTYPE token.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_DOCTYPE);
$token->forceQuirks = true;
yield $token;
yield new EOFToken;
return;
}
# Anything else
else {
# This is an unexpected-character-after-doctype-system-identifier parse error.
# Reconsume in the bogus DOCTYPE state.
# (This does not set the DOCTYPE token's force-quirks flag to on.)
$this->error(ParseError::UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER, $char);
$this->state = self::BOGUS_DOCTYPE_STATE;
goto Reconsume;
}
}
# 13.2.5.67 Bogus DOCTYPE state
elseif ($this->state === self::BOGUS_DOCTYPE_STATE) {
# Consume the next input character
# ">" (U+003E)
if ($char === '>') {
# Switch to the data state.
# Emit the DOCTYPE token.
$this->state = self::DATA_STATE;
yield $token;
}
# U+0000 NULL
elseif ($char === "\0") {
# This is an unexpected-null-character parse error.
# Ignore the character.
$this->error(ParseError::UNEXPECTED_NULL_CHARACTER);
}
# EOF
elseif ($char === '') {
# Emit the DOCTYPE token.
# Emit an end-of-file token.
yield $token;
yield new EOFToken;
return;
}
# Anything else
# Ignore the character.
}
# 13.2.5.69 CDATA section state
elseif ($this->state === self::CDATA_SECTION_STATE) {
# Consume the next input character
# U+005D RIGHT SQUARE BRACKET (])
if ($char === ']') {
# Switch to the CDATA section bracket state.
$this->state = self::CDATA_SECTION_BRACKET_STATE;
}
# EOF
elseif ($char === '') {
# This is an eof-in-cdata parse error.
# Emit an end-of-file token.
$this->error(ParseError::EOF_IN_CDATA);
yield new EOFToken;
return;
}
# Anything else
else {
# Emit the current input character as a character token.
// OPTIMIZATION:
// Consume all characters that aren't listed above to prevent having
// to loop back through here every single time; only null characters
// are emitted singly
if ($char === "\0") {
yield new NullCharacterToken($char);
} elseif (strspn($char, Data::WHITESPACE)) {
yield new WhitespaceToken($char.$this->data->consumeWhile(Data::WHITESPACE_SAFE));
} else {
yield new CharacterToken($char.$this->data->consumeUntil("]\0"));
}
}
}
# 13.2.5.70 CDATA section bracket state
elseif ($this->state === self::CDATA_SECTION_BRACKET_STATE) {
# Consume the next input character
# U+005D RIGHT SQUARE BRACKET (])
if ($char === ']') {
# Switch to the CDATA section end state.
$this->state = self::CDATA_SECTION_END_STATE;
}
# Anything else
else {
# Emit a U+005D RIGHT SQUARE BRACKET character token.
# Reconsume in the CDATA section state.
$this->state = self::CDATA_SECTION_STATE;
yield new CharacterToken(']');
goto Reconsume;
}
}
# 13.2.5.71 CDATA section end state
elseif ($this->state === self::CDATA_SECTION_END_STATE) {
# Consume the next input character
# U+005D RIGHT SQUARE BRACKET (])
if ($char === ']') {
# Emit a U+005D RIGHT SQUARE BRACKET character token.
// OTPIMIZATION: Consume any additional right square brackets
yield new CharacterToken(']'.$this->data->consumeWhile(']'));
}
# U+003E GREATER-THAN SIGN character
elseif ($char === '>') {
# Switch to the data state.
$this->state = self::DATA_STATE;
}
# Anything else
else {
# Emit two U+005D RIGHT SQUARE BRACKET character tokens.
# Reconsume in the CDATA section state.
$this->state = self::CDATA_SECTION_STATE;
yield new CharacterToken(']]');
goto Reconsume;
}
}
# Not a valid state, unimplemented, or implemented elsewhere
else {
throw new \Exception("Invalid tokenizer state: ".(self::STATE_NAMES[$this->state] ?? $this->state)); // @codeCoverageIgnore
}
}
} // @codeCoverageIgnore
protected function switchToCharacterReferenceState(int $returnState): string {
// This function implements states 72 through 80,
// "Character reference" through "Numeric character reference end" states
$this->state = self::CHARACTER_REFERENCE_STATE;
$charRefCode = 0;
while (true) {
assert((function() {
$state = self::STATE_NAMES[$this->state] ?? $this->state;
$char = bin2hex($this->data->peek(1));
$this->debugLog .= " State: $state ($char)\n";
return true;
})());
# 13.2.5.72 Character reference state
if ($this->state === self::CHARACTER_REFERENCE_STATE) {
# Set the temporary buffer to the empty string.
# Append a U+0026 AMPERSAND (&) character to the temporary buffer.
# Consume the next input character.
$this->temporaryBuffer = '&';
$char = $this->data->consume();
# ASCII alphanumeric
if (ctype_alnum($char)) {
# Reconsume in the named character reference state.
$this->state = self::NAMED_CHARACTER_REFERENCE_STATE;
$this->data->unconsume();
}
# U+0023 NUMBER SIGN (#)
elseif ($char === '#') {
# Append the current input character to the temporary buffer.
# Switch to the numeric character reference state.
$this->temporaryBuffer .= $char;
$this->state = self::NUMERIC_CHARACTER_REFERENCE_STATE;
}
# Anything else
else {
# Flush code points consumed as a character reference.
# Reconsume in the return state.
$this->state = $returnState;
$this->data->unconsume();
return $this->temporaryBuffer;
}
}
# 13.2.5.73 Named character reference state
elseif ($this->state === self::NAMED_CHARACTER_REFERENCE_STATE) {
# Consume the maximum number of characters possible,
# with the consumed characters matching one of the
# identifiers in the first column of the named character
# references table (in a case-sensitive manner).
// DEVIATION:
// We consume all possible alphanumeric characters,
// up to the length of the longest in the table
$candidate = $this->data->consumeWhile(self::CTYPE_ALNUM, CharacterReference::LONGEST_NAME);
// Keep a record of the terminating character, which is used later
$next = $this->data->peek(1);
if ($next === ';') {
// consume the following character if it is a proper terminator
$candidate .= $this->data->consume();
}
// Look for an exact match; if not found look for a prefix match
$match = CharacterReference::NAMES[$candidate] ?? null;
if ($match === null) {
$match = (preg_match(CharacterReference::PREFIX_PATTERN, $candidate, $match)) ? $match[0] : null;
// If a prefix match is found, unconsume to the end of the prefix and look up the entry in the table
if ($match !== null) {
$this->data->unconsume(strlen($candidate) - strlen($match));
$next = $candidate[strlen($match)];
$candidate = $match;
$match = CharacterReference::NAMES[$match];
}
}
# Append each character to the temporary buffer when it's consumed.
$this->temporaryBuffer .= $candidate;
# If there is a match
if ($match !== null) {
# If the character reference was consumed as part of an attribute,
# and the last character matched is not a U+003B SEMICOLON character (;),
# and the next input character is either a U+003D EQUALS SIGN character (=)
# or an ASCII alphanumeric...
if (in_array($returnState, self::ATTRIBUTE_VALUE_STATE_SET) && $next !== ';' && ($next === '=' || ctype_alnum($next))) {
# ... then, for historical reasons, flush code points consumed
# as a character reference and switch to the return state.
$this->state = $returnState;
return $this->temporaryBuffer;
}
# Otherwise:
else {
# If the last character matched is not a U+003B SEMICOLON character (;),
# then this is a missing-semicolon-after-character-reference parse error.
if ($next !== ';') {
$this->error(ParseError::MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE);
}
# Set the temporary buffer to the empty string.
# Append one or two characters corresponding to the
# character reference name (as given by the second
# column of the named character references table)
# to the temporary buffer.
# Flush code points consumed as a character reference.
# Switch to the return state.
// In other words: return the match
$this->state = $returnState;
return $match;
}
}
# Otherwise:
else {
# Flush code points consumed as a character reference.
# Switch to the ambiguous ampersand state.
// DEVIATION: We flush only when switching to the return state
$this->state = self::AMBIGUOUS_AMPERSAND_STATE;
// If we consumed a semicolon earlier we need to undo this
if ($next === ';') {
$this->data->unconsume();
$this->temporaryBuffer = substr($this->temporaryBuffer, 0, -1);
}
}
}
# 13.2.5.74 Ambiguous ampersand state
elseif ($this->state === self::AMBIGUOUS_AMPERSAND_STATE) {
# Consume the next input character.
$char = $this->data->consume();
# ASCII alphanumeric
if (ctype_alnum($char)) {
# If the character reference was consumed as part of an attribute,
# then append the current input character to the current attribute's value.
# Otherwise, emit the current input character as a character token.
// DEVIATION: We just continue to buffer characters until it's time to return
$this->temporaryBuffer .= $char.$this->data->consumeWhile(self::CTYPE_ALNUM);
}
# U+003B SEMICOLON (;)
elseif ($char === ';') {
# This is an unknown-named-character-reference parse error.
# Reconsume in the return state.
$this->data->unconsume();
$this->error(ParseError::UNKNOWN_NAMED_CHARACTER_REFERENCE, $this->temporaryBuffer.';');
$this->state = $returnState;
return $this->temporaryBuffer;
}
# Anything else
else {
# Reconsume in the return state.
$this->state = $returnState;
$this->data->unconsume();
return $this->temporaryBuffer;
}
}
# 13.2.5.75 Numeric character reference state
elseif ($this->state === self::NUMERIC_CHARACTER_REFERENCE_STATE) {
# Set the character reference code to zero (0).
$charRefCode = 0;
# Consume the next input character.
$char = $this->data->consume();
# U+0078 LATIN SMALL LETTER X
#U+0058 LATIN CAPITAL LETTER X
if ($char === 'x' || $char === 'X') {
# Append the current input character to the temporary buffer.
# Switch to the hexadecimal character reference start state.
$this->temporaryBuffer .= $char;
$this->state = self::HEXADECIMAL_CHARACTER_REFERENCE_START_STATE;
}
# Anything else
else {
# Reconsume in the decimal character reference start state.
$this->state = self::DECIMAL_CHARACTER_REFERENCE_START_STATE;
$this->data->unconsume();
}
}
# 13.2.5.76 Hexadecimal character reference start state
elseif ($this->state === self::HEXADECIMAL_CHARACTER_REFERENCE_START_STATE) {
# Consume the next input character.
$char = $this->data->consume();
# ASCII hex digit
if (ctype_xdigit($char)) {
# Reconsume in the hexadecimal character reference state.
// OPTIMIZATION:
// Just consume the digits here
$charRefCode = hexdec($char.$this->data->consumeWhile(self::CTYPE_HEX));
$this->state = self::HEXADECIMAL_CHARACTER_REFERENCE_STATE;
}
# Anything else
else {
# This is an absence-of-digits-in-numeric-character-reference parse error.
# Flush code points consumed as a character reference.
# Reconsume in the return state.
$this->data->unconsume();
$this->error(ParseError::ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE);
$this->state = $returnState;
return $this->temporaryBuffer;
}
}
# 13.2.5.77 Decimal character reference start state
elseif ($this->state === self::DECIMAL_CHARACTER_REFERENCE_START_STATE) {
# Consume the next input character.
$char = $this->data->consume();
# ASCII digit
if (ctype_digit($char)) {
# Reconsume in the decimal character reference state.
// OPTIMIZATION:
// Just consume the digits here
$charRefCode = (int) ($char.$this->data->consumeWhile(self::CTYPE_NUM));
$this->state = self::DECIMAL_CHARACTER_REFERENCE_STATE;
}
# Anything else
else {
# This is an absence-of-digits-in-numeric-character-reference parse error.
# Flush code points consumed as a character reference.
# Reconsume in the return state.
$this->data->unconsume();
$this->error(ParseError::ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE);
$this->state = $returnState;
return $this->temporaryBuffer;
}
}
# 13.2.5.78 Hexadecimal character reference state
elseif ($this->state === self::HEXADECIMAL_CHARACTER_REFERENCE_STATE) {
# Consume the next input character.
$char = $this->data->consume();
# ASCII digit
# ASCII upper hex digit
# ASCII lower hex digit
if (ctype_xdigit($char)) {
# Multiply the character reference code by 16.
# Add a numeric version of the current input
# character to the character reference code.
// OPTIMIZATION: Combine all digit types
// NOTE: This branch should never be reached
$charRefCode = ($charRefCode * 16) + hexdec($char); // @codeCoverageIgnore
}
# U+003B SEMICOLON
elseif ($char === ';') {
# Switch to the numeric character reference end state.
$this->state = self::NUMERIC_CHARACTER_REFERENCE_END_STATE;
}
# Anything else
else {
# This is a missing-semicolon-after-character-reference parse error.
# Reconsume in the numeric character reference end state.
$this->data->unconsume();
$this->error(ParseError::MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE);
$this->state = self::NUMERIC_CHARACTER_REFERENCE_END_STATE;
}
}
# 13.2.5.79 Decimal character reference state
elseif ($this->state === self::DECIMAL_CHARACTER_REFERENCE_STATE) {
# Consume the next input character.
$char = $this->data->consume();
# ASCII digit
if (ctype_digit($char)) {
# Multiply the character reference code by 10.
# Add a numeric version of the current input
# character to the character reference code.
// OPTIMIZATION: Combine all digit types
// NOTE: This branch should never be reached
$charRefCode = ($charRefCode * 10) + ((int) ($char)); // @codeCoverageIgnore
}
# U+003B SEMICOLON
elseif ($char === ';') {
# Switch to the numeric character reference end state.
$this->state = self::NUMERIC_CHARACTER_REFERENCE_END_STATE;
}
# Anything else
else {
# This is a missing-semicolon-after-character-reference parse error.
# Reconsume in the numeric character reference end state.
$this->data->unconsume();
$this->error(ParseError::MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE);
$this->state = self::NUMERIC_CHARACTER_REFERENCE_END_STATE;
}
}
# 13.2.5.80 Numeric character reference end state
elseif ($this->state === self::NUMERIC_CHARACTER_REFERENCE_END_STATE) {
# Check the character reference code:
# If the number is 0x00, then this is a null-character-reference parse error.
# Set the character reference code to 0xFFFD.
if ($charRefCode === 0) {
$this->error(ParseError::NULL_CHARACTER_REFERENCE);
$charRefCode = 0xFFFD;
}
# If the number is greater than 0x10FFFF, then this is a
# character-reference-outside-unicode-range parse error.
# Set the character reference code to 0xFFFD.
elseif ($charRefCode > 0x10FFFF) {
$this->error(ParseError::CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE);
$charRefCode = 0xFFFD;
}
# If the number is a surrogate, then this is a
# surrogate-character-reference parse error.
# Set the character reference code to 0xFFFD.
elseif ($charRefCode >= 0xD800 && $charRefCode <= 0xDFFF) {
$this->error(ParseError::SURROGATE_CHARACTER_REFERENCE);
$charRefCode = 0xFFFD;
}
# If the number is a noncharacter, then this is a
# noncharacter-character-reference parse error.
elseif (($charRefCode >= 0xFDD0 && $charRefCode <= 0xFDEF) || ($charRefCode % 0x10000 & 0xFFFE) === 0xFFFE) {
$this->error(ParseError::NONCHARACTER_CHARACTER_REFERENCE);
}
# If the number is 0x0D, or a control that's not ASCII whitespace, then
# this is a control-character-reference parse error.
# If the number is one of the numbers in the first column of the following
# table, then find the row with that number in the first column, and set
# the character reference code to the number in the second column of that row.
elseif (($charRefCode < 0x20 && !in_array($charRefCode, [0x9, 0xA, 0xC])) || ($charRefCode >= 0x7F && $charRefCode <= 0x9F)) {
// NOTE: Table elided
$this->error(ParseError::CONTROL_CHARACTER_REFERENCE);
$charRefCode = CharacterReference::C1_TABLE[$charRefCode] ?? $charRefCode;
}
$this->temporaryBuffer = UTF8::encode($charRefCode);
$this->state = $returnState;
return $this->temporaryBuffer;
}
# Not a valid state, unimplemented, or implemented elsewhere
else {
throw new \Exception("Invalid character reference state: ".(self::STATE_NAMES[$this->state] ?? $this->state)); // @codeCoverageIgnore
}
}
} // @codeCoverageIgnore
}