tokenizerState = static::DATA_STATE; $this->insertionMode = static::INITIAL_MODE; $this->quirksMode = static::QUIRKS_MODE_OFF; $this->stack = new Stack(); } public static function parse(string $data, bool $file = false) { // If parse() is called by parseFragment() then don't create an instance. It has // already been created. $c = __CLASS__; if (!(static::$self instanceof $c && !static::$self->fragmentCase)) { static::$self = new $c; } // Process the input stream. static::$self->data = new DataStream(($file === true) ? '' : $data, ($file === true) ? $data : 'STDIN'); // Set the locale for CTYPE to en_US.UTF8 so ctype functions and strtolower only // work on basic latin characters. Used extensively when tokenizing. setlocale(LC_CTYPE, 'en_US.UTF8'); static::$self->tokenize(); //return static::$self->fixDOM(); return 'OOK!'; } public static function parseFragment(string $data, \DOMDocument $dom = null, \DOMElement $context = null, bool $file = false): \DOMDocument { // If a context is provided and either the DOM isn't provided or the DOM isn't // the owner document of the provided context then the context is invalid and // should be set to null. if (!is_null($context) && (is_null($dom) || !$dom->isSameNode($context->ownerDocument))) { $context = null; } // Create an instance of this class to use the non static properties. $c = __CLASS__; static::$self = new $c; if (!is_null($dom)) { static::$self->DOM = $dom; } else { $imp = new DOMImplementation; static::$self->DOM = $imp->createDocument(); } static::$self->DOMFragment = static::$self->DOM->createDocumentFragment(); // DEVIATION: The spec says to let the document be in quirks mode if the // DOMDocument is in quirks mode. Cannot check whether the context element is in // quirks mode, so going to assume it isn't. // DEVIATION: The spec's version of parsing fragments isn't remotely useful in // the context this library is intended for use in. This implementation uses a // DOMDocumentFragment for inserting nodes into. There's no need to have a // different process for when there isn't a context. There will always be one: // the DOMDocumentFragment. static::$self->fragmentContext = (!is_null($context)) ? $context : static::$self->DOMFragment; $name = static::$self->fragmentContext->nodeName; # Set the state of the HTML parser's tokenization stage as follows: if ($name === 'title' || $name === 'textarea') { static::$self->tokenizerState = static::RCDATA_STATE; } elseif ($name === 'style' || $name === 'xmp' || $name === 'iframe' || $name === 'noembed' || $name === 'noframes') { static::$self->tokenizerState = static::RAWTEXT_STATE; } elseif ($name === 'script') { static::$self->tokenizerState = static::SCRIPT_STATE; } elseif ($name === 'noscript') { static::$self->tokenizerState = static::NOSCRIPT_STATE; } elseif ($name === 'plaintext') { static::$self->tokenizerState = static::PLAINTEXT_STATE; } else { static::$self->tokenizerState = static::DATA_STATE; } // DEVIATION: Since this implementation uses a DOMDocumentFragment for insertion // there is no need to create an html element for inserting stuff into. If the // context element is a template element, push "in template" onto the stack of // template insertion modes so that it is the new current template insertion // mode. if ($name === 'template') { static::$self->templateInsertionModeStack[] = static::IN_TEMPLATE_MODE; } # Reset the parser's insertion mode appropriately. // DEVIATION: The insertion mode will be always 'in body', not 'before head' if // there isn't a context. There isn't a need to reconstruct a valid HTML // document when using a DOMDocumentFragment. static::$self->resetInsertionMode(); # Set the parser's form element pointer to the nearest node to the context element # that is a form element (going straight up the ancestor chain, and including the # element itself, if it is a form element), if any. (If there is no such form # element, the form element pointer keeps its initial value, null.) static::$self->formElement = ($name === 'form') ? $context : DOM::getAncestor('form', $context); # Start the parser and let it run until it has consumed all the characters just inserted into the input stream. static::$self->fragmentCase = true; static::parse($data, $file); # If there is a context element, return the child nodes of root, in tree order. # Otherwise, return the children of the Document object, in tree order. // DEVIATION: This method will always return a DOMDocumentFragment. return static::$self->DOMFragment; } protected function fixDOM($dom = null) { if (is_null($dom)) { $dom = &$this->DOM; } // TODO: Take fragments, append them to a document, fix shit, and then poop out a // fragment so selecting id attributes works on fragments. // Fix id attributes so they may be selected by the DOM. Fix the PHP id attribute // bug. Allows DOMDocument->getElementById() to work on id attributes. if (!static::$self->fragmentCase) { $dom->relaxNGValidateSource(' '); } # Normalize the document before outputting. $dom->normalize(); return $dom; } protected function tokenize() { # The tokenizer state machine consists of the states defined in the following # subsections. // DEVIATION: The tokenizer spec has it work around NULL characters. // HTML5DataStream removes all NULL characters from the document instead. There // isn't a need to work around them when there isn't any scripting in this // implementation. the HTML5DataStream class removes them and triggers parse errors // then instead. So, all mentions of "U+0000 NULL" in the spec are ignored. while (true) { if (static::$debug) { echo "State: "; switch ($this->tokenizerState) { case static::DATA_STATE: echo "Data\n"; break; case static::RCDATA_STATE: echo "RCDATA\n"; break; case static::RAWTEXT_STATE: echo "RAWTEXT\n"; break; case static::SCRIPT_DATA_STATE: echo "Script data\n"; break; case static::PLAINTEXT_STATE: echo "PLAINTEXT\n"; break; case static::TAG_OPEN_STATE: echo "Tag open\n"; break; case static::END_TAG_OPEN_STATE: echo "End tag open\n"; break; case static::TAG_NAME_STATE: echo "Tag name\n"; break; case static::RCDATA_LESS_THAN_SIGN_STATE: echo "RCDATA less-than sign\n"; break; case static::RCDATA_END_TAG_OPEN_STATE: echo "RCDATA end tag open\n"; break; case static::RCDATA_END_TAG_NAME_STATE: echo "RCDATA end tag name\n"; break; case static::RAWTEXT_LESS_THAN_SIGN_STATE: echo "RAWTEXT less than sign\n"; break; case static::RAWTEXT_END_TAG_OPEN_STATE: echo "RAWTEXT end tag open\n"; break; case static::RAWTEXT_END_TAG_NAME_STATE: echo "RAWTEXT end tag name\n"; break; case static::SCRIPT_DATA_LESS_THAN_SIGN_STATE: echo "Script data less-than sign\n"; break; case static::SCRIPT_DATA_END_TAG_OPEN_STATE: echo "Script data end tag open\n"; break; case static::SCRIPT_DATA_END_TAG_NAME_STATE: echo "Script data end tag name\n"; break; case static::SCRIPT_DATA_ESCAPE_START_STATE: echo "Script data escape start\n"; break; case static::SCRIPT_DATA_ESCAPE_START_DASH_STATE: echo "Script data escape start dash\n"; break; case static::SCRIPT_DATA_ESCAPED_STATE: echo "Script data escaped\n"; break; case static::SCRIPT_DATA_ESCAPED_DASH_STATE: echo "Script data escaped dash\n"; break; case static::SCRIPT_DATA_ESCAPED_DASH_DASH_STATE: echo "Script data escaped dash dash\n"; break; case static::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE: echo "Script data escaped less-than sign\n"; break; case static::SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE: echo "Script data escaped end tag open\n"; break; case static::SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE: echo "Script data escaped end tag name\n"; break; case static::SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE: echo "Script data double escape start\n"; break; case static::SCRIPT_DATA_DOUBLE_ESCAPED_STATE: echo "Script data double escaped\n"; break; case static::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE: echo "Script data double escaped dash\n"; break; case static::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE: echo "Script data double escaped dash dash\n"; break; case static::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE: echo "Script data double escaped less-than sign\n"; break; case static::SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE: echo "Script data double escape end\n"; break; case static::BEFORE_ATTRIBUTE_NAME_STATE: echo "Before attribute\n"; break; case static::ATTRIBUTE_NAME_STATE: echo "Attribute name\n"; break; case static::AFTER_ATTRIBUTE_NAME_STATE: echo "After attribute name\n"; break; case static::BEFORE_ATTRIBUTE_VALUE_STATE: echo "Before attribute value\n"; break; case static::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE: echo "Attribute value (double quoted)\n"; break; case static::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE: echo "Attribute value (single quoted)\n"; break; case static::ATTRIBUTE_VALUE_UNQUOTED_STATE: echo "Attribute value (unquoted)\n"; break; case static::AFTER_ATTRIBUTE_VALUE_QUOTED_STATE: echo "After attribute value (quoted)\n"; break; case static::SELF_CLOSING_START_TAG_STATE: echo "Self-closing start tag\n"; break; case static::BOGUS_COMMENT_STATE: echo "Bogus comment\n"; break; case static::MARKUP_DECLARATION_OPEN_STATE: echo "Markup declaration open\n"; break; case static::COMMENT_START_STATE: echo "Comment start\n"; break; case static::COMMENT_START_DASH_STATE: echo "Comment start dash\n"; break; case static::COMMENT_STATE: echo "Comment\n"; break; case static::COMMENT_END_DASH_STATE: echo "Comment end dash\n"; break; case static::COMMENT_END_STATE: echo "Comment end\n"; break; case static::COMMENT_END_BANG_STATE: echo "Comment end bang\n"; break; case static::DOCTYPE_STATE: echo "DOCTYPE\n"; break; case static::BEFORE_DOCTYPE_NAME_STATE: echo "Before DOCTYPE name\n"; break; case static::DOCTYPE_NAME_STATE: echo "DOCTYPE name\n"; break; case static::AFTER_DOCTYPE_NAME_STATE: echo "After DOCTYPE name\n"; break; case static::AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE: echo "After DOCTYPE public keyword\n"; break; case static::BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE: echo "Before DOCTYPE public identifier\n"; break; case static::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE: echo "DOCTYPE public identifier (double quoted)\n"; break; case static::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE: echo "DOCTYPE public identifier (single quoted)\n"; break; case static::AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE: echo "After DOCTYPE public identifier\n"; break; case static::BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE: echo "Between DOCTYPE public and system identifiers\n"; break; case static::AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE: echo "After DOCTYPE system keyword\n"; break; case static::BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE: echo "Before DOCTYPE system identifier\n"; break; case static::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE: echo "DOCTYPE system identifier (double-quoted)\n"; break; case static::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE: echo "DOCTYPE system identifier (single-quoted)\n"; break; case static::AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE: echo "After DOCTYPE system identifier\n"; break; case static::BOGUS_DOCTYPE_STATE: echo "Bogus comment\n"; break; case static::CDATA_SECTION_STATE: echo "CDATA section\n"; } } # 12.2.4.1 Data state if ($this->tokenizerState === static::DATA_STATE) { # Consume the next input character $char = $this->data->consume(); # U+0026 AMPERSAND (&) if ($char === '&') { # Switch to the character reference in data state. # 8.2.4.2 Character reference in data state: # Switch to the data state. # Attempt to consume a character reference, with no additional allowed character. # If nothing is returned, emit a U+0026 AMPERSAND character (&) token. # Otherwise, emit the character tokens that were returned. // DEVIATION: This implementation does the character reference consuming in a // function for which it is more suited for. $this->emitToken(new CharacterToken($this->data->consumeCharacterReference())); } # U+003C LESS-THAN SIGN (<) elseif ($char === '<') { # Switch to the tag open state. $this->tokenizerState = static::TAG_OPEN_STATE; } # EOF elseif ($char === '') { # Emit an end-of-file token. $token = new EOFToken(); $this->emitToken($token); break; } # Anything else else { # Emit the current input character as a character token. // OPTIMIZATION: Consume all characters that don't match what is above and emit // that as a character token instead to prevent having to loop back through here // every single time. $this->emitToken(new CharacterToken($char.$this->data->consumeUntil('&<'))); } } # 12.2.4.2 Character reference in data state // OPTIMIZATION: This is instead done in the block above. # 12.2.4.3 RCDATA state elseif ($this->tokenizerState === static::RCDATA_STATE) { # Consume the next input character $char = $this->data->consume(); # U+0026 AMPERSAND (&) if ($char === '&') { # Switch to the character reference in RCDATA state. # 8.2.4.4 Character reference in RCDATA state: # Switch to the RCDATA state. # Attempt to consume a character reference, with no additional allowed character. # If nothing is returned, emit a U+0026 AMPERSAND character (&) token. # Otherwise, emit the character tokens that were returned. // DEVIATION: This implementation does the character reference consuming in a // function for which it is more suited for. $this->emitToken(new CharacterToken($this->data->consumeCharacterReference())); } # U+003C LESS-THAN SIGN (<) elseif ($char === '<') { # Switch to the RCDATA less-than sign state. $this->tokenizerState = static::RCDATA_LESS_THAN_SIGN_STATE; } # EOF elseif ($char === '') { # Emit an end-of-file token. $this->emitToken(new EOFToken()); break; } # Anything else else { # Emit the current input character as a character token. // OPTIMIZATION: Consume all characters that don't match what is above and emit // that as a character token instead to prevent having to loop back through here // every single time. $this->emitToken(new CharacterToken($char.$this->data->consumeUntil('&<'))); } } # 12.2.4.4 Character reference in RCDATA state // OPTIMIZATION: This is instead done in the block above. # 12.2.4.5 RAWTEXT state elseif ($this->tokenizerState === static::RAWTEXT_STATE) { # Consume the next input character $char = $this->data->consume(); # U+003C LESS-THAN SIGN (<) if ($char === '<') { # Switch to the RAWTEXT less-than sign state. $this->tokenizerState = static::RAWTEXT_LESS_THAN_SIGN_STATE; } # EOF elseif ($char === '') { # Emit an end-of-file token. $this->emitToken(new EOFToken()); break; } # Anything else else { # Emit the current input character as a character token. // OPTIMIZATION: Consume all characters that don't match what is above and emit // that as a character token instead to prevent having to loop back through here // every single time. $this->emitToken(new CharacterToken($char.$this->data->consumeUntil('<'))); } } # 12.2.4.6 Script data state elseif ($this->tokenizerState === static::SCRIPT_DATA_STATE) { # Consume the next input character $char = $this->data->consume(); # U+003C LESS-THAN SIGN (<) if ($char === '<') { # Switch to the script data less-than sign state. $this->tokenizerState = static::SCRIPT_DATA_LESS_THAN_SIGN_STATE; } # EOF elseif ($char === '') { # Emit an end-of-file token. $this->emitToken(new EOFToken()); break; } # Anything else else { # Emit the current input character as a character token. // OPTIMIZATION: Consume all characters that don't match what is above and emit // that as a character token instead to prevent having to loop back through here // every single time. $this->emitToken(new CharacterToken($char.$this->data->consumeUntil('<'))); } } # 12.2.4.7 PLAINTEXT state elseif ($this->tokenizerState === static::PLAINTEXT_STATE) { # Consume the next input character $char = $this->data->consume(); # EOF if ($char === '') { # Emit an end-of-file token. $this->emitToken(new EOFToken()); break; } # Anything else else { # Emit the current input character as a character token. // OPTIMIZATION: Consume all characters that don't match what is above and emit // that as a character token instead to prevent having to loop back through here // every single time. $this->emitToken(new CharacterToken($char.$this->data->consumeUntil(''))); } } # 12.2.4.8 Tag open state elseif ($this->tokenizerState === static::TAG_OPEN_STATE) { # Consume the next input character $char = $this->data->consume(); # U+0021 EXCLAMATION MARK (!) if ($char === '!') { # Switch to the markup declaration open state. $this->tokenizerState = static::MARKUP_DECLARATION_OPEN_STATE; } # U+002F SOLIDUS (/) elseif ($char === '/') { # Switch to the end tag open state. $this->tokenizerState = static::END_TAG_OPEN_STATE; } # Uppercase ASCII letter # Lowercase ASCII letter elseif (ctype_alpha($char)) { # Uppercase: # Create a new start tag token, set its tag name to the lowercase version of the # current input character (add 0x0020 to the character's code point), then switch # to the tag name state. (Don't emit the token yet; further details will be filled # in before it is emitted.) # Lowercase: # Create a new start tag token, set its tag name to the current input character, # then switch to the tag name state. (Don't emit the token yet; further details # will be filled in before it is emitted.) // OPTIMIZATION: Will just check for alpha characters and strtolower the // characters. // OPTIMIZATION: Consume all characters that are ASCII characters to prevent having // to loop back through here every single time. $token = new StartTagToken(strtolower($char.$this->data->consumeWhile(static::CTYPE_ALPHA))); $this->tokenizerState = static::TAG_NAME_STATE; } # U+003F QUESTION MARK (?) elseif ($char === '?') { # Parse error. Switch to the bogus comment state. // Making errors more expressive. if ($char !== '') { ParseError::trigger(ParseError::TAG_NAME_EXPECTED, $this->data, $char); } else { ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'tag name'); } $this->tokenizerState = static::BOGUS_COMMENT_STATE; } # Anything else else { # Parse error. Switch to the data state. Emit a U+003C LESS-THAN SIGN character # token. Reconsume the current input character. // Making errors more expressive. if ($char !== '') { ParseError::trigger(ParseError::TAG_NAME_EXPECTED, $this->data, $char); } else { ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'tag name'); } $this->tokenizerState = static::DATA_STATE; $this->data->unconsume(); } } # 8.2.4.9 End tag open state elseif ($this->tokenizerState === static::END_TAG_OPEN_STATE) { # Consume the next input character $char = $this->data->consume(); # Uppercase ASCII letter # Lowercase ASCII letter if (ctype_alpha($char)) { # Uppercase: # Create a new end tag token, set its tag name to the lowercase version of the # current input character (add 0x0020 to the character's code point), then switch # to the tag name state. (Don't emit the token yet; further details will be filled # in before it is emitted.) # Lowercase: # Create a new end tag token, set its tag name to the current input character, # then switch to the tag name state. (Don't emit the token yet; further details # will be filled in before it is emitted.) // OPTIMIZATION: Will just check for alpha characters and strtolower the // characters. // OPTIMIZATION: Consume all characters that are ASCII characters to prevent having // to loop back through here every single time. $token = new EndTagToken(strtolower($char.$this->data->consumeWhile(static::CTYPE_ALPHA))); $this->tokenizerState = static::TAG_NAME_STATE; } # ">" (U+003E) elseif ($char === '>') { # Parse error. Switch to the data state. ParseError::trigger(ParseError::TAG_NAME_EXPECTED, $this->data, $char); $this->tokenizerState = static::DATA_STATE; } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Emit a U+003C LESS-THAN SIGN character # token and a U+002F SOLIDUS character token. Reconsume the EOF character. // Making errors more expressive. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'tag name'); $this->tokenizerState = static::DATA_STATE; $this->emitToken(new CharacterToken('data->unconsume(); } # Anything else else { # Parse error. Switch to the bogus comment state. ParseError::trigger(ParseError::TAG_NAME_EXPECTED, $this->data, $char); $this->tokenizerState = static::BOGUS_COMMENT_STATE; } } # 8.2.4.10 Tag name state elseif ($this->tokenizerState === static::TAG_NAME_STATE) { # Consume the next input character $char = $this->data->consume(); # "tab" (U+0009) # "LF" (U+000A) # "FF" (U+000C) # U+0020 SPACE if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { # Switch to the before attribute name state. $this->tokenizerState = static::BEFORE_ATTRIBUTE_NAME_STATE; } # "/" (U+002F) elseif ($char === '/') { # Switch to the self-closing start tag state. $this->tokenizerState = static::SELF_CLOSING_START_TAG_STATE; } # ">" (U+003E) elseif ($char === '>') { # Switch to the data state. Emit the current tag token. $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } # Uppercase ASCII letter elseif (ctype_upper($char)) { # Append the lowercase version of the current input character (add 0x0020 to the # character's code point) to the current tag token's tag name. // OPTIMIZATION: Consume all characters that are Uppercase ASCII characters to // prevent having to loop back through here every single time. $token->name = $token->name.strtolower($char.$this->data->consumeWhile(static::CTYPE_UPPER)); } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Reconsume the EOF character. // Making errors more expressive. if ($char !== '') { ParseError::trigger(ParseError::TAG_NAME_EXPECTED, $this->data, $char); } else { ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'tag name'); } $this->tokenizerState = static::DATA_STATE; $this->data->unconsume(); } # Anything else else { # Append the current input character to the current tag token's tag name. // OPTIMIZATION: Consume all characters that aren't listed above to prevent having // to loop back through here every single time. $token->name = $token->name.$char.$this->data->consumeUntil("\t\n\x0c />".static::CTYPE_UPPER); } } # 8.2.4.11 RCDATA less-than sign state elseif ($this->tokenizerState === static::RCDATA_LESS_THAN_SIGN_STATE) { # Consume the next input character $char = $this->data->consume(); # "/" (U+002F) if ($char === '/') { # Set the temporary buffer to the empty string. Switch to the RCDATA end tag open # state. $temporaryBuffer = ''; $this->tokenizerState = static::RCDATA_END_TAG_OPEN_STATE; } # Anything else else { # Switch to the RCDATA state. Emit a U+003C LESS-THAN SIGN character token. # Reconsume the current input character. $this->tokenizerState = static::RCDATA_STATE; $this->emitToken(new CharacterToken('<')); $this->data->unconsume(); } } # 8.2.4.12 RCDATA end tag open state elseif ($this->tokenizerState === static::RCDATA_END_TAG_OPEN_STATE) { # Consume the next input character $char = $this->data->consume(); # Uppercase ASCII letter # Lowercase ASCII letter if (ctype_alpha($char)) { # Uppercase: # Create a new end tag token, and set its tag name to the lowercase version of the # current input character (add 0x0020 to the character's code point). Append the # current input character to the temporary buffer. Finally, switch to the RCDATA # end tag name state. (Don't emit the token yet; further details will be filled in # before it is emitted.) # Lowercase: # Create a new end tag token, and set its tag name to the current input character. # Append the current input character to the temporary buffer. Finally, switch to # the RCDATA end tag name state. (Don't emit the token yet; further details will # be filled in before it is emitted.) // OPTIMIZATION: Will just check for alpha characters and strtolower the // characters. // OPTIMIZATION: Consume all characters that are ASCII characters to prevent having // to loop back through here every single time. $token = new EndTagToken(strtolower($char)); $temporaryBuffer .= $char; $this->tokenizerState = static::RCDATA_END_TAG_NAME_STATE; } # Anything else else { # Switch to the RCDATA state. Emit a U+003C LESS-THAN SIGN character token and a # U+002F SOLIDUS character token. Reconsume the current input character. $this->tokenizerState = static::RCDATA_STATE; $this->emitToken(new CharacterToken('data->unconsume(); } } # 8.2.4.13 RCDATA end tag name state elseif ($this->tokenizerState === static::RCDATA_END_TAG_NAME_STATE) { # Consume the next input character $char = $this->data->consume(); # "tab" (U+0009) # "LF" (U+000A) # "FF" (U+000C) # U+0020 SPACE if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { # If the current end tag token is an appropriate end tag token, then switch to the # before attribute name state. Otherwise, treat it as per the "anything else" # entry below. if ($token->name === $this->stack->currentNode()->name) { $this->tokenizerState = static::BEFORE_ATTRIBUTE_NAME_STATE; } else { $this->tokenizerState = static::RCDATA_STATE; $this->emitToken(new CharacterToken('data->unconsume(); } } # "/" (U+002F) elseif ($char === '/') { # If the current end tag token is an appropriate end tag token, then switch to the # self-closing start tag state. Otherwise, treat it as per the "anything else" # entry below. if ($token->name === $this->stack->currentNode()->name) { $this->tokenizerState = static::SELF_CLOSING_START_TAG_STATE; } else { $this->tokenizerState = static::RCDATA_STATE; $this->emitToken(new CharacterToken('data->unconsume(); } } # ">" (U+003E) elseif ($char === '>') { # If the current end tag token is an appropriate end tag token, then switch to the # data state and emit the current tag token. Otherwise, treat it as per the # "anything else" entry below. if ($token->name === $this->stack->currentNode()->name) { $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } else { $this->tokenizerState = static::RCDATA_STATE; $this->emitToken(new CharacterToken('data->unconsume(); } } # Uppercase ASCII letter # Lowercase ASCII letter elseif (ctype_alpha($char)) { # Uppercase: # Append the lowercase version of the current input character (add 0x0020 to the # character's code point) to the current tag token's tag name. Append the current # input character to the temporary buffer. # Lowercase: # Append the current input character to the current tag token's tag name. Append # the current input character to the temporary buffer. // OPTIMIZATION: Will just check for alpha characters and strtolower the // characters. // OPTIMIZATION: Consume all characters that are ASCII characters to prevent having // to loop back through here every single time. $token->name .= $token->name.strtolower($char.$this->data->consumeWhile(static::CTYPE_ALPHA)); $temporaryBuffer .= $char; } # Anything else else { # Switch to the RCDATA state. Emit a U+003C LESS-THAN SIGN character token, a # U+002F SOLIDUS character token, and a character token for each of the characters # in the temporary buffer (in the order they were added to the buffer). Reconsume # the current input character. $this->tokenizerState = static::RCDATA_STATE; $this->emitToken(new CharacterToken('data->unconsume(); } } # 8.2.4.14 RAWTEXT less-than sign state elseif ($this->tokenizerState === static::RAWTEXT_LESS_THAN_SIGN_STATE) { # Consume the next input character $char = $this->data->consume(); # "/" (U+002F) if ($char === '/') { # Set the temporary buffer to the empty string. Switch to the RAWTEXT end tag open # state. $temporaryBuffer = ''; $this->tokenizerState = static::RAWTEXT_END_TAG_OPEN_STATE; } # Anything else else { # Switch to the RAWTEXT state. Emit a U+003C LESS-THAN SIGN character token. # Reconsume the current input character. $this->tokenizerState = static::RAWTEXT_STATE; $this->emitToken(new CharacterToken('<')); $this->data->unconsume(); } } # 8.2.4.15 RAWTEXT end tag open state elseif ($this->tokenizerState === static::RAWTEXT_END_TAG_OPEN_STATE) { # Consume the next input character $char = $this->data->consume(); # Uppercase ASCII letter # Lowercase ASCII letter if (ctype_alpha($char)) { # Uppercase: # Create a new end tag token, and set its tag name to the lowercase version of the # current input character (add 0x0020 to the character's code point). Append the # current input character to the temporary buffer. Finally, switch to the RAWTEXT # end tag name state. (Don't emit the token yet; further details will be filled in # before it is emitted.) # Lowercase: # Create a new end tag token, and set its tag name to the current input character. # Append the current input character to the temporary buffer. Finally, switch to # the RAWTEXT end tag name state. (Don't emit the token yet; further details will # be filled in before it is emitted.) // OPTIMIZATION: Will just check for alpha characters and strtolower the // characters. $token = new EndTagToken(strtolower($char)); $temporaryBuffer .= $char; $this->tokenizerState = static::RAWTEXT_END_TAG_NAME_STATE; } # Anything else else { # Switch to the RAWTEXT state. Emit a U+003C LESS-THAN SIGN character token and a # U+002F SOLIDUS character token. Reconsume the current input character. $this->tokenizerState = static::RAWTEXT_STATE; $this->emitToken(new CharacterToken('data->unconsume(); } } # 8.2.4.16 RAWTEXT end tag name state elseif ($this->tokenizerState === static::RAWTEXT_END_TAG_NAME_STATE) { # Consume the next input character $char = $this->data->consume(); # "tab" (U+0009) # "LF" (U+000A) # "FF" (U+000C) # U+0020 SPACE if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { # If the current end tag token is an appropriate end tag token, then switch to the # before attribute name state. Otherwise, treat it as per the "anything else" # entry below. if ($token->name === $this->stack->currentNode()->name) { $this->tokenizerState = static::BEFORE_ATTRIBUTE_NAME_STATE; } else { $this->tokenizerState = static::RAWTEXT_STATE; $this->emitToken(new CharacterToken('data->unconsume(); } } # "/" (U+002F) elseif ($char === '/') { # If the current end tag token is an appropriate end tag token, then switch to the # self-closing start tag state. Otherwise, treat it as per the "anything else" # entry below. if ($token->name === $this->stack->currentNode()->name) { $this->tokenizerState = static::SELF_CLOSING_START_TAG_STATE; } else { $this->tokenizerState = static::RAWTEXT_STATE; $this->emitToken(new CharacterToken('data->unconsume(); } } # ">" (U+003E) elseif ($char === '>') { # If the current end tag token is an appropriate end tag token, then switch to the # data state and emit the current tag token. Otherwise, treat it as per the # "anything else" entry below. if ($token->name === $this->stack->currentNode()->name) { $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } else { $this->tokenizerState = static::RAWTEXT_STATE; $this->emitToken(new CharacterToken('data->unconsume(); } } # Uppercase ASCII letter # Lowercase ASCII letter elseif (ctype_alpha($char)) { # Uppercase: # Append the lowercase version of the current input character (add 0x0020 to the # character's code point) to the current tag token's tag name. Append the current # input character to the temporary buffer. # Lowercase: # Append the current input character to the current tag token's tag name. Append # the current input character to the temporary buffer. // OPTIMIZATION: Will just check for alpha characters and strtolower the // characters. // OPTIMIZATION: Consume all characters that are ASCII characters to prevent having // to loop back through here every single time. $token->name .= $token->name.strtolower($char.$this->data->consumeWhile(static::CTYPE_ALPHA)); $temporaryBuffer .= $char; } # Anything else else { # Switch to the RAWTEXT state. Emit a U+003C LESS-THAN SIGN character token, a # U+002F SOLIDUS character token, and a character token for each of the characters # in the temporary buffer (in the order they were added to the buffer). Reconsume # the current input character. $this->tokenizerState = static::RAWTEXT_STATE; $this->emitToken(new CharacterToken('data->unconsume(); } } # 8.2.4.17 Script data less-than sign state elseif ($this->tokenizerState === static::SCRIPT_DATA_LESS_THAN_SIGN_STATE) { # Consume the next input character $char = $this->data->consume(); # "/" (U+002F) if ($char === '/') { # Set the temporary buffer to the empty string. Switch to the script data end tag # open state. $temporaryBuffer = ''; $this->tokenizerState = static::SCRIPT_DATA_END_TAG_OPEN_STATE; } # "!" (U+0021) elseif ($char === '!') { # Switch to the script data escape start state. Emit a U+003C LESS-THAN SIGN # character token and a U+0021 EXCLAMATION MARK character token. $this->tokenizerState = static::SCRIPT_DATA_ESCAPE_START_STATE; $this->emitToken(new CharacterToken('tokenizerState = static::SCRIPT_DATA_STATE; $this->emitToken(new CharacterToken('<')); $this->data->unconsume(); } } # 8.2.4.18 Script data end tag open state elseif ($this->tokenizerState === static::SCRIPT_DATA_END_TAG_OPEN_STATE) { # Consume the next input character $char = $this->data->consume(); # Uppercase ASCII letter # Lowercase ASCII letter if (ctype_alpha($char)) { # Uppercase: # Create a new end tag token, and set its tag name to the lowercase version of the # current input character (add 0x0020 to the character's code point). Append the # current input character to the temporary buffer. Finally, switch to the script # data end tag name state. (Don't emit the token yet; further details will be # filled in before it is emitted.) # Lowercase: # Create a new end tag token, and set its tag name to the current input character. # Append the current input character to the temporary buffer. Finally, switch to # the script data end tag name state. (Don't emit the token yet; further details # will be filled in before it is emitted.) // OPTIMIZATION: Will just check for alpha characters and strtolower the // characters. $token = new EndTagToken(strtolower($char)); $temporaryBuffer .= $char; $this->tokenizerState = static::SCRIPT_DATA_END_TAG_NAME_STATE; } # Anything else else { # Switch to the script data state. Emit a U+003C LESS-THAN SIGN character token # and a U+002F SOLIDUS character token. Reconsume the current input character. $this->tokenizerState = static::SCRIPT_DATA_STATE; $this->emitToken(new CharacterToken('data->unconsume(); } } # 8.2.4.19 Script data end tag name state elseif ($this->tokenizerState === static::SCRIPT_DATA_END_TAG_NAME_STATE) { # Consume the next input character $char = $this->data->consume(); # "tab" (U+0009) # "LF" (U+000A) # "FF" (U+000C) # U+0020 SPACE if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { # If the current end tag token is an appropriate end tag token, then switch to the # before attribute name state. Otherwise, treat it as per the "anything else" # entry below. if ($token->name === $this->stack->currentNode()->name) { $this->tokenizerState = static::BEFORE_ATTRIBUTE_NAME_STATE; } else { $this->tokenizerState = static::SCRIPT_DATA_STATE; $this->emitToken(new CharacterToken('data->unconsume(); } } # "/" (U+002F) elseif ($char === '/') { # If the current end tag token is an appropriate end tag token, then switch to the # self-closing start tag state. Otherwise, treat it as per the "anything else" # entry below. if ($token->name === $this->stack->currentNode()->name) { $this->tokenizerState = static::SELF_CLOSING_START_TAG_STATE; } else { $this->tokenizerState = static::SCRIPT_DATA_STATE; $this->emitToken(new CharacterToken('data->unconsume(); } } # ">" (U+003E) elseif ($char === '>') { # If the current end tag token is an appropriate end tag token, then switch to the # data state and emit the current tag token. Otherwise, treat it as per the # "anything else" entry below. if ($token->name === $this->stack->currentNode()->name) { $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } else { $this->tokenizerState = static::SCRIPT_DATA_STATE; $this->emitToken(new CharacterToken('data->unconsume(); } } # Uppercase ASCII letter # Lowercase ASCII letter elseif (ctype_alpha($char)) { # Uppercase: # Append the lowercase version of the current input character (add 0x0020 to the # character's code point) to the current tag token's tag name. Append the current # input character to the temporary buffer. # Lowercase: # Append the current input character to the current tag token's tag name. Append # the current input character to the temporary buffer. // OPTIMIZATION: Will just check for alpha characters and strtolower the // characters. // OPTIMIZATION: Consume all characters that are ASCII characters to prevent having // to loop back through here every single time. $token->name .= $token->name.strtolower($char.$this->data->consumeWhile(static::CTYPE_ALPHA)); $temporaryBuffer .= $char; } # Anything else else { # Switch to the script data state. Emit a U+003C LESS-THAN SIGN character token, a # U+002F SOLIDUS character token, and a character token for each of the characters # in the temporary buffer (in the order they were added to the buffer). Reconsume # the current input character. $this->tokenizerState = static::SCRIPT_DATA_STATE; $this->emitToken(new CharacterToken('data->unconsume(); } } # 8.2.4.20 Script data escape start state elseif ($this->tokenizerState === static::SCRIPT_DATA_ESCAPE_START_STATE) { # Consume the next input character $char = $this->data->consume(); # "-" (U+002D) if ($char === '-') { # Switch to the script data escape start dash state. Emit a U+002D HYPHEN-MINUS # character token. $this->tokenizerState = static::SCRIPT_DATA_ESCAPE_START_DASH_STATE; $this->emitToken(new CharacterToken('-')); } # Anything else else { # Switch to the script data state. Reconsume the current input character. $this->tokenizerState = static::SCRIPT_DATA_STATE; $this->data->unconsume(); } } # 8.2.4.21 Script data escape start dash state elseif ($this->tokenizerState === static::SCRIPT_DATA_ESCAPE_START_DASH_STATE) { # Consume the next input character $char = $this->data->consume(); # "-" (U+002D) if ($char === '-') { # Switch to the script data escaped dash dash state. Emit a U+002D HYPHEN-MINUS # character token. $this->tokenizerState = static::SCRIPT_DATA_ESCAPED_DASH_DASH_STATE; $this->emitToken(new CharacterToken('-')); } # Anything else else { # Switch to the script data state. Reconsume the current input character. $this->tokenizerState = static::SCRIPT_DATA_STATE; $this->data->unconsume(); } } # 8.2.4.22 Script data escaped state elseif ($this->tokenizerState === static::SCRIPT_DATA_ESCAPED_STATE) { # Consume the next input character $char = $this->data->consume(); # "-" (U+002D) if ($char === '-') { # Switch to the script data escaped dash state. Emit a U+002D HYPHEN-MINUS # character token. $this->tokenizerState = static::SCRIPT_DATA_ESCAPED_DASH_STATE; $this->emitToken(new CharacterToken('-')); } # "<" (U+003C) elseif ($char === '<') { # Switch to the script data escaped less-than sign state. $this->tokenizerState = static::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE; } # EOF elseif ($char === '') { # Switch to the data state. Parse error. Reconsume the EOF character. $this->tokenizerState = static::DATA_STATE; ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'script data'); $this->data->unconsume(); } # Anything else else { # Emit the current input character as a character token. // OPTIMIZATION: Consume all characters that aren't listed above to prevent having // to loop back through here every single time. $this->emitToken(new CharacterToken($char.$this->data->consumeUntil('-<'))); } } # 8.2.4.23 Script data escaped dash state elseif ($this->tokenizerState === static::SCRIPT_DATA_ESCAPED_DASH_STATE) { # Consume the next input character $char = $this->data->consume(); # "-" (U+002D) if ($char === '-') { # Switch to the script data escaped dash dash state. Emit a U+002D HYPHEN-MINUS # character token. $this->tokenizerState = static::SCRIPT_DATA_ESCAPED_DASH_DASH_STATE; $this->emitToken(new CharacterToken('-')); } # "<" (U+003C) elseif ($char === '<') { # Switch to the script data escaped less-than sign state. $this->tokenizerState = static::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE; } # EOF elseif ($char === '') { # Switch to the data state. Parse error. Reconsume the EOF character. $this->tokenizerState = static::DATA_STATE; ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'script data'); $this->data->unconsume(); } # Anything else else { # Switch to the script data escaped state. Emit the current input character as a # character token. $this->tokenizerState = static::SCRIPT_DATA_ESCAPED_STATE; $this->emitToken(new CharacterToken($char)); } } # 8.2.4.24 Script data escaped dash dash state elseif ($this->tokenizerState === static::SCRIPT_DATA_ESCAPED_DASH_DASH_STATE) { # Consume the next input character $char = $this->data->consume(); # "-" (U+002D) if ($char === '-') { # Emit a U+002D HYPHEN-MINUS character token. $this->emitToken(new CharacterToken('-')); } # "<" (U+003C) elseif ($char === '<') { # Switch to the script data escaped less-than sign state. $this->tokenizerState = static::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE; } # ">" (U+003E) elseif ($char === '>') { # Switch to the script data state. Emit a U+003E GREATER-THAN SIGN character # token. $this->tokenizerState = static::SCRIPT_DATA_STATE; $this->emitToken(new CharacterToken('>')); } # EOF elseif ($char === '') { # Switch to the data state. Parse error. Reconsume the EOF character. $this->tokenizerState = static::DATA_STATE; ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'script data'); $this->data->unconsume(); } # Anything else else { # Switch to the script data escaped state. Emit the current input character as a # character token. $this->tokenizerState = static::SCRIPT_DATA_ESCAPED_STATE; $this->emitToken(new CharacterToken($char)); } } # 8.2.4.25 Script data escaped less-than sign state elseif ($this->tokenizerState === static::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE) { # Consume the next input character $char = $this->data->consume(); # "/" (U+002F) if ($char === '/') { # Set the temporary buffer to the empty string. Switch to the script data escaped # end tag open state. $temporaryBuffer .= ''; $this->tokenizerState = static::SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE; } # Uppercase ASCII letter # Lowercase ASCII letter elseif (ctype_alpha($char)) { # Uppercase: # Set the temporary buffer to the empty string. Append the lowercase version of # the current input character (add 0x0020 to the character's code point) to the # temporary buffer. Switch to the script data double escape start state. Emit a # U+003C LESS-THAN SIGN character token and the current input character as a # character token. # Lowercase: # Set the temporary buffer to the empty string. Append the current input character # to the temporary buffer. Switch to the script data double escape start state. # Emit a U+003C LESS-THAN SIGN character token and the current input character as # a character token. // OPTIMIZATION: Will just check for alpha characters and strtolower the // characters. $temporaryBuffer = strtolower($char); $this->tokenizerState = static::SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE; $this->emitToken(new CharacterToken('<'.$char)); } # Anything else else { # Switch to the script data escaped state. Emit a U+003C LESS-THAN SIGN character # token. Reconsume the current input character. $this->tokenizerState = static::SCRIPT_DATA_ESCAPED_STATE; $this->emitToken(new CharacterToken($char)); $this->data->unconsume(); } } # 8.2.4.26 Script data escaped end tag open state elseif ($this->tokenizerState === static::SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE) { # Consume the next input character $char = $this->data->consume(); # Uppercase ASCII letter # Lowercase ASCII letter if (ctype_alpha($char)) { # Uppercase: # Create a new end tag token, and set its tag name to the lowercase version of the # current input character (add 0x0020 to the character's code point). Append the # current input character to the temporary buffer. Finally, switch to the script # data escaped end tag name state. (Don't emit the token yet; further details will # be filled in before it is emitted.) # Lowercase: # Create a new end tag token, and set its tag name to the current input character. # Append the current input character to the temporary buffer. Finally, switch to # the script data escaped end tag name state. (Don't emit the token yet; further # details will be filled in before it is emitted.) // OPTIMIZATION: Will just check for alpha characters and strtolower the // characters. // OPTIMIZATION: Consume all characters that are ASCII characters to prevent having // to loop back through here every single time. $token = new EndTagToken(strtolower($char)); $temporaryBuffer .= $char; $this->tokenizerState = static::SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE; } # Anything else else { # Switch to the script data escaped state. Emit a U+003C LESS-THAN SIGN character # token and a U+002F SOLIDUS character token. Reconsume the current input # character. $this->tokenizerState = static::SCRIPT_DATA_ESCAPED_STATE; $this->emitToken(new CharacterToken('data->unconsume(); } } # 8.2.4.27 Script data escaped end tag name state elseif ($this->tokenizerState === static::SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE) { # Consume the next input character $char = $this->data->consume(); # "tab" (U+0009) # "LF" (U+000A) # "FF" (U+000C) # U+0020 SPACE if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { # If the current end tag token is an appropriate end tag token, then switch to the # before attribute name state. Otherwise, treat it as per the "anything else" # entry below. if ($token->name === $this->stack->currentNode()->name) { $this->tokenizerState = static::BEFORE_ATTRIBUTE_NAME_STATE; } else { $this->tokenizerState = static::SCRIPT_DATA_ESCAPED_STATE; $this->emitToken(new CharacterToken('data->unconsume(); } } # "/" (U+002F) elseif ($char === '/') { # If the current end tag token is an appropriate end tag token, then switch to the # self-closing start tag state. Otherwise, treat it as per the "anything else" # entry below. if ($token->name === $this->stack->currentNode()->name) { $this->tokenizerState = static::SELF_CLOSING_START_TAG_STATE; } else { $this->tokenizerState = static::SCRIPT_DATA_ESCAPED_STATE; $this->emitToken(new CharacterToken('data->unconsume(); } } # ">" (U+003E) elseif ($char === '>') { # If the current end tag token is an appropriate end tag token, then switch to the # data state and emit the current tag token. Otherwise, treat it as per the # "anything else" entry below. if ($token->name === $this->stack->currentNode()->name) { $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } else { $this->tokenizerState = static::SCRIPT_DATA_ESCAPED_STATE; $this->emitToken(new CharacterToken('data->unconsume(); } } # Uppercase ASCII letter # Lowercase ASCII letter elseif (ctype_alpha($char)) { # Uppercase: # Append the lowercase version of the current input character (add 0x0020 to the # character's code point) to the current tag token's tag name. Append the current # input character to the temporary buffer. # Lowercase: # Append the current input character to the current tag token's tag name. Append # the current input character to the temporary buffer. // OPTIMIZATION: Will just check for alpha characters and strtolower the // characters. // OPTIMIZATION: Consume all characters that are ASCII characters to prevent having // to loop back through here every single time. $token->name .= $token->name.strtolower($char.$this->data->consumeWhile(static::CTYPE_ALPHA)); $temporaryBuffer .= $char; } # Anything else else { # Switch to the script data state. Emit a U+003C LESS-THAN SIGN character token, a # U+002F SOLIDUS character token, and a character token for each of the characters # in the temporary buffer (in the order they were added to the buffer). Reconsume # the current input character. $this->tokenizerState = static::SCRIPT_DATA_ESCAPED_STATE; $this->emitToken(new CharacterToken('data->unconsume(); } } # 8.2.4.29 Script data double escaped state elseif ($this->tokenizerState === static::SCRIPT_DATA_DOUBLE_ESCAPED_STATE) { # Consume the next input character $char = $this->data->consume(); # "-" (U+002D) if ($char === '-') { # Switch to the script data double escaped dash dash state. Emit a U+002D # HYPHEN-MINUS character token. $this->tokenizerState = static::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE; $this->emitToken(new CharacterToken('-')); } # "<" (U+003C) elseif ($char === '<') { # Switch to the script data double escaped less-than sign state. Emit a U+003C # LESS-THAN SIGN character token. $this->tokenizerState = static::DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE; $this->emitToken(new CharacterToken('<')); } # ">" (U+003E) elseif ($char === '>') { # Switch to the script data state. Emit a U+003E GREATER-THAN SIGN character # token. $this->tokenizerState = static::SCRIPT_DATA_STATE; $this->emitToken(new CharacterToken('>')); } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Reconsume the EOF character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'script data'); $this->tokenizerState = static::DATA_STATE; $this->data->unconsume(); } # Anything else else { # Switch to the script data double escaped state. Emit the current input character # as a character token. $this->tokenizerState = static::SCRIPT_DATA_DOUBLE_ESCAPED_STATE; $this->emitToken(new CharacterToken($char)); } } # 8.2.4.32 Script data double escaped less-than sign state elseif ($this->tokenizerState === static::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE) { # Consume the next input character $char = $this->data->consume(); # "/" (U+002F) if ($char === '/') { # Set the temporary buffer to the empty string. Switch to the script data double # escape end state. Emit a U+002F SOLIDUS character token. $temporaryBuffer = ''; $this->tokenizerState === static::SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE; $this->emitToken(new CharacterToken('/')); } # Anything else else { # Switch to the script data double escaped state. Reconsume the current input # character. $this->tokenizerState === static::SCRIPT_DATA_DOUBLE_ESCAPED_STATE; $this->data->unconsume(); } } # 8.2.4.33 Script data double escape end state elseif ($this->tokenizerState === static::SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE) { # Consume the next input character $char = $this->data->consume(); # "tab" (U+0009) # "LF" (U+000A) # "FF" (U+000C) # U+0020 SPACE # "/" (U+002F) # ">" (U+003E) if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' || $char === '/' || $char === '>') { # If the temporary buffer is the string "script", then switch to the script data # escaped state. Otherwise, switch to the script data double escaped state. Emit # the current input character as a character token. if ($temporaryBuffer === 'script') { $this->tokenizerState = static::SCRIPT_DATA_ESCAPED_STATE; } else { $this->tokenizerState = static::SCRIPT_DATA_DOUBLE_ESCAPED_STATE; $this->emitToken(new CharacterToken($char)); } } # Uppercase ASCII letter # Lowercase ASCII letter elseif (ctype_alpha($char)) { # Uppercase: # Append the lowercase version of the current input character (add 0x0020 to the # character's code point) to the temporary buffer. Emit the current input # character as a character token. # Lowercase: # Append the current input character to the temporary buffer. Emit the current # input character as a character token. // OPTIMIZATION: Will just check for alpha characters and strtolower the // characters. // OPTIMIZATION: Consume all characters that are ASCII characters to prevent having // to loop back through here every single time. $char = $char.$this->data->consumeWhile(static::CTYPE_ALPHA); $temporaryBuffer .= strtolower(strtolower($char)); $this->emitToken(new CharacterToken($char)); } # Anything else else { # Switch to the script data double escaped state. Reconsume the current input # character. $this->tokenizerState = static::SCRIPT_DATA_ESCAPED_STATE; $this->data->unconsume(); } } # 8.2.4.34 Before attribute name state elseif ($this->tokenizerState === static::BEFORE_ATTRIBUTE_NAME_STATE) { # Consume the next input character $char = $this->data->consume(); # "tab" (U+0009) # "LF" (U+000A) # "FF" (U+000C) # U+0020 SPACE if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { # Ignore the character. continue; } # "/" (U+002F) elseif ($char === '/') { # Switch to the self-closing start tag state. $this->tokenizerState = static::SELF_CLOSING_START_TAG_STATE; } # ">" (U+003E) elseif ($char === '>') { # Switch to the data state. Emit the current tag token. $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } # Uppercase ASCII letter elseif (ctype_upper($char)) { # Start a new attribute in the current tag token. Set that attribute's name to the # lowercase version of the current input character (add 0x0020 to the character's # code point), and its value to the empty string. Switch to the attribute name # state. // DEVIATION: Will use a buffer for the attribute name instead. $attributeName = strtolower($char); $attributeValue = ''; $this->tokenizerState = static::ATTRIBUTE_NAME_STATE; } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Reconsume the EOF character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'attribute name'); $this->tokenizerState = static::DATA_STATE; $this->data->unconsume(); } # U+0022 QUOTATION MARK (") # "'" (U+0027) # "<" (U+003C) # "=" (U+003D) # Anything else else { # Quotes, less than sign, equals: # Parse error. Treat it as per the "anything else" entry below. # Anything else: # Start a new attribute in the current tag token. Set that attribute's name to the # current input character, and its value to the empty string. Switch to the # attribute name state. if ($char === '"' || $char === "'" || $char === '<' || $char === '=') { ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, $char, 'attribute name'); } // DEVIATION: Will use a buffer for the attribute name instead. $attributeName = $char; $attributeValue = ''; $this->tokenizerState = static::ATTRIBUTE_NAME_STATE; } } # 8.2.4.35 Attribute name state elseif ($this->tokenizerState === static::ATTRIBUTE_NAME_STATE) { # Consume the next input character $char = $this->data->consume(); # "tab" (U+0009) # "LF" (U+000A) # "FF" (U+000C) # U+0020 SPACE if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { if ($token->hasAttribute($attributeName)) { ParseError::trigger(ParseError::ATTRIBUTE_EXISTS, $this->data, $attributeName); } # Switch to the after attribute name state. $this->tokenizerState = static::AFTER_ATTRIBUTE_NAME_STATE; } # "/" (U+002F) elseif ($char === '/') { if ($token->hasAttribute($attributeName)) { ParseError::trigger(ParseError::ATTRIBUTE_EXISTS, $this->data, $attributeName); } # Switch to the self-closing start tag state. $this->tokenizerState = static::SELF_CLOSING_START_TAG_STATE; } # "=" (U+003D) elseif ($char === '=') { if ($token->hasAttribute($attributeName)) { ParseError::trigger(ParseError::ATTRIBUTE_EXISTS, $this->data, $attributeName); } # Switch to the before attribute value state. $this->tokenizerState = static::BEFORE_ATTRIBUTE_VALUE_STATE; } # ">" (U+003E) elseif ($char === '>') { if ($token->hasAttribute($attributeName)) { ParseError::trigger(ParseError::ATTRIBUTE_EXISTS, $this->data, $attributeName); } # Switch to the data state. Emit the current tag token. $this->tokenizerState = static::DATA_STATE; // Need to add the current attribute name and value to the token if necessary. if ($attributeName) { $token->setAttribute($attributeName, $attributeValue); } $this->emitToken($token); } # Uppercase ASCII letter elseif (ctype_upper($char)) { # Append the lowercase version of the current input character (add 0x0020 to the # character's code point) to the current attribute's name. // OPTIMIZATION: Consume all characters that are uppercase ASCII letters to prevent // having to loop back through here every single time. $attributeName .= strtolower($char.$this->data-consumeWhile(static::CTYPE_UPPER)); } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Reconsume the EOF character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'attribute name'); $this->tokenizerState = static::DATA_STATE; $this->data->unconsume(); } # U+0022 QUOTATION MARK (") # "'" (U+0027) # "<" (U+003C) # "=" (U+003D) # Anything else else { # Quotes, less than sign, equals: # Parse error. Treat it as per the "anything else" entry below. # Anything else: # Append the current input character to the current attribute's name. if ($char === '"' || $char === "'" || $char === '<' || $char === '=') { ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, $char, 'attribute name'); } // OPTIMIZATION: Will just check for alpha characters and strtolower the // characters. // OPTIMIZATION: Consume all characters that aren't listed above to prevent having // to loop back through here every single time. $attributeName .= $char.$this->data->consumeUntil("\t\n\x0c /=>\"'<".static::CTYPE_UPPER); } # When the user agent leaves the attribute name state (and before emitting the tag # token, if appropriate), the complete attribute's name must be compared to the # other attributes on the same token; if there is already an attribute on the # token with the exact same name, then this is a parse error and the new attribute # must be removed from the token. // DEVIATION: Because this implementation uses a buffer to hold the attribute name // it is only added if it is valid. The result is the same, though. } # 8.2.4.36 After attribute name state elseif ($this->tokenizerState === static::AFTER_ATTRIBUTE_NAME_STATE) { # Consume the next input character $char = $this->data->consume(); # "tab" (U+0009) # "LF" (U+000A) # "FF" (U+000C) # U+0020 SPACE if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { # Ignore the character. continue; } # "/" (U+002F) elseif ($char === '/') { # Switch to the self-closing start tag state. $this->tokenizerState = static::SELF_CLOSING_START_TAG_STATE; } # "=" (U+003D) elseif ($char === '=') { # Switch to the before attribute value state. $this->tokenizerState = static::BEFORE_ATTRIBUTE_VALUE_STATE; } # ">" (U+003E) elseif ($char === '>') { # Switch to the data state. Emit the current tag token. $this->tokenizerState = static::DATA_STATE; // Need to add the current attribute name and value to the token if necessary. if ($attributeName) { $token->setAttribute($attributeName, $attributeValue); } $this->emitToken($token); } # Uppercase ASCII letter elseif (ctype_upper($char)) { # Start a new attribute in the current tag token. Set that attribute's name to the # lowercase version of the current input character (add 0x0020 to the character's # code point), and its value to the empty string. Switch to the attribute name # state. // DEVIATION: Will use a buffer for the attribute name instead. $attributeName = strtolower($char); $attributeValue = ''; $this->tokenizerState = static::ATTRIBUTE_NAME_STATE; } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Reconsume the EOF character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'attribute name, attribute value, or tag end'); $this->tokenizerState = static::DATA_STATE; $this->data->unconsume(); } # U+0022 QUOTATION MARK (") # "'" (U+0027) # "<" (U+003C) # "=" (U+003D) # Anything else else { # Quotes, less than sign, equals: # Parse error. Treat it as per the "anything else" entry below. # Anything else: # Start a new attribute in the current tag token. Set that attribute's name to the # current input character, and its value to the empty string. Switch to the # attribute name state. if ($char === '"' || $char === "'" || $char === '<' || $char === '=') { ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, $char, 'attribute name, attribute value, or tag end'); } $attributeName = $char; $attributeValue = ''; $this->tokenizerState = static::ATTRIBUTE_NAME_STATE; } } # 8.2.4.37 Before attribute value state elseif ($this->tokenizerState === static::BEFORE_ATTRIBUTE_VALUE_STATE) { # Consume the next input character $char = $this->data->consume(); # "tab" (U+0009) # "LF" (U+000A) # "FF" (U+000C) # U+0020 SPACE if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { # Ignore the character. continue; } # U+0022 QUOTATION MARK (") elseif ($char === '"') { # Switch to the attribute value (double-quoted) state. $this->tokenizerState = static::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE; } # U+0026 AMPERSAND (&) elseif ($char === '&') { # Switch to the attribute value (unquoted) state. Reconsume the current input # character. $this->tokenizerState = static::ATTRIBUTE_VALUE_UNQUOTED_STATE; $this->data->unconsume(); } # "'" (U+0027) elseif ($char === "'") { # Switch to the attribute value (single-quoted) state. $this->tokenizerState = static::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE; } # ">" (U+003E) elseif ($char === '>') { # Parse error. Switch to the data state. Emit the current tag token. ParseError::trigger(ParseError::UNEXPECTED_TAG_END, $this->data, 'attribute value'); $this->tokenizerState = static::DATA_STATE; // Need to add the current attribute name and value to the token if necessary. if ($attributeName) { $token->setAttribute($attributeName, $attributeValue); } $this->emitToken($token); } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Reconsume the EOF character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'attribute value'); $this->tokenizerState = static::DATA_STATE; $this->data->unconsume(); } # "<" (U+003C) # "=" (U+003D) # "`" (U+0060) # Anything else else { # less than sign, equals, tick: # Parse error. Treat it as per the "anything else" entry below. # Anything else: # Append the current input character to the current attribute's value. Switch to # the attribute value (unquoted) state. if ($char === '<' || $char === '=' || $char === '`') { ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, $char, 'attribute value'); } $attributeValue .= $char; $this->tokenizerState = static::ATTRIBUTE_VALUE_UNQUOTED_STATE; } } # 8.2.4.38 Attribute value (double-quoted) state elseif ($this->tokenizerState === static::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) { # Consume the next input character $char = $this->data->consume(); # U+0022 QUOTATION MARK (") if ($char === '"') { $this->tokenizerState = static::AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; } # U+0026 AMPERSAND (&) elseif ($char === '&') { # Switch to the character reference in attribute value state, with the additional # allowed character being U+0022 QUOTATION MARK ("). # 8.2.4.41 Character reference in attribute value state: # Attempt to consume a character reference. # If nothing is returned, append a U+0026 AMPERSAND character (&) to the current # attribute's value. # Otherwise, append the returned character tokens to the current attribute's # value. # Finally, switch back to the attribute value state that switched into this state. // DEVIATION: This implementation does the character reference consuming in a // function for which it is more suited for. $attributeValue .= $this->data->consumeCharacterReference('"', true); } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Reconsume the EOF character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'attribute value'); $this->tokenizerState = static::DATA_STATE; $this->data->unconsume(); } # Anything else else { # Append the current input character to the current attribute's value. // OPTIMIZATION: Consume all characters that aren't listed above to prevent having // to loop back through here every single time. $attributeValue .= $char.$this->data->consumeUntil('"&'); } } # 8.2.4.39 Attribute value (single-quoted) state elseif ($this->tokenizerState === static::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) { # Consume the next input character $char = $this->data->consume(); # "'" (U+0027) if ($char === "'") { $this->tokenizerState = static::AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; } # U+0026 AMPERSAND (&) elseif ($char === '&') { # Switch to the character reference in attribute value state, with the additional # allowed character being "'" (U+0027). # 8.2.4.41 Character reference in attribute value state: # Attempt to consume a character reference. # If nothing is returned, append a U+0026 AMPERSAND character (&) to the current # attribute's value. # Otherwise, append the returned character tokens to the current attribute's # value. # Finally, switch back to the attribute value state that switched into this state. # DEVIATION: This implementation does the character reference consuming in a # function for which it is more suited for. $attributeValue .= $this->data->consumeCharacterReference("'", true); } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Reconsume the EOF character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'attribute value'); $this->tokenizerState = static::DATA_STATE; $this->data->unconsume(); } # Anything else else { # Append the current input character to the current attribute's value. // OPTIMIZATION: Consume all characters that aren't listed above to prevent having // to loop back through here every single time. $attributeValue .= $char.$this->data->consumeUntil("'&"); } } # 8.2.4.40 Attribute value (unquoted) state elseif ($this->tokenizerState === static::ATTRIBUTE_VALUE_UNQUOTED_STATE) { # Consume the next input character $char = $this->data->consume(); # "tab" (U+0009) # "LF" (U+000A) # "FF" (U+000C) # U+0020 SPACE if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { $this->tokenizerState = static::BEFORE_ATTRIBUTE_VALUE_STATE; } # U+0026 AMPERSAND (&) elseif ($char === '&') { # Switch to the character reference in attribute value state, with the additional # allowed character being ">" (U+003E). # Switch to the character reference in attribute value state, with the additional # allowed character being "'" (U+0027). # 8.2.4.41 Character reference in attribute value state: # Attempt to consume a character reference. # If nothing is returned, append a U+0026 AMPERSAND character (&) to the current # attribute's value. # Otherwise, append the returned character tokens to the current attribute's # value. # Finally, switch back to the attribute value state that switched into this state. // DEVIATION: This implementation does the character reference consuming in a // function for which it is more suited for. $attributeValue .= $this->data->consumeCharacterReference('>', true); } # ">" (U+003E) elseif ($char === '>') { # Switch to the data state. Emit the current tag token. $this->tokenizerState = static::DATA_STATE; // Need to add the current attribute name and value to the token if necessary. if ($attributeName) { $token->setAttribute($attributeName, $attributeValue); } $this->emitToken($token); } # Parse error. Switch to the data state. Reconsume the EOF character. elseif ($char === '') { ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'attribute value'); $this->tokenizerState = static::DATA_STATE; $this->data->unconsume(); } # U+0022 QUOTATION MARK (") # "'" (U+0027) # "<" (U+003C) # "=" (U+003D) # "`" (U+0060) # Anything else else { # Quotes, less than sign, equals, tick: # Parse error. Treat it as per the "anything else" entry below. # Anything else: # Append the current input character to the current attribute's value. if ($char === '"' || $char === "'" || $char === '<' || $char === '=' || $char === '`') { ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, $char, 'attribute value'); } // OPTIMIZATION: Consume all characters that aren't listed above to prevent having // to loop back through here every single time. $attributeValue .= $char.$this->data->consumeUntil("\t\n\x0c &>\"'<=`"); } } # 8.2.4.42 After attribute value (quoted) state elseif ($this->tokenizerState === static::AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) { # Consume the next input character $char = $this->data->consume(); # "tab" (U+0009) # "LF" (U+000A) # "FF" (U+000C) # U+0020 SPACE if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { # Switch to the before attribute name state. $this->tokenizerState = static::BEFORE_ATTRIBUTE_NAME_STATE; } # "/" (U+002F) elseif ($char === '/') { # Switch to the self-closing start tag state. $this->tokenizerState = static::SELF_CLOSING_START_TAG_STATE; } # ">" (U+003E) elseif ($char === '>') { # Switch to the data state. Emit the current tag token. $this->tokenizerState = static::DATA_STATE; // Need to add the current attribute name and value to the token if necessary. if ($attributeName) { $token->setAttribute($attributeName, $attributeValue); } $this->emitToken($token); } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Reconsume the EOF character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'attribute name or tag end'); $this->tokenizerState = static::DATA_STATE; $this->data->unconsume(); } # Anything else else { # Parse error. Switch to the before attribute name state. Reconsume the character. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, $char, 'attribute name or tag end'); $this->tokenizerState = static::BEFORE_ATTRIBUTE_NAME_STATE; $this->data->unconsume(); } } # 8.2.4.43 Self-closing start tag state elseif ($this->tokenizerState === static::SELF_CLOSING_START_TAG_STATE) { # Consume the next input character $char = $this->data->consume(); # ">" (U+003E) if ($char === '>') { # Set the self-closing flag of the current tag token. Switch to the data state. # Emit the current tag token. $token->selfClosing = true; $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Reconsume the EOF character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'tag end'); $this->tokenizerState = static::DATA_STATE; $this->data->unconsume(); } # Anything else else { # Parse error. Switch to the before attribute name state. Reconsume the character. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, $char, 'tag end'); $this->tokenizerState = static::BEFORE_ATTRIBUTE_NAME_STATE; $this->data->unconsume(); } } # 8.2.4.44 Bogus comment state elseif ($this->tokenizerState === static::BOGUS_COMMENT_STATE) { # Consume every character up to and including the first ">" (U+003E) character or # the end of the file (EOF), whichever comes first. Emit a comment token whose # data is the concatenation of all the characters starting from and including the # character that caused the state machine to switch into the bogus comment state, # up to and including the character immediately before the last consumed character # (i.e. up to the character just before the U+003E or EOF character), but with any # U+0000 NULL characters replaced by U+FFFD REPLACEMENT CHARACTER characters. (If # the comment was started by the end of the file (EOF), the token is empty. # Similarly, the token is empty if it was generated by the string "".) $char = $char.$this->data->consumeUntil('>'); $nextChar = $this->data->consume(); $this->emitToken(new CommentToken($char)); # Switch to the data state. $this->tokenizerState = static::DATA_STATE; # If the end of the file was reached, reconsume the EOF character. if ($nextChar === '') { $this->data->unconsume(); } } # 8.2.4.45 Markup declaration open state elseif ($this->tokenizerState === static::MARKUP_DECLARATION_OPEN_STATE) { # If the next two characters are both "-" (U+002D) characters, consume those two # characters, create a comment token whose data is the empty string, and switch to # the comment start state. if ($this->data->peek(2) === '--') { $this->data->consume(2); $token = new CommentToken(); $this->tokenizerState = static::COMMENT_START_STATE; } # Otherwise, if the next seven characters are an ASCII case-insensitive match for # the word "DOCTYPE", then consume those characters and switch to the DOCTYPE # state. elseif (strtolower($this->data->peek(7)) === 'doctype') { $this->data->consume(7); $this->tokenizerState = static::DOCTYPE_STATE; } # Otherwise, if there is an adjusted current node and it is not an element in the # HTML namespace and the next seven characters are a case-sensitive match for the # string "[CDATA[" (the five uppercase letters "CDATA" with a U+005B LEFT SQUARE # BRACKET character before and after), then consume those characters and switch to # the CDATA section state. else { $adjustedCurrentNode = $this->stack->adjustedCurrentNode; if ($adjustedCurrentNode && $adjustedCurrentNode->namespace !== static::HTML_NAMESPACE && $this->data->peek(7) === '[CDATA[') { $this->data->consume(7); $this->tokenizerState = static::CDATA_SECTION_STATE; } # Otherwise, this is a parse error. Switch to the bogus comment state. The next # character that is consumed, if any, is the first character that will be in the # comment. else { $char = $this->data->consume(); if ($char !== '') { ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, $char, 'markup declaration'); } else { ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'markup declaration'); } $this->tokenizerState = static::BOGUS_COMMENT_STATE; } } } # 8.2.4.46 Comment start state elseif ($this->tokenizerState === static::COMMENT_START_STATE) { # Consume the next input character $char = $this->data->consume(); # "-" (U+002D) if ($char === '-') { # Switch to the comment start dash state. $this->tokenizerState = static::COMMENT_START_DASH_STATE; } # ">" (U+003E) elseif ($char === '>') { # Parse error. Switch to the data state. Emit the comment token. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, '>', 'comment'); $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF # character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'comment'); $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); $this->data->unconsume(); } # Anything else else { # Append the current input character to the comment token's data. Switch to the # comment state. $token->data .= $char; $this->tokenizerState = static::COMMENT_STATE; } } # 8.2.4.47 Comment start dash state elseif ($this->tokenizerState === static::COMMENT_START_DASH_STATE) { # Consume the next input character $char = $this->data->consume(); # "-" (U+002D) if ($char === '-') { # Switch to the comment start dash state. $this->tokenizerState = static::COMMENT_END_STATE; } # ">" (U+003E) elseif ($char === '>') { # Parse error. Switch to the data state. Emit the comment token. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, '>', 'comment'); $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF # character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'comment'); $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); $this->data->unconsume(); } # Anything else else { # Append a "-" (U+002D) character and the current input character to the comment # token's data. Switch to the comment state. $token->data .= '-'.$char; $this->tokenizerState = static::COMMENT_STATE; } } # 8.2.4.48 Comment state elseif ($this->tokenizerState === static::COMMENT_STATE) { # Consume the next input character $char = $this->data->consume(); # "-" (U+002D) if ($char === '-') { # Switch to the comment end dash state $this->tokenizerState = static::COMMENT_END_DASH_STATE; } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF # character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'comment'); $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); $this->data->unconsume(); } # Anything else else { # Append the current input character to the comment token's data. // OPTIMIZATION: Consume all characters that aren't listed above to prevent having // to loop back through here every single time. $token->data .= $char.$this->data->consumeUntil('-'); } } # 8.2.4.49 Comment end dash state elseif ($this->tokenizerState === static::COMMENT_END_DASH_STATE) { # Consume the next input character $char = $this->data->consume(); # "-" (U+002D) if ($char === '-') { # Switch to the comment end state $this->tokenizerState = static::COMMENT_END_STATE; } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF # character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'comment'); $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); $this->data->unconsume(); } # Anything else else { # Append a "-" (U+002D) character and the current input character to the comment # token's data. Switch to the comment state. $token->data .= '-'.$char; $this->tokenizerState = static::COMMENT_STATE; } } # 8.2.4.50 Comment end state elseif ($this->tokenizerState === static::COMMENT_END_STATE) { # Consume the next input character $char = $this->data->consume(); # ">" (U+003E) if ($char === '>') { # Switch to the data state. Emit the comment token. $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } # "!" (U+0021) elseif ($char === '!') { # Parse error. Switch to the comment end bang state. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, '!', 'comment end'); $this->tokenizerState = static::COMMENT_END_BANG_STATE; } # "-" (U+002D) elseif ($char === '-') { # Parse error. Append a "-" (U+002D) character to the comment token's data. // OPTIMIZATION: Consume all '-' characters to prevent having to loop back through // here every single time. $char .= $this->data->consumeWhile('-'); for ($i = 0; $i < strlen($char); $i++) { ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, '-', 'comment end'); } $token->data .= $char; } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF # character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'comment end'); $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); $this->data->unconsume(); } # Anything else else { # Parse error. Append two "-" (U+002D) characters and the current input character # to the comment token's data. Switch to the comment state. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, $char, 'comment end'); $token->data .= '--'.$char; $this->tokenizerState = static::COMMENT_STATE; } } # 8.2.4.51 Comment end bang state elseif ($this->tokenizerState === static::COMMENT_END_BANG_STATE) { # Consume the next input character $char = $this->data->consume(); # "-" (U+002D) if ($char === '-') { # Append two "-" (U+002D) characters and a "!" (U+0021) character to the comment # token's data. Switch to the comment end dash state. $token->data .= '--!'; $this->tokenizerState = static::COMMENT_END_DASH_STATE; } # ">" (U+003E) elseif ($char === '>') { # Switch to the data state. Emit the comment token. $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Emit the comment token. Reconsume the EOF # character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'comment end'); $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); $this->data->unconsume(); } # Anything else else { # Append two "-" (U+002D) characters, a "!" (U+0021) character, and the current # input character to the comment token's data. Switch to the comment state. $token->data .= '--!'.$char; $this->tokenizerState = static::COMMENT_STATE; } } # 8.2.4.52 DOCTYPE state elseif ($this->tokenizerState === static::DOCTYPE_STATE) { # Consume the next input character $char = $this->data->consume(); # "tab" (U+0009) # "LF" (U+000A) # "FF" (U+000C) # U+0020 SPACE if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { # Switch to the before DOCTYPE name state. // Spec doesn't say to create a token here, but if you don't it leads to a // situation where a token doesn't exist. $token = new DOCTYPEToken(); $this->tokenizerState = static::DOCTYPE_NAME_STATE; } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Create a new DOCTYPE token. Set its # force-quirks flag to on. Emit the token. Reconsume the EOF character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'DOCTYPE'); $this->tokenizerState = static::DATA_STATE; $token = new DOCTYPEToken(); $token->forceQuirks = true; $this->emitToken($token); $this->data->unconsume(); } # Anything else else { # Parse error. Switch to the before DOCTYPE name state. Reconsume the character. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, $char, 'DOCTYPE'); $this->tokenizerState = static::DOCTYPE_NAME_STATE; $this->data->unconsume(); } } # 8.2.4.53 Before DOCTYPE name state elseif ($this->tokenizerState === static::BEFORE_DOCTYPE_NAME_STATE) { # Consume the next input character $char = $this->data->consume(); # "tab" (U+0009) # "LF" (U+000A) # "FF" (U+000C) # U+0020 SPACE if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { # Ignore the character. continue; } # Uppercase ASCII letter elseif (ctype_upper($char)) { # Create a new DOCTYPE token. Set the token's name to the lowercase version of the # current input character (add 0x0020 to the character's code point). Switch to # the DOCTYPE name state. $token = new DOCTYPEToken($char); $token->tokenizerState = static::DOCTYPE_NAME_STATE; } # ">" (U+003E) elseif ($char === '>') { # Parse error. Create a new DOCTYPE token. Set its force-quirks flag to on. Switch # to the data state. Emit the token. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, '>', 'DOCTYPE'); $token = new DOCTYPEToken(); $token->forceQuirks = true; $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Create a new DOCTYPE token. Set its # force-quirks flag to on. Emit the token. Reconsume the EOF character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'DOCTYPE'); $this->tokenizerState = static::DATA_STATE; $token = new DOCTYPEToken(); $token->forceQuirks = true; $this->emitToken($token); $this->data->unconsume(); } # Anything else else { # Create a new DOCTYPE token. Set the token's name to the current input character. # Switch to the DOCTYPE name state. $token = new DOCTYPEToken($char); $token->tokenizerState = static::DOCTYPE_NAME_STATE; } } # 8.2.4.54 DOCTYPE name state elseif ($this->tokenizerState === static::DOCTYPE_NAME_STATE) { # Consume the next input character $char = $this->data->consume(); # "tab" (U+0009) # "LF" (U+000A) # "FF" (U+000C) # U+0020 SPACE if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { # Switch to the after DOCTYPE name state. $this->tokenizerState = static::AFTER_DOCTYPE_NAME_STATE; } # ">" (U+003E) elseif ($char === '>') { # Switch to the data state. Emit the current DOCTYPE token. $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } # Uppercase ASCII letter elseif (ctype_alpha($char)) { # Append the lowercase version of the current input character (add 0x0020 to the # character's code point) to the current DOCTYPE token's name. // OPTIMIZATION: Will just check for alpha characters and strtolower the // characters. // OPTIMIZATION: Consume all characters that are ASCII characters to prevent having // to loop back through here every single time. $token->name .= strtolower($char.$this->data->consumeWhile(static::CTYPE_ALPHA)); } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # to on. Emit that DOCTYPE token. Reconsume the EOF character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'DOCTYPE'); $this->tokenizerState = static::DATA_STATE; $token->forceQuirks = true; $this->emitToken($token); $this->data->unconsume(); } # Anything else else { # Append the current input character to the current DOCTYPE token's name. // OPTIMIZATION: Consume all characters that aren't listed above to prevent having // to loop back through here every single time. $token->name .= $char.$this->data->consumeUntil("\t\n\x0c >".static::CTYPE_ALPHA); } } # 8.2.4.55 After DOCTYPE name state elseif ($this->tokenizerState === static::AFTER_DOCTYPE_NAME_STATE) { # Consume the next input character $char = $this->data->consume(); # "tab" (U+0009) # "LF" (U+000A) # "FF" (U+000C) # U+0020 SPACE if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { # Switch to the after DOCTYPE name state. continue; } # ">" (U+003E) elseif ($char === '>') { # Switch to the data state. Emit the current DOCTYPE token. $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # to on. Emit that DOCTYPE token. Reconsume the EOF character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'DOCTYPE name'); $this->tokenizerState = static::DATA_STATE; $token->forceQuirks = true; $this->emitToken($token); $this->data->unconsume(); } # Anything else else { # If the six characters starting from the current input character are an ASCII # case-insensitive match for the word "PUBLIC", then consume those characters and # switch to the after DOCTYPE public keyword state. // Simpler to just consume and then unconsume if they're not needed. $char .= $this->data->consume(5); if (strtolower($char) === 'public') { $this->tokenizerState = static::AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE; } # Otherwise, if the six characters starting from the current input character are # an ASCII case-insensitive match for the word "SYSTEM", then consume those # characters and switch to the after DOCTYPE system keyword state. elseif (strtolower($char) === 'system') { $this->tokenizerState = static::AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE; } # Otherwise, this is a parse error. Set the DOCTYPE token's force-quirks flag to # on. Switch to the bogus DOCTYPE state. else { // Need to unconsume what was consumed earlier. $this->data->unconsume(5); ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, $char[0], 'DOCTYPE name'); $token->forceQuirks = true; $this->tokenizerState = static::BOGUS_DOCTYPE_STATE; } } } # 8.2.4.56 After DOCTYPE public keyword state elseif ($this->tokenizerState === static::AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE) { # Consume the next input character $char = $this->data->consume(); # "tab" (U+0009) # "LF" (U+000A) # "FF" (U+000C) # U+0020 SPACE if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { # Switch to the before DOCTYPE public identifier state. $this->tokenizerState = static::BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE; } # U+0022 QUOTATION MARK (") elseif ($char === '"') { # Parse error. Set the DOCTYPE token's public identifier to the empty string (not # missing), then switch to the DOCTYPE public identifier (double-quoted) state. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, '"', 'DOCTYPE public keyword'); $token->public = ''; $this->tokenizerState = static::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE; } # "'" (U+0027) elseif ($char === "'") { # Parse error. Set the DOCTYPE token's public identifier to the empty string (not # missing), then switch to the DOCTYPE public identifier (single-quoted) state. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, "'", 'DOCTYPE public keyword'); $token->public = ''; $this->tokenizerState = static::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE; } # ">" (U+003E) elseif ($char === '>') { # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data # state. Emit that DOCTYPE token. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, '>', 'DOCTYPE public keyword'); $token->forceQuirks = true; $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # to on. Emit that DOCTYPE token. Reconsume the EOF character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'DOCTYPE public keyword'); $this->tokenizerState = static::DATA_STATE; $token->forceQuirks = true; $this->emitToken($token); $this->data->unconsume(); } # Anything else else { # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the # bogus DOCTYPE state. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, $char, 'DOCTYPE public keyword'); $token->forceQuirks = true; $this->tokenizerState = static::BOGUS_DOCTYPE_STATE; } } # 8.2.4.57 Before DOCTYPE public identifier state elseif ($this->tokenizerState === static::BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) { # Consume the next input character $char = $this->data->consume(); # "tab" (U+0009) # "LF" (U+000A) # "FF" (U+000C) # U+0020 SPACE if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { # Ignore the character. continue; } # U+0022 QUOTATION MARK (") elseif ($char === '"') { # Set the DOCTYPE token's public identifier to the empty string (not missing), # then switch to the DOCTYPE public identifier (double-quoted) state. $token->public = ''; $this->tokenizerState = static::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE; } # "'" (U+0027) elseif ($char === "'") { # Set the DOCTYPE token's public identifier to the empty string (not missing), # then switch to the DOCTYPE public identifier (single-quoted) state. $token->public = ''; $this->tokenizerState = static::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE; } # ">" (U+003E) elseif ($char === '>') { # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data # state. Emit that DOCTYPE token. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, '>', 'DOCTYPE public identifier'); $token->forceQuirks = true; $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # to on. Emit that DOCTYPE token. Reconsume the EOF character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'DOCTYPE public identifier'); $this->tokenizerState = static::DATA_STATE; $token->forceQuirks = true; $this->emitToken($token); $this->data->unconsume(); } # Anything else else { # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the # bogus DOCTYPE state. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, $char, 'DOCTYPE public identifier'); $token->forceQuirks = true; $this->tokenizerState = static::BOGUS_DOCTYPE_STATE; } } # 8.2.4.58 DOCTYPE public identifier (double-quoted) state elseif ($this->tokenizerState === static::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) { # Consume the next input character $char = $this->data->consume(); # U+0022 QUOTATION MARK (") if ($char === '"') { # Switch to the after DOCTYPE public identifier state. $this->tokenizerState = static::AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE; } # ">" (U+003E) elseif ($char === '>') { # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data # state. Emit that DOCTYPE token. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, '>', 'DOCTYPE public identifier'); $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # to on. Emit that DOCTYPE token. Reconsume the EOF character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'DOCTYPE public identifier'); $this->tokenizerState = static::DATA_STATE; $token->forceQuirks = true; $this->emitToken($token); $this->data->unconsume(); } # Anything else else { # Append the current input character to the current DOCTYPE token's public identifier. // OPTIMIZATION: Consume all characters that aren't listed above to prevent having // to loop back through here every single time. $token->public .= $char.$this->data->consumeUntil('">'); } } # 8.2.4.59 DOCTYPE public identifier (single-quoted) state elseif ($this->tokenizerState === static::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) { # Consume the next input character $char = $this->data->consume(); # "'" (U+0027) if ($char === "'") { # Switch to the after DOCTYPE public identifier state. $this->tokenizerState = static::AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE; } # ">" (U+003E) elseif ($char === '>') { # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data # state. Emit that DOCTYPE token. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, '>', 'DOCTYPE public identifier'); $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # to on. Emit that DOCTYPE token. Reconsume the EOF character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'DOCTYPE public identifier'); $this->tokenizerState = static::DATA_STATE; $token->forceQuirks = true; $this->emitToken($token); $this->data->unconsume(); } # Anything else else { # Append the current input character to the current DOCTYPE token's public identifier. // OPTIMIZATION: Consume all characters that aren't listed above to prevent having // to loop back through here every single time. $token->public .= $char.$this->data->consumeUntil("'>"); } } # 8.2.4.60 After DOCTYPE public identifier state elseif ($this->tokenizerState === static::AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) { # Consume the next input character $char = $this->data->consume(); # "tab" (U+0009) # "LF" (U+000A) # "FF" (U+000C) # U+0020 SPACE if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { # Switch to the between DOCTYPE public and system identifiers state. $this->tokenizerState = static::BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE; } # ">" (U+003E) elseif ($char === '>') { # Switch to the data state. Emit the current DOCTYPE token. $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } # U+0022 QUOTATION MARK (") elseif ($char === '"') { # Set the DOCTYPE token's system identifier to the empty string (not missing), # then switch to the DOCTYPE system identifier (double-quoted) state. $this->system = ''; $this->tokenizerState = static::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE; } # "'" (U+0027) elseif ($char === "'") { # Set the DOCTYPE token's system identifier to the empty string (not missing), # then switch to the DOCTYPE system identifier (single-quoted) state. $this->system = ''; $this->tokenizerState = static::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE; } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # to on. Emit that DOCTYPE token. Reconsume the EOF character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'DOCTYPE public identifier'); $this->tokenizerState = static::DATA_STATE; $token->forceQuirks = true; $this->emitToken($token); $this->data->unconsume(); } # Anything else else { # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the # bogus DOCTYPE state. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, $char, 'DOCTYPE public identifier'); $token->forceQuirks = true; $this->tokenizerState = static::BOGUS_DOCTYPE_STATE; } } # 8.2.4.61 Between DOCTYPE public and system identifiers state elseif ($this->tokenizerState === static::BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE) { # Consume the next input character $char = $this->data->consume(); # "tab" (U+0009) # "LF" (U+000A) # "FF" (U+000C) # U+0020 SPACE if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { # Ignore the character. continue; } # ">" (U+003E) elseif ($char === '>') { # Switch to the data state. Emit the current DOCTYPE token. $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } # U+0022 QUOTATION MARK (") elseif ($char === '"') { # Set the DOCTYPE token's system identifier to the empty string (not missing), # then switch to the DOCTYPE system identifier (double-quoted) state. $this->system = ''; $this->tokenizerState = static::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE; } # "'" (U+0027) elseif ($char === "'") { # Set the DOCTYPE token's system identifier to the empty string (not missing), # then switch to the DOCTYPE system identifier (single-quoted) state. $this->system = ''; $this->tokenizerState = static::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE; } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # to on. Emit that DOCTYPE token. Reconsume the EOF character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'DOCTYPE public identifier'); $this->tokenizerState = static::DATA_STATE; $token->forceQuirks = true; $this->emitToken($token); $this->data->unconsume(); } # Anything else else { # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the # bogus DOCTYPE state. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, $char, 'DOCTYPE public identifier'); $token->forceQuirks = true; $this->tokenizerState = static::BOGUS_DOCTYPE_STATE; } } # 8.2.4.62 After DOCTYPE system keyword state elseif ($this->tokenizerState === static::AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE) { # Consume the next input character $char = $this->data->consume(); # "tab" (U+0009) # "LF" (U+000A) # "FF" (U+000C) # U+0020 SPACE if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { # Switch to the before DOCTYPE system identifier state. $this->tokenizerState = static::BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE; } # U+0022 QUOTATION MARK (") elseif ($char === '"') { # Parse error. Set the DOCTYPE token's system identifier to the empty string (not # missing), then switch to the DOCTYPE system identifier (double-quoted) state. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, '"', 'DOCTYPE system keyword'); $token->system = ''; $this->tokenizerState = static::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE; } # "'" (U+0027) elseif ($char === "'") { # Parse error. Set the DOCTYPE token's system identifier to the empty string (not # missing), then switch to the DOCTYPE system identifier (single-quoted) state. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, "'", 'DOCTYPE system keyword'); $token->system = ''; $this->tokenizerState = static::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE; } # ">" (U+003E) elseif ($char === '>') { # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data # state. Emit that DOCTYPE token. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, '>', 'DOCTYPE system keyword'); $token->forceQuirks = true; $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # to on. Emit that DOCTYPE token. Reconsume the EOF character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'DOCTYPE system keyword'); $this->tokenizerState = static::DATA_STATE; $token->forceQuirks = true; $this->emitToken($token); $this->data->unconsume(); } # Anything else else { # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the # bogus DOCTYPE state. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, $char, 'DOCTYPE system keyword'); $token->forceQuirks = true; $this->tokenizerState = static::BOGUS_DOCTYPE_STATE; } } # 8.2.4.63 Before DOCTYPE system identifier state elseif ($this->tokenizerState === static::BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) { # Consume the next input character $char = $this->data->consume(); # "tab" (U+0009) # "LF" (U+000A) # "FF" (U+000C) # U+0020 SPACE if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { # Ignore the character. continue; } # U+0022 QUOTATION MARK (") elseif ($char === '"') { # Set the DOCTYPE token's system identifier to the empty string (not missing), # then switch to the DOCTYPE system identifier (double-quoted) state. $token->system = ''; $this->tokenizerState = static::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE; } # "'" (U+0027) elseif ($char === "'") { # Set the DOCTYPE token's system identifier to the empty string (not missing), # then switch to the DOCTYPE system identifier (single-quoted) state. $token->system = ''; $this->tokenizerState = static::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE; } # ">" (U+003E) elseif ($char === '>') { # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data # state. Emit that DOCTYPE token. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, '>', 'DOCTYPE system identifier'); $token->forceQuirks = true; $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # to on. Emit that DOCTYPE token. Reconsume the EOF character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'DOCTYPE system identifier'); $this->tokenizerState = static::DATA_STATE; $token->forceQuirks = true; $this->emitToken($token); $this->data->unconsume(); } # Anything else else { # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the # bogus DOCTYPE state. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, $char, 'DOCTYPE system identifier'); $token->forceQuirks = true; $this->tokenizerState = static::BOGUS_DOCTYPE_STATE; } } # 8.2.4.64 DOCTYPE system identifier (double-quoted) state elseif ($this->tokenizerState === static::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) { # Consume the next input character $char = $this->data->consume(); # U+0022 QUOTATION MARK (") if ($char === '"') { # Switch to the after DOCTYPE system identifier state. $this->tokenizerState = static::AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; } # ">" (U+003E) elseif ($char === '>') { # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data # state. Emit that DOCTYPE token. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, '>', 'DOCTYPE system identifier'); $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # to on. Emit that DOCTYPE token. Reconsume the EOF character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'DOCTYPE system identifier'); $this->tokenizerState = static::DATA_STATE; $token->forceQuirks = true; $this->emitToken($token); $this->data->unconsume(); } # Anything else else { # Append the current input character to the current DOCTYPE token's system identifier. // OPTIMIZATION: Consume all characters that aren't listed above to prevent having // to loop back through here every single time. $token->system .= $char.$this->data->consumeUntil('">'); } } # 8.2.4.65 DOCTYPE system identifier (single-quoted) state elseif ($this->tokenizerState === static::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) { # Consume the next input character $char = $this->data->consume(); # "'" (U+0027) if ($char === "'") { # Switch to the after DOCTYPE system identifier state. $this->tokenizerState = static::AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; } # ">" (U+003E) elseif ($char === '>') { # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the data # state. Emit that DOCTYPE token. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, '>', 'DOCTYPE system identifier'); $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # to on. Emit that DOCTYPE token. Reconsume the EOF character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'DOCTYPE system identifier'); $this->tokenizerState = static::DATA_STATE; $token->forceQuirks = true; $this->emitToken($token); $this->data->unconsume(); } # Anything else else { # Append the current input character to the current DOCTYPE token's system identifier. // OPTIMIZATION: Consume all characters that aren't listed above to prevent having // to loop back through here every single time. $token->system .= $char.$this->data->consumeUntil("'>"); } } # 8.2.4.66 After DOCTYPE system identifier state elseif ($this->tokenizerState === static::AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) { # Consume the next input character $char = $this->data->consume(); # "tab" (U+0009) # "LF" (U+000A) # "FF" (U+000C) # U+0020 SPACE if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { # Switch to the between DOCTYPE system and system identifiers state. $this->tokenizerState = static::BETWEEN_DOCTYPE_SYSTEM_AND_SYSTEM_IDENTIFIERS_STATE; } # ">" (U+003E) elseif ($char === '>') { # Switch to the data state. Emit the current DOCTYPE token. $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } # U+0022 QUOTATION MARK (") elseif ($char === '"') { # Set the DOCTYPE token's system identifier to the empty string (not missing), # then switch to the DOCTYPE system identifier (double-quoted) state. $this->system = ''; $this->tokenizerState = static::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE; } # "'" (U+0027) elseif ($char === "'") { # Set the DOCTYPE token's system identifier to the empty string (not missing), # then switch to the DOCTYPE system identifier (single-quoted) state. $this->system = ''; $this->tokenizerState = static::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE; } # EOF elseif ($char === '') { # Parse error. Switch to the data state. Set the DOCTYPE token's force-quirks flag # to on. Emit that DOCTYPE token. Reconsume the EOF character. ParseError::trigger(ParseError::UNEXPECTED_EOF, $this->data, 'DOCTYPE system identifier'); $this->tokenizerState = static::DATA_STATE; $token->forceQuirks = true; $this->emitToken($token); $this->data->unconsume(); } # Anything else else { # Parse error. Set the DOCTYPE token's force-quirks flag to on. Switch to the # bogus DOCTYPE state. ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $this->data, $char, 'DOCTYPE system identifier'); $token->forceQuirks = true; $this->tokenizerState = static::BOGUS_DOCTYPE_STATE; } } # 8.2.4.67 Bogus DOCTYPE state elseif ($this->tokenizerState === static::BOGUS_DOCTYPE_STATE) { # Consume the next input character $char = $this->data->consume(); # ">" (U+003E) if ($char === '>') { # Switch to the data state. Emit the DOCTYPE token. $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); } # EOF elseif ($char === '') { # Switch to the data state. Emit the DOCTYPE token. $this->tokenizerState = static::DATA_STATE; $this->emitToken($token); $this->data->unconsume(); } # Anything else else { # Ignore the character. continue; } } # 8.2.4.68 CDATA section state elseif ($this->tokenizerState === static::CDATA_SECTION_STATE) { # Switch to the data state. $this->tokenizerState = static::DATA_STATE; # Consume every character up to the next occurrence of the three character # sequence U+005D RIGHT SQUARE BRACKET U+005D RIGHT SQUARE BRACKET U+003E # GREATER-THAN SIGN (]]>), or the end of the file (EOF), whichever comes first. # Emit a series of character tokens consisting of all the characters consumed # except the matching three character sequence at the end (if one was found before # the end of the file). $char = ''; while (true) { $char .= $this->data->consumeUntil(']'); $peek = $this->data->peek(3); $peeklen = strlen($peek); if ($peek === ']]>') { $this->data->consume(3); $this->emitToken(new CharacterToken($char)); break; } elseif ($peek === '') { $this->emitToken(new CharacterToken($char)); # If the end of the file was reached, reconsume the EOF character. $this->data->unconsume(); break; } elseif ($peeklen < 3) { $char .= $this->data->consume($peeklen); $this->emitToken(new CharacterToken($char)); # If the end of the file was reached, reconsume the EOF character. $this->data->unconsume(); break; } else { $char .= $this->data->consume(); } } } } } protected function emitToken(Token $token) { # 8.2.5 Tree construction # # As each token is emitted from the tokenizer, the user agent must follow the # appropriate steps from the following list: # # If there is no current node $currentNode = $this->stack->currentNode; if (is_null($currentNode)) { # Process the token according to the rules given in the section corresponding to # the current insertion mode in HTML content. $this->parseTokenInHTMLContent($token); } else { $currentNodeName = $this->stack->currentNodeName; $currentNodeNamespace = $this->stack->currentNodeNamespace; # If the current node is an element in the HTML namespace if ($currentNodeNamespace === static::HTML_NAMESPACE || ( # If the current node is a MathML text integration point and the token is a # start tag whose tag name is neither "mglyph" nor "malignmark" # If the current node is a MathML text integration point and the token is a # character token DOM::isMathMLTextIntegrationPoint($currentNode) && (( $token instanceof StartTagToken && ( $token->name !== 'mglyph' && $token->name !== 'malignmark' ) || $token instanceof CharacterToken ) ) ) || ( # If the current node is an annotation-xml element in the MathML namespace and # the token is a start tag whose tag name is "svg" $currentNodeNamespace === static::MATHML_NAMESPACE && $currentNodeName === 'annotation-xml' && $token instanceof StartTagToken && $token->name === 'svg' ) || ( # If the current node is an HTML integration point and the token is a start tag # If the current node is an HTML integration point and the token is a character # token DOM::isHTMLIntegrationPoint($currentNode) && ( $token instanceof StartTagToken || $token instanceof CharacterToken ) ) || # If the token is an end-of-file token $token instanceof EOFToken ) { # Process the token according to the rules given in the section corresponding to # the current insertion mode in HTML content. $this->parseTokenInHTMLContent($token); } } # Otherwise # Process the token according to the rules given in the section for parsing # tokens in foreign content. $this->parseTokenInForeignContent($token); // TEMPORARY $quirksMode = false; var_export($token); echo "\n\n"; if ($token instanceof StartTagToken && !$token->selfClosing) { $this->stack[] = $token; } elseif ($token instanceof EndTagToken) { $this->stack->pop(); } } protected function parseTokenInHTMLContent(Token $token, $insertionMode = null) { } protected function parseTokenInForeignContent(Token $token) { $currentNode = $this->stack->currentNode; $currentNodeName = $this->stack->currentNodeName; $currentNodeNamespace = $this->stack->currentNodeNamespace; # 8.2.5.5 The rules for parsing tokens in foreign content # # When the user agent is to apply the rules for parsing tokens in foreign # content, the user agent must handle the token as follows: # if ($token instanceof CharacterToken) { # A character token that is one of U+0009 CHARACTER TABULATION, "LF" (U+000A), # "FF" (U+000C), "CR" (U+000D), or U+0020 SPACE # Any other character token // OPTIMIZATION: Will check for multiple space characters at once as character // tokens can contain more than one character. if (strspn($token->data, "\t\n\x0c\x0d ") !== strlen($token->data)) { # Set the frameset-ok flag to "not ok". $this->$framesetOk = false; } # Insert the token's character into the current node. $currentNode->appendChild($this->DOM->createTextElement($token->data)); } # A comment token elseif ($token instanceof CommentToken) { # Append a Comment node to the current node with the data attribute set to the # data given in the comment token. $currentNode->appendChild($this->DOM->createComment($token->data)); } # A DOCTYPE token elseif ($token instanceof DOCTYPEToken) { # Parse error. Ignore the token. ParseError::trigger(ParseError::UNEXPECTED_DOCTYPE, 'Character, Comment, Start Tag, or End Tag'); } elseif ($token instanceof StartTagToken) { # A start tag whose tag name is one of: "b", "big", "blockquote", "body", "br", # "center", "code", "dd", "div", "dl", "dt", "em", "embed", "h1", "h2", "h3", # "h4", "h5", "h6", "head", "hr", "i", "img", "li", "listing", "menu", "meta", # "nobr", "ol", "p", "pre", "ruby", "s", "small", "span", "strong", "strike", # "sub", "sup", "table", "tt", "u", "ul", "var" # A start tag whose tag name is "font", if the token has any attributes named # "color", "face", or "size" if ($token->name === 'b' || $token->name === 'big' || $token->name === 'blockquote' || $token->name === 'body' || $token->name === 'br' || $token->name === 'center' || $token->name === 'code' || $token->name === 'dd' || $token->name === 'div' || $token->name === 'dl' || $token->name === 'dt' || $token->name === 'em' || $token->name === 'embed' || $token->name === 'h1' || $token->name === 'h2' || $token->name === 'h3' || $token->name === 'h4' || $token->name === 'h5' || $token->name === 'h6' || $token->name === 'head' || $token->name === 'hr' || $token->name === 'i' || $token->name === 'img' || $token->name === 'li' || $token->name === 'listing' || $token->name === 'menu' || $token->name === 'meta' || $token->name === 'nobr' || $token->name === 'ol' || $token->name === 'p' || $token->name === 'pre' || $token->name === 'ruby' || $token->name === 's' || $token->name === 'small' || $token->name === 'span' || $token->name === 'strong' || $token->name === 'strike' || $token->name === 'sub' || $token->name === 'sup' || $token->name === 'table' || $token->name === 'tt' || $token->name === 'u' || $token->name === 'var' || ( $token->name === 'font' && ( $token->hasAttribute('color') || $token->hasAttribute('face') || $token->hasAttribute('size') ) ) ) { # Parse error. ParseError::trigger(ParseError::UNEXPECTED_START_TAG, $token->name, 'Non-HTML start tag'); # Pop an element from the stack of open elements, and then keep popping more # elements from the stack of open elements until the current node is a MathML # text integration point, an HTML integration point, or an element in the HTML # namespace. do { $popped = $this->stack->pop(); } while (!is_null($popped) && ( !DOM::isMathMLTextIntegrationPoint($this->stack->currentNode) && !DOM::isHTMLIntegrationPoint($this->stack->currentNode) && $this->stack->currentNode->namespaceURI !== static::HTML_NAMESPACE ) ); # Then, reprocess the token. $this->emitToken($token); } # Any other start tag else { $tokenName = $token->name; # If the current node is an element in the SVG namespace, and the token's tag # name is one of the ones in the first column of the following table, change the # tag name to the name given in the corresponding cell in the second column. # (This fixes the case of SVG elements that are not all lowercase.) if ($currentNode->namespaceURI === static::SVG_NAMESPACE) { switch ($tokenName) { case 'altglyph': $tokenName = 'altGlyph'; break; case 'altglyphdef': $tokenName = 'altGlyphDef'; break; case 'altglyphitem': $tokenName = 'altGlyphItem'; break; case 'animatecolor': $tokenName = 'animateColor'; break; case 'animatemotion': $tokenName = 'animateMotion'; break; case 'animatetransform': $tokenName = 'animateTransform'; break; case 'clippath': $tokenName = 'clipPath'; break; case 'feblend': $tokenName = 'feBlend'; break; case 'fecolormatrix': $tokenName = 'feColorMatrix'; break; case 'fecomponenttransfer': $tokenName = 'feComponentTransfer'; break; case 'fecomposite': $tokenName = 'feComposite'; break; case 'feconvolvematrix': $tokenName = 'feConvolveMatrix'; break; case 'fediffuselighting': $tokenName = 'feDiffuseLighting'; break; case 'fedisplacementmap': $tokenName = 'feDisplacementMap'; break; case 'fedistantlight': $tokenName = 'feDistantLight'; break; case 'feflood': $tokenName = 'feFlood'; break; case 'fefunca': $tokenName = 'feFuncA'; break; case 'fefuncb': $tokenName = 'feFuncB'; break; case 'fefuncg': $tokenName = 'feFuncG'; break; case 'fefuncr': $tokenName = 'feFuncR'; break; case 'fegaussianblur': $tokenName = 'feGaussianBlur'; break; case 'feimage': $tokenName = 'feImage'; break; case 'femerge': $tokenName = 'feMerge'; break; case 'femergenode': $tokenName = 'feMergeNode'; break; case 'femorphology': $tokenName = 'feMorphology'; break; case 'feoffset': $tokenName = 'feOffset'; break; case 'fepointlight': $tokenName = 'fePointLight'; break; case 'fespecularlighting': $tokenName = 'feSpecularLighting'; break; case 'fespotlight': $tokenName = 'feSpotLight'; break; case 'fetile': $tokenName = 'feTile'; break; case 'feturbulence': $tokenName = 'feTurbulence'; break; case 'foreignobject': $tokenName = 'foreignObject'; break; case 'glyphref': $tokenName = 'glyphRef'; break; case 'lineargradient': $tokenName = 'linearGradient'; break; case 'radialgradient': $tokenName = 'radialGradient'; break; case 'textpath': $tokenName = 'textPath'; } } $node = $this->DOM->createElementNS($currentNodeNamespace, $tokenName); foreach ($token->attributes as $name => $value) { # If the current node is an element in the MathML namespace, adjust MathML # attributes for the token. (This fixes the case of MathML attributes that are # not all lowercase.) if ($currentNodeNamespace === static::MATHML_NAMESPACE && $name === 'definitionurl') { $name === 'definitionURL'; } # If the current node is an element in the SVG namespace, adjust SVG attributes # for the token. (This fixes the case of SVG attributes that are not all # lowercase.) elseif ($currentNodeNamespace === static::SVG_NAMESPACE) { switch ($name) { case 'attributename': $name = 'attributeName'; break; case 'attributetype': $name = 'attributeType'; break; case 'basefrequency': $name = 'baseFrequency'; break; case 'baseprofile': $name = 'baseProfile'; break; case 'calcmode': $name = 'calcMode'; break; case 'clippathunits': $name = 'clipPathUnits'; break; case 'contentscripttype': $name = 'contentScriptType'; break; case 'contentstyletype': $name = 'contentStyleType'; break; case 'diffuseconstant': $name = 'diffuseConstant'; break; case 'edgemode': $name = 'edgeMode'; break; case 'externalresourcesrequired': $name = 'externalResourcesRequired'; break; case 'filterres': $name = 'filterRes'; break; case 'filterunits': $name = 'filterUnits'; break; case 'glyphref': $name = 'glyphRef'; break; case 'gradienttransform': $name = 'gradientTransform'; break; case 'gradientunits': $name = 'gradientUnits'; break; case 'kernelmatrix': $name = 'kernelMatrix'; break; case 'kernelunitlength': $name = 'kernelUnitLength'; break; case 'keypoints': $name = 'keyPoints'; break; case 'keysplines': $name = 'keySplines'; break; case 'keytimes': $name = 'keyTimes'; break; case 'lengthadjust': $name = 'lengthAdjust'; break; case 'limitingconeangle': $name = 'limitingConeAngle'; break; case 'markerheight': $name = 'markerHeight'; break; case 'markerunits': $name = 'markerUnits'; break; case 'markerwidth': $name = 'markerWidth'; break; case 'maskcontentunits': $name = 'maskContentUnits'; break; case 'maskunits': $name = 'maskUnits'; break; case 'numoctaves': $name = 'numOctaves'; break; case 'pathlength': $name = 'pathLength'; break; case 'patterncontentunits': $name = 'patternContentUnits'; break; case 'patterntransform': $name = 'patternTransform'; break; case 'patternunits': $name = 'patternUnits'; break; case 'pointsatx': $name = 'pointsAtX'; break; case 'pointsaty': $name = 'pointsAtY'; break; case 'pointsatz': $name = 'pointsAtZ'; break; case 'preservealpha': $name = 'preserveAlpha'; break; case 'preserveaspectratio': $name = 'preserveAspectRatio'; break; case 'primitiveunits': $name = 'primitiveUnits'; break; case 'refx': $name = 'refX'; break; case 'refy': $name = 'refY'; break; case 'repeatcount': $name = 'repeatCount'; break; case 'repeatdur': $name = 'repeatDur'; break; case 'requiredextensions': $name = 'requiredExtensions'; break; case 'requiredfeatures': $name = 'requiredFeatures'; break; case 'specularconstant': $name = 'specularConstant'; break; case 'specularexponent': $name = 'specularExponent'; break; case 'spreadmethod': $name = 'spreadMethod'; break; case 'startoffset': $name = 'startOffset'; break; case 'stddeviation': $name = 'stdDeviation'; break; case 'stitchtiles': $name = 'stitchTiles'; break; case 'surfacescale': $name = 'surfaceScale'; break; case 'systemlanguage': $name = 'systemLanguage'; break; case 'tablevalues': $name = 'tableValues'; break; case 'targetx': $name = 'targetX'; break; case 'targety': $name = 'targetY'; break; case 'textlength': $name = 'textLength'; break; case 'viewbox': $name = 'viewBox'; break; case 'viewtarget': $name = 'viewTarget'; break; case 'xchannelselector': $name = 'xChannelSelector'; break; case 'ychannelselector': $name = 'yChannelSelector'; break; case 'zoomandpan': $name = 'zoomAndPan'; } } # Adjust foreign attributes for the token. (This fixes the use of namespaced # attributes, in particular XLink in SVG.) # When the steps below require the user agent to adjust foreign attributes for a # token, then, if any of the attributes on the token match the strings given in # the first column of the following table, let the attribute be a namespaced # attribute, with the prefix being the string given in the corresponding cell in # the second column, the local name being the string given in the corresponding # cell in the third column, and the namespace being the namespace given in the # corresponding cell in the fourth column. (This fixes the use of namespaced # attributes, in particular lang attributes in the XML namespace.) switch($name) { case 'xlink:actuate': case 'xlink:arcrole': case 'xlink:href': case 'xlink:role': case 'xlink:show': case 'xlink:title': case 'xlink:type': $node->setAttributeNS(static::XLINK_NAMESPACE, $name, $value); break; case 'xml:base': case 'xml:lang': case 'xml:space': $node->setAttributeNS(static::XML_NAMESPACE, $name, $value); break; case 'xmlns': $node->setAttributeNS(static::XMLNS_NAMESPACE, $name, $value); # If the newly created element has an xmlns attribute in the XMLNS namespace # whose value is not exactly the same as the element's namespace, that is a # parse error. if ($value !== $node->namespaceURI) { ParseError::trigger(ParseError::INVALID_XMLNS_ATTRIBUTE_VALUE, $node->namespaceURI); } break; case 'xmlns:xlink': $node->setAttributeNS(static::XMLNS_NAMESPACE, $name, $value); # Similarly, if the newly created element has an xmlns:xlink attribute in the # XMLNS namespace whose value is not the XLink Namespace, that is a parse error. if ($value !== static::XLINK_NAMESPACE) { ParseError::trigger(ParseError::INVALID_XMLNS_ATTRIBUTE_VALUE, static::XLINK_NAMESPACE); } break; default: $node->setAttribute($name, $value); } } # Insert a foreign element for the token, in the same namespace as the current # node. # When the steps below require the UA to insert a foreign element for a token, # the UA must first create an element for the token in the given namespace, and # then append this node to the current node, and push it onto the stack of open # elements so that it is the new current node. $currentNode->appendChild($node); # If the token has its self-closing flag set, pop the current node off the stack # of open elements and acknowledge the token's self-closing flag. // OPTIMIZATION: Not adding it to the stack unless it's not self-closing. if (!$token->selfClosing) { $this->stack[] = $node; } } } # An end tag whose tag name is "script", if the current node is a script element # in the SVG namespace // DEVIATION: This implementation does not support scripting, so script elements // aren't processed differently. # Any other end tag elseif ($token instanceof EndTagToken) { # Run these steps: # # 1. Initialize node to be the current node (the bottommost node of the stack). $node = $currentNode; $nodeName = $currentNodeName; # 2. If node is not an element with the same tag name as the token, then this is # a parse error. if ($nodeName !== $token->name) { ParseError::trigger(ParseError::UNEXPECTED_END_TAG, $token->name, $nodeName); } # 3. Loop: If node's tag name, converted to ASCII lowercase, is the same as the tag name of the token, pop elements from the stack of open elements until node has been popped from the stack, and then abort these steps. $count = $this->stack->length - 1; while (true) { if (strtolower($nodeName) === $token->name) { do { $popped = $this->stack->pop(); } while ($popped !== $node && !is_null($popped)); break; } # 4. Set node to the previous entry in the stack of open elements. $node = $this->stack[--$count]; $nodeName = $node->nodeName; $nodeNamespace = $node->namespaceURI; # 5. If node is not an element in the HTML namespace, return to the step labeled # loop. if ($nodeNamespace !== static::HTML_NAMESPACE) { continue; } # 6. Otherwise, process the token according to the rules given in the section # corresponding to the current insertion mode in HTML content. $this->processTokenInHTMLContent($token, $this->insertionMode); break; } } } }