HTML-Parser/lib/TreeBuilder.php
J. King 3c7a76bce1 Retrofit tree builder for new error emitter
Also fixed a number of undefined variable errors and erroneous
non-root namespace references
2019-12-12 18:10:01 -05:00

2353 lines
135 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
declare(strict_types=1);
namespace dW\HTML5;
class TreeBuilder {
use ParseErrorEmitter;
// The list of active formatting elements, used when elements are improperly nested
protected $activeFormattingElementsList;
// The DOMDocument that is assembled by this class
protected $DOM;
// The form element pointer points to the last form element that was opened and
// whose end tag has not yet been seen. It is used to make form controls associate
// with forms in the face of dramatically bad markup, for historical reasons. It is
// ignored inside template elements
protected $formElement;
// Flag for determining whether to use the foster parenting (badly nested table
// elements) algorithm.
protected $fosterParenting = false;
// Flag that shows whether the content that's being parsed is a fragment or not
protected $fragmentCase;
// Context element for fragments
protected $fragmentContext;
// Flag used to determine whether elements are okay to be used in framesets or not
protected $framesetOk = true;
// Once a head element has been parsed (whether implicitly or explicitly) the head
// element pointer gets set to point to this node
protected $headElement;
// Treebuilder insertion mode
protected $insertionMode;
// When the insertion mode is switched to "text" or "in table text", the
// original insertion mode is also set. This is the insertion mode to which the
// tree construction stage will return.
protected $originalInsertionMode;
// The stack of open elements, uses Stack
protected $stack;
// Instance of the Tokenizer class used for creating tokens
protected $tokenizer;
// Used to check if the document is in quirks mode
protected $quirksMode;
// Used to store the template insertion modes
protected $templateInsertionModes;
// Used for debugging to print out information as the tree is built.
public static $debug = false;
// Instance used with the static token insertion methods.
protected static $instance;
// Constants used for insertion modes
const INITIAL_MODE = 0;
const BEFORE_HTML_MODE = 1;
const BEFORE_HEAD_MODE = 2;
const IN_HEAD_MODE = 3;
const IN_HEAD_NOSCRIPT_MODE = 4;
const AFTER_HEAD_MODE = 5;
const IN_BODY_MODE = 6;
const TEXT_MODE = 7;
const IN_TABLE_MODE = 8;
const IN_TABLE_TEXT_MODE = 9;
const IN_CAPTION_MODE = 10;
const IN_COLUMN_GROUP_MODE = 11;
const IN_TABLE_BODY_MODE = 12;
const IN_ROW_MODE = 13;
const IN_CELL_MODE = 14;
const IN_SELECT_MODE = 15;
const IN_SELECT_IN_TABLE_MODE = 16;
const IN_TEMPLATE_MODE = 17;
const AFTER_BODY_MODE = 18;
const IN_FRAMESET_MODE = 19;
const AFTER_FRAMESET_MODE = 20;
const AFTER_AFTER_BODY_MODE = 21;
const AFTER_AFTER_FRAMESET_MODE = 22;
// Quirks mode constants
const QUIRKS_MODE_OFF = 0;
const QUIRKS_MODE_ON = 1;
const QUIRKS_MODE_LIMITED = 2;
public function __construct(Document $dom, $formElement, bool $fragmentCase = false, $fragmentContext = null, OpenElementsStack $stack, Stack $templateInsertionModes, Tokenizer $tokenizer, ParseError $errorHandler, Data $data) {
// If the form element isn't an instance of DOMElement that has a node name of
// "form" or null then there's a problem.
if (!is_null($formElement) && !($formElement instanceof \DOMElement && $formElement->nodeName === 'form')) {
throw new Exception(Exception::TREEBUILDER_FORMELEMENT_EXPECTED, gettype($formElement));
}
// If the fragment context is not null and is not a document fragment, document,
// or element then we have a problem. Additionally, if the parser is created for
// parsing a fragment and the fragment context is null then we have a problem,
// too.
if ((!is_null($fragmentContext) && !$fragmentContext instanceof \DOMDocumentFragment && !$fragmentContext instanceof \DOMDocument && !$fragmentContext instanceof \DOMElement) ||
(is_null($fragmentContext) && $fragmentCase)) {
throw new Exception(Exception::TREEBUILDER_DOCUMENTFRAG_ELEMENT_DOCUMENT_DOCUMENTFRAG_EXPECTED, gettype($fragmentContext));
}
$this->DOM = $dom;
$this->formElement = $formElement;
$this->fragmentCase = $fragmentCase;
$this->fragmentContext = $fragmentContext;
$this->stack = $stack;
$this->templateInsertionModes = $templateInsertionModes;
$this->tokenizer = $tokenizer;
$this->data = $data;
$this->errorHandler = $errorHandler;
// Initialize the list of active formatting elements.
$this->activeFormattingElementsList = new ActiveFormattingElementsList($stack);
$this->insertionMode = self::INITIAL_MODE;
$this->quirksMode = self::QUIRKS_MODE_OFF;
static::$instance = $this;
}
public function __destruct() {
static::$instance = null;
}
public function emitToken(Token $token) {
// Loop used for reprocessing.
while (true) {
$adjustedCurrentNode = $this->stack->adjustedCurrentNode;
$adjustedCurrentNodeName = $this->stack->adjustedCurrentNodeName;
$adjustedCurrentNodeNamespace = $this->stack->adjustedCurrentNodeNamespace;
if (self::$debug) {
echo "Node: $adjustedCurrentNodeName\n";
echo "\nToken: \n";
var_export($token);
echo "\n\n";
}
# 8.2.5 Tree construction
#
# As each token is emitted from the tokenizer, the user agent must follow the
# appropriate steps from the following list, known as the tree construction dispatcher:
#
# If the stack of open elements is empty
if ($this->stack->length === 0 ||
# If the adjusted current node is an element in the HTML namespace
// PHP's DOM returns null when the namespace isn't specified... eg. HTML.
is_null($adjustedCurrentNodeNamespace) || (
# If the adjusted current node is a MathML text integration point and the token is a
# start tag whose tag name is neither "mglyph" nor "malignmark"
# If the adjusted current node is a MathML text integration point and the token is a
# character token
$adjustedCurrentNode->isMathMLTextIntegrationPoint() && ((
$token instanceof StartTagToken && (
$token->name !== 'mglyph' && $token->name !== 'malignmark'
) ||
$token instanceof CharacterToken
)
)
) || (
# If the adjusted current node is an annotation-xml element in the MathML namespace and
# the token is a start tag whose tag name is "svg"
$adjustedCurrentNodeNamespace === Parser::MATHML_NAMESPACE &&
$adjustedCurrentNodeName === 'annotation-xml' &&
$token instanceof StartTagToken &&
$token->name === 'svg'
) || (
# If the adjusted current node is an HTML integration point and the token is a start tag
# If the adjusted current node is an HTML integration point and the token is a character
# token
$adjustedCurrentNode->isHTMLIntegrationPoint() && (
$token instanceof StartTagToken || $token instanceof CharacterToken
)
) ||
# If the token is an end-of-file token
$token instanceof EOFToken) {
# Process the token according to the rules given in the section corresponding to
# the current insertion mode in HTML content.
$this->parseTokenInHTMLContent($token);
}
# Otherwise
else {
# Process the token according to the rules given in the section for parsing
# tokens in foreign content.
// Returns false when needing to reprocess.
if ($this->parseTokenInForeignContent($token) === false) {
continue;
}
}
break;
}
}
protected function parseTokenInHTMLContent(Token $token, int $insertionMode = null) {
$insertionMode = (is_null($insertionMode)) ? $this->insertionMode : $insertionMode;
// Loop used when processing the token under different rules; always breaks.
while (true) {
if (self::$debug) {
switch ($insertionMode) {
case self::INITIAL_MODE: $mode = "Initial";
break;
case self::BEFORE_HTML_MODE: $mode = "Before html";
break;
case self::BEFORE_HEAD_MODE: $mode = "Before head";
break;
case self::IN_HEAD_MODE: $mode = "In head";
break;
case self::IN_HEAD_NOSCRIPT_MODE: $mode = "In head noscript";
break;
case self::AFTER_HEAD_MODE: $mode = "After head";
break;
case self::IN_BODY_MODE: $mode = "In body";
break;
case self::TEXT_MODE: $mode = "Text";
break;
case self::IN_TABLE_MODE: $mode = "In table";
break;
case self::IN_TABLE_TEXT_MODE: $mode = "In table text";
break;
case self::IN_CAPTION_MODE: $mode = "In caption";
break;
case self::IN_COLUMN_GROUP_MODE: $mode = "In column group";
break;
case self::IN_TABLE_BODY_MODE: $mode = "In table body";
break;
case self::IN_ROW_MODE: $mode = "In row";
break;
case self::IN_CELL_MODE: $mode = "In cell";
break;
case self::IN_SELECT_MODE: $mode = "In select";
break;
case self::IN_SELECT_IN_TABLE_MODE: $mode = "In select in table";
break;
case self::IN_TEMPLATE_MODE: $mode = "In template mode";
break;
case self::AFTER_BODY_MODE: $mode = "After body";
break;
case self::IN_FRAMESET_MODE: $mode = "In frameset";
break;
case self::AFTER_FRAMESET_MODE: $mode = "After frameset";
break;
case self::AFTER_AFTER_BODY_MODE: $mode = "After after body";
break;
case self::AFTER_AFTER_FRAMESET_MODE: $mode = "After after frameset";
break;
default: throw new Exception(Exception::UNKNOWN_ERROR);
}
echo "Mode: $mode\n";
}
# 8.2.5.4. The rules for parsing tokens in HTML content
switch ($insertionMode) {
# 8.2.5.4.1. The "initial" insertion mode
case self::INITIAL_MODE:
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
// OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
# Ignore the token.
}
# A comment token
elseif ($token instanceof CommentToken) {
# Insert a comment as the last child of the Document object.
// DEVIATION: PHP's DOM cannot have comments before the DOCTYPE, so just going
// to ignore them instead.
//$this->insertCommentToken($token, $this->DOM);
}
# A DOCTYPE token
elseif ($token instanceof DOCTYPEToken) {
# If the DOCTYPE tokens name is not a case-sensitive match for the string
# "html", or the tokens public identifier is not missing, or the tokens system
# identifier is neither missing nor a case-sensitive match for the string
# "about:legacy-compat", then there is a parse error.
if ($token->name !== 'html' || $token->public !== '' || ($token->system !== '' && $token->system !== 'about:legacy-compat')) {
$this->error(ParseError::INVALID_DOCTYPE);
}
# Append a DocumentType node to the Document node, with the name attribute set
# to the name given in the DOCTYPE token, or the empty string if the name was
# missing; the publicId attribute set to the public identifier given in the
# DOCTYPE token, or the empty string if the public identifier was missing; the
# systemId attribute set to the system identifier given in the DOCTYPE token, or
# the empty string if the system identifier was missing; and the other
# attributes specific to DocumentType objects set to null and empty lists as
# appropriate. Associate the DocumentType node with the Document object so that
# it is returned as the value of the doctype attribute of the Document object.
$this->DOM->appendChild($this->DOM->implementation->createDocumentType((!is_null($token->name)) ? $token->name : 'html', $token->public, $token->system));
$public = strtolower($token->public);
# Then, if the document is not an iframe srcdoc document, and the DOCTYPE token
# matches one of the conditions in the following list, then set the Document to
# quirks mode:
// DEVIATION: This implementation does not render, so there is no nested
// browsing contexts to consider.
if ($token->forceQuirks === true || $token->name !== 'html' ||
$public === '-//w3o//dtd w3 html strict 3.0//en//' ||
$public === '-/w3c/dtd html 4.0 transitional/en' ||
$public === 'html' ||
strtolower($token->system) === 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd' ||
strpos($public, '+//silmaril//dtd html pro v0r11 19970101//') === 0 ||
strpos($public, '-//as//dtd html 3.0 aswedit + extensions//') === 0 ||
strpos($public, '+//silmaril//dtd html pro v0r11 19970101//') === 0 ||
strpos($public, '-//as//dtd html 3.0 aswedit + extensions//') === 0 ||
strpos($public, '-//advasoft ltd//dtd html 3.0 aswedit + extensions//') === 0 ||
strpos($public, '-//ietf//dtd html 2.0 level 1//') === 0 ||
strpos($public, '-//ietf//dtd html 2.0 level 2//') === 0 ||
strpos($public, '-//ietf//dtd html 2.0 strict level 1//') === 0 ||
strpos($public, '-//ietf//dtd html 2.0 strict level 2//') === 0 ||
strpos($public, '-//ietf//dtd html 2.0 strict//') === 0 ||
strpos($public, '-//ietf//dtd html 2.0//') === 0 ||
strpos($public, '-//ietf//dtd html 2.1e//') === 0 ||
strpos($public, '-//ietf//dtd html 3.0//') === 0 ||
strpos($public, '-//ietf//dtd html 3.2 final//') === 0 ||
strpos($public, '-//ietf//dtd html 3.2//') === 0 ||
strpos($public, '-//ietf//dtd html 3//') === 0 ||
strpos($public, '-//ietf//dtd html level 0//') === 0 ||
strpos($public, '-//ietf//dtd html level 1//') === 0 ||
strpos($public, '-//ietf//dtd html level 2//') === 0 ||
strpos($public, '-//ietf//dtd html level 3//') === 0 ||
strpos($public, '-//ietf//dtd html strict level 0//') === 0 ||
strpos($public, '-//ietf//dtd html strict level 1//') === 0 ||
strpos($public, '-//ietf//dtd html strict level 2//') === 0 ||
strpos($public, '-//ietf//dtd html strict level 3//') === 0 ||
strpos($public, '-//ietf//dtd html strict//') === 0 ||
strpos($public, '-//ietf//dtd html//') === 0 ||
strpos($public, '-//metrius//dtd metrius presentational//') === 0 ||
strpos($public, '-//microsoft//dtd internet explorer 2.0 html strict//') === 0 ||
strpos($public, '-//microsoft//dtd internet explorer 2.0 html//') === 0 ||
strpos($public, '-//microsoft//dtd internet explorer 2.0 tables//') === 0 ||
strpos($public, '-//microsoft//dtd internet explorer 3.0 html strict//') === 0 ||
strpos($public, '-//microsoft//dtd internet explorer 3.0 html//') === 0 ||
strpos($public, '-//microsoft//dtd internet explorer 3.0 tables//') === 0 ||
strpos($public, '-//netscape comm. corp.//dtd html//') === 0 ||
strpos($public, '-//netscape comm. corp.//dtd strict html//') === 0 ||
strpos($public, '-//o\'reilly and associates//dtd html 2.0//') === 0 ||
strpos($public, '-//o\'reilly and associates//dtd html extended 1.0//') === 0 ||
strpos($public, '-//o\'reilly and associates//dtd html extended relaxed 1.0//') === 0 ||
strpos($public, '-//sq//dtd html 2.0 hotmetal + extensions//') === 0 ||
strpos($public, '-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//') === 0 ||
strpos($public, '-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//') === 0 ||
strpos($public, '-//spyglass//dtd html 2.0 extended//') === 0 ||
strpos($public, '-//sun microsystems corp.//dtd hotjava html//') === 0 ||
strpos($public, '-//sun microsystems corp.//dtd hotjava strict html//') === 0 ||
strpos($public, '-//w3c//dtd html 3 1995-03-24//') === 0 ||
strpos($public, '-//w3c//dtd html 3.2 draft//') === 0 ||
strpos($public, '-//w3c//dtd html 3.2 final//') === 0 ||
strpos($public, '-//w3c//dtd html 3.2//') === 0 ||
strpos($public, '-//w3c//dtd html 3.2s draft//') === 0 ||
strpos($public, '-//w3c//dtd html 4.0 frameset//') === 0 ||
strpos($public, '-//w3c//dtd html 4.0 transitional//') === 0 ||
strpos($public, '-//w3c//dtd html experimental 19960712//') === 0 ||
strpos($public, '-//w3c//dtd html experimental 970421//') === 0 ||
strpos($public, '-//w3c//dtd w3 html//') === 0 ||
strpos($public, '-//w3o//dtd w3 html 3.0//') === 0 ||
strpos($public, '-//webtechs//dtd mozilla html 2.0//') === 0 ||
strpos($public, '-//webtechs//dtd mozilla html//') === 0 ||
(is_null($token->system) &&
(strpos($public, '-//w3c//dtd html 4.01 frameset//') === 0 ||
strpos($public, '-//w3c//dtd html 4.01 transitional//') === 0))) {
$this->quirksMode = self::QUIRKS_MODE_ON;
}
# Otherwise, if the document is not an iframe srcdoc document, and the DOCTYPE
# token matches one of the conditions in the following list, then set the
# Document to limited-quirks mode:
// DEVIATION: There is no iframe srcdoc document because there are no nested
// browsing contexts in this implementation.
else {
if (strpos($public, '-//w3c//dtd xhtml 1.0 frameset//') === 0 ||
strpos($public, '-//w3c//dtd xhtml 1.0 transitional//') === 0 ||
(!is_null($token->system) &&
(strpos($public, '-//w3c//dtd html 4.01 frameset//') === 0 ||
strpos($public, '-//w3c//dtd html 4.01 transitional//') === 0))) {
$this->quirksMode = self::QUIRKS_MODE_LIMITED;
}
}
# The system identifier and public identifier strings must be compared to the
# values given in the lists above in an ASCII case-insensitive manner. A system
# identifier whose value is the empty string is not considered missing for the
# purposes of the conditions above.
# Then, switch the insertion mode to "before html".
$this->insertionMode = self::BEFORE_HTML_MODE;
}
# Anything else
else {
# If the document is not an iframe srcdoc document, then this is a parse error;
# set the Document to quirks mode.
// DEVIATION: There is no iframe srcdoc document because there are no nested
// browsing contexts in this implementation.
switch (get_class($token)) {
case 'StartTagToken': $this->error(ParseError::UNEXPECTED_START_TAG, $token->name);
break;
case 'EndTagToken': $this->error(ParseError::UNEXPECTED_END_TAG, $token->name);
break;
case 'EOFToken': $this->error(ParseError::UNEXPECTED_EOF);
break;
default: throw new Exception(Exception::UNKNOWN_ERROR);
}
$this->quirksMode = self::QUIRKS_MODE_ON;
# In any case, switch the insertion mode to "before html", then reprocess the
# token.
$this->insertionMode = self::BEFORE_HTML_MODE;
$insertionMode = self::BEFORE_HTML_MODE;
continue 2;
}
break;
# 8.2.5.4.2. The "before html" insertion mode
case self::BEFORE_HTML_MODE:
# A DOCTYPE token
if ($token instanceof DOCTYPEToken) {
$this->error(ParseError::UNEXPECTED_DOCTYPE);
}
# A comment token
elseif ($token instanceof CommentToken) {
# Insert a comment as the last child of the Document object.
$this->insertCommentToken($token, $this->DOM);
}
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
// OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
elseif ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
# Ignore the token.
}
# A start tag whose tag name is "html"
elseif ($token instanceof StartTagToken && $token->name === 'html') {
# Create an element for the token in the HTML namespace, with the Document as
# the intended parent. Append it to the Document object. Put this element in the
# stack of open elements.
$element = static::insertStartTagToken($token, $this->DOM);
# Switch the insertion mode to "before head".
$this->insertionMode = self::BEFORE_HEAD_MODE;
}
# Any other end tag
elseif ($token instanceof EndTagToken && $token->name !== 'head' && $token->name !== 'body' && $token->name !== 'html' && $token->name !== 'br') {
# Parse error.
$this->error(ParseError::UNEXPECTED_END_TAG, $token->name);
}
# An end tag whose tag name is one of: "head", "body", "html", "br"
# Anything else
else {
# Create an html element whose node document is the Document object. Append it
# to the Document object. Put this element in the stack of open elements.
$element = $this->DOM->createElement('html');
$this->DOM->appendChild($element);
$this->stack[] = $element;
# Switch the insertion mode to "before head", then reprocess the token.
$this->insertionMode = self::BEFORE_HEAD_MODE;
$insertionMode = self::BEFORE_HEAD_MODE;
continue 2;
}
# The document element can end up being removed from the Document object, e.g.,
# by scripts; nothing in particular happens in such cases, content continues
# being appended to the nodes as described in the next section.
// Good to know. There's no scripting in this implementation, though.
break;
# 8.2.5.4.3. The "before head" insertion mode
case self::BEFORE_HEAD_MODE:
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
// OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
# Ignore the token.
}
# A comment token
elseif ($token instanceof CommentToken) {
$this->insertCommentToken($token);
}
# A DOCTYPE token
elseif ($token instanceof DOCTYPEToken) {
# Parse error.
$this->error(ParseError::UNEXPECTED_DOCTYPE);
}
elseif ($token instanceof StartTagToken) {
# A start tag whose tag name is "html"
if ($token->name === 'html') {
# Process the token using the rules for the "in body" insertion mode.
$insertionMode = self::IN_BODY_MODE;
continue 2;
}
# A start tag whose tag name is "head"
elseif ($token->name === 'head') {
# Insert an HTML element for the token.
$element = static::insertStartTagToken($token);
# Set the head element pointer to the newly created head element.
$this->headElement = $element;
# Switch the insertion mode to "in head".
$this->insertionMode = self::IN_HEAD_MODE;
}
}
# Any other end tag
elseif ($token instanceof EndTagToken && $token->name !== 'head' && $token->name !== 'body' && $token->name !== 'html' && $token->name === 'br') {
# Parse error.
$this->error(ParseError::UNEXPECTED_END_TAG, $token->name);
}
# An end tag whose tag name is one of: "head", "body", "html", "br"
# Anything else
else {
# Insert an HTML element for a "head" start tag token with no attributes.
$element = static::insertStartTagToken(new StartTagToken('head'));
# Set the head element pointer to the newly created head element.
$this->headElement = $element;
# Switch the insertion mode to "in head".
$this->insertionMode = self::IN_HEAD_MODE;
$insertionMode = self::IN_HEAD_MODE;
# Reprocess the current token.
continue 2;
}
break;
# 8.2.5.4.4. The "in head" insertion mode
case self::IN_HEAD_MODE:
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
// OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
# Insert the character.
$this->insertCharacterToken($token);
}
# A comment token
elseif ($token instanceof CommentToken) {
# Insert a comment.
$this->insertCommentToken($token);
}
# A DOCTYPE token
elseif ($token instanceof DOCTYPEToken) {
# Parse error.
$this->error(ParseError::UNEXPECTED_DOCTYPE);
}
elseif ($token instanceof StartTagToken) {
# A start tag whose tag name is "html"
if ($token->name === 'html') {
# Process the token using the rules for the "in body" insertion mode.
$insertionMode = self::IN_BODY_MODE;
continue 2;
}
# A start tag whose tag name is one of: "base", "basefont", "bgsound", "link"
elseif ($token->name === 'base' || $token->name === 'basefont' || $token->name === 'bgsound' || $token->name === 'link') {
# Insert an HTML element for the token. Immediately pop the current node off the
# stack of open elements.
static::insertStartTagToken($token);
$this->stack->pop();
# Acknowledge the tokens *self-closing flag*, if it is set.
// Acknowledged.
}
# A start tag whose tag name is "meta"
elseif ($token->name === 'meta') {
# Insert an HTML element for the token. Immediately pop the current node off the
# stack of open elements.
static::insertStartTagToken($token);
$this->stack->pop();
# Acknowledge the tokens *self-closing flag*, if it is set.
// Acknowledged.
# If the element has a charset attribute, and getting an encoding from its value
# results in an encoding, and the confidence is currently tentative, then change
# the encoding to the resulting encoding.
#
# Otherwise, if the element has an http-equiv attribute whose value is an ASCII
# case-insensitive match for the string "Content-Type", and the element has a
# content attribute, and applying the algorithm for extracting a character
# encoding from a meta element to that attributes value returns an encoding,
# and the confidence is currently tentative, then change the encoding to the
# extracted encoding.
// DEVIATION: FIXME: This implementation currently only supports UTF-8.
}
# A start tag whose tag name is "title"
elseif ($token->name === 'title') {
# Follow the generic RCDATA element parsing algorithm.
$this->parseGenericRCDATA($token);
}
# A start tag whose tag name is "noscript", if the scripting flag is enabled
# A start tag whose tag name is one of: "noframes", "style"
// DEVIATION: There is no scripting in this implementation, so the scripting
// flag is always disabled.
elseif ($token->name === 'noframes' || $token->name === 'style') {
# Follow the generic raw text element parsing algorithm.
$this->parseGenericRawText($token);
}
# A start tag whose tag name is "noscript", if the scripting flag is disabled
// DEVIATION: There is no scripting in this implementation, so the scripting
// flag is always disabled.
elseif ($token->name === 'noscript') {
# Insert an HTML element for the token.
static::insertStartTagToken($token);
# Switch the insertion mode to "in head noscript".
$this->insertionMode = self::IN_HEAD_NOSCRIPT_MODE;
}
# A start tag whose tag name is "script"
elseif ($token->name === 'script') {
# Run these steps:
# 1. Let the adjusted insertion location be the appropriate place for inserting
# a node.
# 2. Create an element for the token in the HTML namespace, with the intended
# parent being the element in which the adjusted insertion location finds
# itself.
// DEVIATION: Because there is no scripting in this implementation, there is no
// need to get the adjusted insertion location as the intended parent as the
// intended parent isn't used when determining anything;
// Parser::createAndInsertElement will get the adjusted insertion location
// anyway.
static::insertStartTagToken($token);
# 3. Mark the element as being "parser-inserted" and unset the elements
# "non-blocking" flag.
# 4. Mark the element as being "parser-inserted" and unset the elements
# "non-blocking" flag.
// DEVIATION: No scripting.
# 5. Insert the newly created element at the adjusted insertion location.
// Done.
# 6. Push the element onto the stack of open elements so that it is the new
# current node.
// The element insertion algorithm has it do this already...
# 7. Switch the tokenizer to the script data state.
$this->tokenizer->state = Tokenizer::SCRIPT_DATA_STATE;
# 8. Let the original insertion mode be the current insertion mode.
$this->originalInsertionMode = $this->currentInsertionMode;
# 9. Switch the insertion mode to "text".
$this->insertionMode = self::TEXT_MODE;
}
# A start tag whose tag name is "template"
elseif ($token->name === 'template') {
# Insert an HTML element for the token.
static::insertStartTagToken($token);
# Insert a marker at the end of the list of active formatting elements.
$this->activeFormattingElementsList->insertMarker();
# Set the frameset-ok flag to "not ok".
$this->framesetOk = false;
# Switch the insertion mode to "in template".
$this->insertionMode = self::IN_TEMPLATE_MODE;
# Push "in template" onto the stack of template insertion modes so that it is
# the new current template insertion mode.
$this->templateInsertionModes = self::IN_TEMPLATE_MODE;
}
# A start tag whose tag name is "head"
elseif ($token->name === 'head') {
# Parse error.
$this->error(ParseError::UNEXPECTED_START_TAG, 'head');
}
# Anything else
else {
# Act as described in the "anything else" entry below.
#
# Pop the current node (which will be the head element) off the stack of open
# elements.
$this->stack->pop();
# Switch the insertion mode to "after head".
$this->insertionMode = self::AFTER_HEAD_MODE;
$insertionMode = self::AFTER_HEAD_MODE;
# Reprocess the token.
continue 2;
}
}
elseif ($token instanceof EndTagToken) {
# An end tag whose tag name is "head"
if ($token->name === 'head') {
# Pop the current node (which will be the head element) off the stack of open
# elements.
$this->stack->pop();
# Switch the insertion mode to "after head".
$this->insertionMode = self::AFTER_HEAD_MODE;
}
# An end tag whose tag name is one of: "body", "html", "br"
elseif ($token->name === 'body' || $token->name === 'html' || $token->name === 'br') {
# Act as described in the "anything else" entry below.
#
# Pop the current node (which will be the head element) off the stack of open
# elements.
$this->stack->pop();
# Switch the insertion mode to "after head".
$this->insertionMode = self::AFTER_HEAD_MODE;
$insertionMode = self::AFTER_HEAD_MODE;
# Reprocess the token.
continue 2;
}
# An end tag whose tag name is "template"
elseif ($token->name === 'template') {
# If there is no template element on the stack of open elements, then this is a
# parse error; ignore the token.
if ($this->stack->search('template') === -1) {
$this->error(ParseError::UNEXPECTED_END_TAG, 'template');
}
# Otherwise, run these steps:
else {
# 1. Generate all implied end tags thoroughly.
$this->stack->generateImpliedEndTags();
# 2. If the current node is not a template element, then this is a parse error.
if ($this->stack->currentNodeName !== 'template') {
$this->error(ParseError::UNEXPECTED_END_TAG, 'template');
}
# 3. Pop elements from the stack of open elements until a template element has been popped from the stack.
$this->stack->popUntil('template');
# 4. Clear the list of active formatting elements up to the last marker.
$this->activeFormattingElementsList->clearToTheLastMarker();
# 5. Pop the current template insertion mode off the stack of template insertion modes.
$this->templateInsertionModes->pop();
# 6. Reset the insertion mode appropriately.
$this->resetInsertionMode();
}
}
# Any other end tag
else {
# Parse error.
$this->error(ParseError::UNEXPECTED_END_TAG, $token->name);
}
}
# Anything else
else {
# Pop the current node (which will be the head element) off the stack of open
# elements.
$this->stack->pop();
# Switch the insertion mode to "after head".
$this->insertionMode = self::AFTER_HEAD_MODE;
$insertionMode = self::AFTER_HEAD_MODE;
# Reprocess the token.
continue 2;
}
break;
# 8.2.5.4.5. The "in head noscript" insertion mode
case self::IN_HEAD_NOSCRIPT_MODE:
# DOCTYPE token
if ($token instanceof DOCTYPEToken) {
# Parse error.
$this->error(ParseError::UNEXPECTED_DOCTYPE);
}
elseif ($token instanceof StartTagToken) {
# A start tag whose tag name is "html"
if ($token->name === 'html') {
# Process the token using the rules for the "in body" insertion mode.
$insertionMode = self::IN_BODY_MODE;
continue 2;
}
# A start tag whose tag name is one of: "basefont", "bgsound", "link", "meta",
# "noframes", "style"
elseif ($token->name === 'basefont' || $token->name === 'bgsound' || $token->name === 'link' || $token->name === 'meta' || $token->name === 'noframes' || $token->name === 'style'){
# Process the token using the rules for the "in head" insertion mode.
$insertionMode = self::IN_HEAD_MODE;
continue 2;
}
# A start tag whose tag name is one of: "head", "noscript"
elseif ($token->name === 'head' || $token->name === 'noscript') {
# Parse error.
$this->error(ParseError::UNEXPECTED_START_TAG, $token->name);
}
# Anything else
else {
# Act as described in the "anything else" entry below.
#
# Parse error.
$this->error(ParseError::UNEXPECTED_START_TAG, $token->name);
# Pop the current node (which will be a noscript element) from the stack of open
# elements; the new current node will be a head element.
$this->stack->pop();
# Switch the insertion mode to "in head".
$this->insertionMode = self::IN_HEAD_MODE;
$insertionMode = self::IN_HEAD_MODE;
# Reprocess the token.
continue 2;
}
}
elseif ($token instanceof EndTagToken) {
# An end tag whose tag name is "noscript"
if ($token->name === 'noscript') {
# Pop the current node (which will be a noscript element) from the stack of open
# elements; the new current node will be a head element.
$this->stack->pop();
# Switch the insertion mode to "in head".
$this->insertionMode = self::IN_HEAD_MODE;
}
# An end tag whose tag name is "br"
elseif ($token->name === 'br') {
# Act as described in the "anything else" entry below.
#
# Parse error.
$this->error(ParseError::UNEXPECTED_END_TAG, 'br');
# Pop the current node (which will be a noscript element) from the stack of open
# elements; the new current node will be a head element.
$this->stack->pop();
# Switch the insertion mode to "in head".
$this->insertionMode = self::IN_HEAD_MODE;
$insertionMode = self::IN_HEAD_MODE;
# Reprocess the token.
continue 2;
}
# Any other end tag
else {
# Parse error.
$this->error(ParseError::UNEXPECTED_END_TAG, 'br');
}
}
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
# A comment token
// OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
elseif (($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) ||
$token instanceof CommentToken) {
# Process the token using the rules for the "in head" insertion mode.
$insertionMode = self::IN_HEAD_MODE;
continue 2;
}
# Anything else
else {
# Parse error.
$this->error(ParseError::UNEXPECTED_END_TAG, 'br');
# Pop the current node (which will be a noscript element) from the stack of open
# elements; the new current node will be a head element.
$this->stack->pop();
# Switch the insertion mode to "in head".
$this->insertionMode = self::IN_HEAD_MODE;
$insertionMode = self::IN_HEAD_MODE;
# Reprocess the token.
continue 2;
}
break;
# 8.2.5.4.6. The "after head" insertion mode
case self::AFTER_HEAD_MODE:
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
// OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
# Insert the character.
$this->insertCharacterToken($token);
}
# A comment token
elseif ($token instanceof CommentToken) {
# Insert a comment.
$this->insertCommentToken($token);
}
# A DOCTYPE token
elseif ($token instanceof DOCTYPEToken) {
# Parse error.
$this->error(ParseError::UNEXPECTED_DOCTYPE);
}
elseif ($token instanceof StartTagToken) {
# A start tag whose tag name is "html"
if ($token->name === 'html') {
# Process the token using the rules for the "in body" insertion mode.
$insertionMode = self::IN_BODY_MODE;
continue 2;
}
# A start tag whose tag name is "body"
elseif ($token->name === 'body') {
# Insert an HTML element for the token.
$this->insertStartTagToken($token);
# Set the frameset-ok flag to "not ok".
$this->framesetOk = false;
# Switch the insertion mode to "in body".
$this->insertionMode = self::IN_BODY_MODE;
}
# A start tag whose tag name is "frameset"
elseif ($token->name === 'frameset') {
# Insert an HTML element for the token.
$this->insertStartTagToken($token);
# Switch the insertion mode to "in frameset".
$this->insertionMode = self::IN_FRAMESET_MODE;
}
# A start tag whose tag name is one of: "base", "basefont", "bgsound", "link",
# "meta", "noframes", "script", "style", "template", "title"
elseif ($token->name === 'base' || $token->name === 'basefont' || $token->name === 'bgsound' || $token->name === 'link' || $token->name === 'meta' || $token->name === 'noframes' || $token->name === 'script' || $token->name === 'style' || $token->name === 'template' || $token->name === 'title') {
# Parse error.
$this->error(ParseError::UNEXPECTED_START_TAG, $token->name);
# Push the node pointed to by the head element pointer onto the stack of open elements.
$this->stack[] = $this->headElement;
# Process the token using the rules for the "in head" insertion mode.
$this->parseTokenInHTMLContent($token, self::IN_HEAD_MODE);
# Remove the node pointed to by the head element pointer from the stack of open
# elements. (It might not be the current node at this point.)
$key = $this->stack->search($this->headElement);
if ($key !== -1) {
unset($this->stack[$key]);
}
}
# A start tag whose tag name is "head"
elseif ($token->name === 'head') {
# Parse error.
$this->error(ParseError::UNEXPECTED_START_TAG, 'head');
}
# Any other start tag
else {
# Act as described in the "anything else" entry below.
#
# Insert an HTML element for a "body" start tag token with no attributes.
$this->insertStartTagToken(new StartTagToken('body'));
# Switch the insertion mode to "in body".
$this->insertionMode = self::IN_BODY_MODE;
$insertionMode = self::IN_BODY_MODE;
# Reprocess the current token.
continue 2;
}
}
elseif ($token instanceof EndTagToken) {
# An end tag whose tag name is "template"
if ($token->name === 'template') {
# Process the token using the rules for the "in head" insertion mode.
$insertionMode = self::IN_HEAD_MODE;
continue 2;
}
# An end tag whose tag name is one of: "body", "html", "br"
elseif ($token->name === 'body' || $token->name === 'html' || $token->name === 'br') {
# Act as described in the "anything else" entry below.
#
# Insert an HTML element for a "body" start tag token with no attributes.
$this->insertStartTagToken(new StartTagToken('body'));
# Switch the insertion mode to "in body".
$this->insertionMode = self::IN_BODY_MODE;
$insertionMode = self::IN_BODY_MODE;
# Reprocess the current token.
continue 2;
}
# Any other end tag
else {
# Parse error.
$this->error(ParseError::UNEXPECTED_END_TAG, 'head');
}
}
# Anything else
else {
# Insert an HTML element for a "body" start tag token with no attributes.
$this->insertStartTagToken(new StartTagToken('body'));
# Switch the insertion mode to "in body".
$this->insertionMode = self::IN_BODY_MODE;
$insertionMode = self::IN_BODY_MODE;
# Reprocess the current token.
continue 2;
}
break;
# 8.2.5.4.7. The "in body" insertion mode
case self::IN_BODY_MODE:
if ($token instanceof CharacterToken) {
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
#
# Any other character token
// Space characters and any other characters are exactly the same except any
// other characters sets the frameset-ok flag to "not ok".
# Reconstruct the active formatting elements, if any.
$this->activeFormattingElementsList->reconstruct();
# Insert the tokens character.
$this->insertCharacterToken($token);
// OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
if (strspn($token->data, Data::WHITESPACE) !== strlen($token->data)) {
# Set the frameset-ok flag to "not ok".
$this->framesetOk = false;
}
}
# A comment token
elseif ($token instanceof CommentToken) {
# Insert a comment.
$this->insertCommentToken($token);
}
# A DOCTYPE token
elseif ($token instanceof DOCTYPEToken) {
# Parse error.
$this->error(ParseError::UNEXPECTED_DOCTYPE);
}
elseif ($token instanceof StartTagToken) {
# A start tag whose tag name is "html"
if ($token->name === 'html') {
# Parse error.
$this->error(ParseError::UNEXPECTED_START_TAG, 'html');
# If there is a template element on the stack of open elements, then ignore the
# token.
if ($this->stack->search('template') === -1) {
# Otherwise, for each attribute on the token, check to see if the attribute is
# already present on the top element of the stack of open elements. If it is
# not, add the attribute and its corresponding value to that element.
$top = $this->stack[0];
foreach ($token->attributes as $a) {
if (!$top->hasAttribute($a->name)) {
$top->setAttribute($a->name, $a->value);
}
}
}
}
# A start tag whose tag name is one of: "base", "basefont", "bgsound", "link",
# "meta", "noframes", "script", "style", "template", "title"
elseif ($token->name === 'base' || $token->name === 'basefont' || $token->name === 'bgsound' || $token->name === 'link' || $token->name === 'meta' || $token->name === 'noframes' || $token->name === 'script' || $token->name === 'style' || $token->name === 'template' || $token->name === 'title') {
# Process the token using the rules for the "in head" insertion mode.
$insertionMode = self::IN_HEAD_MODE;
continue 2;
}
# A start tag whose tag name is "body"
elseif ($token->name === 'body') {
# Parse error.
$this->error(ParseError::UNEXPECTED_START_TAG, 'body');
# If the second element on the stack of open elements is not a body element, if
# the stack of open elements has only one node on it, or if there is a template
# element on the stack of open elements, then ignore the token. (fragment case)
if (!($this->stack[1]->name !== 'body' || $this->stack->length === 1 || $this->stack->search('template') !== -1)) {
# Otherwise, set the frameset-ok flag to "not ok"; then, for each attribute on
# the token, check to see if the attribute is already present on the body
# element (the second element) on the stack of open elements, and if it is not,
# add the attribute and its corresponding value to that element.
$this->framesetOk = false;
$body = $this->stack[1];
foreach ($token->attributes as $a) {
if (!$body->hasAttribute($a->name)) {
$body->setAttribute($a->name, $a->value);
}
}
}
}
# A start tag whose tag name is "frameset"
elseif ($token->name === 'frameset') {
# Parse error.
$this->error(ParseError::UNEXPECTED_START_TAG, 'frameset');
# If the stack of open elements has only one node on it, or if the second
# element on the stack of open elements is not a body element, then ignore the
# token. (fragment case)
# If the frameset-ok flag is set to "not ok", ignore the token.
if (!($this->stack->length === 1 || $this->stack[1]->name !== 'body' || $this->framesetOk === false)) {
# Otherwise, run the following steps:
#
# 1. Remove the second element on the stack of open elements from its parent
# node, if it has one.
$second = $this->stack[1];
if ($second->parentNode) {
$second->parentNode->removeChild($second);
}
# 2. Pop all the nodes from the bottom of the stack of open elements, from the
# current node up to, but not including, the root html element.
for ($i = $this->stack->length - 1; $i > 0; $i--) {
$this->stack->pop();
}
# 3. Insert an HTML element for the token.
$this->insertStartTagToken($token);
# 4. Switch the insertion mode to "in frameset".
$this->insertionMode = self::IN_FRAMESET_MODE;
}
}
# A start tag whose tag name is one of: "address", "article", "aside",
# "blockquote", "center", "details", "dialog", "dir", "div", "dl", "fieldset",
# "figcaption", "figure", "footer", "header", "main", "nav", "ol", "p",
# "section", "summary", "ul"
elseif ($token->name === 'address' || $token->name === 'article' || $token->name === 'aside' || $token->name === 'blockquote' || $token->name === 'center' || $token->name === 'details' || $token->name === 'dialog' || $token->name === 'dir' || $token->name === 'div' || $token->name === 'dl' || $token->name === 'fieldset' || $token->name === 'figcaption' || $token->name === 'figure' || $token->name === 'footer' || $token->name === 'header' || $token->name === 'main' || $token->name === 'nav' || $token->name === 'ol' || $token->name === 'p' || $token->name === 'section' || $token->name === 'summary' || $token->name === 'ul') {
# If the stack of open elements has a p element in button scope, then close a p
# element.
if ($this->stack->hasElementInButtonScope('p')) {
$this->closePElement();
}
# Insert an HTML element for the token.
$this->insertStartTagToken($token);
}
# A start tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6"
elseif ($token->name === 'h1' || $token->name === 'h2' || $token->name === 'h3' || $token->name === 'h4' || $token->name === 'h5' || $token->name === 'h6') {
# If the stack of open elements has a p element in button scope, then close a p
# element.
if ($this->stack->hasElementInButtonScope('p')) {
$this->closePElement();
}
# If the current node is an HTML element whose tag name is one of "h1", "h2",
# "h3", "h4", "h5", or "h6", then this is a parse error; pop the current node
# off the stack of open elements.
$currentNodeName = $this->stack->currentNodeName;
$currentNodeNamespace = $this->stack->currentNodeNamespace;
if ($currentNodeNamespace === '' && ($currentNodeName === 'h1' || $currentNodeName === 'h2' || $currentNodeName === 'h3' || $currentNodeName === 'h4' || $currentNodeName === 'h5' || $currentNodeName === 'h6')) {
$this->error(ParseError::UNEXPECTED_START_TAG, $token->name);
$this->stack->pop();
}
# Insert an HTML element for the token.
$this->insertStartTagToken($token);
}
# A start tag whose tag name is one of: "pre", "listing"
elseif ($token->name === 'pre' || $token->name === 'listing') {
# If the stack of open elements has a p element in button scope, then close a p
# element.
if ($this->stack->hasElementInButtonScope('p')) {
$this->closePElement();
}
# Insert an HTML element for the token.
$this->insertStartTagToken($token);
# Set the frameset-ok flag to "not ok".
$this->framesetOk = false;
# If the next token is a U+000A LINE FEED (LF) character token, then ignore that
# token and move on to the next one. (Newlines at the start of pre blocks are
# ignored as an authoring convenience.)
$nextToken = $this->tokenizer->createToken();
if ($nextToken instanceof CharacterToken) {
// Character tokens in this implementation can have more than one character in
// them.
if (strlen($nextToken->data) === 1 && $nextToken->data === "\n") {
return true;
} elseif (strpos($nextToken->data, "\n") === 0) {
$nextToken->data = substr($nextToken->data, 1);
}
}
// Process the next token
$token = $nextToken;
continue 2;
}
# A start tag whose tag name is "form"
elseif ($token->name === 'form') {
# If the form element pointer is not null, and there is no template element on
# the stack of open elements, then this is a parse error; ignore the token.
$templateInStack = ($this->stack->search('template') !== -1);
if (!is_null($this->formElement) && !$templateInStack) {
$this->error(ParseError::UNEXPECTED_START_TAG, $token->name);
}
# Otherwise:
else {
# If the stack of open elements has a p element in button scope, then close a p
# element.
if ($this->stack->hasElementInButtonScope('p')) {
$this->closePElement();
}
# Insert an HTML element for the token, and, if there is no template element on
# the stack of open elements, set the form element pointer to point to the
# element created.
$form = $this->insertStartTagToken($token);
if ($templateInStack) {
$this->formElement = $form;
}
}
}
# A start tag whose tag name is "li"
elseif ($token->name === 'li') {
# 1. Set the frameset-ok flag to "not ok".
$this->framesetOk = false;
# 2. Initialize node to be the current node (the bottommost node of the stack).
# 3. Loop: If node is an li element, then run these substeps:
for ($i = $this->stack->length - 1; $i >= 0; $i--) {
$node = $this->stack[$i];
$nodeName = $node->nodeName;
if ($nodeName === 'li') {
# 1. Generate implied end tags, except for li elements.
$this->stack->generateImpliedEndTags('li');
# 2. If the current node is not an li element, then this is a parse error.
if ($this->stack->currentNodeName !== 'li') {
$this->error(ParseError::UNEXPECTED_START_TAG, $nodeName);
}
# 3. Pop elements from the stack of open elements until an li element has been
# popped from the stack.
$this->stack->popUntil('li');
# 4. Jump to the step labeled Done below.
break;
}
# 4. If node is in the special category, but is not an address, div, or p
# element, then jump to the step labeled Done below.
if ($nodeName !== 'address' && $nodeName !== 'div' && $nodeName !== 'p' && $this->isElementSpecial($node)) {
break;
}
# 5. Otherwise, set node to the previous entry in the stack of open elements and
# return to the step labeled Loop.
// The loop handles that.
}
# 6. Done: If the stack of open elements has a p element in button scope, then
# close a p element.
if ($this->stack->hasElementInButtonScope('p')) {
$this->closePElement();
}
# 7. Finally, insert an HTML element for the token.
$this->insertStartTagToken($token);
}
# A start tag whose tag name is one of: "dd", "dt"
elseif ($token->name === 'dd' || $token->name === 'dt') {
# 1. Set the frameset-ok flag to "not ok".
$this->framesetOk = false;
# 2. Initialize node to be the current node (the bottommost node of the stack).
for ($i = $this->stack->length - 1; $i >= 0; $i--) {
$node = $this->stack[$i];
$nodeName = $node->nodeName;
// Combining these two sets of instructions as they're identical except for the
// element name.
# 3. Loop: If node is a dd element, then run these substeps:
# 4. If node is a dt element, then run these substeps:
if ($nodeName === 'dd' || $nodeName === 'dt') {
# 1. Generate implied end tags, except for dd or dt elements.
$this->stack->generateImpliedEndTags(['dd', 'dt']);
# 2. If the current node is not a dd or dt element, then this is a parse error.
if ($this->stack->currentNodeName !== $nodeName) {
$this->error(ParseError::UNEXPECTED_START_TAG, $nodeName);
}
# 3. Pop elements from the stack of open elements until a dd or dt element has been
# popped from the stack.
$this->stack->popUntil(['dd', 'dt']);
# 4. Jump to the step labeled Done below.
break;
}
# 5. If node is in the special category, but is not an address, div, or p
# element, then jump to the step labeled Done below.
if ($nodeName !== 'address' && $nodeName !== 'div' && $nodeName !== 'p' && $this->isElementSpecial($node)) {
break;
}
# 6. Otherwise, set node to the previous entry in the stack of open elements and
# return to the step labeled Loop.
// The loop handles that.
}
# 7. Done: If the stack of open elements has a p element in button scope, then
# close a p element.
if ($this->stack->hasElementInButtonScope('p')) {
$this->closePElement();
}
# 8. Finally, insert an HTML element for the token.
$this->insertStartTagToken($token);
}
# A start tag whose tag name is "plaintext"
elseif ($token->name === 'plaintext') {
# If the stack of open elements has a p element in button scope, then close a p
# element.
if ($this->stack->hasElementInButtonScope('p')) {
$this->closePElement();
}
# Insert an HTML element for the token.
$this->insertStartTagToken($token);
# Switch the tokenizer to the §8.2.4.5 PLAINTEXT state.
$this->tokenizer->state = Tokenizer::PLAINTEXT_STATE;
}
# A start tag whose tag name is "button"
elseif ($token->name === 'button') {
# 1. If the stack of open elements has a button element in scope, then run these
# substeps:
if ($this->stack->hasElementInScope('button')) {
# 1. Parse error.
$this->error(ParseError::UNEXPECTED_START_TAG, $token->name);
# 2. Generate implied end tags.
$this->stack->generateImpliedEndTags();
# 3. Pop elements from the stack of open elements until a button element has
# been popped from the stack.
$this->stack->popUntil('button');
}
# 2. Reconstruct the active formatting elements, if any.
$this->activeFormattingElementsList->reconstruct();
# 3. Insert an HTML element for the token.
$this->insertStartTagToken($token);
# 4. Set the frameset-ok flag to "not ok".
$this->framesetOk = false;
}
}
elseif ($token instanceof EndTagToken) {
# An end tag whose tag name is "template"
if ($token->name === 'template') {
# Process the token using the rules for the "in head" insertion mode.
$insertionMode = self::IN_HEAD_MODE;
continue 2;
}
# An end tag whose tag name is "body"
# An end tag whose tag name is "html"
elseif ($token->name === 'body' || $token->name === 'html') {
# If the stack of open elements does not have a body element in scope, this is a
# parse error; ignore the token.
if (!$this->stack->hasElementInScope('body')) {
$this->error(ParseError::UNEXPECTED_END_TAG, 'body');
}
# Otherwise, if there is a node in the stack of open elements that is not either
# a dd element, a dt element, an li element, an optgroup element, an option
# element, a p element, an rb element, an rp element, an rt element, an rtc
# element, a tbody element, a td element, a tfoot element, a th element, a thead
# element, a tr element, the body element, or the html element, then this is a
# parse error.
else {
if ($this->stack->search(function($node) {
$n = $node->nodeName;
if ($n !== 'dd' && $n !== 'dt' && $n !== 'li' && $n !== 'optgroup' && $n !== 'option' && $n !== 'p' && $n !== 'rb' && $n !== 'rp' && $n !== 'rt' && $n !== 'rtc' && $n !== 'tbody' && $n !== 'td' && $n !== 'tfoot' && $n !== 'th' && $n !== 'thead' && $n !== 'tr' && $n !== 'body' && $n !== 'html') {
return true;
}
return false;
}) !== -1) {
$this->error(ParseError::UNEXPECTED_END_TAG, 'body');
break;
}
# Switch the insertion mode to "after body".
self::$insertionMode = self::AFTER_BODY_MODE;
// The only thing different between body and html here is that when processing
// an html end tag the token is reprocessed.
if ($token->name === 'html') {
# Reprocess the token.
continue 2;
}
}
}
# An end tag whose tag name is one of: "address", "article", "aside",
# "blockquote", "button", "center", "details", "dialog", "dir", "div", "dl",
# "fieldset", "figcaption", "figure", "footer", "header", "listing", "main",
# "nav", "ol", "pre", "section", "summary", "ul"
elseif ($token->name === 'address' || $token->name === 'article' || $token->name === 'aside' || $token->name === 'blockquote' || $token->name === 'button' || $token->name === 'center' || $token->name === 'details' || $token->name === 'dialog' || $token->name === 'dir' || $token->name === 'div' || $token->name === 'dl' || $token->name === 'fieldset' || $token->name === 'figcaption' || $token->name === 'figure' || $token->name === 'footer' || $token->name === 'header' || $token->name === 'listing' || $token->name === 'main' || $token->name === 'nav' || $token->name === 'ol' || $token->name === 'pre' || $token->name === 'section' || $token->name === 'summary' || $token->name === 'ul') {
# If the stack of open elements does not have an element in scope that is an
# HTML element with the same tag name as that of the token, then this is a parse
# error; ignore the token.
if (!$this->stack->hasElementInScope($token->name)) {
$this->error(ParseError::UNEXPECTED_END_TAG, $token->name);
}
# Otherwise, run these steps:
else {
# 1. Generate implied end tags.
$this->stack->generateImpliedEndTags();
# 2. If the current node is not an HTML element with the same tag name as that
# of the token, then this is a parse error.
if ($this->stack->currentNodeName !== $token->name) {
$this->error(ParseError::UNEXPECTED_END_TAG, $token->name);
}
# 3. Pop elements from the stack of open elements until an HTML element with the
# same tag name as the token has been popped from the stack.
$this->stack->popUntil($token->name);
}
}
# An end tag whose tag name is "form"
elseif ($token->name === 'form') {
# If there is no template element on the stack of open elements, then run these
# substeps:
if ($this->stack->search('template') === -1) {
# 1. Let node be the element that the form element pointer is set to, or null if it
# is not set to an element.
$node = $this->formElement;
# 2. Set the form element pointer to null.
$this->formElement = null;
# 3. If node is null or if the stack of open elements does not have node in
# scope, then this is a parse error; return and ignore the token.
if (is_null($node) || !$this->stack->hasElementInScope($node)) {
$this->error(ParseError::UNEXPECTED_END_TAG, $token->name);
return true;
}
# 4. Generate implied end tags.
$this->stack->generateImpliedEndTags();
# 5. If the current node is not node, then this is a parse error.
if (!$this->stack->currentNode->isSameNode($node)) {
$this->error(ParseError::UNEXPECTED_END_TAG, $token->name);
}
# 6. Remove node from the stack of open elements
$this->stack->remove($node);
}
# If there is a template element on the stack of open elements, then run these
# substeps instead:
else {
# 1. If the stack of open elements does not have a form element in scope, then
# this is a parse error; return and ignore the token.
if ($this->stack->hasElementInScope('form')) {
$this->error(ParseError::UNEXPECTED_END_TAG, $token->name);
return true;
}
# 2. Generate implied end tags.
$this->stack->generateImpliedEndTags();
# 3. If the current node is not a form element, then this is a parse error.
if (!$this->stack->currentNodeName !== 'form') {
$this->error(ParseError::UNEXPECTED_END_TAG, $token->name);
}
# 4. Pop elements from the stack of open elements until a form element has been
# popped from the stack.
$this->stack->popUntil('form');
}
}
}
# An end-of-file token
elseif ($token instanceof EOFToken) {
# If the stack of template insertion modes is not empty, then process the token using the rules for the "in template" insertion mode.
if ($this->templateInsertionModes->length !== 0) {
$insertionMode = self::IN_TEMPLATE_MODE;
continue 2;
}
# Otherwise, follow these steps:
# 1. If there is a node in the stack of open elements that is not either a dd
# element, a dt element, an li element, an optgroup element, an option element,
# a p element, an rb element, an rp element, an rt element, an rtc element, a
# tbody element, a td element, a tfoot element, a th element, a thead element, a
# tr element, the body element, or the html element, then this is a parse error.
if ($this->stack->search(function($node) {
$n = $node->nodeName;
if ($n !== 'dd' && $n !== 'dt' && $n !== 'li' && $n !== 'optgroup' && $n !== 'option' && $n !== 'p' && $n !== 'rb' && $n !== 'rp' && $n !== 'rt' && $n !== 'rtc' && $n !== 'tbody' && $n !== 'td' && $n !== 'tfoot' && $n !== 'th' && $n !== 'thead' && $n !== 'tr' && $n !== 'body' && $n !== 'html') {
return true;
}
return false;
}) !== -1) {
$this->error(ParseError::UNEXPECTED_END_TAG, 'body');
break;
}
# 2. Stop parsing.
// Abort!
}
break;
}
break;
}
return true;
}
protected function parseTokenInForeignContent(Token $token): bool {
if (self::$debug) {
echo "Foreign Content\n";
}
$currentNode = $this->stack->currentNode;
$currentNodeName = $this->stack->currentNodeName;
$currentNodeNamespace = $this->stack->currentNodeNamespace;
# 8.2.5.5 The rules for parsing tokens in foreign content
#
# When the user agent is to apply the rules for parsing tokens in foreign
# content, the user agent must handle the token as follows:
#
if ($token instanceof CharacterToken) {
# A character token that is one of U+0009 CHARACTER TABULATION, "LF" (U+000A),
# "FF" (U+000C), "CR" (U+000D), or U+0020 SPACE
# Any other character token
// OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
if (strspn($token->data, Data::WHITESPACE) !== strlen($token->data)) {
# Set the frameset-ok flag to "not ok".
$this->framesetOk = false;
}
# Insert the token's character.
$this->insertCharacterToken($token);
}
# A comment token
elseif ($token instanceof CommentToken) {
# Insert a comment.
$this->insertCommentToken($token);
}
# A DOCTYPE token
elseif ($token instanceof DOCTYPEToken) {
# Parse error.
$this->error(ParseError::UNEXPECTED_DOCTYPE);
}
elseif ($token instanceof StartTagToken) {
# A start tag whose tag name is one of: "b", "big", "blockquote", "body", "br",
# "center", "code", "dd", "div", "dl", "dt", "em", "embed", "h1", "h2", "h3",
# "h4", "h5", "h6", "head", "hr", "i", "img", "li", "listing", "menu", "meta",
# "nobr", "ol", "p", "pre", "ruby", "s", "small", "span", "strong", "strike",
# "sub", "sup", "table", "tt", "u", "ul", "var"
# A start tag whose tag name is "font", if the token has any attributes named
# "color", "face", or "size"
if ($token->name === 'b' || $token->name === 'big' || $token->name === 'blockquote' || $token->name === 'body' || $token->name === 'br' || $token->name === 'center' || $token->name === 'code' || $token->name === 'dd' || $token->name === 'div' || $token->name === 'dl' || $token->name === 'dt' || $token->name === 'em' || $token->name === 'embed' || $token->name === 'h1' || $token->name === 'h2' || $token->name === 'h3' || $token->name === 'h4' || $token->name === 'h5' || $token->name === 'h6' || $token->name === 'head' || $token->name === 'hr' || $token->name === 'i' || $token->name === 'img' || $token->name === 'li' || $token->name === 'listing' || $token->name === 'menu' || $token->name === 'meta' || $token->name === 'nobr' || $token->name === 'ol' || $token->name === 'p' || $token->name === 'pre' || $token->name === 'ruby' || $token->name === 's' || $token->name === 'small' || $token->name === 'span' || $token->name === 'strong' || $token->name === 'strike' || $token->name === 'sub' || $token->name === 'sup' || $token->name === 'table' || $token->name === 'tt' || $token->name === 'u' || $token->name === 'var' || (
$token->name === 'font' && (
$token->hasAttribute('color') || $token->hasAttribute('face') || $token->hasAttribute('size')
)
)
) {
# Parse error.
$this->error(ParseError::UNEXPECTED_START_TAG, $token->name);
# If the parser was originally created for the HTML fragment parsing algorithm,
# then act as described in the "any other start tag" entry below. (fragment
# case)
if ($this->fragmentCase === true) {
// ¡TEMPORARY!
goto foreignContentAnyOtherStartTag;
}
# Otherwise:
#
# Pop an element from the stack of open elements, and then keep popping more
# elements from the stack of open elements until the current node is a MathML
# text integration point, an HTML integration point, or an element in the HTML
# namespace.
do {
$popped = $this->stack->pop();
$n = $this->stack->currentNode;
$nns = $currentNode->namespaceURI;
} while (!is_null($popped) && !(
$n->isMathMLTextIntegrationPoint() ||
$n->isHTMLIntegrationPoint() ||
// PHP's DOM returns null when the namespace isn't specified... eg. HTML.
is_null($nns)
)
);
# Then, reprocess the token.
return false;
}
# Any other start tag
else {
// ¡TEMPORARY!
foreignContentAnyOtherStartTag:
# If the adjusted current node is an element in the SVG namespace, and the
# tokens tag name is one of the ones in the first column of the following
# table, change the tag name to the name given in the corresponding cell in the
# second column. (This fixes the case of SVG elements that are not all
# lowercase.)
if ($this->stack->adjustedCurrentNodeNamespace === Parser::SVG_NAMESPACE) {
switch ($token->name) {
case 'altglyph': $token->name = 'altGlyph';
break;
case 'altglyphdef': $token->name = 'altGlyphDef';
break;
case 'altglyphitem': $token->name = 'altGlyphItem';
break;
case 'animatecolor': $token->name = 'animateColor';
break;
case 'animatemotion': $token->name = 'animateMotion';
break;
case 'animatetransform': $token->name = 'animateTransform';
break;
case 'clippath': $token->name = 'clipPath';
break;
case 'feblend': $token->name = 'feBlend';
break;
case 'fecolormatrix': $token->name = 'feColorMatrix';
break;
case 'fecomponenttransfer': $token->name = 'feComponentTransfer';
break;
case 'fecomposite': $token->name = 'feComposite';
break;
case 'feconvolvematrix': $token->name = 'feConvolveMatrix';
break;
case 'fediffuselighting': $token->name = 'feDiffuseLighting';
break;
case 'fedisplacementmap': $token->name = 'feDisplacementMap';
break;
case 'fedistantlight': $token->name = 'feDistantLight';
break;
case 'feflood': $token->name = 'feFlood';
break;
case 'fefunca': $token->name = 'feFuncA';
break;
case 'fefuncb': $token->name = 'feFuncB';
break;
case 'fefuncg': $token->name = 'feFuncG';
break;
case 'fefuncr': $token->name = 'feFuncR';
break;
case 'fegaussianblur': $token->name = 'feGaussianBlur';
break;
case 'feimage': $token->name = 'feImage';
break;
case 'femerge': $token->name = 'feMerge';
break;
case 'femergenode': $token->name = 'feMergeNode';
break;
case 'femorphology': $token->name = 'feMorphology';
break;
case 'feoffset': $token->name = 'feOffset';
break;
case 'fepointlight': $token->name = 'fePointLight';
break;
case 'fespecularlighting': $token->name = 'feSpecularLighting';
break;
case 'fespotlight': $token->name = 'feSpotLight';
break;
case 'fetile': $token->name = 'feTile';
break;
case 'feturbulence': $token->name = 'feTurbulence';
break;
case 'foreignobject': $token->name = 'foreignObject';
break;
case 'glyphref': $token->name = 'glyphRef';
break;
case 'lineargradient': $token->name = 'linearGradient';
break;
case 'radialgradient': $token->name = 'radialGradient';
break;
case 'textpath': $token->name = 'textPath';
}
}
foreach ($token->attributes as &$a) {
# If the current node is an element in the MathML namespace, adjust MathML
# attributes for the token. (This fixes the case of MathML attributes that are
# not all lowercase.)
if ($currentNodeNamespace === Parser::MATHML_NAMESPACE && $a->name === 'definitionurl') {
$a->name = 'definitionURL';
}
# If the current node is an element in the SVG namespace, adjust SVG attributes
# for the token. (This fixes the case of SVG attributes that are not all
# lowercase.)
elseif ($currentNodeNamespace === Parser::SVG_NAMESPACE) {
switch ($a->name) {
case 'attributename': $a->name = 'attributeName';
break;
case 'attributetype': $a->name = 'attributeType';
break;
case 'basefrequency': $a->name = 'baseFrequency';
break;
case 'baseprofile': $a->name = 'baseProfile';
break;
case 'calcmode': $a->name = 'calcMode';
break;
case 'clippathunits': $a->name = 'clipPathUnits';
break;
case 'contentscripttype': $a->name = 'contentScriptType';
break;
case 'contentstyletype': $a->name = 'contentStyleType';
break;
case 'diffuseconstant': $a->name = 'diffuseConstant';
break;
case 'edgemode': $a->name = 'edgeMode';
break;
case 'externalresourcesrequired': $a->name = 'externalResourcesRequired';
break;
case 'filterres': $a->name = 'filterRes';
break;
case 'filterunits': $a->name = 'filterUnits';
break;
case 'glyphref': $a->name = 'glyphRef';
break;
case 'gradienttransform': $a->name = 'gradientTransform';
break;
case 'gradientunits': $a->name = 'gradientUnits';
break;
case 'kernelmatrix': $a->name = 'kernelMatrix';
break;
case 'kernelunitlength': $a->name = 'kernelUnitLength';
break;
case 'keypoints': $a->name = 'keyPoints';
break;
case 'keysplines': $a->name = 'keySplines';
break;
case 'keytimes': $a->name = 'keyTimes';
break;
case 'lengthadjust': $a->name = 'lengthAdjust';
break;
case 'limitingconeangle': $a->name = 'limitingConeAngle';
break;
case 'markerheight': $a->name = 'markerHeight';
break;
case 'markerunits': $a->name = 'markerUnits';
break;
case 'markerwidth': $a->name = 'markerWidth';
break;
case 'maskcontentunits': $a->name = 'maskContentUnits';
break;
case 'maskunits': $a->name = 'maskUnits';
break;
case 'numoctaves': $a->name = 'numOctaves';
break;
case 'pathlength': $a->name = 'pathLength';
break;
case 'patterncontentunits': $a->name = 'patternContentUnits';
break;
case 'patterntransform': $a->name = 'patternTransform';
break;
case 'patternunits': $a->name = 'patternUnits';
break;
case 'pointsatx': $a->name = 'pointsAtX';
break;
case 'pointsaty': $a->name = 'pointsAtY';
break;
case 'pointsatz': $a->name = 'pointsAtZ';
break;
case 'preservealpha': $a->name = 'preserveAlpha';
break;
case 'preserveaspectratio': $a->name = 'preserveAspectRatio';
break;
case 'primitiveunits': $a->name = 'primitiveUnits';
break;
case 'refx': $a->name = 'refX';
break;
case 'refy': $a->name = 'refY';
break;
case 'repeatcount': $a->name = 'repeatCount';
break;
case 'repeatdur': $a->name = 'repeatDur';
break;
case 'requiredextensions': $a->name = 'requiredExtensions';
break;
case 'requiredfeatures': $a->name = 'requiredFeatures';
break;
case 'specularconstant': $a->name = 'specularConstant';
break;
case 'specularexponent': $a->name = 'specularExponent';
break;
case 'spreadmethod': $a->name = 'spreadMethod';
break;
case 'startoffset': $a->name = 'startOffset';
break;
case 'stddeviation': $a->name = 'stdDeviation';
break;
case 'stitchtiles': $a->name = 'stitchTiles';
break;
case 'surfacescale': $a->name = 'surfaceScale';
break;
case 'systemlanguage': $a->name = 'systemLanguage';
break;
case 'tablevalues': $a->name = 'tableValues';
break;
case 'targetx': $a->name = 'targetX';
break;
case 'targety': $a->name = 'targetY';
break;
case 'textlength': $a->name = 'textLength';
break;
case 'viewbox': $a->name = 'viewBox';
break;
case 'viewtarget': $a->name = 'viewTarget';
break;
case 'xchannelselector': $a->name = 'xChannelSelector';
break;
case 'ychannelselector': $a->name = 'yChannelSelector';
break;
case 'zoomandpan': $a->name = 'zoomAndPan';
}
}
# Adjust foreign attributes for the token. (This fixes the use of namespaced
# attributes, in particular XLink in SVG.)
# When the steps below require the user agent to adjust foreign attributes for a
# token, then, if any of the attributes on the token match the strings given in
# the first column of the following table, let the attribute be a namespaced
# attribute, with the prefix being the string given in the corresponding cell in
# the second column, the local name being the string given in the corresponding
# cell in the third column, and the namespace being the namespace given in the
# corresponding cell in the fourth column. (This fixes the use of namespaced
# attributes, in particular lang attributes in the XML namespace.)
// DOMElement::setAttributeNS requires the prefix and local name be in one
// string, so there is no need to separate the prefix and the local name here.
switch($a->name) {
case 'xlink:actuate':
case 'xlink:arcrole':
case 'xlink:href':
case 'xlink:role':
case 'xlink:show':
case 'xlink:title':
case 'xlink:type': $a->namespace = Parser::XLINK_NAMESPACE;
break;
case 'xml:base':
case 'xml:lang':
case 'xml:space': $a->namespace = Parser::XML_NAMESPACE;
break;
case 'xmlns': $a->namespace = Parser::XMLNS_NAMESPACE;
break;
case 'xmlns:xlink': $a->namespace = Parser::XLINK_NAMESPACE;
break;
}
}
# Insert a foreign element for the token, in the same namespace as the adjusted
# current node.
static::insertStartTagToken($token, null, $this->stack->adjustedCurrentNode->namespaceURI);
# If the token has its self-closing flag set, then run the appropriate steps
# from the following list:
#
# If the tokens tag name is "script", and the new current node is in the SVG
# namespace
# Acknowledge the tokens *self-closing flag*, and then act as described in the
# steps for a "script" end tag below.
// DEVIATION: Unnecessary because there is no scripting in this implementation.
# Otherwise
# Pop the current node off the stack of open elements and acknowledge the
# tokens *self-closing flag*.
$this->stack->pop();
// Acknowledged.
}
}
# An end tag whose tag name is "script", if the current node is a script element
# in the SVG namespace
// DEVIATION: This implementation does not support scripting, so script elements
// aren't processed differently.
# Any other end tag
elseif ($token instanceof EndTagToken) {
# Run these steps:
#
# 1. Initialize node to be the current node (the bottommost node of the stack).
$node = $currentNode;
$nodeName = $currentNodeName;
# 2. If node is not an element with the same tag name as the token, then this is
# a parse error.
if ($nodeName !== $token->name) {
$this->error(ParseError::UNEXPECTED_END_TAG, $token->name);
}
# 3. Loop: If node's tag name, converted to ASCII lowercase, is the same as the
# tag name of the token, pop elements from the stack of open elements until node
# has been popped from the stack, and then abort these steps.
$count = $this->stack->length - 1;
while (true) {
if (strtolower($nodeName) === $token->name) {
$this->stack->popUntil($node);
break;
}
# 4. Set node to the previous entry in the stack of open elements.
$node = $this->stack[--$count];
$nodeName = $node->nodeName;
# 5. If node is not an element in the HTML namespace, return to the step labeled
# loop.
// PHP DOM returns null if the namespace isn't specified... eg. HTML.
if (!is_null($node->namespaceURI)) {
continue;
}
# 6. Otherwise, process the token according to the rules given in the section
# corresponding to the current insertion mode in HTML content.
$this->parseTokenInHTMLContent($token, $this->insertionMode);
break;
}
}
return true;
}
protected function appropriatePlaceForInsertingNode(\DOMNode $overrideTarget = null): array {
$insertBefore = false;
# 8.2.5.1. Creating and inserting nodes
#
# While the parser is processing a token, it can enable or disable foster
# parenting. This affects the following algorithm.
#
# The appropriate place for inserting a node, optionally using a particular
# override target, is the position in an element returned by running the
# following steps:
# 1. If there was an override target specified, then let target be the override
# target.
$target = (!is_null($overrideTarget)) ? $overrideTarget : $this->stack->currentNode;
# 2. Determine the adjusted insertion location using the first matching steps
# from the following list: If foster parenting is enabled and target is a table,
# tbody, tfoot, thead, or tr element
$targetNodeName = $target->nodeName;
if ($this->fosterParenting && ($targetNodeName === 'table' || $targetNodeName === 'tbody' || $targetNodeName === 'tfoot' || $targetNodeName === 'thead' || $targetNodeName === 'tr')) {
# Run these substeps:
#
# 1. Let last template be the last template element in the stack of open
# elements, if any.
$lastTemplateKey = $this->stack->search('template');
$lastTemplate = ($lastTemplateKey !== -1 ) ? $this->stack[$lastTemplateKey] : null;
# 2. Let last table be the last table element in the stack of open elements, if
# any.
$lastTableKey = $this->stack->search('table');
$lastTable = ($lastTableKey !== -1 ) ? $this->stack[$lastTableKey] : null;
# 3. If there is a last template and either there is no last table, or there is
# one, but last template is lower (more recently added) than last table in the
# stack of open elements, then: let adjusted insertion location be inside last
# templates template contents, after its last child (if any), and abort these
# substeps.
if ($lastTemplate && (!$lastTable || $lastTable && $lastTemplateKey > $lastTableKey)) {
$insertionLocation = $lastTemplate->content;
// Abort!
}
# 4. If there is no last table, then let adjusted insertion location be inside
# the first element in the stack of open elements (the html element), after its
# last child (if any), and abort these substeps. (fragment case)
elseif (!$lastTable) {
$insertionLocation = $this->stack[0];
// Abort!
}
# 5. If last table has a parent node, then let adjusted insertion location be
# inside last tables parent node, immediately before last table, and abort
# these substeps.
elseif ($lastTable->parentNode) {
$insertionLocation = $lastTable;
$insertBefore = true;
// Abort!
}
else {
# 6. Let previous element be the element immediately above last table in the
# stack of open elements.
$previousElement = $this->stack[$lastTableKey - 1];
# 7. Let adjusted insertion location be inside previous element, after its last
# child (if any).
$insertionLocation = $previousElement;
}
}
# Otherwise let adjusted insertion location be inside target, after its last
# child (if any).
else {
$insertionLocation = $target;
}
# 3. If the adjusted insertion location is inside a template element, let it
# instead be inside the template elements template contents, after its last
# child (if any).
if ($insertionLocation instanceof Element && $insertionLocation->nodeName === 'template') {
$insertionLocation = $insertionLocation->content;
}
# 4. Return the adjusted insertion location.
return [
'node' => $insertionLocation,
'insert before' => $insertBefore
];
}
public static function insertCharacterToken(CharacterToken $token) {
# 1. Let data be the characters passed to the algorithm, or, if no characters
# were explicitly specified, the character of the character token being
# processed.
// Already provided through the token object.
# 2. Let the adjusted insertion location be the appropriate place for inserting
# a node.
$location = static::$instance->appropriatePlaceForInsertingNode();
$adjustedInsertionLocation = $location['node'];
$insertBefore = $location['insert before'];
# 3. If the adjusted insertion location is in a Document node, then abort these
# steps.
if ((($insertBefore === false) ? $adjustedInsertionLocation : $adjustedInsertionLocation->parentNode) instanceof \DOMDocument) {
return;
}
# 4. If there is a Text node immediately before the adjusted insertion location,
# then append data to that Text nodes data.
$previousSibling = ($insertBefore === false) ? $adjustedInsertionLocation->lastChild : $adjustedInsertionLocation->previousSibling;
if ($previousSibling instanceof \DOMText) {
$previousSibling->data .= $token->data;
return;
}
# Otherwise, create a new Text node whose data is data and whose node document
# is the same as that of the element in which the adjusted insertion location
# finds itself, and insert the newly created node at the adjusted insertion
# location.
$textNode = $adjustedInsertionLocation->ownerDocument->createTextNode($token->data);
if ($insertBefore === false) {
$adjustedInsertionLocation->appendChild($textNode);
} else {
$adjustedInsertionLocation->parentNode->insertBefore($textNode, $adjustedInsertionLocation);
}
}
public static function insertCommentToken(CommentToken $token, \DOMNode $position = null) {
# When the steps below require the user agent to insert a comment while
# processing a comment token, optionally with an explicitly insertion position
# position, the user agent must run the following steps:
# 1. Let data be the data given in the comment token being processed.
// Already provided through the token object.
# 2. If position was specified, then let the adjusted insertion location be
# position. Otherwise, let adjusted insertion location be the appropriate place
# for inserting a node.
if (!is_null($position)) {
$adjustedInsertionLocation = $position;
$insertBefore = false;
} else {
$location = static::$instance->appropriatePlaceForInsertingNode();
$adjustedInsertionLocation = $location['node'];
$insertBefore = $location['insert before'];
}
# 3. Create a Comment node whose data attribute is set to data and whose node
# document is the same as that of the node in which the adjusted insertion
# location finds itself.
$commentNode = $adjustedInsertionLocation->ownerDocument->createComment($token->data);
# 4. Insert the newly created node at the adjusted insertion location.
if ($insertBefore === false) {
$adjustedInsertionLocation->appendChild($commentNode);
} else {
$adjustedInsertionLocation->parentNode->insertBefore($commentNode, $adjustedInsertionLocation);
}
}
public static function insertStartTagToken(StartTagToken $token, \DOMNode $intendedParent = null, string $namespace = null): Element {
if (!is_null($namespace)) {
$namespace = $token->namespace;
}
# When the steps below require the UA to create an element for a token in a
# particular given namespace and with a particular intended parent, the UA must
# run the following steps:
# 1. Let document be intended parents node document.
// DEVIATION: Unnecessary because there aren't any nested contexts to consider.
// The document will always be $this->DOM.
# 2. Let local name be the tag name of the token.
// Nope. Don't need it because when creating elements with
// DOMElement::createElementNS the prefix and local name are combined.
// DEVIATION: Steps three through six are unnecessary because there is no
// scripting in this implementation.
# 7. Let element be the result of creating an element given document, local
# name, given namespace, null, and is. If will execute script is true, set the
# synchronous custom elements flag; otherwise, leave it unset.
// DEVIATION: There is no point to setting the synchronous custom elements flag
// and custom element definition; there is no scripting in this implementation.
if ($namespace === Parser::HTML_NAMESPACE) {
$element = static::$instance->DOM->createElement($token->name);
} else {
$element = static::$instance->DOM->createElementNS($namespace, $token->name);
}
# 8. Append each attribute in the given token to element.
foreach ($token->attributes as $a) {
if ($namespace === Parser::HTML_NAMESPACE) {
$element->setAttribute($a->name, $a->value);
} else {
$element->setAttributeNS($namespace, $a->name, $a->value);
}
}
# 9. If will execute script is true, then:
# - 1. Let queue be the result of popping the current element queue from the
# custom element reactions stack. (This will be the same element queue as was
# pushed above.)
# - 2. Invoke custom element reactions in queue.
# - 3. Decrement documents throw-on-dynamic-markup-insertion counter.
// DEVIATION: These steps are unnecessary because there is no scripting in this
// implementation.
# 10. If element has an xmlns attribute *in the XMLNS namespace* whose value is
# not exactly the same as the elements namespace, that is a parse error.
# Similarly, if element has an xmlns:xlink attribute in the XMLNS namespace
# whose value is not the XLink namespace, that is a parse error.
$xmlns = $element->getAttributeNS(Parser::XMLNS_NAMESPACE, 'xmlns');
if ($xmlns !== '' && $xmlns !== $element->namespaceURI) {
$this->error(ParseError::UNEXPECTED_XMLNS_ATTRIBUTE_VALUE, $element->namespaceURI);
}
$xlink = $element->getAttributeNS(Parser::XMLNS_NAMESPACE, 'xlink');
if ($xlink !== '' && $xlink !== Parser::XLINK_NAMESPACE) {
$this->error(ParseError::UNEXPECTED_XMLNS_ATTRIBUTE_VALUE, Parser::XLINK_NAMESPACE);
}
# 11. If element is a resettable element, invoke its reset algorithm. (This
# initializes the elements value and checkedness based on the elements
# attributes.)
// DEVIATION: Unnecessary because there is no scripting in this implementation.
# 12. If element is a form-associated element, and the form element pointer is
# not null, and there is no template element on the stack of open elements, and
# element is either not listed or doesnt have a form attribute, and the
# intended parent is in the same tree as the element pointed to by the form
# element pointer, associate element with the form element pointed to by the
# form element pointer, and suppress the running of the reset the form owner
# algorithm when the parser subsequently attempts to insert the element.
// DEVIATION: Unnecessary because there is no scripting in this implementation.
# 13. Return element.
// Nope. Going straight into element insertion.
# When the steps below require the user agent to insert an HTML element for a
# token, the user agent must insert a foreign element for the token, in the HTML
# namespace.
# When the steps below require the user agent to insert a foreign element for a
# token in a given namespace, the user agent must run these steps:
// Doing both foreign and HTML elements here because the only difference between
// the two is that foreign elements are inserted with a namespace and HTML
// elements are not.
# 1. Let the adjusted insertion location be the appropriate place for inserting
# a node.
$location = static::$instance->appropriatePlaceForInsertingNode($intendedParent);
$adjustedInsertionLocation = $location['node'];
$insertBefore = $location['insert before'];
# 2. Let element be the result of creating an element for the token in the given
# namespace, with the intended parent being the element in which the adjusted
# insertion location finds itself.
// Element is supplied.
// Have that, too.
# 3. If it is possible to insert element at the adjusted insertion location,
# then:
# - 1. Push a new element queue onto the custom element reactions stack.
// DEVIATION: Unnecessary because there is no scripting in this implementation.
# - 2. Insert element at the adjusted insertion location.
if ($insertBefore === false) {
$adjustedInsertionLocation->appendChild($element);
} else {
$adjustedInsertionLocation->parentNode->insertBefore($element, $adjustedInsertionLocation);
}
# - 3. Pop the element queue from the custom element reactions stack, and
# invoke custom element reactions in that queue.
// DEVIATION: Unnecessary because there is no scripting in this implementation.
# 4. Push element onto the stack of open elements so that it is the new current node.
static::$instance->stack[] = $element;
# Return element.
return $element;
}
protected function parseGenericText(StartTagToken $token, bool $RAWTEXT = true) {
# The generic raw text element parsing algorithm and the generic RCDATA element
# parsing algorithm consist of the following steps. These algorithms are always
# invoked in response to a start tag token.
# 1. Insert an HTML element for the token.
static::insertStartTagToken($token);
# 2. If the algorithm that was invoked is the generic raw text element parsing
# algorithm, switch the tokenizer to the RAWTEXT state; otherwise the algorithm
# invoked was the generic RCDATA element parsing algorithm, switch the tokenizer
# to the RCDATA state.
$this->tokenizer->state = ($RAWTEXT === true) ? Tokenizer::RAWTEXT_STATE : Tokenizer::RCDATA_STATE;
# 3. Let the original insertion mode be the current insertion mode.
$this->originalInsertionMode = $this->insertionMode;
# 4. Then, switch the insertion mode to "text".
$this->insertionMode = self::TEXT_MODE;
}
protected function parseGenericRawText(StartTagToken $token) {
$this->parseGenericText($token, true);
}
protected function parseGenericRCDATA(StartTagToken $token) {
$this->parseGenericText($token, false);
}
protected function resetInsertionMode() {
# When the steps below require the UA to reset the insertion mode appropriately,
# it means the UA must follow these steps:
# 1. Let last be false.
$last = false;
# 2. Let node be the last node in the stack of open elements.
$node = $this->stack->currentNode;
$nodeName = $this->stack->currentNodeName;
// Keeping up with the position, too.
$position = $this->stack->length - 1;
# 3. Loop: If node is the first node in the stack of open elements, then set
# last to true, and, if the parser was originally created as part of the HTML
# fragment parsing algorithm (fragment case), set node to the context element
# passed to that algorithm.
while (true) {
if ($node->isSameNode($this->stack[0])) {
$last = true;
if ($this->fragmentCase === true) {
$node = $this->fragmentContext;
}
}
# 4. If node is a select element, run these substeps:
if ($nodeName === 'select') {
# 1. If last is true, jump to the step below labeled Done.
if ($last === false) {
# 2. Let ancestor be node.
$ancestor = $node;
$position2 = $position;
# 3. Loop: If ancestor is the first node in the stack of open elements, jump to
# the step below labeled Done.
while (!$ancestor->isSameNode($this->stack[0])) {
# 4. Let ancestor be the node before ancestor in the stack of open elements.
$ancestor = $this->stack[--$position2];
# 5. If ancestor is a template node, jump to the step below labeled Done.
if ($ancestor->nodeName === 'template') {
break;
}
# 6. If ancestor is a table node, switch the insertion mode to "in select in
# table" and abort these steps.
if ($ancestor->nodeName === 'table') {
$this->insertionMode = self::IN_SELECT_IN_TABLE_MODE;
return;
}
# 7. Jump back to the step labeled Loop.
}
}
# 8. Done: Switch the insertion mode to "in select" and abort these steps.
$this->insertionMode = self::IN_SELECT_MODE;
}
# 5. If node is a td or th element and last is false, then switch the insertion
# mode to "in cell" and abort these steps.
elseif (($nodeName === 'td' || $nodeName === 'th') && $last === false) {
$this->insertionMode = self::IN_CELL_MODE;
return;
}
# 6. If node is a tr element, then switch the insertion mode to "in row" and
# abort these steps.
elseif ($nodeName === 'tr') {
$this->insertionMode = self::IN_ROW_MODE;
return;
}
# 7. If node is a tbody, thead, or tfoot element, then switch the insertion mode
# to "in table body" and abort these steps.
elseif ($nodeName === 'tbody' || $nodeName === 'thead' || $nodeName === 'tfoot') {
$this->insertionMode = self::IN_TABLE_BODY_MODE;
return;
}
# 8. If node is a caption element, then switch the insertion mode to "in
# caption" and abort these steps.
elseif ($nodeName === 'caption') {
$this->insertionMode = self::IN_CAPTION_MODE;
return;
}
# 9. If node is a colgroup element, then switch the insertion mode to "in column
# group" and abort these steps.
elseif ($nodeName === 'colgroup') {
$this->insertionMode = self::IN_COLUMN_GROUP_MODE;
return;
}
# 10. If node is a table element, then switch the insertion mode to "in table"
# and abort these steps.
elseif ($nodeName === 'table') {
$this->insertionMode = self::IN_TABLE_MODE;
return;
}
# 11. If node is a template element, then switch the insertion mode to the
# current template insertion mode and abort these steps.
elseif ($nodeName === 'template') {
$this->insertionMode = $this->templateInsertionModes->currentMode;
return;
}
# 12. If node is a head element and last is false, then switch the insertion
# mode to "in head" and abort these steps.
elseif ($nodeName === 'head' && $last === false) {
$this->insertionMode = self::IN_HEAD_MODE;
return;
}
# 13. If node is a body element, then switch the insertion mode to "in body" and
# abort these steps.
elseif ($nodeName === 'body') {
$this->insertionMode = self::IN_BODY_MODE;
return;
}
# 14. If node is a frameset element, then switch the insertion mode to "in
# frameset" and abort these steps. (fragment case)
elseif ($nodeName === 'frameset') {
$this->insertionMode = self::IN_FRAMESET_MODE;
return;
}
# 15. If node is an html element, run these substeps:
elseif ($nodeName === 'html') {
# 1. If the head element pointer is null, switch the insertion mode to "before
# head" and abort these steps. (fragment case)
if (is_null($this->headElement)) {
$this->insertionMode = self::BEFORE_HEAD_MODE;
return;
}
# 2. Otherwise, the head element pointer is not null, switch the insertion mode
# to "after head" and abort these steps.
$this->insertionMode = self::AFTER_HEAD_MODE;
return;
}
# 16. If last is true, then switch the insertion mode to "in body" and abort
# these steps. (fragment case)
if ($last === true) {
$this->insertionMode = self::IN_BODY_MODE;
}
# 17. Let node now be the node before node in the stack of open elements.
$node = $this->stack[--$position];
# 18. Return to the step labeled Loop.
}
}
protected function closePElement() {
# When the steps above say the UA is to close a p element, it means that the UA
# must run the following steps:
# 1. Generate implied end tags, except for p elements.
$this->stack->generateImpliedEndTags('p');
# 2. If the current node is not a p element, then this is a parse error.
$currentNodeName = $this->stack->currentNodeName;
if ($currentNodeName !== 'p') {
$this->error(ParseError::UNEXPECTED_END_TAG, $currentNodeName);
}
# 3. Pop elements from the stack of open elements until a p element has been
# popped from the stack.
$this->stack->popUntil('p');
}
protected function isElementSpecial(Element $element): bool {
$name = $element->nodeName;
$ns = $element->namespaceURI;
# The following elements have varying levels of special parsing rules: HTMLs
# address, applet, area, article, aside, base, basefont, bgsound, blockquote,
# body, br, button, caption, center, col, colgroup, dd, details, dir, div, dl,
# dt, embed, fieldset, figcaption, figure, footer, form, frame, frameset, h1,
# h2, h3, h4, h5, h6, head, header, hr, html, iframe, img, input, li, link,
# listing, main, marquee, meta, nav, noembed, noframes, noscript, object, ol, p,
# param, plaintext, pre, script, section, select, source, style, summary, table,
# tbody, td, template, textarea, tfoot, th, thead, title, tr, track, ul, wbr,
# xmp; MathML mi, MathML mo, MathML mn, MathML ms, MathML mtext, and MathML
# annotation-xml; and SVG foreignObject, SVG desc, and SVG title.
return (($ns === '' && ($name === 'address' || $name === 'applet' || $name === 'area' || $name === 'article' || $name === 'aside' || $name === 'base' || $name === 'basefont' || $name === 'bgsound' || $name === 'blockquote' || $name === 'body' || $name === 'br' || $name === 'button' || $name === 'caption' || $name === 'center' || $name === 'col' || $name === 'colgroup' || $name === 'dd' || $name === 'details' || $name === 'dir' || $name === 'div' || $name === 'dl' || $name === 'dt' || $name === 'embed' || $name === 'fieldset' || $name === 'figcaption' || $name === 'figure' || $name === 'footer' || $name === 'form' || $name === 'frame' || $name === 'frameset' || $name === 'h1' || $name === 'h2' || $name === 'h3' || $name === 'h4' || $name === 'h5' || $name === 'h6' || $name === 'head' || $name === 'header' || $name === 'hr' || $name === 'html' || $name === 'iframe' || $name === 'img' || $name === 'input' || $name === 'li' || $name === 'link' || $name === 'listing' || $name === 'main' || $name === 'marquee' || $name === 'meta' || $name === 'nav' || $name === 'noembed' || $name === 'noframes' || $name === 'noscript' || $name === 'object' || $name === 'ol' || $name === 'p' || $name === 'param' || $name === 'plaintext' || $name === 'pre' || $name === 'script' || $name === 'section' || $name === 'select' || $name === 'source' || $name === 'style' || $name === 'summary' || $name === 'table' || $name === 'tbody' || $name === 'td' || $name === 'template' || $name === 'textarea' || $name === 'tfoot' || $name === 'th' || $name === 'thead' || $name === 'title' || $name === 'tr' || $name === 'track' || $name === 'ul' || $name === 'wbr' || $name === 'xmp')) || ($ns === Parser::MATHML_NAMESPACE && ($name === 'mi' || $name === 'mo' || $name === 'mn' || $name === 'ms' || $name === 'mtext' || $name === 'annotation-xml')) || ($ns === Parser::SVG_NAMESPACE && ($name === 'foreignObject' || $name === 'desc' || $name === 'title')));
}
}