diff --git a/lib/ActiveFormattingElementsList.php b/lib/ActiveFormattingElementsList.php index a7e9345..7b156fd 100644 --- a/lib/ActiveFormattingElementsList.php +++ b/lib/ActiveFormattingElementsList.php @@ -158,7 +158,7 @@ class ActiveFormattingElementsList implements \ArrayAccess { # 6. If entry is neither a marker nor an element that is also in the stack of # open elements, go to the step labeled Rewind. - if (!$entry instanceof ActiveFormattingElementMarker && !in_array($entry['element'], $this->stack)) { + if (!$entry instanceof ActiveFormattingElementMarker || $this->stack->search($entry['element']) === -1) { goto rewind; } diff --git a/lib/ParseError.php b/lib/ParseError.php index 90b63bc..76240a1 100644 --- a/lib/ParseError.php +++ b/lib/ParseError.php @@ -9,7 +9,7 @@ class ParseError { const UNEXPECTED_EOF = 1; const UNEXPECTED_CHARACTER = 2; const ATTRIBUTE_EXISTS = 3; - const UNEXPECTED_TAG_END = 4; + const UNEXPECTED_END_OF_TAG = 4; const UNEXPECTED_START_TAG = 5; const UNEXPECTED_END_TAG = 6; const UNEXPECTED_DOCTYPE = 7; @@ -25,9 +25,9 @@ class ParseError { 'Unexpected end-of-file; %s expected', 'Unexpected "%s" character; %s expected', '%s attribute already exists; discarding', - 'Unexpected tag end; %s expected', - 'Unexpected %s start tag; %s expected', - 'Unexpected %s end tag; %s expected', + 'Unexpected end-of-tag; %s expected', + 'Unexpected %s start tag; %s start tag expected', + 'Unexpected %s end tag; %s end tag expected', 'Unexpected DOCTYPE; %s expected', 'Invalid DOCTYPE', 'Invalid Control or Non-character; removing', diff --git a/lib/Stack.php b/lib/Stack.php index 26b14fc..0d6e818 100644 --- a/lib/Stack.php +++ b/lib/Stack.php @@ -16,7 +16,7 @@ class Stack implements \ArrayAccess { (is_null($fragmentContext) && $fragmentCase)) { throw new Exception(Exception::STACK_FRAGMENT_CONTEXT_DOMELEMENT_DOMDOCUMENT_DOMDOCUMENTFRAG_EXPECTED, gettype($fragmentContext)); } - + $this->fragmentCase = $fragmentCase; $this->fragmentContext = $fragmentContext; } @@ -59,7 +59,7 @@ class Stack implements \ArrayAccess { public function search(mixed $needle): int { if (!$needle) { - return false; + return -1; } if ($needle instanceof DOMElement) { @@ -76,7 +76,15 @@ class Stack implements \ArrayAccess { } } - return false; + return -1; + } + + public function generateImpliedEndTags() { + $currentNodeName = end($this->_storage)->nodeName; + while ($currentNodeName === 'caption' || $currentNodeName === 'colgroup' || $currentNodeName === 'dd' || $currentNodeName === 'dt' || $currentNodeName === 'li' || $currentNodeName === 'optgroup' || $currentNodeName === 'option' || $currentNodeName === 'p' || $currentNodeName === 'rb' || $currentNodeName === 'rp' || $currentNodeName === 'rt' || $currentNodeName === 'rtc' || $currentNodeName === 'tbody' || $currentNodeName === 'td' || $currentNodeName === 'tfoot' || $currentNodeName === 'th' || $currentNodeName === 'thead' || $currentNodeName === 'tr') { + $this->pop(); + $currentNodeName = end($this->_storage)->nodeName; + } } public function __get($property) { @@ -107,4 +115,18 @@ class Stack implements \ArrayAccess { default: return null; } } + + // Used when listing expected elements when returning parse errors + public function __toString(): string { + if (count($this->_storage) > 1) { + // Don't output the name of the root element. + for ($i = 1, $temp = []; $i < count($this->_storage) - 1; $i++) { + $temp[] = $this->_storage[$i]->nodeName; + } + + return implode(', ', array_unique($temp)); + } else { + return ''; + } + } } diff --git a/lib/Tokenizer.php b/lib/Tokenizer.php index a201c1e..4715215 100644 --- a/lib/Tokenizer.php +++ b/lib/Tokenizer.php @@ -1721,7 +1721,7 @@ class Tokenizer { # ">" (U+003E) elseif ($char === '>') { # Parse error. Switch to the data state. Emit the current tag token. - ParseError::trigger(ParseError::UNEXPECTED_TAG_END, 'attribute value'); + ParseError::trigger(ParseError::UNEXPECTED_END_OF_TAG, 'attribute value'); $this->state = self::DATA_STATE; // Need to add the current attribute name and value to the token if necessary. diff --git a/lib/TreeBuilder.php b/lib/TreeBuilder.php index ea366f3..4433da6 100644 --- a/lib/TreeBuilder.php +++ b/lib/TreeBuilder.php @@ -15,6 +15,10 @@ class TreeBuilder { // Flag for determining whether to use the foster parenting (badly nested table // elements) algorithm. protected $fosterParenting = false; + // Flag that shows whether the content that's being parsed is a fragment or not + protected $fragmentCase; + // Context element for fragments + protected $fragmentContext; // Flag used to determine whether elements are okay to be used in framesets or not protected $framesetOk = true; // Once a head element has been parsed (whether implicitly or explicitly) the head @@ -333,7 +337,7 @@ class TreeBuilder { # In any case, switch the insertion mode to "before html", then reprocess the # token. $this->insertionMode = self::BEFORE_HTML_MODE; - return false; + continue; } break; @@ -369,7 +373,7 @@ class TreeBuilder { # Any other end tag elseif ($token instanceof EndTagToken && $token->name !== 'head' && $token->name !== 'body' && $token->name !== 'html' && $token->name !== 'br') { # Parse error. - ParseError::trigger(ParseError::UNEXPECTED_END_TAG, $token->name, 'head, body, html, or br tag'); + ParseError::trigger(ParseError::UNEXPECTED_END_TAG, $token->name, 'head, body, html, br'); } # An end tag whose tag name is one of: "head", "body", "html", "br" # Anything else @@ -382,7 +386,7 @@ class TreeBuilder { # Switch the insertion mode to "before head", then reprocess the token. $this->insertionMode = self::BEFORE_HEAD_MODE; - return false; + continue; } # The document element can end up being removed from the Document object, e.g., @@ -429,7 +433,7 @@ class TreeBuilder { # Any other end tag elseif ($token instanceof EndTagToken && $token->name !== 'head' && $token->name !== 'body' && $token->name !== 'html' && $token->name === 'br') { # Parse error. - ParseError::trigger(ParseError::UNEXPECTED_END_TAG, $token->name, 'head, body, html, or br tag'); + ParseError::trigger(ParseError::UNEXPECTED_END_TAG, $token->name, 'head, body, html, br'); } # An end tag whose tag name is one of: "head", "body", "html", "br" # Anything else @@ -443,7 +447,7 @@ class TreeBuilder { $this->insertionMode = self::IN_HEAD_MODE; # Reprocess the current token. - return false; + continue; } break; @@ -583,6 +587,50 @@ class TreeBuilder { # Switch the insertion mode to "after head". $this->insertionMode = self::AFTER_HEAD_MODE; } + # An end tag whose tag name is one of: "body", "html", "br" + elseif ($token->name === 'body' || $token->name === 'html' || $token->name === 'br') { + # Act as described in the "anything else" entry below. + # + # Pop the current node (which will be the head element) off the stack of open + # elements. + $this->stack->pop(); + # Switch the insertion mode to "after head". + $this->insertionMOde = self::AFTER_HEAD_MODE; + # Reprocess the token. + continue; + } + # An end tag whose tag name is "template" + elseif ($token->name === 'template') { + # If there is no template element on the stack of open elements, then this is a + # parse error; ignore the token. + if ($this->stack->search('template') === -1) { + ParseError::trigger(ParseError::UNEXPECTED_END_TAG, 'template', (string)$this->stack); + } + # Otherwise, run these steps: + else { + # 1. Generate all implied end tags thoroughly. + $this->stack->generateImpliedEndTags(); + + # 2. If the current node is not a template element, then this is a parse error. + if ($this->stack->currentNodeName !== 'template') { + ParseError::trigger(ParseError::UNEXPECTED_END_TAG, 'template', (string)$this->stack); + } + + # 3. Pop elements from the stack of open elements until a template element has been popped from the stack. + do { + $poppedNodeName = $this->stack->pop()->nodeName; + } while ($poppedNodeName !== 'template'); + + # 4. Clear the list of active formatting elements up to the last marker. + $this->activeFormattingElementsList->clearToTheLastMarker(); + + # 5. Pop the current template insertion mode off the stack of template insertion modes. + // DEVIATION: No scripting. + + # 6. Reset the insertion mode appropriately. + $this->resetInsertionMode(); + } + } // ¡STOPPED HERE! } break; @@ -666,7 +714,7 @@ class TreeBuilder { ); # Then, reprocess the token. - return false; + continue; } # Any other start tag else { @@ -1019,18 +1067,22 @@ class TreeBuilder { # 1. Let last template be the last template element in the stack of open # elements, if any. $lastTemplateKey = $this->stack->search('template'); - $lastTemplate = $this->stack[$lastTemplateKey]; + $lastTemplate = ($lastTemplateKey !== -1 ) ? $this->stack[$lastTemplateKey] : null; # 2. Let last table be the last table element in the stack of open elements, if # any. $lastTableKey = $this->stack->search('table'); - $lastTable = $this->stack[$lastTableKey]; + $lastTable = ($lastTableKey !== -1 ) ? $this->stack[$lastTableKey] : null; # 3. If there is a last template and either there is no last table, or there is # one, but last template is lower (more recently added) than last table in the # stack of open elements, then: let adjusted insertion location be inside last # template’s template contents, after its last child (if any), and abort these # substeps. + // DEVIATION: PHP's DOM does not have a special element for template and + // therefore no API for putting the template's contents into a + // DOMDocumentFragment in a property of the element, so the contents are just + // going to be children of the template element instead. if ($lastTemplate && (!$lastTable || $lastTable && $lastTemplateKey > $lastTableKey)) { $insertionLocation = $lastTemplate; // Abort! @@ -1071,9 +1123,11 @@ class TreeBuilder { # 3. If the adjusted insertion location is inside a template element, let it # instead be inside the template element’s template contents, after its last # child (if any). - if ($insertionLocation->nodeName === 'template') { - $insertionLocation = $insertionLocation->contents; - } + // DEVIATION: PHP's DOM does not have a special element for template and + // therefore no API for putting the template's contents into a + // DOMDocumentFragment in a property of the element, so the contents are just + // going to be children of the template element instead, so there's nothing to + // do. # 4. Return the adjusted insertion location. return [ @@ -1307,4 +1361,151 @@ class TreeBuilder { protected function parseGenericRCDATA(StartTagToken $token) { $this->parseGenericText($token, false); } + + protected function resetInsertionMode() { + # When the steps below require the UA to reset the insertion mode appropriately, + # it means the UA must follow these steps: + + # 1. Let last be false. + $last = false; + + # 2. Let node be the last node in the stack of open elements. + $node = $this->stack->currentNode; + $nodeName = $this->stack->currentNodeName; + // Keeping up with the position, too. + $position = $this->stack->length - 1; + + # 3. Loop: If node is the first node in the stack of open elements, then set + # last to true, and, if the parser was originally created as part of the HTML + # fragment parsing algorithm (fragment case), set node to the context element + # passed to that algorithm. + while (true) { + if ($node->isSameNode($this->stack[0])) { + $last = true; + + if ($this->fragmentCase === true) { + $node = $this->fragmentContext; + } + } + + # 4. If node is a select element, run these substeps: + if ($nodeName === 'select') { + # 1. If last is true, jump to the step below labeled Done. + if ($last === false) { + # 2. Let ancestor be node. + $ancestor = $node; + $position2 = $position; + + # 3. Loop: If ancestor is the first node in the stack of open elements, jump to + # the step below labeled Done. + while (!$ancestor->isSameNode($this->stack[0])) { + # 4. Let ancestor be the node before ancestor in the stack of open elements. + $ancestor = $this->stack[--$position2]; + + # 5. If ancestor is a template node, jump to the step below labeled Done. + if ($ancestor->nodeName === 'template') { + break; + } + + # 6. If ancestor is a table node, switch the insertion mode to "in select in + # table" and abort these steps. + if ($ancestor->nodeName === 'table') { + $this->insertionMode = self::IN_SELECT_IN_TABLE_MODE; + return; + } + + # 7. Jump back to the step labeled Loop. + } + } + + # 8. Done: Switch the insertion mode to "in select" and abort these steps. + $this->insertionMode = self::IN_SELECT_MODE; + } + # 5. If node is a td or th element and last is false, then switch the insertion + # mode to "in cell" and abort these steps. + elseif (($nodeName === 'td' || $nodeName === 'th') && $last === false) { + $this->insertionMode = self::IN_CELL_MODE; + return; + } + # 6. If node is a tr element, then switch the insertion mode to "in row" and + # abort these steps. + elseif ($nodeName === 'tr') { + $this->insertionMode = self::IN_ROW_MODE; + return; + } + # 7. If node is a tbody, thead, or tfoot element, then switch the insertion mode + # to "in table body" and abort these steps. + elseif ($nodeName === 'tbody' || $nodeName === 'thead' || $nodeName === 'tfoot') { + $this->insertionMode = self::IN_TABLE_BODY_MODE; + return; + } + # 8. If node is a caption element, then switch the insertion mode to "in + # caption" and abort these steps. + elseif ($nodeName === 'caption') { + $this->insertionMode = self::IN_CAPTION_MODE; + return; + } + # 9. If node is a colgroup element, then switch the insertion mode to "in column + # group" and abort these steps. + elseif ($nodeName === 'colgroup') { + $this->insertionMode = self::IN_COLUMN_GROUP_MODE; + return; + } + # 10. If node is a table element, then switch the insertion mode to "in table" + # and abort these steps. + elseif ($nodeName === 'table') { + $this->insertionMode = self::IN_TABLE_MODE; + return; + } + # 11. If node is a template element, then switch the insertion mode to the + # current template insertion mode and abort these steps. + elseif ($nodeName === 'template') { + // FIXME: NOT SURE WHAT TO DO HERE YET. + return; + } + # 12. If node is a head element and last is false, then switch the insertion + # mode to "in head" and abort these steps. + elseif ($nodeName === 'head' && $last === false) { + $this->insertionMode = self::IN_HEAD_MODE; + return; + } + # 13. If node is a body element, then switch the insertion mode to "in body" and + # abort these steps. + elseif ($nodeName === 'body') { + $this->insertionMode = self::IN_BODY_MODE; + return; + } + # 14. If node is a frameset element, then switch the insertion mode to "in + # frameset" and abort these steps. (fragment case) + elseif ($nodeName === 'frameset') { + $this->insertionMode = self::IN_FRAMESET_MODE; + return; + } + # 15. If node is an html element, run these substeps: + elseif ($nodeName === 'html') { + # 1. If the head element pointer is null, switch the insertion mode to "before + # head" and abort these steps. (fragment case) + if (is_null($this->headElement)) { + $this->insertionMode = self::BEFORE_HEAD_MODE; + return; + } + + # 2. Otherwise, the head element pointer is not null, switch the insertion mode + # to "after head" and abort these steps. + $this->insertionMode = self::AFTER_HEAD_MODE; + return; + } + + # 16. If last is true, then switch the insertion mode to "in body" and abort + # these steps. (fragment case) + if ($last === true) { + $this->insertionMode = self::IN_BODY_MODE; + } + + # 17. Let node now be the node before node in the stack of open elements. + $node = $this->stack[--$position]; + + # 18. Return to the step labeled Loop. + } + } } \ No newline at end of file