diff --git a/lib/DOM.php b/lib/DOM.php index 680ae57..caa0db6 100644 --- a/lib/DOM.php +++ b/lib/DOM.php @@ -3,6 +3,27 @@ declare(strict_types=1); namespace dW\HTML5; class DOM { + public $document = null; + public $implementation = null; + + // Instance used to pass around the implementation and the document. PHP's DOM + // cannot append a DOCTYPE to a DOMDocument, so the document must be created + // when the DOCTYPE is. This creates a problem where the Parser sometimes needs + // an implementation before the TreeBuilder is initiated. + public function __construct($document = null) { + if (is_null($document)) { + $this->implementation = new \DOMImplementation(); + return; + } + + if (!$document instanceof \DOMDocument) { + throw new Exception(Exception::DOM_DOMDOCUMENT_EXPECTED, gettype($document)); + } + + $this->document = $document; + } + + public static function getAncestor(mixed $needle, \DOMElement $context): \DOMElement { return static::ancestor($needle, $context, true); } diff --git a/lib/Exception.php b/lib/Exception.php index 5ec69c9..711b558 100644 --- a/lib/Exception.php +++ b/lib/Exception.php @@ -20,7 +20,8 @@ class Exception extends \Exception { const DATASTREAM_NODATA = 10401; const DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH = 10402; - const DOM_DOMELEMENT_STRING_OR_CLOSURE_EXPECTED = 10501; + const DOM_DOMDOCUMENT_EXPECTED = 10501; + const DOM_DOMELEMENT_STRING_OR_CLOSURE_EXPECTED = 10502; const TOKENIZER_INVALID_STATE = 10601; @@ -44,7 +45,8 @@ class Exception extends \Exception { 10401 => 'Data string expected; found %s', 10402 => '%s is an invalid data consumption length; a value of 1 or above is expected', - 10501 => 'The first argument must either be an instance of \DOMElement, a string, or a closure; found %s', + 10501 => 'The first argument must be an instance of \DOMElement or null; found %s', + 10502 => 'The first argument must either be an instance of \DOMElement, a string, or a closure; found %s', 10601 => 'The Tokenizer has entered an invalid state', diff --git a/lib/ParseError.php b/lib/ParseError.php index 76240a1..8f99dab 100644 --- a/lib/ParseError.php +++ b/lib/ParseError.php @@ -26,8 +26,8 @@ class ParseError { 'Unexpected "%s" character; %s expected', '%s attribute already exists; discarding', 'Unexpected end-of-tag; %s expected', - 'Unexpected %s start tag; %s start tag expected', - 'Unexpected %s end tag; %s end tag expected', + 'Unexpected %s start tag; %s expected', + 'Unexpected %s end tag; %s expected', 'Unexpected DOCTYPE; %s expected', 'Invalid DOCTYPE', 'Invalid Control or Non-character; removing', diff --git a/lib/Parser.php b/lib/Parser.php index 66ac1d6..99b0d5d 100644 --- a/lib/Parser.php +++ b/lib/Parser.php @@ -65,10 +65,8 @@ class Parser { static::$instance = new $c; } - // Create the document if it doesn't already exist. Will be overwritten if there is a DOCTYPE. if (is_null(static::$instance->DOM)) { - $imp = new \DOMImplementation; - static::$instance->DOM = $imp->createDocument(); + static::$instance->DOM = new DOM(); } // Process the input stream. @@ -94,7 +92,7 @@ class Parser { } while (!$token instanceof EOFToken); // The Parser instance has no need to exist when finished. - $dom = static::$instance->DOM; + $dom = static::$instance->DOM->document; static::$instance->__destruct(); return DOM::fixIdAttributes($dom); @@ -106,13 +104,13 @@ class Parser { static::$instance = new $c; if (!is_null($context)) { - static::$instance->DOM = $context->ownerDocument; + static::$instance->DOM = new DOM($context->ownerDocument); } else { - $imp = new \DOMImplementation; - static::$instance->DOM = $imp->createDocument(); + static::$instance->DOM = new DOM(); + static::$instance->DOM->document = static::$instance->DOM->implementation->createDocument(); } - static::$instance->DOMFragment = static::$instance->DOM->createDocumentFragment(); + static::$instance->DOMFragment = static::$instance->DOM->document->createDocumentFragment(); // DEVIATION: The spec says to let the document be in quirks mode if the // DOMDocument is in quirks mode. Cannot check whether the context element is in diff --git a/lib/TreeBuilder.php b/lib/TreeBuilder.php index 06824f1..93ec950 100644 --- a/lib/TreeBuilder.php +++ b/lib/TreeBuilder.php @@ -73,7 +73,7 @@ class TreeBuilder { const QUIRKS_MODE_LIMITED = 2; - public function __construct(\DOMDocument $dom, $formElement, bool $fragmentCase = false, $fragmentContext = null, Stack $stack, Tokenizer $tokenizer) { + public function __construct(DOM $dom, $formElement, bool $fragmentCase = false, $fragmentContext = null, Stack $stack, Tokenizer $tokenizer) { // If the form element isn't an instance of DOMElement that has a node name of // "form" or null then there's a problem. if (!is_null($formElement) && !($formElement instanceof DOMElement && $formElement->nodeName === 'form')) { @@ -173,6 +173,7 @@ class TreeBuilder { } # TEMPORARY + echo "\n"; var_export($token); echo "\n\n"; @@ -278,10 +279,9 @@ class TreeBuilder { # it is returned as the value of the doctype attribute of the Document object. // PHP's DOM cannot just append a DOCTYPE node to the document, so a document is // created with the specified DOCTYPE instead. - $imp = new \DOMImplementation(); // DEVIATION: PHP's DOMImplementation::createDocumentType() method cannot accept // an empty name, so if it is missing it is replaced with 'html' instead. - $this->DOM = $imp->createDocument('', '', $imp->createDocumentType((!is_null($token->name)) ? $token->name : 'html', $token->public, $token->system)); + $this->DOM->document = $this->DOM->implementation->createDocument('', '', $this->DOM->implementation->createDocumentType((!is_null($token->name)) ? $token->name : 'html', $token->public, $token->system)); $public = strtolower($token->public); @@ -386,6 +386,17 @@ class TreeBuilder { # set the Document to quirks mode. // DEVIATION: There is no iframe srcdoc document because there are no nested // browsing contexts in this implementation. + switch (get_class($token)) { + case 'StartTagToken': $errorType = ParseError::UNEXPECTED_START_TAG; + break; + case 'EndTagToken': $errorType = ParseError::UNEXPECTED_END_TAG; + break; + case 'EOFToken': $errorType = ParseError::UNEXPECTED_EOF; + break; + default: throw new Exception(Exception::UNKNOWN_ERROR); + } + + ParseError::trigger($errorType, 'doctype'); $this->quirksMode = self::QUIRKS_MODE_ON; # In any case, switch the insertion mode to "before html", then reprocess the @@ -400,7 +411,7 @@ class TreeBuilder { case self::BEFORE_HTML_MODE: # A DOCTYPE token if ($token instanceof DOCTYPEToken) { - ParseError::trigger(ParseError::UNEXPECTED_DOCTYPE, ''); + ParseError::trigger(ParseError::UNEXPECTED_DOCTYPE, 'html start tag, comment'); } # A comment token elseif ($token instanceof CommentToken) { @@ -419,7 +430,7 @@ class TreeBuilder { # Create an element for the token in the HTML namespace, with the Document as # the intended parent. Append it to the Document object. Put this element in the # stack of open elements. - $element = static::insertStartTagToken($token, $this->DOM); + $element = static::insertStartTagToken($token, $this->DOM->document); # Switch the insertion mode to "before head". $this->insertionMode = self::BEFORE_HEAD_MODE; @@ -427,15 +438,15 @@ class TreeBuilder { # Any other end tag elseif ($token instanceof EndTagToken && $token->name !== 'head' && $token->name !== 'body' && $token->name !== 'html' && $token->name !== 'br') { # Parse error. - ParseError::trigger(ParseError::UNEXPECTED_END_TAG, $token->name, 'head, body, html, br'); + ParseError::trigger(ParseError::UNEXPECTED_END_TAG, $token->name, 'head, body, html, br end tag'); } # An end tag whose tag name is one of: "head", "body", "html", "br" # Anything else else { # Create an html element whose node document is the Document object. Append it # to the Document object. Put this element in the stack of open elements. - $element = $this->DOM->createElement('html'); - $this->DOM->appendChild($element); + $element = $this->DOM->document->createElement('html'); + $this->DOM->document->appendChild($element); $this->stack[] = $element; # Switch the insertion mode to "before head", then reprocess the token. @@ -487,7 +498,7 @@ class TreeBuilder { # Any other end tag elseif ($token instanceof EndTagToken && $token->name !== 'head' && $token->name !== 'body' && $token->name !== 'html' && $token->name === 'br') { # Parse error. - ParseError::trigger(ParseError::UNEXPECTED_END_TAG, $token->name, 'head, body, html, br'); + ParseError::trigger(ParseError::UNEXPECTED_END_TAG, $token->name, 'head, body, html, br end tag'); } # An end tag whose tag name is one of: "head", "body", "html", "br" # Anything else @@ -635,7 +646,7 @@ class TreeBuilder { # A start tag whose tag name is "head" elseif ($token->name === 'head') { # Parse error. - ParseError::trigger(ParseError::UNEXPECTED_START_TAG, 'head', 'base, basefont, bgsound, link, meta, title, noframes, style, noscript, script, template'); + ParseError::trigger(ParseError::UNEXPECTED_START_TAG, 'head', 'base, basefont, bgsound, link, meta, title, noframes, style, noscript, script, template start tag'); } # Anything else else { @@ -678,7 +689,7 @@ class TreeBuilder { # If there is no template element on the stack of open elements, then this is a # parse error; ignore the token. if ($this->stack->search('template') === -1) { - ParseError::trigger(ParseError::UNEXPECTED_END_TAG, 'template', (string)$this->stack); + ParseError::trigger(ParseError::UNEXPECTED_END_TAG, 'template', (string)$this->stack.' end tag'); } # Otherwise, run these steps: else { @@ -687,7 +698,7 @@ class TreeBuilder { # 2. If the current node is not a template element, then this is a parse error. if ($this->stack->currentNodeName !== 'template') { - ParseError::trigger(ParseError::UNEXPECTED_END_TAG, 'template', (string)$this->stack); + ParseError::trigger(ParseError::UNEXPECTED_END_TAG, 'template', (string)$this->stack.' end tag'); } # 3. Pop elements from the stack of open elements until a template element has been popped from the stack. @@ -708,7 +719,7 @@ class TreeBuilder { # Any other end tag else { # Parse error. - ParseError::trigger(ParseError::UNEXPECTED_END_TAG, $token->name, (string)$this->stack); + ParseError::trigger(ParseError::UNEXPECTED_END_TAG, $token->name, (string)$this->stack.' end tag'); } } # Anything else @@ -748,14 +759,14 @@ class TreeBuilder { # A start tag whose tag name is one of: "head", "noscript" elseif ($token->name === 'head' || $token->name === 'noscript') { # Parse error. - ParseError::trigger(ParseError::UNEXPECTED_START_TAG, $token->name, 'basefont, bgsound, link, meta, noframes, style'); + ParseError::trigger(ParseError::UNEXPECTED_START_TAG, $token->name, 'basefont, bgsound, link, meta, noframes, style start tag'); } # Anything else else { # Act as described in the "anything else" entry below. # # Parse error. - ParseError::trigger(ParseError::UNEXPECTED_START_TAG, $token->name, 'basefont, bgsound, link, meta, noframes, style'); + ParseError::trigger(ParseError::UNEXPECTED_START_TAG, $token->name, 'basefont, bgsound, link, meta, noframes, style start tag'); # Pop the current node (which will be a noscript element) from the stack of open # elements; the new current node will be a head element. $this->stack->pop(); @@ -780,7 +791,7 @@ class TreeBuilder { # Act as described in the "anything else" entry below. # # Parse error. - ParseError::trigger(ParseError::UNEXPECTED_END_TAG, 'br', (string)$this->stack); + ParseError::trigger(ParseError::UNEXPECTED_END_TAG, 'br', (string)$this->stack.' end tag'); # Pop the current node (which will be a noscript element) from the stack of open # elements; the new current node will be a head element. $this->stack->pop(); @@ -793,7 +804,7 @@ class TreeBuilder { # Any other end tag else { # Parse error. - ParseError::trigger(ParseError::UNEXPECTED_END_TAG, 'br', (string)$this->stack); + ParseError::trigger(ParseError::UNEXPECTED_END_TAG, 'br', (string)$this->stack.' end tag'); } } # A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED @@ -808,7 +819,7 @@ class TreeBuilder { # Anything else else { # Parse error. - ParseError::trigger(ParseError::UNEXPECTED_END_TAG, 'br', (string)$this->stack); + ParseError::trigger(ParseError::UNEXPECTED_END_TAG, 'br', (string)$this->stack.' end tag'); # Pop the current node (which will be a noscript element) from the stack of open # elements; the new current node will be a head element. $this->stack->pop(); @@ -865,7 +876,7 @@ class TreeBuilder { # "meta", "noframes", "script", "style", "template", "title" elseif ($token->name === 'base' || $token->name === 'basefont' || $token->name === 'bgsound' || $token->name === 'link' || $token->name === 'meta' || $token->name === 'noframes' || $token->name === 'script' || $token->name === 'style' || $token->name === 'template' || $token->name === 'title') { # Parse error. - ParseError::trigger(ParseError::UNEXPECTED_START_TAG, $token->name, 'body, frameset'); + ParseError::trigger(ParseError::UNEXPECTED_START_TAG, $token->name, 'body, frameset start tag'); # Push the node pointed to by the head element pointer onto the stack of open elements. $this->stack[] = $this->headElement; # Process the token using the rules for the "in head" insertion mode. @@ -881,7 +892,7 @@ class TreeBuilder { # A start tag whose tag name is "head" elseif ($token->name === 'head') { # Parse error. - ParseError::trigger(ParseError::UNEXPECTED_START_TAG, 'head', 'body, frameset'); + ParseError::trigger(ParseError::UNEXPECTED_START_TAG, 'head', 'body, frameset start tag'); } # Any other start tag else { @@ -918,7 +929,7 @@ class TreeBuilder { # Any other end tag else { # Parse error. - ParseError::trigger(ParseError::UNEXPECTED_END_TAG, 'head', 'body, frameset'); + ParseError::trigger(ParseError::UNEXPECTED_END_TAG, 'head', 'body, frameset end tag'); } } # Anything else @@ -967,7 +978,7 @@ class TreeBuilder { # A start tag whose tag name is "html" if ($token->name === 'html') { # Parse error. - ParseError::trigger(ParseError::UNEXPECTED_START_TAG, 'html', 'any body content'); + ParseError::trigger(ParseError::UNEXPECTED_START_TAG, 'html', 'any body content start tag'); # If there is a template element on the stack of open elements, then ignore the # token. if ($this->stack->search('template') === -1) { @@ -1059,7 +1070,7 @@ class TreeBuilder { ) ) { # Parse error. - ParseError::trigger(ParseError::UNEXPECTED_START_TAG, $token->name, 'Non-HTML'); + ParseError::trigger(ParseError::UNEXPECTED_START_TAG, $token->name, 'Non-HTML start tag'); # If the parser was originally created for the HTML fragment parsing algorithm, # then act as described in the "any other start tag" entry below. (fragment @@ -1381,7 +1392,7 @@ class TreeBuilder { # 2. If node is not an element with the same tag name as the token, then this is # a parse error. if ($nodeName !== $token->name) { - ParseError::trigger(ParseError::UNEXPECTED_END_TAG, $token->name, $nodeName); + ParseError::trigger(ParseError::UNEXPECTED_END_TAG, $token->name, "$nodeName end tag"); } # 3. Loop: If node's tag name, converted to ASCII lowercase, is the same as the # tag name of the token, pop elements from the stack of open elements until node @@ -1595,7 +1606,7 @@ class TreeBuilder { # 1. Let document be intended parent’s node document. // DEVIATION: Unnecessary because there aren't any nested contexts to consider. - // The document will always be $this->DOM. + // The document will always be $this->DOM->document. # 2. Let local name be the tag name of the token. // Nope. Don't need it because when creating elements with @@ -1610,9 +1621,9 @@ class TreeBuilder { // DEVIATION: There is no point to setting the synchronous custom elements flag // and custom element definition; there is no scripting in this implementation. if ($namespace === Parser::HTML_NAMESPACE) { - $element = static::$instance->DOM->createElement($token->name); + $element = static::$instance->DOM->document->createElement($token->name); } else { - $element = static::$instance->DOM->createElementNS($namespace, $token->name); + $element = static::$instance->DOM->document->createElementNS($namespace, $token->name); } # 8. Append each attribute in the given token to element. @@ -1677,6 +1688,7 @@ class TreeBuilder { # 1. Let the adjusted insertion location be the appropriate place for inserting # a node. $location = static::$instance->appropriatePlaceForInsertingNode($intendedParent); + $adjustedInsertionLocation = $location['node']; $insertBefore = $location['insert before'];