diff --git a/lib/OpenElementsStack.php b/lib/OpenElementsStack.php index 087af87..690ef13 100644 --- a/lib/OpenElementsStack.php +++ b/lib/OpenElementsStack.php @@ -78,15 +78,9 @@ class OpenElementsStack extends Stack { $this->_storage = array_values($this->_storage); } - public function generateImpliedEndTags($exclude = []) { + public function generateImpliedEndTags(array $exclude = []) { $tags = ['caption', 'colgroup', 'dd', 'dt', 'li', 'optgroup', 'option', 'p', 'rb', 'rp', 'rt', 'rtc', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']; - if (is_string($exclude)) { - $exclude = [$exclude]; - } - - assert(is_array($exclude), new Exception(Exception::STACK_STRING_ARRAY_EXPECTED)); - if (count($exclude) > 0) { $modified = false; foreach ($exclude as $e) { diff --git a/lib/ParseError.php b/lib/ParseError.php index da8b18b..6110855 100644 --- a/lib/ParseError.php +++ b/lib/ParseError.php @@ -5,6 +5,7 @@ namespace dW\HTML5; class ParseError { protected $data; + // tokenization parse errors; these have been standardized const ENCODING_ERROR = 100; const UNEXPECTED_NULL_CHARACTER = 101; const UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME = 102; @@ -53,9 +54,19 @@ class ParseError { const CONTROL_CHARACTER_REFERENCE = 145; const SURROGATE_IN_INPUT_STREAM = 146; const NONCHARACTER_IN_INPUT_STREAM = 147; - const CONTROL_CHARACTER_IN_INPUT_STREAM = 148; + const CONTROL_CHARACTER_IN_INPUT_STREAM = 148; + // tree construction parse errors; these have not been standardized, but html5lib's error names are likely to become standard in future + const EXPECTED_DOCTYPE_BUT_GOT_START_TAG = 200; + const EXPECTED_DOCTYPE_BUT_GOT_END_TAG = 201; + const EXPECTED_DOCTYPE_BUT_GOT_CHARS = 202; + const UNEXPECTED_END_TAG = 203; // html5lib also uses 'adoption-agency-1.2' and 'adoption-agency-1.3' for this const MESSAGES = [ + self::EXPECTED_DOCTYPE_BUT_GOT_START_TAG => 'Expected DOCTYPE but got start tag', + self::EXPECTED_DOCTYPE_BUT_GOT_END_TAG => 'Expected DOCTYPE but got end tag', + self::EXPECTED_DOCTYPE_BUT_GOT_CHARS => 'Expected DOCTYPE but got characters', + self::UNEXPECTED_END_TAG => 'Unexpected end tag', + self::ENCODING_ERROR => 'Corrupt encoding near byte position %s', self::UNEXPECTED_NULL_CHARACTER => 'Unexpected null character', self::UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME => 'Unexpected "?" character instead of tag name', @@ -108,55 +119,26 @@ class ParseError { ]; const REPORT_OFFSETS = [ - self::ENCODING_ERROR => 0, self::UNEXPECTED_NULL_CHARACTER => -1, - self::UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME => 0, - self::EOF_BEFORE_TAG_NAME => 0, - self::INVALID_FIRST_CHARACTER_OF_TAG_NAME => 0, self::MISSING_END_TAG_NAME => -1, - self::EOF_IN_TAG => 0, - self::EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT => 0, self::UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME => -1, self::DUPLICATE_ATTRIBUTE => -1, self::UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME => -1, self::MISSING_ATTRIBUTE_VALUE => -1, self::UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE => -1, - self::MISSING_WHITESPACE_BETWEEN_ATTRIBUTES => 0, - self::UNEXPECTED_SOLIDUS_IN_TAG => 0, self::CDATA_IN_HTML_CONTENT => -1, - self::INCORRECTLY_OPENED_COMMENT => 0, self::ABRUPT_CLOSING_OF_EMPTY_COMMENT => -1, - self::EOF_IN_COMMENT => 0, - self::NESTED_COMMENT => 0, self::INCORRECTLY_CLOSED_COMMENT => -1, - self::EOF_IN_DOCTYPE => 0, - self::MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME => 0, self::MISSING_DOCTYPE_NAME => -1, - self::INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME => 0, self::MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD => -1, self::MISSING_DOCTYPE_PUBLIC_IDENTIFIER => -1, - self::MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER => 0, self::ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER => -1, self::MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS => -1, self::MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD => -1, self::MISSING_DOCTYPE_SYSTEM_IDENTIFIER => -1, - self::MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER => 0, self::ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER => -1, - self::UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER => 0, - self::EOF_IN_CDATA => 0, self::END_TAG_WITH_ATTRIBUTES => -1, self::END_TAG_WITH_TRAILING_SOLIDUS => -1, - self::MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE => 0, - self::UNKNOWN_NAMED_CHARACTER_REFERENCE => 0, - self::ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE => 0, - self::NULL_CHARACTER_REFERENCE => 0, - self::CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE => 0, - self::SURROGATE_CHARACTER_REFERENCE => 0, - self::NONCHARACTER_CHARACTER_REFERENCE => 0, - self::CONTROL_CHARACTER_REFERENCE => 0, - self::SURROGATE_IN_INPUT_STREAM => 0, - self::NONCHARACTER_IN_INPUT_STREAM => 0, - self::CONTROL_CHARACTER_IN_INPUT_STREAM => 0, ]; public function setHandler() { diff --git a/lib/ParseErrorEmitter.php b/lib/ParseErrorEmitter.php index 42ace03..00e3ccd 100644 --- a/lib/ParseErrorEmitter.php +++ b/lib/ParseErrorEmitter.php @@ -10,7 +10,7 @@ trait ParseErrorEmitter { $data = ($this instanceof Data) ? $this : ($this->data ?? null); assert($data instanceof Data); assert($this->errorHandler instanceof ParseError); - list($line, $column) = $data->whereIs(ParseError::REPORT_OFFSETS[$code]); + list($line, $column) = $data->whereIs(ParseError::REPORT_OFFSETS[$code] ?? 0); return $this->errorHandler->emit($data->filePath, $line, $column, $code, ...$arg); } } diff --git a/lib/TreeBuilder.php b/lib/TreeBuilder.php index 82f937a..d38757f 100644 --- a/lib/TreeBuilder.php +++ b/lib/TreeBuilder.php @@ -109,12 +109,6 @@ class TreeBuilder { $this->insertionMode = self::INITIAL_MODE; $this->quirksMode = self::QUIRKS_MODE_OFF; - - static::$instance = $this; - } - - public function __destruct() { - static::$instance = null; } public function emitToken(Token $token) { @@ -388,14 +382,16 @@ class TreeBuilder { # set the Document to quirks mode. // DEVIATION: There is no iframe srcdoc document because there are no nested // browsing contexts in this implementation. - switch (get_class($token)) { - case 'StartTagToken': $this->error(ParseError::UNEXPECTED_START_TAG, $token->name); - break; - case 'EndTagToken': $this->error(ParseError::UNEXPECTED_END_TAG, $token->name); - break; - case 'EOFToken': $this->error(ParseError::UNEXPECTED_EOF); - break; - default: throw new Exception(Exception::UNKNOWN_ERROR); + if ($token instanceof StartTagToken) { + $this->error(ParseError::EXPECTED_DOCTYPE_BUT_GOT_START_TAG); + } elseif ($token instanceof EndTagToken) { + $this->error(ParseError::EXPECTED_DOCTYPE_BUT_GOT_END_TAG); + } elseif ($token instanceof CharacterToken) { + $this->error(ParseError::EXPECTED_DOCTYPE_BUT_GOT_CHARS); + } elseif ($token instanceof EOFToken) { + $this->error(ParseError::UNEXPECTED_EOF); + } else { + throw new \Exception("Unexpected token type".get_class($token)); } $this->quirksMode = self::QUIRKS_MODE_ON; @@ -431,7 +427,7 @@ class TreeBuilder { # Create an element for the token in the HTML namespace, with the Document as # the intended parent. Append it to the Document object. Put this element in the # stack of open elements. - $element = static::insertStartTagToken($token, $this->DOM); + $element = $this->insertStartTagToken($token, $this->DOM); # Switch the insertion mode to "before head". $this->insertionMode = self::BEFORE_HEAD_MODE; @@ -490,7 +486,7 @@ class TreeBuilder { # A start tag whose tag name is "head" elseif ($token->name === 'head') { # Insert an HTML element for the token. - $element = static::insertStartTagToken($token); + $element = $this->insertStartTagToken($token); # Set the head element pointer to the newly created head element. $this->headElement = $element; @@ -507,7 +503,7 @@ class TreeBuilder { # Anything else else { # Insert an HTML element for a "head" start tag token with no attributes. - $element = static::insertStartTagToken(new StartTagToken('head')); + $element = $this->insertStartTagToken(new StartTagToken('head')); # Set the head element pointer to the newly created head element. $this->headElement = $element; @@ -551,7 +547,7 @@ class TreeBuilder { elseif ($token->name === 'base' || $token->name === 'basefont' || $token->name === 'bgsound' || $token->name === 'link') { # Insert an HTML element for the token. Immediately pop the current node off the # stack of open elements. - static::insertStartTagToken($token); + $this->insertStartTagToken($token); $this->stack->pop(); # Acknowledge the token’s *self-closing flag*, if it is set. @@ -561,7 +557,7 @@ class TreeBuilder { elseif ($token->name === 'meta') { # Insert an HTML element for the token. Immediately pop the current node off the # stack of open elements. - static::insertStartTagToken($token); + $this->insertStartTagToken($token); $this->stack->pop(); # Acknowledge the token’s *self-closing flag*, if it is set. @@ -597,7 +593,7 @@ class TreeBuilder { // flag is always disabled. elseif ($token->name === 'noscript') { # Insert an HTML element for the token. - static::insertStartTagToken($token); + $this->insertStartTagToken($token); # Switch the insertion mode to "in head noscript". $this->insertionMode = self::IN_HEAD_NOSCRIPT_MODE; } @@ -615,7 +611,7 @@ class TreeBuilder { // intended parent isn't used when determining anything; // Parser::createAndInsertElement will get the adjusted insertion location // anyway. - static::insertStartTagToken($token); + $this->insertStartTagToken($token); # 3. Mark the element as being "parser-inserted" and unset the element’s # "non-blocking" flag. @@ -637,7 +633,7 @@ class TreeBuilder { # A start tag whose tag name is "template" elseif ($token->name === 'template') { # Insert an HTML element for the token. - static::insertStartTagToken($token); + $this->insertStartTagToken($token); # Insert a marker at the end of the list of active formatting elements. $this->activeFormattingElementsList->insertMarker(); # Set the frameset-ok flag to "not ok". @@ -703,7 +699,7 @@ class TreeBuilder { # 2. If the current node is not a template element, then this is a parse error. if ($this->stack->currentNodeName !== 'template') { - $this->error(ParseError::UNEXPECTED_END_TAG, 'template'); + $this->error(ParseError::UNEXPECTED_END_TAG); } # 3. Pop elements from the stack of open elements until a template element has been popped from the stack. @@ -722,7 +718,7 @@ class TreeBuilder { # Any other end tag else { # Parse error. - $this->error(ParseError::UNEXPECTED_END_TAG, $token->name); + $this->error(ParseError::UNEXPECTED_END_TAG); } } # Anything else @@ -1326,7 +1322,7 @@ class TreeBuilder { } # Switch the insertion mode to "after body". - self::$insertionMode = self::AFTER_BODY_MODE; + $this->insertionMode = self::AFTER_BODY_MODE; // The only thing different between body and html here is that when processing // an html end tag the token is reprocessed. @@ -1789,7 +1785,7 @@ class TreeBuilder { # Insert a foreign element for the token, in the same namespace as the adjusted # current node. - static::insertStartTagToken($token, null, $this->stack->adjustedCurrentNode->namespaceURI); + $this->insertStartTagToken($token, null, $this->stack->adjustedCurrentNode->namespaceURI); # If the token has its self-closing flag set, then run the appropriate steps # from the following list: @@ -1944,7 +1940,7 @@ class TreeBuilder { ]; } - public static function insertCharacterToken(CharacterToken $token) { + public function insertCharacterToken(CharacterToken $token) { # 1. Let data be the characters passed to the algorithm, or, if no characters # were explicitly specified, the character of the character token being # processed. @@ -1952,7 +1948,7 @@ class TreeBuilder { # 2. Let the adjusted insertion location be the appropriate place for inserting # a node. - $location = static::$instance->appropriatePlaceForInsertingNode(); + $location = $this->appropriatePlaceForInsertingNode(); $adjustedInsertionLocation = $location['node']; $insertBefore = $location['insert before']; @@ -1998,7 +1994,7 @@ class TreeBuilder { $adjustedInsertionLocation = $position; $insertBefore = false; } else { - $location = static::$instance->appropriatePlaceForInsertingNode(); + $location = $this->appropriatePlaceForInsertingNode(); $adjustedInsertionLocation = $location['node']; $insertBefore = $location['insert before']; } @@ -2016,7 +2012,7 @@ class TreeBuilder { } } - public static function insertStartTagToken(StartTagToken $token, \DOMNode $intendedParent = null, string $namespace = null): Element { + public function insertStartTagToken(StartTagToken $token, \DOMNode $intendedParent = null, string $namespace = null): Element { if (!is_null($namespace)) { $namespace = $token->namespace; } @@ -2042,9 +2038,9 @@ class TreeBuilder { // DEVIATION: There is no point to setting the synchronous custom elements flag // and custom element definition; there is no scripting in this implementation. if ($namespace === Parser::HTML_NAMESPACE) { - $element = static::$instance->DOM->createElement($token->name); + $element = $this->DOM->createElement($token->name); } else { - $element = static::$instance->DOM->createElementNS($namespace, $token->name); + $element = $this->DOM->createElementNS($namespace, $token->name); } # 8. Append each attribute in the given token to element. @@ -2108,7 +2104,7 @@ class TreeBuilder { # 1. Let the adjusted insertion location be the appropriate place for inserting # a node. - $location = static::$instance->appropriatePlaceForInsertingNode($intendedParent); + $location = $this->appropriatePlaceForInsertingNode($intendedParent); $adjustedInsertionLocation = $location['node']; $insertBefore = $location['insert before']; @@ -2136,7 +2132,7 @@ class TreeBuilder { // DEVIATION: Unnecessary because there is no scripting in this implementation. # 4. Push element onto the stack of open elements so that it is the new current node. - static::$instance->stack[] = $element; + $this->stack[] = $element; # Return element. return $element; @@ -2148,7 +2144,7 @@ class TreeBuilder { # invoked in response to a start tag token. # 1. Insert an HTML element for the token. - static::insertStartTagToken($token); + $this->insertStartTagToken($token); # 2. If the algorithm that was invoked is the generic raw text element parsing # algorithm, switch the tokenizer to the RAWTEXT state; otherwise the algorithm