diff --git a/lib/Exception.php b/lib/Exception.php index 5d8d877..5ec69c9 100644 --- a/lib/Exception.php +++ b/lib/Exception.php @@ -37,7 +37,7 @@ class Exception extends \Exception { 10201 => '%s is an invalid Stack index', 10202 => 'Instances of DOMNode are the only types allowed in a Stack', - 10203 => 'DOMElement, DOMDocument, or DOMDocumentFragment expected for fragment context; found %s' + 10203 => 'DOMElement, DOMDocument, or DOMDocumentFragment expected for fragment context; found %s', 10301 => '%s is an invalid ActiveFormattingElementsList index', diff --git a/lib/Tokenizer.php b/lib/Tokenizer.php index 4715215..e35f2bd 100644 --- a/lib/Tokenizer.php +++ b/lib/Tokenizer.php @@ -87,139 +87,141 @@ class Tokenizer { public function createToken(): Token { while (true) { if (Parser::$debug) { - echo "State: "; - switch ($this->state) { - case self::DATA_STATE: echo "Data\n"; + case self::DATA_STATE: $state = "Data"; + break; + case self::RCDATA_STATE: $state = "RCDATA"; break; - case self::RCDATA_STATE: echo "RCDATA\n"; + case self::RAWTEXT_STATE: $state = "RAWTEXT"; break; - case self::RAWTEXT_STATE: echo "RAWTEXT\n"; + case self::SCRIPT_DATA_STATE: $state = "Script data"; break; - case self::SCRIPT_DATA_STATE: echo "Script data\n"; + case self::PLAINTEXT_STATE: $state = "PLAINTEXT"; break; - case self::PLAINTEXT_STATE: echo "PLAINTEXT\n"; + case self::TAG_OPEN_STATE: $state = "Tag open"; break; - case self::TAG_OPEN_STATE: echo "Tag open\n"; + case self::END_TAG_OPEN_STATE: $state = "End tag open"; break; - case self::END_TAG_OPEN_STATE: echo "End tag open\n"; + case self::TAG_NAME_STATE: $state = "Tag name"; break; - case self::TAG_NAME_STATE: echo "Tag name\n"; + case self::RCDATA_LESS_THAN_SIGN_STATE: $state = "RCDATA less-than sign"; break; - case self::RCDATA_LESS_THAN_SIGN_STATE: echo "RCDATA less-than sign\n"; + case self::RCDATA_END_TAG_OPEN_STATE: $state = "RCDATA end tag open"; break; - case self::RCDATA_END_TAG_OPEN_STATE: echo "RCDATA end tag open\n"; + case self::RCDATA_END_TAG_NAME_STATE: $state = "RCDATA end tag name"; break; - case self::RCDATA_END_TAG_NAME_STATE: echo "RCDATA end tag name\n"; + case self::RAWTEXT_LESS_THAN_SIGN_STATE: $state = "RAWTEXT less than sign"; break; - case self::RAWTEXT_LESS_THAN_SIGN_STATE: echo "RAWTEXT less than sign\n"; + case self::RAWTEXT_END_TAG_OPEN_STATE: $state = "RAWTEXT end tag open"; break; - case self::RAWTEXT_END_TAG_OPEN_STATE: echo "RAWTEXT end tag open\n"; + case self::RAWTEXT_END_TAG_NAME_STATE: $state = "RAWTEXT end tag name"; break; - case self::RAWTEXT_END_TAG_NAME_STATE: echo "RAWTEXT end tag name\n"; + case self::SCRIPT_DATA_LESS_THAN_SIGN_STATE: $state = "Script data less-than sign"; break; - case self::SCRIPT_DATA_LESS_THAN_SIGN_STATE: echo "Script data less-than sign\n"; + case self::SCRIPT_DATA_END_TAG_OPEN_STATE: $state = "Script data end tag open"; break; - case self::SCRIPT_DATA_END_TAG_OPEN_STATE: echo "Script data end tag open\n"; + case self::SCRIPT_DATA_END_TAG_NAME_STATE: $state = "Script data end tag name"; break; - case self::SCRIPT_DATA_END_TAG_NAME_STATE: echo "Script data end tag name\n"; + case self::SCRIPT_DATA_ESCAPE_START_STATE: $state = "Script data escape start"; break; - case self::SCRIPT_DATA_ESCAPE_START_STATE: echo "Script data escape start\n"; + case self::SCRIPT_DATA_ESCAPE_START_DASH_STATE: $state = "Script data escape start dash"; break; - case self::SCRIPT_DATA_ESCAPE_START_DASH_STATE: echo "Script data escape start dash\n"; + case self::SCRIPT_DATA_ESCAPED_STATE: $state = "Script data escaped"; break; - case self::SCRIPT_DATA_ESCAPED_STATE: echo "Script data escaped\n"; + case self::SCRIPT_DATA_ESCAPED_DASH_STATE: $state = "Script data escaped dash"; break; - case self::SCRIPT_DATA_ESCAPED_DASH_STATE: echo "Script data escaped dash\n"; + case self::SCRIPT_DATA_ESCAPED_DASH_DASH_STATE: $state = "Script data escaped dash dash"; break; - case self::SCRIPT_DATA_ESCAPED_DASH_DASH_STATE: echo "Script data escaped dash dash\n"; + case self::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE: $state = "Script data escaped less-than sign"; break; - case self::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE: echo "Script data escaped less-than sign\n"; + case self::SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE: $state = "Script data escaped end tag open"; break; - case self::SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE: echo "Script data escaped end tag open\n"; + case self::SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE: $state = "Script data escaped end tag name"; break; - case self::SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE: echo "Script data escaped end tag name\n"; + case self::SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE: $state = "Script data double escape start"; break; - case self::SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE: echo "Script data double escape start\n"; + case self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE: $state = "Script data double escaped"; break; - case self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE: echo "Script data double escaped\n"; + case self::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE: $state = "Script data double escaped dash"; break; - case self::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE: echo "Script data double escaped dash\n"; + case self::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE: $state = "Script data double escaped dash dash"; break; - case self::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE: echo "Script data double escaped dash dash\n"; + case self::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE: $state = "Script data double escaped less-than sign"; break; - case self::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE: echo "Script data double escaped less-than sign\n"; + case self::SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE: $state = "Script data double escape end"; break; - case self::SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE: echo "Script data double escape end\n"; + case self::BEFORE_ATTRIBUTE_NAME_STATE: $state = "Before attribute"; break; - case self::BEFORE_ATTRIBUTE_NAME_STATE: echo "Before attribute\n"; + case self::ATTRIBUTE_NAME_STATE: $state = "Attribute name"; break; - case self::ATTRIBUTE_NAME_STATE: echo "Attribute name\n"; + case self::AFTER_ATTRIBUTE_NAME_STATE: $state = "After attribute name"; break; - case self::AFTER_ATTRIBUTE_NAME_STATE: echo "After attribute name\n"; + case self::BEFORE_ATTRIBUTE_VALUE_STATE: $state = "Before attribute value"; break; - case self::BEFORE_ATTRIBUTE_VALUE_STATE: echo "Before attribute value\n"; + case self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE: $state = "Attribute value (double quoted)"; break; - case self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE: echo "Attribute value (double quoted)\n"; + case self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE: $state = "Attribute value (single quoted)"; break; - case self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE: echo "Attribute value (single quoted)\n"; + case self::ATTRIBUTE_VALUE_UNQUOTED_STATE: $state = "Attribute value (unquoted)"; break; - case self::ATTRIBUTE_VALUE_UNQUOTED_STATE: echo "Attribute value (unquoted)\n"; + case self::AFTER_ATTRIBUTE_VALUE_QUOTED_STATE: $state = "After attribute value (quoted)"; break; - case self::AFTER_ATTRIBUTE_VALUE_QUOTED_STATE: echo "After attribute value (quoted)\n"; + case self::SELF_CLOSING_START_TAG_STATE: $state = "Self-closing start tag"; break; - case self::SELF_CLOSING_START_TAG_STATE: echo "Self-closing start tag\n"; + case self::BOGUS_COMMENT_STATE: $state = "Bogus comment"; break; - case self::BOGUS_COMMENT_STATE: echo "Bogus comment\n"; + case self::MARKUP_DECLARATION_OPEN_STATE: $state = "Markup declaration open"; break; - case self::MARKUP_DECLARATION_OPEN_STATE: echo "Markup declaration open\n"; + case self::COMMENT_START_STATE: $state = "Comment start"; break; - case self::COMMENT_START_STATE: echo "Comment start\n"; + case self::COMMENT_START_DASH_STATE: $state = "Comment start dash"; break; - case self::COMMENT_START_DASH_STATE: echo "Comment start dash\n"; + case self::COMMENT_STATE: $state = "Comment"; break; - case self::COMMENT_STATE: echo "Comment\n"; + case self::COMMENT_END_DASH_STATE: $state = "Comment end dash"; break; - case self::COMMENT_END_DASH_STATE: echo "Comment end dash\n"; + case self::COMMENT_END_STATE: $state = "Comment end"; break; - case self::COMMENT_END_STATE: echo "Comment end\n"; + case self::COMMENT_END_BANG_STATE: $state = "Comment end bang"; break; - case self::COMMENT_END_BANG_STATE: echo "Comment end bang\n"; + case self::DOCTYPE_STATE: $state = "DOCTYPE"; break; - case self::DOCTYPE_STATE: echo "DOCTYPE\n"; + case self::BEFORE_DOCTYPE_NAME_STATE: $state = "Before DOCTYPE name"; break; - case self::BEFORE_DOCTYPE_NAME_STATE: echo "Before DOCTYPE name\n"; + case self::DOCTYPE_NAME_STATE: $state = "DOCTYPE name"; break; - case self::DOCTYPE_NAME_STATE: echo "DOCTYPE name\n"; + case self::AFTER_DOCTYPE_NAME_STATE: $state = "After DOCTYPE name"; break; - case self::AFTER_DOCTYPE_NAME_STATE: echo "After DOCTYPE name\n"; + case self::AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE: $state = "After DOCTYPE public keyword"; break; - case self::AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE: echo "After DOCTYPE public keyword\n"; + case self::BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE: $state = "Before DOCTYPE public identifier"; break; - case self::BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE: echo "Before DOCTYPE public identifier\n"; + case self::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE: $state = "DOCTYPE public identifier (double quoted)"; break; - case self::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE: echo "DOCTYPE public identifier (double quoted)\n"; + case self::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE: $state = "DOCTYPE public identifier (single quoted)"; break; - case self::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE: echo "DOCTYPE public identifier (single quoted)\n"; + case self::AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE: $state = "After DOCTYPE public identifier"; break; - case self::AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE: echo "After DOCTYPE public identifier\n"; + case self::BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE: $state = "Between DOCTYPE public and system identifiers"; break; - case self::BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE: echo "Between DOCTYPE public and system identifiers\n"; + case self::AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE: $state = "After DOCTYPE system keyword"; break; - case self::AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE: echo "After DOCTYPE system keyword\n"; + case self::BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE: $state = "Before DOCTYPE system identifier"; break; - case self::BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE: echo "Before DOCTYPE system identifier\n"; + case self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE: $state = "DOCTYPE system identifier (double-quoted)"; break; - case self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE: echo "DOCTYPE system identifier (double-quoted)\n"; + case self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE: $state = "DOCTYPE system identifier (single-quoted)"; break; - case self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE: echo "DOCTYPE system identifier (single-quoted)\n"; + case self::AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE: $state = "After DOCTYPE system identifier"; break; - case self::AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE: echo "After DOCTYPE system identifier\n"; + case self::BOGUS_DOCTYPE_STATE: $state = "Bogus comment"; break; - case self::BOGUS_DOCTYPE_STATE: echo "Bogus comment\n"; + case self::CDATA_SECTION_STATE: $state = "CDATA section"; break; - case self::CDATA_SECTION_STATE: echo "CDATA section\n"; + default: throw new Exception(Exception::UNKNOWN_ERROR); } + + echo "State: $state\n"; } # 12.2.4.1 Data state diff --git a/lib/TreeBuilder.php b/lib/TreeBuilder.php index a0dfe73..ef5d274 100644 --- a/lib/TreeBuilder.php +++ b/lib/TreeBuilder.php @@ -183,6 +183,60 @@ class TreeBuilder { // Loop used when processing the token under different rules; always breaks. while (true) { + if (Parser::$debug) { + switch ($insertionMode) { + case self::INITIAL_MODE: $mode = "Initial"; + break; + case self::BEFORE_HTML_MODE: $mode = "Before html"; + break; + case self::BEFORE_HEAD_MODE: $mode = "Before head"; + break; + case self::IN_HEAD_MODE: $mode = "In head"; + break; + case self::IN_HEAD_NOSCRIPT_MODE: $mode = "In head noscript"; + break; + case self::AFTER_HEAD_MODE: $mode = "After head"; + break; + case self::IN_BODY_MODE: $mode = "In body"; + break; + case self::TEXT_MODE: $mode = "Text"; + break; + case self::IN_TABLE_MODE: $mode = "In table"; + break; + case self::IN_TABLE_TEXT_MODE: $mode = "In table text"; + break; + case self::IN_CAPTION_MODE: $mode = "In caption"; + break; + case self::IN_COLUMN_GROUP_MODE: $mode = "In column group"; + break; + case self::IN_TABLE_BODY_MODE: $mode = "In table body"; + break; + case self::IN_ROW_MODE: $mode = "In row"; + break; + case self::IN_CELL_MODE: $mode = "In cell"; + break; + case self::IN_SELECT_MODE: $mode = "In select"; + break; + case self::IN_SELECT_IN_TABLE_MODE: $mode = "In select in table"; + break; + case self::IN_TEMPLATE_MODE: $mode = "In template mode"; + break; + case self::AFTER_BODY_MODE: $mode = "After body"; + break; + case self::IN_FRAMESET_MODE: $mode = "In frameset"; + break; + case self::AFTER_FRAMESET_MODE: $mode = "After frameset"; + break; + case self::AFTER_AFTER_BODY_MODE: $mode = "After after body"; + break; + case self::AFTER_AFTER_FRAMESET_MODE: $mode = "After after frameset"; + break; + default: throw new Exception(Exception::UNKNOWN_ERROR); + } + + echo "Mode: $mode\n"; + } + # 8.2.5.4. The rules for parsing tokens in HTML content switch ($insertionMode) { # 8.2.5.4.1. The "initial" insertion mode @@ -447,6 +501,7 @@ class TreeBuilder { # Switch the insertion mode to "in head". $this->insertionMode = self::IN_HEAD_MODE; + $insertionMode = self::IN_HEAD_MODE; # Reprocess the current token. continue 2; @@ -771,7 +826,7 @@ class TreeBuilder { case self::AFTER_HEAD_MODE: # A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE - if ($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") === strlen($token->data)) { + if ($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") === strlen($token->data))) { # Insert the character. $this->insertCharacterToken($token); } @@ -783,7 +838,7 @@ class TreeBuilder { # A DOCTYPE token elseif ($token instanceof DOCTYPEToken) { # Parse error. - ParseError::trigger(ParseError::UNEXPECTED_DOCTYPE, 'body, frameset, template start tag'); + ParseError::trigger(ParseError::UNEXPECTED_DOCTYPE, 'body, frameset start tag'); } elseif ($token instanceof StartTagToken) { # A start tag whose tag name is "html" @@ -808,8 +863,107 @@ class TreeBuilder { # Switch the insertion mode to "in frameset". $this->insertionMode = self::IN_FRAMESET_MODE; } + # A start tag whose tag name is one of: "base", "basefont", "bgsound", "link", + # "meta", "noframes", "script", "style", "template", "title" + elseif ($token->name === 'base' || $token->name === 'basefont' || $token->name === 'bgsound' || $token->name === 'link' || $token->name === 'meta' || $token->name === 'noframes' || $token->name === 'script' || $token->name === 'style' || $token->name === 'template' || $token->name === 'title') { + # Parse error. + ParseError::trigger(ParseError::UNEXPECTED_START_TAG, $token->name, 'body, frameset'); + # Push the node pointed to by the head element pointer onto the stack of open elements. + $this->stack[] = $this->headElement; + # Process the token using the rules for the "in head" insertion mode. + $this->parseTokenInHTMLContent($token, self::IN_HEAD_MODE); - /* ¡STOPPED HERE! + # Remove the node pointed to by the head element pointer from the stack of open + # elements. (It might not be the current node at this point.) + $key = $this->stack->search($this->headElement); + if ($key !== -1) { + unset($this->stack[$key]); + } + } + # A start tag whose tag name is "head" + elseif ($token->name === 'head') { + # Parse error. + ParseError::trigger(ParseError::UNEXPECTED_START_TAG, 'head', 'body, frameset'); + } + # Any other start tag + else { + # Act as described in the "anything else" entry below. + # + # Insert an HTML element for a "body" start tag token with no attributes. + $this->insertStartTagToken(new StartTagToken('body')); + # Switch the insertion mode to "in body". + $this->insertionMode = self::IN_BODY_MODE; + $insertionMode = self::IN_BODY_MODE; + # Reprocess the current token. + continue 2; + } + } + elseif ($token instanceof EndTagToken) { + # An end tag whose tag name is "template" + if ($token->name === 'template') { + # Process the token using the rules for the "in head" insertion mode. + $insertionMode = self::IN_HEAD_MODE; + continue 2; + } + # An end tag whose tag name is one of: "body", "html", "br" + elseif ($token->name === 'body' || $token->name === 'html' || $token->name === 'br') { + # Act as described in the "anything else" entry below. + # + # Insert an HTML element for a "body" start tag token with no attributes. + $this->insertStartTagToken(new StartTagToken('body')); + # Switch the insertion mode to "in body". + $this->insertionMode = self::IN_BODY_MODE; + $insertionMode = self::IN_BODY_MODE; + # Reprocess the current token. + continue 2; + } + # Any other end tag + else { + # Parse error. + ParseError::trigger(ParseError::UNEXPECTED_END_TAG, 'head', 'body, frameset'); + } + } + # Anything else + else { + # Insert an HTML element for a "body" start tag token with no attributes. + $this->insertStartTagToken(new StartTagToken('body')); + # Switch the insertion mode to "in body". + $this->insertionMode = self::IN_BODY_MODE; + $insertionMode = self::IN_BODY_MODE; + # Reprocess the current token. + continue 2; + } + break; + + # 8.2.5.4.7. The "in body" insertion mode + case self::IN_BODY_MODE: + if ($token instanceof CharacterToken) { + # A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED + # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + # + # Any other character token + // Space characters and any other characters are exactly the same except any + // other characters sets the frameset-ok flag to "not ok". + + # Reconstruct the active formatting elements, if any. + $this->activeFormattingElementsList->reconstruct(); + # Insert the token’s character. + $this->insertCharacterToken($token); + + if (strspn($token->data, "\t\n\x0c\x0d ") !== strlen($token->data)) { + # Set the frameset-ok flag to "not ok". + $this->framesetOk = false; + } + } + # A comment token + elseif ($token instanceof CommentToken) { + # Insert a comment. + $this->insertCommentToken($token); + } + # A DOCTYPE token + elseif ($token instanceof DOCTYPEToken) { + # Parse error. + ParseError::trigger(ParseError::UNEXPECTED_DOCTYPE, 'body content'); } break; } @@ -819,6 +973,10 @@ class TreeBuilder { } protected function parseTokenInForeignContent(Token $token) { + if (Parser::$debug) { + echo "Foreign Content\n"; + } + $currentNode = $this->stack->currentNode; $currentNodeName = $this->stack->currentNodeName; $currentNodeNamespace = $this->stack->currentNodeNamespace; @@ -866,7 +1024,7 @@ class TreeBuilder { ) ) { # Parse error. - ParseError::trigger(ParseError::UNEXPECTED_START_TAG, $token->name, 'Non-HTML start tag'); + ParseError::trigger(ParseError::UNEXPECTED_START_TAG, $token->name, 'Non-HTML'); # If the parser was originally created for the HTML fragment parsing algorithm, # then act as described in the "any other start tag" entry below. (fragment