Browse Source

Completed After head insertion mode, started In body

• Added debugging output for TreeBuilder, showing the insertion mode and foreign if content is foreign.
• Fixed syntax errors that weren't caught in last commit
split-manual
Dustin Wilson 6 years ago
parent
commit
e199df20aa
  1. 2
      lib/Exception.php
  2. 136
      lib/Tokenizer.php
  3. 166
      lib/TreeBuilder.php

2
lib/Exception.php

@ -37,7 +37,7 @@ class Exception extends \Exception {
10201 => '%s is an invalid Stack index',
10202 => 'Instances of DOMNode are the only types allowed in a Stack',
10203 => 'DOMElement, DOMDocument, or DOMDocumentFragment expected for fragment context; found %s'
10203 => 'DOMElement, DOMDocument, or DOMDocumentFragment expected for fragment context; found %s',
10301 => '%s is an invalid ActiveFormattingElementsList index',

136
lib/Tokenizer.php

@ -87,139 +87,141 @@ class Tokenizer {
public function createToken(): Token {
while (true) {
if (Parser::$debug) {
echo "State: ";
switch ($this->state) {
case self::DATA_STATE: echo "Data\n";
case self::DATA_STATE: $state = "Data";
break;
case self::RCDATA_STATE: $state = "RCDATA";
break;
case self::RCDATA_STATE: echo "RCDATA\n";
case self::RAWTEXT_STATE: $state = "RAWTEXT";
break;
case self::RAWTEXT_STATE: echo "RAWTEXT\n";
case self::SCRIPT_DATA_STATE: $state = "Script data";
break;
case self::SCRIPT_DATA_STATE: echo "Script data\n";
case self::PLAINTEXT_STATE: $state = "PLAINTEXT";
break;
case self::PLAINTEXT_STATE: echo "PLAINTEXT\n";
case self::TAG_OPEN_STATE: $state = "Tag open";
break;
case self::TAG_OPEN_STATE: echo "Tag open\n";
case self::END_TAG_OPEN_STATE: $state = "End tag open";
break;
case self::END_TAG_OPEN_STATE: echo "End tag open\n";
case self::TAG_NAME_STATE: $state = "Tag name";
break;
case self::TAG_NAME_STATE: echo "Tag name\n";
case self::RCDATA_LESS_THAN_SIGN_STATE: $state = "RCDATA less-than sign";
break;
case self::RCDATA_LESS_THAN_SIGN_STATE: echo "RCDATA less-than sign\n";
case self::RCDATA_END_TAG_OPEN_STATE: $state = "RCDATA end tag open";
break;
case self::RCDATA_END_TAG_OPEN_STATE: echo "RCDATA end tag open\n";
case self::RCDATA_END_TAG_NAME_STATE: $state = "RCDATA end tag name";
break;
case self::RCDATA_END_TAG_NAME_STATE: echo "RCDATA end tag name\n";
case self::RAWTEXT_LESS_THAN_SIGN_STATE: $state = "RAWTEXT less than sign";
break;
case self::RAWTEXT_LESS_THAN_SIGN_STATE: echo "RAWTEXT less than sign\n";
case self::RAWTEXT_END_TAG_OPEN_STATE: $state = "RAWTEXT end tag open";
break;
case self::RAWTEXT_END_TAG_OPEN_STATE: echo "RAWTEXT end tag open\n";
case self::RAWTEXT_END_TAG_NAME_STATE: $state = "RAWTEXT end tag name";
break;
case self::RAWTEXT_END_TAG_NAME_STATE: echo "RAWTEXT end tag name\n";
case self::SCRIPT_DATA_LESS_THAN_SIGN_STATE: $state = "Script data less-than sign";
break;
case self::SCRIPT_DATA_LESS_THAN_SIGN_STATE: echo "Script data less-than sign\n";
case self::SCRIPT_DATA_END_TAG_OPEN_STATE: $state = "Script data end tag open";
break;
case self::SCRIPT_DATA_END_TAG_OPEN_STATE: echo "Script data end tag open\n";
case self::SCRIPT_DATA_END_TAG_NAME_STATE: $state = "Script data end tag name";
break;
case self::SCRIPT_DATA_END_TAG_NAME_STATE: echo "Script data end tag name\n";
case self::SCRIPT_DATA_ESCAPE_START_STATE: $state = "Script data escape start";
break;
case self::SCRIPT_DATA_ESCAPE_START_STATE: echo "Script data escape start\n";
case self::SCRIPT_DATA_ESCAPE_START_DASH_STATE: $state = "Script data escape start dash";
break;
case self::SCRIPT_DATA_ESCAPE_START_DASH_STATE: echo "Script data escape start dash\n";
case self::SCRIPT_DATA_ESCAPED_STATE: $state = "Script data escaped";
break;
case self::SCRIPT_DATA_ESCAPED_STATE: echo "Script data escaped\n";
case self::SCRIPT_DATA_ESCAPED_DASH_STATE: $state = "Script data escaped dash";
break;
case self::SCRIPT_DATA_ESCAPED_DASH_STATE: echo "Script data escaped dash\n";
case self::SCRIPT_DATA_ESCAPED_DASH_DASH_STATE: $state = "Script data escaped dash dash";
break;
case self::SCRIPT_DATA_ESCAPED_DASH_DASH_STATE: echo "Script data escaped dash dash\n";
case self::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE: $state = "Script data escaped less-than sign";
break;
case self::SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE: echo "Script data escaped less-than sign\n";
case self::SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE: $state = "Script data escaped end tag open";
break;
case self::SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE: echo "Script data escaped end tag open\n";
case self::SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE: $state = "Script data escaped end tag name";
break;
case self::SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE: echo "Script data escaped end tag name\n";
case self::SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE: $state = "Script data double escape start";
break;
case self::SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE: echo "Script data double escape start\n";
case self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE: $state = "Script data double escaped";
break;
case self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE: echo "Script data double escaped\n";
case self::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE: $state = "Script data double escaped dash";
break;
case self::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE: echo "Script data double escaped dash\n";
case self::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE: $state = "Script data double escaped dash dash";
break;
case self::SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE: echo "Script data double escaped dash dash\n";
case self::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE: $state = "Script data double escaped less-than sign";
break;
case self::SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE: echo "Script data double escaped less-than sign\n";
case self::SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE: $state = "Script data double escape end";
break;
case self::SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE: echo "Script data double escape end\n";
case self::BEFORE_ATTRIBUTE_NAME_STATE: $state = "Before attribute";
break;
case self::BEFORE_ATTRIBUTE_NAME_STATE: echo "Before attribute\n";
case self::ATTRIBUTE_NAME_STATE: $state = "Attribute name";
break;
case self::ATTRIBUTE_NAME_STATE: echo "Attribute name\n";
case self::AFTER_ATTRIBUTE_NAME_STATE: $state = "After attribute name";
break;
case self::AFTER_ATTRIBUTE_NAME_STATE: echo "After attribute name\n";
case self::BEFORE_ATTRIBUTE_VALUE_STATE: $state = "Before attribute value";
break;
case self::BEFORE_ATTRIBUTE_VALUE_STATE: echo "Before attribute value\n";
case self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE: $state = "Attribute value (double quoted)";
break;
case self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE: echo "Attribute value (double quoted)\n";
case self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE: $state = "Attribute value (single quoted)";
break;
case self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE: echo "Attribute value (single quoted)\n";
case self::ATTRIBUTE_VALUE_UNQUOTED_STATE: $state = "Attribute value (unquoted)";
break;
case self::ATTRIBUTE_VALUE_UNQUOTED_STATE: echo "Attribute value (unquoted)\n";
case self::AFTER_ATTRIBUTE_VALUE_QUOTED_STATE: $state = "After attribute value (quoted)";
break;
case self::AFTER_ATTRIBUTE_VALUE_QUOTED_STATE: echo "After attribute value (quoted)\n";
case self::SELF_CLOSING_START_TAG_STATE: $state = "Self-closing start tag";
break;
case self::SELF_CLOSING_START_TAG_STATE: echo "Self-closing start tag\n";
case self::BOGUS_COMMENT_STATE: $state = "Bogus comment";
break;
case self::BOGUS_COMMENT_STATE: echo "Bogus comment\n";
case self::MARKUP_DECLARATION_OPEN_STATE: $state = "Markup declaration open";
break;
case self::MARKUP_DECLARATION_OPEN_STATE: echo "Markup declaration open\n";
case self::COMMENT_START_STATE: $state = "Comment start";
break;
case self::COMMENT_START_STATE: echo "Comment start\n";
case self::COMMENT_START_DASH_STATE: $state = "Comment start dash";
break;
case self::COMMENT_START_DASH_STATE: echo "Comment start dash\n";
case self::COMMENT_STATE: $state = "Comment";
break;
case self::COMMENT_STATE: echo "Comment\n";
case self::COMMENT_END_DASH_STATE: $state = "Comment end dash";
break;
case self::COMMENT_END_DASH_STATE: echo "Comment end dash\n";
case self::COMMENT_END_STATE: $state = "Comment end";
break;
case self::COMMENT_END_STATE: echo "Comment end\n";
case self::COMMENT_END_BANG_STATE: $state = "Comment end bang";
break;
case self::COMMENT_END_BANG_STATE: echo "Comment end bang\n";
case self::DOCTYPE_STATE: $state = "DOCTYPE";
break;
case self::DOCTYPE_STATE: echo "DOCTYPE\n";
case self::BEFORE_DOCTYPE_NAME_STATE: $state = "Before DOCTYPE name";
break;
case self::BEFORE_DOCTYPE_NAME_STATE: echo "Before DOCTYPE name\n";
case self::DOCTYPE_NAME_STATE: $state = "DOCTYPE name";
break;
case self::DOCTYPE_NAME_STATE: echo "DOCTYPE name\n";
case self::AFTER_DOCTYPE_NAME_STATE: $state = "After DOCTYPE name";
break;
case self::AFTER_DOCTYPE_NAME_STATE: echo "After DOCTYPE name\n";
case self::AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE: $state = "After DOCTYPE public keyword";
break;
case self::AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE: echo "After DOCTYPE public keyword\n";
case self::BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE: $state = "Before DOCTYPE public identifier";
break;
case self::BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE: echo "Before DOCTYPE public identifier\n";
case self::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE: $state = "DOCTYPE public identifier (double quoted)";
break;
case self::DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE: echo "DOCTYPE public identifier (double quoted)\n";
case self::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE: $state = "DOCTYPE public identifier (single quoted)";
break;
case self::DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE: echo "DOCTYPE public identifier (single quoted)\n";
case self::AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE: $state = "After DOCTYPE public identifier";
break;
case self::AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE: echo "After DOCTYPE public identifier\n";
case self::BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE: $state = "Between DOCTYPE public and system identifiers";
break;
case self::BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE: echo "Between DOCTYPE public and system identifiers\n";
case self::AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE: $state = "After DOCTYPE system keyword";
break;
case self::AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE: echo "After DOCTYPE system keyword\n";
case self::BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE: $state = "Before DOCTYPE system identifier";
break;
case self::BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE: echo "Before DOCTYPE system identifier\n";
case self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE: $state = "DOCTYPE system identifier (double-quoted)";
break;
case self::DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE: echo "DOCTYPE system identifier (double-quoted)\n";
case self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE: $state = "DOCTYPE system identifier (single-quoted)";
break;
case self::DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE: echo "DOCTYPE system identifier (single-quoted)\n";
case self::AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE: $state = "After DOCTYPE system identifier";
break;
case self::AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE: echo "After DOCTYPE system identifier\n";
case self::BOGUS_DOCTYPE_STATE: $state = "Bogus comment";
break;
case self::BOGUS_DOCTYPE_STATE: echo "Bogus comment\n";
case self::CDATA_SECTION_STATE: $state = "CDATA section";
break;
case self::CDATA_SECTION_STATE: echo "CDATA section\n";
default: throw new Exception(Exception::UNKNOWN_ERROR);
}
echo "State: $state\n";
}
# 12.2.4.1 Data state

166
lib/TreeBuilder.php

@ -183,6 +183,60 @@ class TreeBuilder {
// Loop used when processing the token under different rules; always breaks.
while (true) {
if (Parser::$debug) {
switch ($insertionMode) {
case self::INITIAL_MODE: $mode = "Initial";
break;
case self::BEFORE_HTML_MODE: $mode = "Before html";
break;
case self::BEFORE_HEAD_MODE: $mode = "Before head";
break;
case self::IN_HEAD_MODE: $mode = "In head";
break;
case self::IN_HEAD_NOSCRIPT_MODE: $mode = "In head noscript";
break;
case self::AFTER_HEAD_MODE: $mode = "After head";
break;
case self::IN_BODY_MODE: $mode = "In body";
break;
case self::TEXT_MODE: $mode = "Text";
break;
case self::IN_TABLE_MODE: $mode = "In table";
break;
case self::IN_TABLE_TEXT_MODE: $mode = "In table text";
break;
case self::IN_CAPTION_MODE: $mode = "In caption";
break;
case self::IN_COLUMN_GROUP_MODE: $mode = "In column group";
break;
case self::IN_TABLE_BODY_MODE: $mode = "In table body";
break;
case self::IN_ROW_MODE: $mode = "In row";
break;
case self::IN_CELL_MODE: $mode = "In cell";
break;
case self::IN_SELECT_MODE: $mode = "In select";
break;
case self::IN_SELECT_IN_TABLE_MODE: $mode = "In select in table";
break;
case self::IN_TEMPLATE_MODE: $mode = "In template mode";
break;
case self::AFTER_BODY_MODE: $mode = "After body";
break;
case self::IN_FRAMESET_MODE: $mode = "In frameset";
break;
case self::AFTER_FRAMESET_MODE: $mode = "After frameset";
break;
case self::AFTER_AFTER_BODY_MODE: $mode = "After after body";
break;
case self::AFTER_AFTER_FRAMESET_MODE: $mode = "After after frameset";
break;
default: throw new Exception(Exception::UNKNOWN_ERROR);
}
echo "Mode: $mode\n";
}
# 8.2.5.4. The rules for parsing tokens in HTML content
switch ($insertionMode) {
# 8.2.5.4.1. The "initial" insertion mode
@ -447,6 +501,7 @@ class TreeBuilder {
# Switch the insertion mode to "in head".
$this->insertionMode = self::IN_HEAD_MODE;
$insertionMode = self::IN_HEAD_MODE;
# Reprocess the current token.
continue 2;
@ -771,7 +826,7 @@ class TreeBuilder {
case self::AFTER_HEAD_MODE:
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
if ($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") === strlen($token->data)) {
if ($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") === strlen($token->data))) {
# Insert the character.
$this->insertCharacterToken($token);
}
@ -783,7 +838,7 @@ class TreeBuilder {
# A DOCTYPE token
elseif ($token instanceof DOCTYPEToken) {
# Parse error.
ParseError::trigger(ParseError::UNEXPECTED_DOCTYPE, 'body, frameset, template start tag');
ParseError::trigger(ParseError::UNEXPECTED_DOCTYPE, 'body, frameset start tag');
}
elseif ($token instanceof StartTagToken) {
# A start tag whose tag name is "html"
@ -808,8 +863,107 @@ class TreeBuilder {
# Switch the insertion mode to "in frameset".
$this->insertionMode = self::IN_FRAMESET_MODE;
}
# A start tag whose tag name is one of: "base", "basefont", "bgsound", "link",
# "meta", "noframes", "script", "style", "template", "title"
elseif ($token->name === 'base' || $token->name === 'basefont' || $token->name === 'bgsound' || $token->name === 'link' || $token->name === 'meta' || $token->name === 'noframes' || $token->name === 'script' || $token->name === 'style' || $token->name === 'template' || $token->name === 'title') {
# Parse error.
ParseError::trigger(ParseError::UNEXPECTED_START_TAG, $token->name, 'body, frameset');
# Push the node pointed to by the head element pointer onto the stack of open elements.
$this->stack[] = $this->headElement;
# Process the token using the rules for the "in head" insertion mode.
$this->parseTokenInHTMLContent($token, self::IN_HEAD_MODE);
/* ¡STOPPED HERE!
# Remove the node pointed to by the head element pointer from the stack of open
# elements. (It might not be the current node at this point.)
$key = $this->stack->search($this->headElement);
if ($key !== -1) {
unset($this->stack[$key]);
}
}
# A start tag whose tag name is "head"
elseif ($token->name === 'head') {
# Parse error.
ParseError::trigger(ParseError::UNEXPECTED_START_TAG, 'head', 'body, frameset');
}
# Any other start tag
else {
# Act as described in the "anything else" entry below.
#
# Insert an HTML element for a "body" start tag token with no attributes.
$this->insertStartTagToken(new StartTagToken('body'));
# Switch the insertion mode to "in body".
$this->insertionMode = self::IN_BODY_MODE;
$insertionMode = self::IN_BODY_MODE;
# Reprocess the current token.
continue 2;
}
}
elseif ($token instanceof EndTagToken) {
# An end tag whose tag name is "template"
if ($token->name === 'template') {
# Process the token using the rules for the "in head" insertion mode.
$insertionMode = self::IN_HEAD_MODE;
continue 2;
}
# An end tag whose tag name is one of: "body", "html", "br"
elseif ($token->name === 'body' || $token->name === 'html' || $token->name === 'br') {
# Act as described in the "anything else" entry below.
#
# Insert an HTML element for a "body" start tag token with no attributes.
$this->insertStartTagToken(new StartTagToken('body'));
# Switch the insertion mode to "in body".
$this->insertionMode = self::IN_BODY_MODE;
$insertionMode = self::IN_BODY_MODE;
# Reprocess the current token.
continue 2;
}
# Any other end tag
else {
# Parse error.
ParseError::trigger(ParseError::UNEXPECTED_END_TAG, 'head', 'body, frameset');
}
}
# Anything else
else {
# Insert an HTML element for a "body" start tag token with no attributes.
$this->insertStartTagToken(new StartTagToken('body'));
# Switch the insertion mode to "in body".
$this->insertionMode = self::IN_BODY_MODE;
$insertionMode = self::IN_BODY_MODE;
# Reprocess the current token.
continue 2;
}
break;
# 8.2.5.4.7. The "in body" insertion mode
case self::IN_BODY_MODE:
if ($token instanceof CharacterToken) {
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
#
# Any other character token
// Space characters and any other characters are exactly the same except any
// other characters sets the frameset-ok flag to "not ok".
# Reconstruct the active formatting elements, if any.
$this->activeFormattingElementsList->reconstruct();
# Insert the token’s character.
$this->insertCharacterToken($token);
if (strspn($token->data, "\t\n\x0c\x0d ") !== strlen($token->data)) {
# Set the frameset-ok flag to "not ok".
$this->framesetOk = false;
}
}
# A comment token
elseif ($token instanceof CommentToken) {
# Insert a comment.
$this->insertCommentToken($token);
}
# A DOCTYPE token
elseif ($token instanceof DOCTYPEToken) {
# Parse error.
ParseError::trigger(ParseError::UNEXPECTED_DOCTYPE, 'body content');
}
break;
}
@ -819,6 +973,10 @@ class TreeBuilder {
}
protected function parseTokenInForeignContent(Token $token) {
if (Parser::$debug) {
echo "Foreign Content\n";
}
$currentNode = $this->stack->currentNode;
$currentNodeName = $this->stack->currentNodeName;
$currentNodeNamespace = $this->stack->currentNodeNamespace;
@ -866,7 +1024,7 @@ class TreeBuilder {
)
) {
# Parse error.
ParseError::trigger(ParseError::UNEXPECTED_START_TAG, $token->name, 'Non-HTML start tag');
ParseError::trigger(ParseError::UNEXPECTED_START_TAG, $token->name, 'Non-HTML');
# If the parser was originally created for the HTML fragment parsing algorithm,
# then act as described in the "any other start tag" entry below. (fragment

Loading…
Cancel
Save