Browse Source

Avoid implicit looping and switching

The while loop has been replaced with gotos where appropriate, and
switching has been replaced with a series of if-blocks in line with the
same logic in the tokenizer.
ns
J. King 4 years ago
parent
commit
f8b9cf2c2b
  1. 122
      lib/TreeBuilder.php

122
lib/TreeBuilder.php

@ -199,22 +199,18 @@ class TreeBuilder {
}
}
protected function parseTokenInHTMLContent(Token $token, int $insertionMode = null) {
protected function parseTokenInHTMLContent(Token $token, int $insertionMode = null): bool {
ProcessToken:
$insertionMode = $insertionMode ?? $this->insertionMode;
// Loop used when processing the token under different rules; always breaks.
$iterations = 0;
while (true) {
assert((function() use ($insertionMode) {
$mode = self::INSERTION_MODE_NAMES[$insertionMode] ?? $insertionMode;
$this->debugLog .= " Mode: $mode\n";
return true;
})());
assert($iterations++ < 50, new LoopException("Probable infinite loop detected in HTML content handling"));
# 13.2.6.4. The rules for parsing tokens in HTML content
switch ($insertionMode) {
# 13.2.6.4.1. The "initial" insertion mode
case self::INITIAL_MODE:
if ($insertionMode === self::INITIAL_MODE) {
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
// OPTIMIZATION: Will check for multiple space characters at once as character
@ -371,12 +367,11 @@ class TreeBuilder {
# token.
$this->insertionMode = self::BEFORE_HTML_MODE;
$insertionMode = self::BEFORE_HTML_MODE;
continue 2;
goto ProcessToken;
};
}
break;
# 13.2.6.4.2. The "before html" insertion mode
case self::BEFORE_HTML_MODE:
elseif ($insertionMode === self::BEFORE_HTML_MODE) {
# A DOCTYPE token
if ($token instanceof DOCTYPEToken) {
$this->error(ParseError::UNEXPECTED_DOCTYPE);
@ -420,17 +415,16 @@ class TreeBuilder {
# Switch the insertion mode to "before head", then reprocess the token.
$this->insertionMode = self::BEFORE_HEAD_MODE;
$insertionMode = self::BEFORE_HEAD_MODE;
continue 2;
goto ProcessToken;
}
# The document element can end up being removed from the Document object, e.g.,
# by scripts; nothing in particular happens in such cases, content continues
# by scripts; nothing in particular happens in such cases, content goto ProcessTokens
# being appended to the nodes as described in the next section.
// Good to know. There's no scripting in this implementation, though.
break;
}
# 13.2.6.4.3. The "before head" insertion mode
case self::BEFORE_HEAD_MODE:
elseif ($insertionMode === self::BEFORE_HEAD_MODE) {
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
// OPTIMIZATION: Will check for multiple space characters at once as character
@ -452,7 +446,7 @@ class TreeBuilder {
elseif ($token instanceof StartTagToken && $token->name === 'html') {
# Process the token using the rules for the "in body" insertion mode.
$insertionMode = self::IN_BODY_MODE;
continue 2;
goto ProcessToken;
}
# A start tag whose tag name is "head"
elseif ($token instanceof StartTagToken && $token->name === 'head') {
@ -484,12 +478,11 @@ class TreeBuilder {
$insertionMode = self::IN_HEAD_MODE;
# Reprocess the current token.
continue 2;
goto ProcessToken;
}
}
break;
# 13.2.6.4.4. The "in head" insertion mode
case self::IN_HEAD_MODE:
elseif ($insertionMode === self::IN_HEAD_MODE) {
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
// OPTIMIZATION: Will check for multiple space characters at once as character
@ -513,7 +506,7 @@ class TreeBuilder {
if ($token->name === 'html') {
# Process the token using the rules for the "in body" insertion mode.
$insertionMode = self::IN_BODY_MODE;
continue 2;
goto ProcessToken;
}
# A start tag whose tag name is one of: "base", "basefont", "bgsound", "link"
elseif ($token->name === 'base' || $token->name === 'basefont' || $token->name === 'bgsound' || $token->name === 'link') {
@ -632,7 +625,7 @@ class TreeBuilder {
$this->insertionMode = self::AFTER_HEAD_MODE;
$insertionMode = self::AFTER_HEAD_MODE;
# Reprocess the token.
continue 2;
goto ProcessToken;
}
}
elseif ($token instanceof EndTagToken) {
@ -655,7 +648,7 @@ class TreeBuilder {
$this->insertionMode = self::AFTER_HEAD_MODE;
$insertionMode = self::AFTER_HEAD_MODE;
# Reprocess the token.
continue 2;
goto ProcessToken;
}
# An end tag whose tag name is "template"
elseif ($token->name === 'template') {
@ -702,12 +695,11 @@ class TreeBuilder {
$this->insertionMode = self::AFTER_HEAD_MODE;
$insertionMode = self::AFTER_HEAD_MODE;
# Reprocess the token.
continue 2;
goto ProcessToken;
}
}
break;
# 13.2.6.4.5. The "in head noscript" insertion mode
case self::IN_HEAD_NOSCRIPT_MODE:
elseif ($insertionMode === self::IN_HEAD_NOSCRIPT_MODE) {
# DOCTYPE token
if ($token instanceof DOCTYPEToken) {
# Parse error.
@ -718,14 +710,14 @@ class TreeBuilder {
if ($token->name === 'html') {
# Process the token using the rules for the "in body" insertion mode.
$insertionMode = self::IN_BODY_MODE;
continue 2;
goto ProcessToken;
}
# A start tag whose tag name is one of: "basefont", "bgsound", "link", "meta",
# "noframes", "style"
elseif ($token->name === 'basefont' || $token->name === 'bgsound' || $token->name === 'link' || $token->name === 'meta' || $token->name === 'noframes' || $token->name === 'style'){
# Process the token using the rules for the "in head" insertion mode.
$insertionMode = self::IN_HEAD_MODE;
continue 2;
goto ProcessToken;
}
# A start tag whose tag name is one of: "head", "noscript"
elseif ($token->name === 'head' || $token->name === 'noscript') {
@ -745,7 +737,7 @@ class TreeBuilder {
$this->insertionMode = self::IN_HEAD_MODE;
$insertionMode = self::IN_HEAD_MODE;
# Reprocess the token.
continue 2;
goto ProcessToken;
}
}
elseif ($token instanceof EndTagToken) {
@ -770,7 +762,7 @@ class TreeBuilder {
$this->insertionMode = self::IN_HEAD_MODE;
$insertionMode = self::IN_HEAD_MODE;
# Reprocess the token.
continue 2;
goto ProcessToken;
}
# Any other end tag
else {
@ -787,7 +779,7 @@ class TreeBuilder {
$token instanceof CommentToken) {
# Process the token using the rules for the "in head" insertion mode.
$insertionMode = self::IN_HEAD_MODE;
continue 2;
goto ProcessToken;
}
# Anything else
else {
@ -800,12 +792,11 @@ class TreeBuilder {
$this->insertionMode = self::IN_HEAD_MODE;
$insertionMode = self::IN_HEAD_MODE;
# Reprocess the token.
continue 2;
goto ProcessToken;
}
}
break;
# 13.2.6.4.6. The "after head" insertion mode
case self::AFTER_HEAD_MODE:
elseif ($insertionMode === self::AFTER_HEAD_MODE) {
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
// OPTIMIZATION: Will check for multiple space characters at once as character
@ -829,7 +820,7 @@ class TreeBuilder {
if ($token->name === 'html') {
# Process the token using the rules for the "in body" insertion mode.
$insertionMode = self::IN_BODY_MODE;
continue 2;
goto ProcessToken;
}
# A start tag whose tag name is "body"
elseif ($token->name === 'body') {
@ -879,7 +870,7 @@ class TreeBuilder {
$this->insertionMode = self::IN_BODY_MODE;
$insertionMode = self::IN_BODY_MODE;
# Reprocess the current token.
continue 2;
goto ProcessToken;
}
}
elseif ($token instanceof EndTagToken) {
@ -887,7 +878,7 @@ class TreeBuilder {
if ($token->name === 'template') {
# Process the token using the rules for the "in head" insertion mode.
$insertionMode = self::IN_HEAD_MODE;
continue 2;
goto ProcessToken;
}
# An end tag whose tag name is one of: "body", "html", "br"
elseif ($token->name === 'body' || $token->name === 'html' || $token->name === 'br') {
@ -899,7 +890,7 @@ class TreeBuilder {
$this->insertionMode = self::IN_BODY_MODE;
$insertionMode = self::IN_BODY_MODE;
# Reprocess the current token.
continue 2;
goto ProcessToken;
}
# Any other end tag
else {
@ -915,19 +906,25 @@ class TreeBuilder {
$this->insertionMode = self::IN_BODY_MODE;
$insertionMode = self::IN_BODY_MODE;
# Reprocess the current token.
continue 2;
goto ProcessToken;
}
}
break;
# 13.2.6.4.7. The "in body" insertion mode
case self::IN_BODY_MODE:
if ($token instanceof CharacterToken) {
elseif ($insertionMode === self::IN_BODY_MODE) {
# A character token that is U+0000 NULL
if ($token instanceof CharacterToken && $token->data === "\0") {
# Parse error. Ignore the token
// DEVIATION: the parse error is already reported by the tokenizer;
// this is probably an oversight in the specification, so we don't
// report it a second time
}
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
#
# Any other character token
// Space characters and any other characters are exactly the same except any
// other characters sets the frameset-ok flag to "not ok".
elseif ($token instanceof CharacterToken) {
# Reconstruct the active formatting elements, if any.
$this->activeFormattingElementsList->reconstruct();
@ -975,7 +972,7 @@ class TreeBuilder {
elseif ($token->name === 'base' || $token->name === 'basefont' || $token->name === 'bgsound' || $token->name === 'link' || $token->name === 'meta' || $token->name === 'noframes' || $token->name === 'script' || $token->name === 'style' || $token->name === 'template' || $token->name === 'title') {
# Process the token using the rules for the "in head" insertion mode.
$insertionMode = self::IN_HEAD_MODE;
continue 2;
goto ProcessToken;
}
# A start tag whose tag name is "body"
elseif ($token->name === 'body') {
@ -1093,7 +1090,7 @@ class TreeBuilder {
// Process the next token
$token = $nextToken;
continue 2;
goto ProcessToken;
}
# A start tag whose tag name is "form"
elseif ($token->name === 'form') {
@ -1145,13 +1142,13 @@ class TreeBuilder {
$this->stack->popUntil('li');
# 4. Jump to the step labeled Done below.
break;
return true;
}
# 4. If node is in the special category, but is not an address, div, or p
# element, then jump to the step labeled Done below.
if ($nodeName !== 'address' && $nodeName !== 'div' && $nodeName !== 'p' && $this->isElementSpecial($node)) {
break;
return true;
}
# 5. Otherwise, set node to the previous entry in the stack of open elements and
@ -1196,13 +1193,13 @@ class TreeBuilder {
$this->stack->popUntil(['dd', 'dt']);
# 4. Jump to the step labeled Done below.
break;
return true;
}
# 5. If node is in the special category, but is not an address, div, or p
# element, then jump to the step labeled Done below.
if ($nodeName !== 'address' && $nodeName !== 'div' && $nodeName !== 'p' && $this->isElementSpecial($node)) {
break;
return true;
}
# 6. Otherwise, set node to the previous entry in the stack of open elements and
@ -1264,7 +1261,7 @@ class TreeBuilder {
if ($token->name === 'template') {
# Process the token using the rules for the "in head" insertion mode.
$insertionMode = self::IN_HEAD_MODE;
continue 2;
goto ProcessToken;
}
# An end tag whose tag name is "body"
# An end tag whose tag name is "html"
@ -1290,7 +1287,7 @@ class TreeBuilder {
return false;
}) !== -1) {
$this->error(ParseError::UNEXPECTED_END_TAG, 'body');
break;
return true;
}
# Switch the insertion mode to "after body".
@ -1300,7 +1297,7 @@ class TreeBuilder {
// an html end tag the token is reprocessed.
if ($token->name === 'html') {
# Reprocess the token.
continue 2;
goto ProcessToken;
}
}
}
@ -1382,7 +1379,7 @@ class TreeBuilder {
# If the stack of template insertion modes is not empty, then process the token using the rules for the "in template" insertion mode.
if ($this->templateInsertionModes->length !== 0) {
$insertionMode = self::IN_TEMPLATE_MODE;
continue 2;
goto ProcessToken;
}
# Otherwise, follow these steps:
@ -1400,26 +1397,21 @@ class TreeBuilder {
return false;
}) !== -1) {
$this->error(ParseError::UNEXPECTED_END_TAG, 'body');
break;
return true;
}
# 2. Stop parsing.
// Abort!
}
break;
}
break;
// IMPLEMENTATION PENDING
else {
throw new \Exception("NOT IMPLEMENTED");
}
return true;
}
protected function parseTokenInForeignContent(Token $token): bool {
if (self::$debug) {
echo "Foreign Content\n";
}
$currentNode = $this->stack->currentNode;
$currentNodeName = $this->stack->currentNodeName;
$currentNodeNamespace = $this->stack->currentNodeNamespace;

Loading…
Cancel
Save