From 1dc3d9c23e997a6592958a7ac5f10b80f5597ee8 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Sun, 14 Feb 2021 18:31:16 -0500 Subject: [PATCH] Emit whitespace-only character tokens This makes tree building simpler in certain circumstances --- lib/Token.php | 2 + lib/Tokenizer.php | 100 ++++++++++++++++++++++++++++------ lib/TreeBuilder.php | 52 ++++++++---------- tests/cases/TestTokenizer.php | 5 ++ 4 files changed, 114 insertions(+), 45 deletions(-) diff --git a/lib/Token.php b/lib/Token.php index 1551aa1..c14d5fe 100644 --- a/lib/Token.php +++ b/lib/Token.php @@ -39,6 +39,8 @@ class CharacterToken extends DataToken { public const NAME = "Character token"; } +class WhitespaceToken extends CharacterToken {} + class CommentToken extends DataToken { public const NAME = "Comment token"; diff --git a/lib/Tokenizer.php b/lib/Tokenizer.php index be35b2b..908173b 100644 --- a/lib/Tokenizer.php +++ b/lib/Tokenizer.php @@ -268,7 +268,12 @@ class Tokenizer { # Switch to the character reference state. // DEVIATION: Character reference consumption implemented as a function - return new CharacterToken($this->switchToCharacterReferenceState(self::DATA_STATE)); + $outChar = $this->switchToCharacterReferenceState(self::DATA_STATE); + if (strspn($outChar, Data::WHITESPACE)) { + return new WhitespaceToken($outChar); // a character reference is either all whitespace is no whitespace + } else { + return new CharacterToken($outChar); + } } # U+003C LESS-THAN SIGN (<) elseif ($char === '<') { @@ -295,7 +300,11 @@ class Tokenizer { // Consume all characters that don't match what is above and emit // that as a character token instead to prevent having to loop back // through here every single time. - return new CharacterToken($char.$this->data->consumeUntil("&<\0")); + if (strspn($char, Data::WHITESPACE)) { + return new WhitespaceToken($char.$this->data->consumeWhile(Data::WHITESPACE)); + } else { + return new CharacterToken($char.$this->data->consumeUntil("&<\0")); + } } } @@ -310,7 +319,12 @@ class Tokenizer { # Switch to the character reference state. // DEVIATION: Character reference consumption implemented as a function - return new CharacterToken($this->switchToCharacterReferenceState(self::RCDATA_STATE)); + $outChar = $this->switchToCharacterReferenceState(self::RCDATA_STATE); + if (strspn($outChar, Data::WHITESPACE)) { + return new WhitespaceToken($outChar); // a character reference is either all whitespace is no whitespace + } else { + return new CharacterToken($outChar); + } } # U+003C LESS-THAN SIGN (<) elseif ($char === '<') { @@ -337,7 +351,11 @@ class Tokenizer { // Consume all characters that don't match what is above and emit // that as a character token instead to prevent having to loop back // through here every single time. - return new CharacterToken($char.$this->data->consumeUntil("&<\0")); + if (strspn($char, Data::WHITESPACE)) { + return new WhitespaceToken($char.$this->data->consumeWhile(Data::WHITESPACE)); + } else { + return new CharacterToken($char.$this->data->consumeUntil("&<\0")); + } } } @@ -371,7 +389,11 @@ class Tokenizer { // Consume all characters that don't match what is above and emit // that as a character token instead to prevent having to loop back // through here every single time. - return new CharacterToken($char.$this->data->consumeUntil("<\0")); + if (strspn($char, Data::WHITESPACE)) { + return new WhitespaceToken($char.$this->data->consumeWhile(Data::WHITESPACE)); + } else { + return new CharacterToken($char.$this->data->consumeUntil("<\0")); + } } } @@ -405,7 +427,11 @@ class Tokenizer { // Consume all characters that don't match what is above and emit // that as a character token instead to prevent having to loop back // through here every single time. - return new CharacterToken($char.$this->data->consumeUntil("<\0")); + if (strspn($char, Data::WHITESPACE)) { + return new WhitespaceToken($char.$this->data->consumeWhile(Data::WHITESPACE)); + } else { + return new CharacterToken($char.$this->data->consumeUntil("<\0")); + } } } @@ -434,7 +460,11 @@ class Tokenizer { // Consume all characters that don't match what is above and emit // that as a character token instead to prevent having to loop back // through here every single time. - return new CharacterToken($char.$this->data->consumeUntil("\0")); + if (strspn($char, Data::WHITESPACE)) { + return new WhitespaceToken($char.$this->data->consumeWhile(Data::WHITESPACE)); + } else { + return new CharacterToken($char.$this->data->consumeUntil("\0")); + } } } @@ -1062,7 +1092,11 @@ class Tokenizer { // OPTIMIZATION: // Consume all characters that aren't listed above to prevent having // to loop back through here every single time. - return new CharacterToken($char.$this->data->consumeUntil("-<\0")); + if (strspn($char, Data::WHITESPACE)) { + return new WhitespaceToken($char.$this->data->consumeWhile(Data::WHITESPACE)); + } else { + return new CharacterToken($char.$this->data->consumeUntil("-<\0")); + } } } @@ -1104,7 +1138,11 @@ class Tokenizer { # Switch to the script data escaped state. # Emit the current input character as a character token. $this->state = self::SCRIPT_DATA_ESCAPED_STATE; - return new CharacterToken($char); + if (strspn($char, Data::WHITESPACE)) { + return new WhitespaceToken($char); + } else { + return new CharacterToken($char); + } } } @@ -1151,7 +1189,11 @@ class Tokenizer { # Switch to the script data escaped state. # Emit the current input character as a character token. $this->state = self::SCRIPT_DATA_ESCAPED_STATE; - return new CharacterToken($char); + if (strspn($char, Data::WHITESPACE)) { + return new WhitespaceToken($char); + } else { + return new CharacterToken($char); + } } } @@ -1313,7 +1355,11 @@ class Tokenizer { } else { $this->state = self::SCRIPT_DATA_ESCAPED_STATE; } - return new CharacterToken($char); + if (strspn($char, Data::WHITESPACE)) { + return new WhitespaceToken($char); + } else { + return new CharacterToken($char); + } } # ASCII upper alpha # ASCII lower alpha @@ -1378,7 +1424,11 @@ class Tokenizer { // OPTIMIZATION: // Consume all characters that aren't listed above to prevent having // to loop back through here every single time. - return new CharacterToken($char.$this->data->consumeUntil("-<\0")); + if (strspn($char, Data::WHITESPACE)) { + return new WhitespaceToken($char.$this->data->consumeWhile(Data::WHITESPACE)); + } else { + return new CharacterToken($char.$this->data->consumeUntil("-<\0")); + } } } @@ -1422,7 +1472,11 @@ class Tokenizer { # Switch to the script data double escaped state. # Emit the current input character as a character token. $this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE; - return new CharacterToken($char); + if (strspn($char, Data::WHITESPACE)) { + return new WhitespaceToken($char); + } else { + return new CharacterToken($char); + } } } @@ -1471,7 +1525,11 @@ class Tokenizer { # Switch to the script data double escaped state. # Emit the current input character as a character token. $this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE; - return new CharacterToken($char); + if (strspn($char, Data::WHITESPACE)) { + return new WhitespaceToken($char); + } else { + return new CharacterToken($char); + } } } @@ -1518,7 +1576,11 @@ class Tokenizer { } else { $this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE; } - return new CharacterToken($char); + if (strspn($char, Data::WHITESPACE)) { + return new WhitespaceToken($char); + } else { + return new CharacterToken($char); + } } # ASCII upper alpha # ASCII lower alpha @@ -3344,7 +3406,11 @@ class Tokenizer { // OPTIMIZATION: // Consume all characters that aren't listed above to prevent having // to loop back through here every single time. - return new CharacterToken($char.$this->data->consumeUntil(']')); + if (strspn($char, Data::WHITESPACE)) { + return new WhitespaceToken($char.$this->data->consumeWhile(Data::WHITESPACE)); + } else { + return new CharacterToken($char.$this->data->consumeUntil(']')); + } } } @@ -3378,7 +3444,7 @@ class Tokenizer { # Emit a U+005D RIGHT SQUARE BRACKET character token. // OTPIMIZATION: Consume any additional right square brackets - return new CharacterToken($char.$this->data->consumeWhile(']')); + return new CharacterToken(']'.$this->data->consumeWhile(']')); } # U+003E GREATER-THAN SIGN character elseif ($char === '>') { diff --git a/lib/TreeBuilder.php b/lib/TreeBuilder.php index a43577e..1ecd9a2 100644 --- a/lib/TreeBuilder.php +++ b/lib/TreeBuilder.php @@ -221,7 +221,7 @@ class TreeBuilder { # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE // OPTIMIZATION: Will check for multiple space characters at once as character // tokens can contain more than one character. - if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) { + if ($token instanceof WhitespaceToken) { # Ignore the token. } # A comment token @@ -389,7 +389,7 @@ class TreeBuilder { # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE // OPTIMIZATION: Will check for multiple space characters at once as character // tokens can contain more than one character. - elseif ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) { + elseif ($token instanceof WhitespaceToken) { # Ignore the token. } # A start tag whose tag name is "html" @@ -433,7 +433,7 @@ class TreeBuilder { # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE // OPTIMIZATION: Will check for multiple space characters at once as character // tokens can contain more than one character. - if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) { + if ($token instanceof WhitespaceToken) { # Ignore the token. } # A comment token @@ -485,7 +485,7 @@ class TreeBuilder { # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE // OPTIMIZATION: Will check for multiple space characters at once as character // tokens can contain more than one character. - if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) { + if ($token instanceof WhitespaceToken) { # Insert the character. $this->insertCharacterToken($token); } @@ -745,7 +745,7 @@ class TreeBuilder { # A comment token // OPTIMIZATION: Will check for multiple space characters at once as character // tokens can contain more than one character. - elseif ($token instanceof CommentToken || ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data)))) { + elseif ($token instanceof CommentToken || $token instanceof WhitespaceToken) { # Process the token using the rules for the "in head" insertion mode. return $this->parseTokenInHTMLContent($token, self::IN_HEAD_MODE); } @@ -768,7 +768,7 @@ class TreeBuilder { # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE // OPTIMIZATION: Will check for multiple space characters at once as character // tokens can contain more than one character. - if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) { + if ($token instanceof WhitespaceToken) { # Insert the character. $this->insertCharacterToken($token); } @@ -882,23 +882,20 @@ class TreeBuilder { } # A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE - # + elseif ($token instanceof WhitespaceToken) { + # Reconstruct the active formatting elements, if any. + $this->activeFormattingElementsList->reconstruct(); + # Insert the token’s character. + $this->insertCharacterToken($token); + } # Any other character token - // Space characters and any other characters are exactly the same except any - // other characters sets the frameset-ok flag to "not ok". elseif ($token instanceof CharacterToken) { - # Reconstruct the active formatting elements, if any. $this->activeFormattingElementsList->reconstruct(); # Insert the token’s character. $this->insertCharacterToken($token); - - // OPTIMIZATION: Will check for multiple space characters at once as character - // tokens can contain more than one character. - if (strspn($token->data, Data::WHITESPACE) !== strlen($token->data)) { - # Set the frameset-ok flag to "not ok". - $this->framesetOk = false; - } + # Set the frameset-ok flag to "not ok". + $this->framesetOk = false; } # A comment token elseif ($token instanceof CommentToken) { @@ -1402,18 +1399,17 @@ class TreeBuilder { # # When the user agent is to apply the rules for parsing tokens in foreign # content, the user agent must handle the token as follows: - # - if ($token instanceof CharacterToken) { - # A character token that is one of U+0009 CHARACTER TABULATION, "LF" (U+000A), - # "FF" (U+000C), "CR" (U+000D), or U+0020 SPACE - # Any other character token - // OPTIMIZATION: Will check for multiple space characters at once as character - // tokens can contain more than one character. - if (strspn($token->data, Data::WHITESPACE) !== strlen($token->data)) { - # Set the frameset-ok flag to "not ok". - $this->framesetOk = false; - } + # A character token that is one of U+0009 CHARACTER TABULATION, "LF" (U+000A), + # "FF" (U+000C), "CR" (U+000D), or U+0020 SPACE + if ($token instanceof WhitespaceToken) { + # Insert the token's character. + $this->insertCharacterToken($token); + } + # Any other character token + elseif ($token instanceof CharacterToken) { + # Set the frameset-ok flag to "not ok". + $this->framesetOk = false; # Insert the token's character. $this->insertCharacterToken($token); } diff --git a/tests/cases/TestTokenizer.php b/tests/cases/TestTokenizer.php index 946e068..79c8241 100644 --- a/tests/cases/TestTokenizer.php +++ b/tests/cases/TestTokenizer.php @@ -12,6 +12,7 @@ use dW\HTML5\CommentToken; use dW\HTML5\DOCTYPEToken; use dW\HTML5\EndTagToken; use dW\HTML5\StartTagToken; +use dW\HTML5\WhitespaceToken; /** * @covers \dW\HTML5\Tokenizer @@ -61,6 +62,7 @@ class TestTokenizer extends \PHPUnit\Framework\TestCase { try { do { $t = $tokenizer->createToken(); + assert(!$t instanceof CharacterToken || ($t instanceof WhitespaceToken && strspn($t->data, Data::WHITESPACE) === strlen($t->data)) || strspn($t->data, Data::WHITESPACE) === 0, new \Exception("Character token must either consist only of whitespace, or start with other than whitespace: ".var_export($t->data ?? "''", true))); if (!($t instanceof EOFToken)) { $actual[] = $t; } @@ -100,6 +102,9 @@ class TestTokenizer extends \PHPUnit\Framework\TestCase { foreach ($tokens as $t) { if ($t instanceof CharacterToken) { if (!$pending) { + if ($t instanceof WhitespaceToken) { + $t = new CharacterToken($t->data); + } $pending = $t; } else { $pending->data .= $t->data;