Browse Source

Emit whitespace-only character tokens

This makes tree building simpler in certain circumstances
ns
J. King 3 years ago
parent
commit
1dc3d9c23e
  1. 2
      lib/Token.php
  2. 100
      lib/Tokenizer.php
  3. 52
      lib/TreeBuilder.php
  4. 5
      tests/cases/TestTokenizer.php

2
lib/Token.php

@ -39,6 +39,8 @@ class CharacterToken extends DataToken {
public const NAME = "Character token";
}
class WhitespaceToken extends CharacterToken {}
class CommentToken extends DataToken {
public const NAME = "Comment token";

100
lib/Tokenizer.php

@ -268,7 +268,12 @@ class Tokenizer {
# Switch to the character reference state.
// DEVIATION: Character reference consumption implemented as a function
return new CharacterToken($this->switchToCharacterReferenceState(self::DATA_STATE));
$outChar = $this->switchToCharacterReferenceState(self::DATA_STATE);
if (strspn($outChar, Data::WHITESPACE)) {
return new WhitespaceToken($outChar); // a character reference is either all whitespace is no whitespace
} else {
return new CharacterToken($outChar);
}
}
# U+003C LESS-THAN SIGN (<)
elseif ($char === '<') {
@ -295,7 +300,11 @@ class Tokenizer {
// Consume all characters that don't match what is above and emit
// that as a character token instead to prevent having to loop back
// through here every single time.
return new CharacterToken($char.$this->data->consumeUntil("&<\0"));
if (strspn($char, Data::WHITESPACE)) {
return new WhitespaceToken($char.$this->data->consumeWhile(Data::WHITESPACE));
} else {
return new CharacterToken($char.$this->data->consumeUntil("&<\0"));
}
}
}
@ -310,7 +319,12 @@ class Tokenizer {
# Switch to the character reference state.
// DEVIATION: Character reference consumption implemented as a function
return new CharacterToken($this->switchToCharacterReferenceState(self::RCDATA_STATE));
$outChar = $this->switchToCharacterReferenceState(self::RCDATA_STATE);
if (strspn($outChar, Data::WHITESPACE)) {
return new WhitespaceToken($outChar); // a character reference is either all whitespace is no whitespace
} else {
return new CharacterToken($outChar);
}
}
# U+003C LESS-THAN SIGN (<)
elseif ($char === '<') {
@ -337,7 +351,11 @@ class Tokenizer {
// Consume all characters that don't match what is above and emit
// that as a character token instead to prevent having to loop back
// through here every single time.
return new CharacterToken($char.$this->data->consumeUntil("&<\0"));
if (strspn($char, Data::WHITESPACE)) {
return new WhitespaceToken($char.$this->data->consumeWhile(Data::WHITESPACE));
} else {
return new CharacterToken($char.$this->data->consumeUntil("&<\0"));
}
}
}
@ -371,7 +389,11 @@ class Tokenizer {
// Consume all characters that don't match what is above and emit
// that as a character token instead to prevent having to loop back
// through here every single time.
return new CharacterToken($char.$this->data->consumeUntil("<\0"));
if (strspn($char, Data::WHITESPACE)) {
return new WhitespaceToken($char.$this->data->consumeWhile(Data::WHITESPACE));
} else {
return new CharacterToken($char.$this->data->consumeUntil("<\0"));
}
}
}
@ -405,7 +427,11 @@ class Tokenizer {
// Consume all characters that don't match what is above and emit
// that as a character token instead to prevent having to loop back
// through here every single time.
return new CharacterToken($char.$this->data->consumeUntil("<\0"));
if (strspn($char, Data::WHITESPACE)) {
return new WhitespaceToken($char.$this->data->consumeWhile(Data::WHITESPACE));
} else {
return new CharacterToken($char.$this->data->consumeUntil("<\0"));
}
}
}
@ -434,7 +460,11 @@ class Tokenizer {
// Consume all characters that don't match what is above and emit
// that as a character token instead to prevent having to loop back
// through here every single time.
return new CharacterToken($char.$this->data->consumeUntil("\0"));
if (strspn($char, Data::WHITESPACE)) {
return new WhitespaceToken($char.$this->data->consumeWhile(Data::WHITESPACE));
} else {
return new CharacterToken($char.$this->data->consumeUntil("\0"));
}
}
}
@ -1062,7 +1092,11 @@ class Tokenizer {
// OPTIMIZATION:
// Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
return new CharacterToken($char.$this->data->consumeUntil("-<\0"));
if (strspn($char, Data::WHITESPACE)) {
return new WhitespaceToken($char.$this->data->consumeWhile(Data::WHITESPACE));
} else {
return new CharacterToken($char.$this->data->consumeUntil("-<\0"));
}
}
}
@ -1104,7 +1138,11 @@ class Tokenizer {
# Switch to the script data escaped state.
# Emit the current input character as a character token.
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
return new CharacterToken($char);
if (strspn($char, Data::WHITESPACE)) {
return new WhitespaceToken($char);
} else {
return new CharacterToken($char);
}
}
}
@ -1151,7 +1189,11 @@ class Tokenizer {
# Switch to the script data escaped state.
# Emit the current input character as a character token.
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
return new CharacterToken($char);
if (strspn($char, Data::WHITESPACE)) {
return new WhitespaceToken($char);
} else {
return new CharacterToken($char);
}
}
}
@ -1313,7 +1355,11 @@ class Tokenizer {
} else {
$this->state = self::SCRIPT_DATA_ESCAPED_STATE;
}
return new CharacterToken($char);
if (strspn($char, Data::WHITESPACE)) {
return new WhitespaceToken($char);
} else {
return new CharacterToken($char);
}
}
# ASCII upper alpha
# ASCII lower alpha
@ -1378,7 +1424,11 @@ class Tokenizer {
// OPTIMIZATION:
// Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
return new CharacterToken($char.$this->data->consumeUntil("-<\0"));
if (strspn($char, Data::WHITESPACE)) {
return new WhitespaceToken($char.$this->data->consumeWhile(Data::WHITESPACE));
} else {
return new CharacterToken($char.$this->data->consumeUntil("-<\0"));
}
}
}
@ -1422,7 +1472,11 @@ class Tokenizer {
# Switch to the script data double escaped state.
# Emit the current input character as a character token.
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
return new CharacterToken($char);
if (strspn($char, Data::WHITESPACE)) {
return new WhitespaceToken($char);
} else {
return new CharacterToken($char);
}
}
}
@ -1471,7 +1525,11 @@ class Tokenizer {
# Switch to the script data double escaped state.
# Emit the current input character as a character token.
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
return new CharacterToken($char);
if (strspn($char, Data::WHITESPACE)) {
return new WhitespaceToken($char);
} else {
return new CharacterToken($char);
}
}
}
@ -1518,7 +1576,11 @@ class Tokenizer {
} else {
$this->state = self::SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
}
return new CharacterToken($char);
if (strspn($char, Data::WHITESPACE)) {
return new WhitespaceToken($char);
} else {
return new CharacterToken($char);
}
}
# ASCII upper alpha
# ASCII lower alpha
@ -3344,7 +3406,11 @@ class Tokenizer {
// OPTIMIZATION:
// Consume all characters that aren't listed above to prevent having
// to loop back through here every single time.
return new CharacterToken($char.$this->data->consumeUntil(']'));
if (strspn($char, Data::WHITESPACE)) {
return new WhitespaceToken($char.$this->data->consumeWhile(Data::WHITESPACE));
} else {
return new CharacterToken($char.$this->data->consumeUntil(']'));
}
}
}
@ -3378,7 +3444,7 @@ class Tokenizer {
# Emit a U+005D RIGHT SQUARE BRACKET character token.
// OTPIMIZATION: Consume any additional right square brackets
return new CharacterToken($char.$this->data->consumeWhile(']'));
return new CharacterToken(']'.$this->data->consumeWhile(']'));
}
# U+003E GREATER-THAN SIGN character
elseif ($char === '>') {

52
lib/TreeBuilder.php

@ -221,7 +221,7 @@ class TreeBuilder {
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
// OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
if ($token instanceof WhitespaceToken) {
# Ignore the token.
}
# A comment token
@ -389,7 +389,7 @@ class TreeBuilder {
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
// OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
elseif ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
elseif ($token instanceof WhitespaceToken) {
# Ignore the token.
}
# A start tag whose tag name is "html"
@ -433,7 +433,7 @@ class TreeBuilder {
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
// OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
if ($token instanceof WhitespaceToken) {
# Ignore the token.
}
# A comment token
@ -485,7 +485,7 @@ class TreeBuilder {
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
// OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
if ($token instanceof WhitespaceToken) {
# Insert the character.
$this->insertCharacterToken($token);
}
@ -745,7 +745,7 @@ class TreeBuilder {
# A comment token
// OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
elseif ($token instanceof CommentToken || ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data)))) {
elseif ($token instanceof CommentToken || $token instanceof WhitespaceToken) {
# Process the token using the rules for the "in head" insertion mode.
return $this->parseTokenInHTMLContent($token, self::IN_HEAD_MODE);
}
@ -768,7 +768,7 @@ class TreeBuilder {
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
// OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
if ($token instanceof WhitespaceToken) {
# Insert the character.
$this->insertCharacterToken($token);
}
@ -882,23 +882,20 @@ class TreeBuilder {
}
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
#
elseif ($token instanceof WhitespaceToken) {
# Reconstruct the active formatting elements, if any.
$this->activeFormattingElementsList->reconstruct();
# Insert the token’s character.
$this->insertCharacterToken($token);
}
# Any other character token
// Space characters and any other characters are exactly the same except any
// other characters sets the frameset-ok flag to "not ok".
elseif ($token instanceof CharacterToken) {
# Reconstruct the active formatting elements, if any.
$this->activeFormattingElementsList->reconstruct();
# Insert the token’s character.
$this->insertCharacterToken($token);
// OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
if (strspn($token->data, Data::WHITESPACE) !== strlen($token->data)) {
# Set the frameset-ok flag to "not ok".
$this->framesetOk = false;
}
# Set the frameset-ok flag to "not ok".
$this->framesetOk = false;
}
# A comment token
elseif ($token instanceof CommentToken) {
@ -1402,18 +1399,17 @@ class TreeBuilder {
#
# When the user agent is to apply the rules for parsing tokens in foreign
# content, the user agent must handle the token as follows:
#
if ($token instanceof CharacterToken) {
# A character token that is one of U+0009 CHARACTER TABULATION, "LF" (U+000A),
# "FF" (U+000C), "CR" (U+000D), or U+0020 SPACE
# Any other character token
// OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
if (strspn($token->data, Data::WHITESPACE) !== strlen($token->data)) {
# Set the frameset-ok flag to "not ok".
$this->framesetOk = false;
}
# A character token that is one of U+0009 CHARACTER TABULATION, "LF" (U+000A),
# "FF" (U+000C), "CR" (U+000D), or U+0020 SPACE
if ($token instanceof WhitespaceToken) {
# Insert the token's character.
$this->insertCharacterToken($token);
}
# Any other character token
elseif ($token instanceof CharacterToken) {
# Set the frameset-ok flag to "not ok".
$this->framesetOk = false;
# Insert the token's character.
$this->insertCharacterToken($token);
}

5
tests/cases/TestTokenizer.php

@ -12,6 +12,7 @@ use dW\HTML5\CommentToken;
use dW\HTML5\DOCTYPEToken;
use dW\HTML5\EndTagToken;
use dW\HTML5\StartTagToken;
use dW\HTML5\WhitespaceToken;
/**
* @covers \dW\HTML5\Tokenizer
@ -61,6 +62,7 @@ class TestTokenizer extends \PHPUnit\Framework\TestCase {
try {
do {
$t = $tokenizer->createToken();
assert(!$t instanceof CharacterToken || ($t instanceof WhitespaceToken && strspn($t->data, Data::WHITESPACE) === strlen($t->data)) || strspn($t->data, Data::WHITESPACE) === 0, new \Exception("Character token must either consist only of whitespace, or start with other than whitespace: ".var_export($t->data ?? "''", true)));
if (!($t instanceof EOFToken)) {
$actual[] = $t;
}
@ -100,6 +102,9 @@ class TestTokenizer extends \PHPUnit\Framework\TestCase {
foreach ($tokens as $t) {
if ($t instanceof CharacterToken) {
if (!$pending) {
if ($t instanceof WhitespaceToken) {
$t = new CharacterToken($t->data);
}
$pending = $t;
} else {
$pending->data .= $t->data;

Loading…
Cancel
Save