Browse Source

Fixed an issue with tokenization of attributes

split-manual
Dustin Wilson 6 years ago
parent
commit
85894ed1ea
  1. 17
      lib/Token.php
  2. 154
      lib/Tokenizer.php

17
lib/Token.php

@ -55,7 +55,7 @@ class StartTagToken extends TagToken {
} }
public function getAttribute(string $name) { public function getAttribute(string $name) {
$key = $this->getAttributeKey($name); $key = $this->_getAttributeKey($name);
return (isset($this->attributes[$key])) ? $this->attributes[$key] : null; return (isset($this->attributes[$key])) ? $this->attributes[$key] : null;
} }
@ -65,29 +65,30 @@ class StartTagToken extends TagToken {
} }
public function removeAttribute(string $name) { public function removeAttribute(string $name) {
unset($this->attributes[$this->getAttributeKey($name)]); unset($this->attributes[$this->_getAttributeKey($name)]);
} }
public function setAttribute(string $name, string $value, string $namespace = Parser::HTML_NAMESPACE) { public function setAttribute(string $name, string $value, string $namespace = Parser::HTML_NAMESPACE) {
$key = $this->_getAttributeKey($name); $key = $this->_getAttributeKey($name);
$attribute = new TokenAttr($name, $value, $namespace);
if (is_null($key)) { if (is_null($key)) {
$this->attributes[] = $attribute; $this->attributes[] = new TokenAttr($name, $value, $namespace);
} else { } else {
$this->attributes[$key] = $attribute; $attribute = &$this->attributes[$key];
$attribute->name = $name;
$attribute->value = $value;
$attribute->namespace = $namespace;
} }
} }
private function _getAttributeKey(string $name) { private function _getAttributeKey(string $name) {
$key = null;
foreach ($this->attributes as $key => $a) { foreach ($this->attributes as $key => $a) {
if ($a->name === $name) { if ($a->name === $name) {
break; return $key;
} }
} }
return $key; return null;
} }
} }

154
lib/Tokenizer.php

@ -1474,9 +1474,13 @@ class Tokenizer {
# code point), and its value to the empty string. Switch to the attribute name # code point), and its value to the empty string. Switch to the attribute name
# state. # state.
// DEVIATION: Will use a buffer for the attribute name instead. // Need to add the current attribute to the token, if necessary.
$attributeName = strtolower($char); if ($attribute) {
$attributeValue = ''; $token->attributes[] = $attribute;
$attribute = null;
}
$attribute = new TokenAttr(strtolower($char), '');
$this->state = self::ATTRIBUTE_NAME_STATE; $this->state = self::ATTRIBUTE_NAME_STATE;
} }
# EOF # EOF
@ -1503,9 +1507,13 @@ class Tokenizer {
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char, 'attribute name'); ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char, 'attribute name');
} }
// DEVIATION: Will use a buffer for the attribute name instead. // Need to add the current attribute to the token, if necessary.
$attributeName = $char; if ($attribute) {
$attributeValue = ''; $token->attributes[] = $attribute;
$attribute = null;
}
$attribute = new TokenAttr($char, '');
$this->state = self::ATTRIBUTE_NAME_STATE; $this->state = self::ATTRIBUTE_NAME_STATE;
} }
@ -1521,48 +1529,27 @@ class Tokenizer {
# "LF" (U+000A) # "LF" (U+000A)
# "FF" (U+000C) # "FF" (U+000C)
# U+0020 SPACE # U+0020 SPACE
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
if ($token->hasAttribute($attributeName)) {
ParseError::trigger(ParseError::ATTRIBUTE_EXISTS, $attributeName);
}
# Switch to the after attribute name state.
$this->state = self::AFTER_ATTRIBUTE_NAME_STATE;
}
# "/" (U+002F) # "/" (U+002F)
elseif ($char === '/') { # U+003E GREATER-THAN SIGN (>)
if ($token->hasAttribute($attributeName)) { # EOF
ParseError::trigger(ParseError::ATTRIBUTE_EXISTS, $attributeName); if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' || $char === '/' || $char === '>' || $char === '') {
if ($token->hasAttribute($attribute->name)) {
ParseError::trigger(ParseError::ATTRIBUTE_EXISTS, $attribute->name);
} }
# Switch to the self-closing start tag state. # Reconsume in the after attribute name state.
$this->state = self::SELF_CLOSING_START_TAG_STATE; $this->data->unconsume();
$this->state = self::AFTER_ATTRIBUTE_NAME_STATE;
} }
# "=" (U+003D) # "=" (U+003D)
elseif ($char === '=') { elseif ($char === '=') {
if ($token->hasAttribute($attributeName)) { if ($token->hasAttribute($attribute->name)) {
ParseError::trigger(ParseError::ATTRIBUTE_EXISTS, $attributeName); ParseError::trigger(ParseError::ATTRIBUTE_EXISTS, $attribute->name);
} }
# Switch to the before attribute value state. # Switch to the before attribute value state.
$this->state = self::BEFORE_ATTRIBUTE_VALUE_STATE; $this->state = self::BEFORE_ATTRIBUTE_VALUE_STATE;
} }
# ">" (U+003E)
elseif ($char === '>') {
if ($token->hasAttribute($attributeName)) {
ParseError::trigger(ParseError::ATTRIBUTE_EXISTS, $attributeName);
}
# Switch to the data state. Emit the current tag token.
$this->state = self::DATA_STATE;
// Need to add the current attribute name and value to the token if necessary.
if ($attributeName) {
$token->setAttribute($attributeName, $attributeValue);
}
return $token;
}
# Uppercase ASCII letter # Uppercase ASCII letter
elseif (ctype_upper($char)) { elseif (ctype_upper($char)) {
# Append the lowercase version of the current input character (add 0x0020 to the # Append the lowercase version of the current input character (add 0x0020 to the
@ -1570,22 +1557,14 @@ class Tokenizer {
// OPTIMIZATION: Consume all characters that are uppercase ASCII letters to prevent // OPTIMIZATION: Consume all characters that are uppercase ASCII letters to prevent
// having to loop back through here every single time. // having to loop back through here every single time.
$attributeName .= strtolower($char.$this->data->consumeWhile(self::CTYPE_UPPER)); $attribute->name .= strtolower($char.$this->data->consumeWhile(self::CTYPE_UPPER));
}
# EOF
elseif ($char === '') {
# Parse error. Switch to the data state. Reconsume the EOF character.
ParseError::trigger(ParseError::UNEXPECTED_EOF, 'attribute name');
$this->state = self::DATA_STATE;
$this->data->unconsume();
} }
# U+0022 QUOTATION MARK (") # U+0022 QUOTATION MARK (")
# "'" (U+0027) # "'" (U+0027)
# "<" (U+003C) # "<" (U+003C)
# "=" (U+003D)
# Anything else # Anything else
else { else {
# Quotes, less than sign, equals: # Quotes, less than sign:
# Parse error. Treat it as per the "anything else" entry below. # Parse error. Treat it as per the "anything else" entry below.
# Anything else: # Anything else:
# Append the current input character to the current attribute's name. # Append the current input character to the current attribute's name.
@ -1598,7 +1577,7 @@ class Tokenizer {
// characters. // characters.
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having // OPTIMIZATION: Consume all characters that aren't listed above to prevent having
// to loop back through here every single time. // to loop back through here every single time.
$attributeName .= $char.$this->data->consumeUntil("\t\n\x0c /=>\"'<".self::CTYPE_UPPER); $attribute->name .= $char.$this->data->consumeUntil("\t\n\x0c /=>\"'<".self::CTYPE_UPPER);
} }
# When the user agent leaves the attribute name state (and before emitting the tag # When the user agent leaves the attribute name state (and before emitting the tag
@ -1625,26 +1604,26 @@ class Tokenizer {
if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
# Ignore the character. # Ignore the character.
} }
# "/" (U+002F) # U+002F SOLIDUS (/)
elseif ($char === '/') { elseif ($char === '/') {
# Switch to the self-closing start tag state. # Switch to the self-closing start tag state.
$this->state = self::SELF_CLOSING_START_TAG_STATE; $this->state = self::SELF_CLOSING_START_TAG_STATE;
} }
# "=" (U+003D) # U+003D EQUALS SIGN (=)
elseif ($char === '=') { elseif ($char === '=') {
# Switch to the before attribute value state. # Switch to the before attribute value state.
$this->state = self::BEFORE_ATTRIBUTE_VALUE_STATE; $this->state = self::BEFORE_ATTRIBUTE_VALUE_STATE;
} }
# ">" (U+003E) # U+003E GREATER-THAN SIGN (>)
elseif ($char === '>') { elseif ($char === '>') {
# Switch to the data state. Emit the current tag token. // Need to add the current attribute to the token, if necessary.
$this->state = self::DATA_STATE; if ($attribute) {
$token->attributes[] = $attribute;
// Need to add the current attribute name and value to the token if necessary. $attribute = null;
if ($attributeName) {
$token->setAttribute($attributeName, $attributeValue);
} }
# Switch to the data state. Emit the current tag token.
$this->state = self::DATA_STATE;
return $token; return $token;
} }
# Uppercase ASCII letter # Uppercase ASCII letter
@ -1654,9 +1633,13 @@ class Tokenizer {
# code point), and its value to the empty string. Switch to the attribute name # code point), and its value to the empty string. Switch to the attribute name
# state. # state.
// DEVIATION: Will use a buffer for the attribute name instead. // Need to add the current attribute to the token, if necessary.
$attributeName = strtolower($char); if ($attribute) {
$attributeValue = ''; $token->attributes[] = $attribute;
$attribute = null;
}
$attribute = new TokenAttr(strtolower($char), '');
$this->state = self::ATTRIBUTE_NAME_STATE; $this->state = self::ATTRIBUTE_NAME_STATE;
} }
# EOF # EOF
@ -1683,8 +1666,13 @@ class Tokenizer {
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char, 'attribute name, attribute value, or tag end'); ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char, 'attribute name, attribute value, or tag end');
} }
$attributeName = $char; // Need to add the current attribute to the token, if necessary.
$attributeValue = ''; if ($attribute) {
$token->attributes[] = $attribute;
$attribute = null;
}
$attribute = new TokenAttr($char, '');
$this->state = self::ATTRIBUTE_NAME_STATE; $this->state = self::ATTRIBUTE_NAME_STATE;
} }
@ -1726,9 +1714,10 @@ class Tokenizer {
ParseError::trigger(ParseError::UNEXPECTED_END_OF_TAG, 'attribute value'); ParseError::trigger(ParseError::UNEXPECTED_END_OF_TAG, 'attribute value');
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
// Need to add the current attribute name and value to the token if necessary. // Need to add the current attribute to the token, if necessary.
if ($attributeName) { if ($attribute) {
$token->setAttribute($attributeName, $attributeValue); $token->attributes[] = $attribute;
$attribute = null;
} }
return $token; return $token;
@ -1755,7 +1744,7 @@ class Tokenizer {
ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char, 'attribute value'); ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char, 'attribute value');
} }
$attributeValue .= $char; $attribute->value .= $char;
$this->state = self::ATTRIBUTE_VALUE_UNQUOTED_STATE; $this->state = self::ATTRIBUTE_VALUE_UNQUOTED_STATE;
} }
@ -1786,7 +1775,7 @@ class Tokenizer {
// DEVIATION: This implementation does the character reference consuming in a // DEVIATION: This implementation does the character reference consuming in a
// function for which it is more suited for. // function for which it is more suited for.
$attributeValue .= $this->data->consumeCharacterReference('"', true); $attribute->value .= $this->data->consumeCharacterReference('"', true);
} }
# EOF # EOF
elseif ($char === '') { elseif ($char === '') {
@ -1800,7 +1789,7 @@ class Tokenizer {
# Append the current input character to the current attribute's value. # Append the current input character to the current attribute's value.
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having // OPTIMIZATION: Consume all characters that aren't listed above to prevent having
// to loop back through here every single time. // to loop back through here every single time.
$attributeValue .= $char.$this->data->consumeUntil('"&'); $attribute->value .= $char.$this->data->consumeUntil('"&');
} }
continue; continue;
@ -1831,7 +1820,7 @@ class Tokenizer {
# DEVIATION: This implementation does the character reference consuming in a # DEVIATION: This implementation does the character reference consuming in a
# function for which it is more suited for. # function for which it is more suited for.
$attributeValue .= $this->data->consumeCharacterReference("'", true); $attribute->value .= $this->data->consumeCharacterReference("'", true);
} }
# EOF # EOF
elseif ($char === '') { elseif ($char === '') {
@ -1846,7 +1835,7 @@ class Tokenizer {
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having // OPTIMIZATION: Consume all characters that aren't listed above to prevent having
// to loop back through here every single time. // to loop back through here every single time.
$attributeValue .= $char.$this->data->consumeUntil("'&"); $attribute->value .= $char.$this->data->consumeUntil("'&");
} }
continue; continue;
@ -1883,16 +1872,17 @@ class Tokenizer {
// DEVIATION: This implementation does the character reference consuming in a // DEVIATION: This implementation does the character reference consuming in a
// function for which it is more suited for. // function for which it is more suited for.
$attributeValue .= $this->data->consumeCharacterReference('>', true); $attribute->value .= $this->data->consumeCharacterReference('>', true);
} }
# ">" (U+003E) # ">" (U+003E)
elseif ($char === '>') { elseif ($char === '>') {
# Switch to the data state. Emit the current tag token. # Switch to the data state. Emit the current tag token.
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
// Need to add the current attribute name and value to the token if necessary. // Need to add the current attribute to the token, if necessary.
if ($attributeName) { if ($attribute) {
$token->setAttribute($attributeName, $attributeValue); $token->attributes[] = $attribute;
$attribute = null;
} }
return $token; return $token;
@ -1921,7 +1911,7 @@ class Tokenizer {
// OPTIMIZATION: Consume all characters that aren't listed above to prevent having // OPTIMIZATION: Consume all characters that aren't listed above to prevent having
// to loop back through here every single time. // to loop back through here every single time.
$attributeValue .= $char.$this->data->consumeUntil("\t\n\x0c &>\"'<=`"); $attribute->value .= $char.$this->data->consumeUntil("\t\n\x0c &>\"'<=`");
} }
continue; continue;
@ -1950,9 +1940,10 @@ class Tokenizer {
# Switch to the data state. Emit the current tag token. # Switch to the data state. Emit the current tag token.
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
// Need to add the current attribute name and value to the token if necessary. // Need to add the current attribute to the token, if necessary.
if ($attributeName) { if ($attribute) {
$token->setAttribute($attributeName, $attributeValue); $token->attributes[] = $attribute;
$attribute = null;
} }
return $token; return $token;
@ -1986,6 +1977,13 @@ class Tokenizer {
# Emit the current tag token. # Emit the current tag token.
$token->selfClosing = true; $token->selfClosing = true;
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
// Need to add the current attribute to the token, if necessary.
if ($attribute) {
$token->attributes[] = $attribute;
$attribute = null;
}
return $token; return $token;
} }
# EOF # EOF

Loading…
Cancel
Save