diff --git a/lib/Token.php b/lib/Token.php index 447196d..fa5252b 100644 --- a/lib/Token.php +++ b/lib/Token.php @@ -55,7 +55,7 @@ class StartTagToken extends TagToken { } public function getAttribute(string $name) { - $key = $this->getAttributeKey($name); + $key = $this->_getAttributeKey($name); return (isset($this->attributes[$key])) ? $this->attributes[$key] : null; } @@ -65,29 +65,30 @@ class StartTagToken extends TagToken { } public function removeAttribute(string $name) { - unset($this->attributes[$this->getAttributeKey($name)]); + unset($this->attributes[$this->_getAttributeKey($name)]); } public function setAttribute(string $name, string $value, string $namespace = Parser::HTML_NAMESPACE) { $key = $this->_getAttributeKey($name); - $attribute = new TokenAttr($name, $value, $namespace); if (is_null($key)) { - $this->attributes[] = $attribute; + $this->attributes[] = new TokenAttr($name, $value, $namespace); } else { - $this->attributes[$key] = $attribute; + $attribute = &$this->attributes[$key]; + $attribute->name = $name; + $attribute->value = $value; + $attribute->namespace = $namespace; } } private function _getAttributeKey(string $name) { - $key = null; foreach ($this->attributes as $key => $a) { if ($a->name === $name) { - break; + return $key; } } - return $key; + return null; } } diff --git a/lib/Tokenizer.php b/lib/Tokenizer.php index e35f2bd..80d3544 100644 --- a/lib/Tokenizer.php +++ b/lib/Tokenizer.php @@ -1474,9 +1474,13 @@ class Tokenizer { # code point), and its value to the empty string. Switch to the attribute name # state. - // DEVIATION: Will use a buffer for the attribute name instead. - $attributeName = strtolower($char); - $attributeValue = ''; + // Need to add the current attribute to the token, if necessary. + if ($attribute) { + $token->attributes[] = $attribute; + $attribute = null; + } + + $attribute = new TokenAttr(strtolower($char), ''); $this->state = self::ATTRIBUTE_NAME_STATE; } # EOF @@ -1503,9 +1507,13 @@ class Tokenizer { ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char, 'attribute name'); } - // DEVIATION: Will use a buffer for the attribute name instead. - $attributeName = $char; - $attributeValue = ''; + // Need to add the current attribute to the token, if necessary. + if ($attribute) { + $token->attributes[] = $attribute; + $attribute = null; + } + + $attribute = new TokenAttr($char, ''); $this->state = self::ATTRIBUTE_NAME_STATE; } @@ -1521,48 +1529,27 @@ class Tokenizer { # "LF" (U+000A) # "FF" (U+000C) # U+0020 SPACE - if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { - if ($token->hasAttribute($attributeName)) { - ParseError::trigger(ParseError::ATTRIBUTE_EXISTS, $attributeName); - } - - # Switch to the after attribute name state. - $this->state = self::AFTER_ATTRIBUTE_NAME_STATE; - } # "/" (U+002F) - elseif ($char === '/') { - if ($token->hasAttribute($attributeName)) { - ParseError::trigger(ParseError::ATTRIBUTE_EXISTS, $attributeName); + # U+003E GREATER-THAN SIGN (>) + # EOF + if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' || $char === '/' || $char === '>' || $char === '') { + if ($token->hasAttribute($attribute->name)) { + ParseError::trigger(ParseError::ATTRIBUTE_EXISTS, $attribute->name); } - # Switch to the self-closing start tag state. - $this->state = self::SELF_CLOSING_START_TAG_STATE; + # Reconsume in the after attribute name state. + $this->data->unconsume(); + $this->state = self::AFTER_ATTRIBUTE_NAME_STATE; } # "=" (U+003D) elseif ($char === '=') { - if ($token->hasAttribute($attributeName)) { - ParseError::trigger(ParseError::ATTRIBUTE_EXISTS, $attributeName); + if ($token->hasAttribute($attribute->name)) { + ParseError::trigger(ParseError::ATTRIBUTE_EXISTS, $attribute->name); } # Switch to the before attribute value state. $this->state = self::BEFORE_ATTRIBUTE_VALUE_STATE; } - # ">" (U+003E) - elseif ($char === '>') { - if ($token->hasAttribute($attributeName)) { - ParseError::trigger(ParseError::ATTRIBUTE_EXISTS, $attributeName); - } - - # Switch to the data state. Emit the current tag token. - $this->state = self::DATA_STATE; - - // Need to add the current attribute name and value to the token if necessary. - if ($attributeName) { - $token->setAttribute($attributeName, $attributeValue); - } - - return $token; - } # Uppercase ASCII letter elseif (ctype_upper($char)) { # Append the lowercase version of the current input character (add 0x0020 to the @@ -1570,22 +1557,14 @@ class Tokenizer { // OPTIMIZATION: Consume all characters that are uppercase ASCII letters to prevent // having to loop back through here every single time. - $attributeName .= strtolower($char.$this->data->consumeWhile(self::CTYPE_UPPER)); - } - # EOF - elseif ($char === '') { - # Parse error. Switch to the data state. Reconsume the EOF character. - ParseError::trigger(ParseError::UNEXPECTED_EOF, 'attribute name'); - $this->state = self::DATA_STATE; - $this->data->unconsume(); + $attribute->name .= strtolower($char.$this->data->consumeWhile(self::CTYPE_UPPER)); } # U+0022 QUOTATION MARK (") # "'" (U+0027) # "<" (U+003C) - # "=" (U+003D) # Anything else else { - # Quotes, less than sign, equals: + # Quotes, less than sign: # Parse error. Treat it as per the "anything else" entry below. # Anything else: # Append the current input character to the current attribute's name. @@ -1598,7 +1577,7 @@ class Tokenizer { // characters. // OPTIMIZATION: Consume all characters that aren't listed above to prevent having // to loop back through here every single time. - $attributeName .= $char.$this->data->consumeUntil("\t\n\x0c /=>\"'<".self::CTYPE_UPPER); + $attribute->name .= $char.$this->data->consumeUntil("\t\n\x0c /=>\"'<".self::CTYPE_UPPER); } # When the user agent leaves the attribute name state (and before emitting the tag @@ -1625,26 +1604,26 @@ class Tokenizer { if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { # Ignore the character. } - # "/" (U+002F) + # U+002F SOLIDUS (/) elseif ($char === '/') { # Switch to the self-closing start tag state. $this->state = self::SELF_CLOSING_START_TAG_STATE; } - # "=" (U+003D) + # U+003D EQUALS SIGN (=) elseif ($char === '=') { # Switch to the before attribute value state. $this->state = self::BEFORE_ATTRIBUTE_VALUE_STATE; } - # ">" (U+003E) + # U+003E GREATER-THAN SIGN (>) elseif ($char === '>') { - # Switch to the data state. Emit the current tag token. - $this->state = self::DATA_STATE; - - // Need to add the current attribute name and value to the token if necessary. - if ($attributeName) { - $token->setAttribute($attributeName, $attributeValue); + // Need to add the current attribute to the token, if necessary. + if ($attribute) { + $token->attributes[] = $attribute; + $attribute = null; } + # Switch to the data state. Emit the current tag token. + $this->state = self::DATA_STATE; return $token; } # Uppercase ASCII letter @@ -1654,9 +1633,13 @@ class Tokenizer { # code point), and its value to the empty string. Switch to the attribute name # state. - // DEVIATION: Will use a buffer for the attribute name instead. - $attributeName = strtolower($char); - $attributeValue = ''; + // Need to add the current attribute to the token, if necessary. + if ($attribute) { + $token->attributes[] = $attribute; + $attribute = null; + } + + $attribute = new TokenAttr(strtolower($char), ''); $this->state = self::ATTRIBUTE_NAME_STATE; } # EOF @@ -1683,8 +1666,13 @@ class Tokenizer { ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char, 'attribute name, attribute value, or tag end'); } - $attributeName = $char; - $attributeValue = ''; + // Need to add the current attribute to the token, if necessary. + if ($attribute) { + $token->attributes[] = $attribute; + $attribute = null; + } + + $attribute = new TokenAttr($char, ''); $this->state = self::ATTRIBUTE_NAME_STATE; } @@ -1726,9 +1714,10 @@ class Tokenizer { ParseError::trigger(ParseError::UNEXPECTED_END_OF_TAG, 'attribute value'); $this->state = self::DATA_STATE; - // Need to add the current attribute name and value to the token if necessary. - if ($attributeName) { - $token->setAttribute($attributeName, $attributeValue); + // Need to add the current attribute to the token, if necessary. + if ($attribute) { + $token->attributes[] = $attribute; + $attribute = null; } return $token; @@ -1755,7 +1744,7 @@ class Tokenizer { ParseError::trigger(ParseError::UNEXPECTED_CHARACTER, $char, 'attribute value'); } - $attributeValue .= $char; + $attribute->value .= $char; $this->state = self::ATTRIBUTE_VALUE_UNQUOTED_STATE; } @@ -1786,7 +1775,7 @@ class Tokenizer { // DEVIATION: This implementation does the character reference consuming in a // function for which it is more suited for. - $attributeValue .= $this->data->consumeCharacterReference('"', true); + $attribute->value .= $this->data->consumeCharacterReference('"', true); } # EOF elseif ($char === '') { @@ -1800,7 +1789,7 @@ class Tokenizer { # Append the current input character to the current attribute's value. // OPTIMIZATION: Consume all characters that aren't listed above to prevent having // to loop back through here every single time. - $attributeValue .= $char.$this->data->consumeUntil('"&'); + $attribute->value .= $char.$this->data->consumeUntil('"&'); } continue; @@ -1831,7 +1820,7 @@ class Tokenizer { # DEVIATION: This implementation does the character reference consuming in a # function for which it is more suited for. - $attributeValue .= $this->data->consumeCharacterReference("'", true); + $attribute->value .= $this->data->consumeCharacterReference("'", true); } # EOF elseif ($char === '') { @@ -1846,7 +1835,7 @@ class Tokenizer { // OPTIMIZATION: Consume all characters that aren't listed above to prevent having // to loop back through here every single time. - $attributeValue .= $char.$this->data->consumeUntil("'&"); + $attribute->value .= $char.$this->data->consumeUntil("'&"); } continue; @@ -1883,16 +1872,17 @@ class Tokenizer { // DEVIATION: This implementation does the character reference consuming in a // function for which it is more suited for. - $attributeValue .= $this->data->consumeCharacterReference('>', true); + $attribute->value .= $this->data->consumeCharacterReference('>', true); } # ">" (U+003E) elseif ($char === '>') { # Switch to the data state. Emit the current tag token. $this->state = self::DATA_STATE; - // Need to add the current attribute name and value to the token if necessary. - if ($attributeName) { - $token->setAttribute($attributeName, $attributeValue); + // Need to add the current attribute to the token, if necessary. + if ($attribute) { + $token->attributes[] = $attribute; + $attribute = null; } return $token; @@ -1921,7 +1911,7 @@ class Tokenizer { // OPTIMIZATION: Consume all characters that aren't listed above to prevent having // to loop back through here every single time. - $attributeValue .= $char.$this->data->consumeUntil("\t\n\x0c &>\"'<=`"); + $attribute->value .= $char.$this->data->consumeUntil("\t\n\x0c &>\"'<=`"); } continue; @@ -1950,9 +1940,10 @@ class Tokenizer { # Switch to the data state. Emit the current tag token. $this->state = self::DATA_STATE; - // Need to add the current attribute name and value to the token if necessary. - if ($attributeName) { - $token->setAttribute($attributeName, $attributeValue); + // Need to add the current attribute to the token, if necessary. + if ($attribute) { + $token->attributes[] = $attribute; + $attribute = null; } return $token; @@ -1986,6 +1977,13 @@ class Tokenizer { # Emit the current tag token. $token->selfClosing = true; $this->state = self::DATA_STATE; + + // Need to add the current attribute to the token, if necessary. + if ($attribute) { + $token->attributes[] = $attribute; + $attribute = null; + } + return $token; } # EOF