Browse Source

Character reference fixes

One test in the "entities.test" file is till failing
split-manual
J. King 4 years ago
parent
commit
e8f35e92fb
  1. 1
      .gitignore
  2. 12
      lib/Tokenizer.php
  3. 1
      tests/cases/TestTokenizer.php

1
.gitignore

@ -70,3 +70,4 @@ $RECYCLE.BIN/
/vendor-bin/*/vendor
/tests/html5lib-tests
/tests/.phpunit.result.cache
/tests/coverage

12
lib/Tokenizer.php

@ -267,7 +267,7 @@ class Tokenizer {
# Switch to the character reference state.
// DEVIATION: Character reference consumption implemented as a function
return new CharacterToken($this->switchToCharacterReferenceState(self::RCDATA_STATE));
return new CharacterToken($this->switchToCharacterReferenceState(self::DATA_STATE));
}
# U+003C LESS-THAN SIGN (<)
elseif ($char === '<') {
@ -3579,15 +3579,16 @@ class Tokenizer {
$match = CharacterReference::NAMES[$candidate] ?? null;
if (is_null($match) && !in_array($returnState, self::ATTRIBUTE_VALUE_STATE_SET)) {
$match = (preg_match(CharacterReference::PREFIX_PATTERN, $candidate, $match)) ? $match[0] : null;
// If a prefix match is found, unconsume to the end of the prefix
// If a prefix match is found, unconsume to the end of the prefix and look up the entry in the table
if (!is_null($match)) {
$this->data->unconsume(strlen($candidate) - strlen($match));
$next = $candidate[strlen($match)];
$match = CharacterReference::NAMES[$match];
}
}
# Append each character to the temporary buffer when it's consumed.
$temporaryBuffer .= ($match ?? $candidate);
$temporaryBuffer .= $candidate;
# If there is a match
if (!is_null($match)) {
@ -3643,8 +3644,7 @@ class Tokenizer {
# Otherwise, emit the current input character as a character token.
// DEVIATION: We just continue to buffer characters until it's time to return
// NOTE: this branch should never be reached
$temporaryBuffer .= $char;
$temporaryBuffer .= $char.$this->data->consumeWhile(self::CTYPE_ALNUM);
}
# U+003B SEMICOLON (;)
elseif ($char === ';') {
@ -3724,7 +3724,7 @@ class Tokenizer {
// OPTIMIZATION:
// Just consume the digits here
$charRefCode = hexdec($char.$this->data->consumeWhile(self::CTYPE_NUM));
$charRefCode = (int) ($char.$this->data->consumeWhile(self::CTYPE_NUM));
$this->state = self::DECIMAL_CHARACTER_REFERENCE_STATE;
}
# Anything else

1
tests/cases/TestTokenizer.php

@ -8,6 +8,7 @@ use dW\HTML5\OpenElementsStack;
use dW\HTML5\ParseError;
use dW\HTML5\Tokenizer;
/** @covers \dW\HTML5\Tokenizer */
class TestTokenizer extends \dW\HTML5\Test\StandardTest {
const DEBUG = false;

Loading…
Cancel
Save