New from-scratch character reference consumer
This commit is contained in:
parent
67c7f382e2
commit
19fb541806
4 changed files with 523 additions and 29 deletions
125
RoboFile.php
125
RoboFile.php
|
@ -84,16 +84,6 @@ class RoboFile extends \Robo\Tasks {
|
|||
return $this->runTests($exec, "typical", array_merge(["--coverage-html", BASE_TEST."coverage"], $args));
|
||||
}
|
||||
|
||||
/** Runs the coding standards fixer */
|
||||
public function clean($opts = ['demo|d' => false]): Result {
|
||||
$t = $this->taskExec(norm(BASE."vendor/bin/php-cs-fixer"));
|
||||
$t->arg("fix");
|
||||
if ($opts['demo']) {
|
||||
$t->args("--dry-run", "--diff")->option("--diff-format", "udiff");
|
||||
}
|
||||
return $t->run();
|
||||
}
|
||||
|
||||
protected function findCoverageEngine(): string {
|
||||
if (IS_WIN) {
|
||||
$dbg = dirname(\PHP_BINARY)."\\phpdbg.exe";
|
||||
|
@ -139,4 +129,119 @@ class RoboFile extends \Robo\Tasks {
|
|||
}
|
||||
return $this->taskExec($executor)->option("-d", "zend.assertions=1")->arg($execpath)->option("-c", $confpath)->args(array_merge($set, $args))->run();
|
||||
}
|
||||
|
||||
/** Runs the coding standards fixer */
|
||||
public function clean($opts = ['demo|d' => false]): Result {
|
||||
$t = $this->taskExec(norm(BASE."vendor/bin/php-cs-fixer"));
|
||||
$t->arg("fix");
|
||||
if ($opts['demo']) {
|
||||
$t->args("--dry-run", "--diff")->option("--diff-format", "udiff");
|
||||
}
|
||||
return $t->run();
|
||||
}
|
||||
|
||||
/** Produces the CharacterReference class file */
|
||||
public function charref() {
|
||||
$template = <<<'FILE'
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
namespace dW\HTML5;
|
||||
|
||||
// This file is machine-generated
|
||||
// DO NOT MODIFY
|
||||
|
||||
// To update, run ./robo charref
|
||||
|
||||
class CharacterReference {
|
||||
const LONGEST_NAME = %LONGEST%;
|
||||
const PREFIX_PATTERN = %NAMED_PATTERN%;
|
||||
const NAMES = [
|
||||
%NAMED_REFERENCES%
|
||||
];
|
||||
const C1_TABLE = [
|
||||
%C1_SUBSTITUTIONS%
|
||||
];
|
||||
}
|
||||
|
||||
FILE;
|
||||
$input = @json_decode(@file_get_contents("https://html.spec.whatwg.org/entities.json"), true);
|
||||
if (!is_array($input)) {
|
||||
throw new \Exception("Could not retrieve character reference table.");
|
||||
}
|
||||
$list = [];
|
||||
$terms = [];
|
||||
foreach ($input as $entity => $data) {
|
||||
// strip the ampersand from the entity name
|
||||
$entity = substr($entity, 1);
|
||||
// add the entity name to an array of regular expression terms
|
||||
// if the entry exists in unterminated form, compress it into one, skiping the unterminated version
|
||||
if (substr($entity, -1) === ';') {
|
||||
if (isset($input['&'.substr($entity, 0, strlen($entity) -1)])) {
|
||||
$terms[] = "$entity?";
|
||||
} else {
|
||||
$terms[] = $entity;
|
||||
}
|
||||
}
|
||||
// add a PHP-code representation of the entity name and its characters to another array
|
||||
$chars = $data['codepoints'];
|
||||
for ($a = 0; $a < sizeof($chars); $a++) {
|
||||
$chars[$a] = '\u{'.dechex($chars[$a]).'}';
|
||||
}
|
||||
$chars = implode('', $chars);
|
||||
$list[] = "'$entity'=>\"$chars\"";
|
||||
}
|
||||
// concatenate the list of entities and substitute them into the template
|
||||
$list = implode(",", $list);
|
||||
$template = str_replace('%NAMED_REFERENCES%', $list, $template);
|
||||
// prepare the list of terms as a regular expression
|
||||
// sort longest terms first
|
||||
usort($terms, function($a, $b) {
|
||||
return -1 * (strlen(preg_replace("/\W/", "", $a)) <=> strlen(preg_replace("/\W/", "", $b)));
|
||||
});
|
||||
// note the longest term
|
||||
$longest = strlen(preg_replace("/\W/", "", $terms[0]));
|
||||
$template = str_replace('%LONGEST%', $longest, $template);
|
||||
// concatenate the terms into a case-sensitive non-capturing prefix search
|
||||
$regexp = '/^(?:'.implode('|', $terms).')/';
|
||||
$template = str_replace('%NAMED_PATTERN%', var_export($regexp, true), $template);
|
||||
// Compile the C1 control substitution table
|
||||
// See https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
|
||||
$list = [];
|
||||
$c1table = [
|
||||
0x80 => 0x20AC, // EURO SIGN (€)
|
||||
0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK (‚)
|
||||
0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK (ƒ)
|
||||
0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK („)
|
||||
0x85 => 0x2026, // HORIZONTAL ELLIPSIS (…)
|
||||
0x86 => 0x2020, // DAGGER (†)
|
||||
0x87 => 0x2021, // DOUBLE DAGGER (‡)
|
||||
0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ)
|
||||
0x89 => 0x2030, // PER MILLE SIGN (‰)
|
||||
0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON (Š)
|
||||
0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹)
|
||||
0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE (Œ)
|
||||
0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON (Ž)
|
||||
0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK (‘)
|
||||
0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK (’)
|
||||
0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK (“)
|
||||
0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK (”)
|
||||
0x95 => 0x2022, // BULLET (•)
|
||||
0x96 => 0x2013, // EN DASH (–)
|
||||
0x97 => 0x2014, // EM DASH (—)
|
||||
0x98 => 0x02DC, // SMALL TILDE (˜)
|
||||
0x99 => 0x2122, // TRADE MARK SIGN (™)
|
||||
0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON (š)
|
||||
0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›)
|
||||
0x9C => 0x0153, // LATIN SMALL LIGATURE OE (œ)
|
||||
0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON (ž)
|
||||
0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ)
|
||||
];
|
||||
foreach ($c1table as $c1 => $code) {
|
||||
$list[] = "$c1=>$code";
|
||||
}
|
||||
$list = implode(",", $list);
|
||||
$template = str_replace('%C1_SUBSTITUTIONS%', $list, $template);
|
||||
// output the file itself
|
||||
file_put_contents(BASE."lib/CharacterReference.php", $template);
|
||||
}
|
||||
}
|
||||
|
|
19
lib/CharacterReference.php
Normal file
19
lib/CharacterReference.php
Normal file
File diff suppressed because one or more lines are too long
|
@ -49,6 +49,14 @@ class ParseError {
|
|||
const EOF_IN_CDATA = 135;
|
||||
const END_TAG_WITH_ATTRIBUTES = 136;
|
||||
const END_TAG_WITH_TRAILING_SOLIDUS = 137;
|
||||
const MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE = 138;
|
||||
const UNKNOWN_NAMED_CHARACTER_REFERENCE = 139;
|
||||
const ABSENCE_OF_DIGITS_IN_CHARACTER_REFERENCE = 140;
|
||||
const NULL_CHARACTER_REFRERENCE = 141;
|
||||
const CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE = 142;
|
||||
const SURROGATE_CHARACTER_REFERENCE = 143;
|
||||
const NONCHARACTER_CHARACTER_REFERENCE = 144;
|
||||
const CONTROL_CHARACTER_REFERENCE = 145;
|
||||
|
||||
protected static $messages = [
|
||||
self::UNEXPECTED_NULL_CHARACTER => 'Unexpected null character',
|
||||
|
@ -88,6 +96,14 @@ class ParseError {
|
|||
self::EOF_IN_CDATA => 'End-of-file in CDATA section',
|
||||
self::END_TAG_WITH_ATTRIBUTES => 'End-tag with attributes',
|
||||
self::END_TAG_WITH_TRAILING_SOLIDUS => 'End-tag with trailing solidus',
|
||||
self::MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE => 'Missing semicolon after character reference',
|
||||
self::UNKNOWN_NAMED_CHARACTER_REFERENCE => 'Unknown named character reference "%s"',
|
||||
self::ABSENCE_OF_DIGITS_IN_CHARACTER_REFERENCE => 'Absence of digits in character reference',
|
||||
self::NULL_CHARACTER_REFRERENCE => 'Null character reference',
|
||||
self::CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE => 'Character reference outside Unicode range',
|
||||
self::SURROGATE_CHARACTER_REFERENCE => 'Surrogate character rereference',
|
||||
self::NONCHARACTER_CHARACTER_REFERENCE => 'Non-character character reference',
|
||||
self::CONTROL_CHARACTER_REFERENCE => 'Control-character character reference',
|
||||
];
|
||||
|
||||
public function setHandler() {
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
declare(strict_types=1);
|
||||
namespace dW\HTML5;
|
||||
|
||||
use MensBeam\Intl\Encoding\UTF8;
|
||||
|
||||
class Tokenizer {
|
||||
use ParseErrorEmitter;
|
||||
|
||||
|
@ -176,9 +178,21 @@ class Tokenizer {
|
|||
self::NUMERIC_CHARACTER_REFERENCE_END_STATE => "Numeric character reference",
|
||||
];
|
||||
|
||||
const ATTRIBUTE_VALUE_STATE_SET = [
|
||||
# A character reference is said to be consumed as part of an attribute
|
||||
# if the return state is either attribute value (double-quoted) state,
|
||||
# attribute value (single-quoted) state or attribute value (unquoted) state.
|
||||
self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE,
|
||||
self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE,
|
||||
self::ATTRIBUTE_VALUE_UNQUOTED_STATE
|
||||
];
|
||||
|
||||
// Ctype constants
|
||||
const CTYPE_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
|
||||
const CTYPE_UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
|
||||
const CTYPE_ALPHA = self::CTYPE_UPPER.'abcdefghijklmnopqrstuvwxyz';
|
||||
const CTYPE_NUM = '0123456789';
|
||||
const CTYPE_ALNUM = self::CTYPE_ALPHA.self::CTYPE_NUM;
|
||||
const CTYPE_HEX = self::CTYPE_NUM.'ABCDEFabcdef';
|
||||
|
||||
public function __construct(Data $data, OpenElementsStack $stack, ParseError $errorHandler) {
|
||||
$this->state = self::DATA_STATE;
|
||||
|
@ -251,8 +265,9 @@ class Tokenizer {
|
|||
if ($char === '&') {
|
||||
# Set the return state to the data state.
|
||||
# Switch to the character reference state.
|
||||
$returnState = self::DATA_STATE;
|
||||
$this->state = self::CHARACTER_REFERENCE_STATE;
|
||||
|
||||
// DEVIATION: Character reference consumption implemented as a function
|
||||
return new CharacterToken($this->switchToCharacterReferenceState(self::RCDATA_STATE));
|
||||
}
|
||||
# U+003C LESS-THAN SIGN (<)
|
||||
elseif ($char === '<') {
|
||||
|
@ -292,8 +307,9 @@ class Tokenizer {
|
|||
if ($char === '&') {
|
||||
# Set the return state to the RCDATA state.
|
||||
# Switch to the character reference state.
|
||||
$returnState = self::RCDATA_STATE;
|
||||
$this->state = self::CHARACTER_REFERENCE_STATE;
|
||||
|
||||
// DEVIATION: Character reference consumption implemented as a function
|
||||
return new CharacterToken($this->switchToCharacterReferenceState(self::RCDATA_STATE));
|
||||
}
|
||||
# U+003C LESS-THAN SIGN (<)
|
||||
elseif ($char === '<') {
|
||||
|
@ -1763,8 +1779,10 @@ class Tokenizer {
|
|||
elseif ($char === '&') {
|
||||
# Set the return state to the attribute value (double-quoted) state.
|
||||
# Switch to the character reference state.
|
||||
$returnState = self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
|
||||
$this->state = self::CHARACTER_REFERENCE_STATE;
|
||||
|
||||
// DEVIATION: Character reference consumption implemented as a function
|
||||
assert(isset($attribute) && $attribute instanceof TokenAttr);
|
||||
$attribute->value .= $this->switchToCharacterReferenceState(self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE);
|
||||
}
|
||||
# U+0000 NULL
|
||||
elseif ($char === "\0") {
|
||||
|
@ -1807,8 +1825,10 @@ class Tokenizer {
|
|||
elseif ($char === '&') {
|
||||
# Set the return state to the attribute value (single-quoted) state.
|
||||
# Switch to the character reference state.
|
||||
$returnState = self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
|
||||
$this->state = self::CHARACTER_REFERENCE_STATE;
|
||||
|
||||
// DEVIATION: Character reference consumption implemented as a function
|
||||
assert(isset($attribute) && $attribute instanceof TokenAttr);
|
||||
$attribute->value .= $this->switchToCharacterReferenceState(self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE);
|
||||
}
|
||||
# U+0000 NULL
|
||||
elseif ($char === "\0") {
|
||||
|
@ -1855,8 +1875,10 @@ class Tokenizer {
|
|||
elseif ($char === '&') {
|
||||
# Set the return state to the attribute value (unquoted) state.
|
||||
# Switch to the character reference state.
|
||||
$returnState = self::ATTRIBUTE_VALUE_UNQUOTED_STATE;
|
||||
$this->state = self::CHARACTER_REFERENCE_STATE;
|
||||
|
||||
// DEVIATION: Character reference consumption implemented as a function
|
||||
assert(isset($attribute) && $attribute instanceof TokenAttr);
|
||||
$attribute->value .= $this->switchToCharacterReferenceState(self::ATTRIBUTE_VALUE_UNQUOTED_STATE);
|
||||
}
|
||||
# ">" (U+003E)
|
||||
elseif ($char === '>') {
|
||||
|
@ -3484,17 +3506,349 @@ class Tokenizer {
|
|||
}
|
||||
}
|
||||
|
||||
#12.2.5.72 Character reference state
|
||||
elseif ($this->state === self::CHARACTER_REFERENCE_STATE) {
|
||||
// Not implemented
|
||||
$this->state = $returnState;
|
||||
return new CharacterToken('&');
|
||||
}
|
||||
|
||||
# Not a valid state
|
||||
# Not a valid state, unimplemented, or implemented elsewhere
|
||||
else {
|
||||
throw new \Exception("Unimplemented state: ".(self::STATE_NAMES[$this->state] ?? $this->state));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected function switchToCharacterReferenceState(int $returnState): string {
|
||||
// This function implements states 72 through 80,
|
||||
// "Character reference" through "Numeric character reference end" states
|
||||
$this->state = self::CHARACTER_REFERENCE_STATE;
|
||||
|
||||
while (true) {
|
||||
assert((function() {
|
||||
$state = self::STATE_NAMES[$this->state] ?? $this->state;
|
||||
$char = bin2hex($this->data->peek(1));
|
||||
$this->debugLog .= " State: $state ($char)\n";
|
||||
return true;
|
||||
})());
|
||||
|
||||
# 12.2.5.72 Character reference state
|
||||
if ($this->state === self::CHARACTER_REFERENCE_STATE) {
|
||||
# Set the temporary buffer to the empty string.
|
||||
# Append a U+0026 AMPERSAND (&) character to the temporary buffer.
|
||||
# Consume the next input character.
|
||||
$temporaryBuffer = '&';
|
||||
$char = $this->data->consume();
|
||||
|
||||
# ASCII alphanumeric
|
||||
if (ctype_alnum($char)) {
|
||||
# Reconsume in the named character reference state.
|
||||
$this->state = self::NAMED_CHARACTER_REFERENCE_STATE;
|
||||
$this->data->unconsume();
|
||||
}
|
||||
# U+0023 NUMBER SIGN (#)
|
||||
elseif ($char === '#') {
|
||||
# Append the current input character to the temporary buffer.
|
||||
# Switch to the numeric character reference state.
|
||||
$temporaryBuffer .= $char;
|
||||
$this->state = self::NUMERIC_CHARACTER_REFERENCE_STATE;
|
||||
}
|
||||
# Anything else
|
||||
else {
|
||||
# Flush code points consumed as a character reference.
|
||||
# Reconsume in the return state.
|
||||
$this->state = $returnState;
|
||||
$this->data->unconsume();
|
||||
return $temporaryBuffer;
|
||||
}
|
||||
}
|
||||
|
||||
# 12.2.5.73 Named character reference state
|
||||
elseif ($this->state === self::NAMED_CHARACTER_REFERENCE_STATE) {
|
||||
# Consume the maximum number of characters possible,
|
||||
# with the consumed characters matching one of the
|
||||
# identifiers in the first column of the named character
|
||||
# references table (in a case-sensitive manner).
|
||||
|
||||
// DEVIATION:
|
||||
// We consume all possible alphanumeric characters,
|
||||
// up to the length of the longest in the table
|
||||
$candidate = $this->data->consumeWhile(self::CTYPE_ALNUM, CharacterReference::LONGEST_NAME);
|
||||
// Keep a record of the terminating character, which is used later
|
||||
$next = $this->data->peek(1);
|
||||
if ($next === ';') {
|
||||
// consume the following character if it is a proper terminator
|
||||
$candidate .= $this->data->consume();
|
||||
}
|
||||
// Look for an exact match
|
||||
// If not found look for a prefix match if not consuming in an attribute
|
||||
$match = CharacterReference::NAMES[$candidate] ?? null;
|
||||
if (is_null($match) && !in_array($returnState, self::ATTRIBUTE_VALUE_STATE_SET)) {
|
||||
$match = (preg_match(CharacterReference::PREFIX_PATTERN, $candidate, $match)) ? $match[0] : null;
|
||||
// If a prefix match is found, unconsume to the end of the prefix
|
||||
if (!is_null($match)) {
|
||||
$this->data->unconsume(strlen($candidate) - strlen($match));
|
||||
$next = $candidate[strlen($match)];
|
||||
}
|
||||
}
|
||||
|
||||
# Append each character to the temporary buffer when it's consumed.
|
||||
$temporaryBuffer .= ($match ?? $candidate);
|
||||
|
||||
# If there is a match
|
||||
if (!is_null($match)) {
|
||||
# If the character reference was consumed as part of an attribute,
|
||||
# and the last character matched is not a U+003B SEMICOLON character (;),
|
||||
# and the next input character is either a U+003D EQUALS SIGN character (=)
|
||||
# or an ASCII alphanumeric...
|
||||
if (in_array($returnState, self::ATTRIBUTE_VALUE_STATE_SET) && $next !== ';' && ($next === '=' || ctype_alnum($next))) {
|
||||
# ... then, for historical reasons, flush code points consumed
|
||||
# as a character reference and switch to the return state.
|
||||
$this->state = $returnState;
|
||||
return $temporaryBuffer;
|
||||
}
|
||||
# Otherwise:
|
||||
else {
|
||||
# If the last character matched is not a U+003B SEMICOLON character (;),
|
||||
# then this is a missing-semicolon-after-character-reference parse error.
|
||||
if ($next !== ';') {
|
||||
$this->error(ParseError::MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE);
|
||||
}
|
||||
# Set the temporary buffer to the empty string.
|
||||
# Append one or two characters corresponding to the
|
||||
# character reference name (as given by the second
|
||||
# column of the named character references table)
|
||||
# to the temporary buffer.
|
||||
# Flush code points consumed as a character reference.
|
||||
# Switch to the return state.
|
||||
|
||||
// In other words: return the match
|
||||
$this->state = $returnState;
|
||||
return $match;
|
||||
}
|
||||
}
|
||||
# Otherwise:
|
||||
else {
|
||||
# Flush code points consumed as a character reference.
|
||||
# Switch to the ambiguous ampersand state.
|
||||
|
||||
// DEVIATION: We flush only when switching to the return state
|
||||
$this->state = self::AMBIGUOUS_AMPERSAND_STATE;
|
||||
}
|
||||
}
|
||||
|
||||
# 12.2.5.74 Ambiguous ampersand state
|
||||
elseif ($this->state === self::AMBIGUOUS_AMPERSAND_STATE) {
|
||||
# Consume the next input character.
|
||||
$char = $this->data->consume();
|
||||
|
||||
# ASCII alphanumeric
|
||||
if (ctype_alnum($char)) {
|
||||
# If the character reference was consumed as part of an attribute,
|
||||
# then append the current input character to the current attribute's value.
|
||||
# Otherwise, emit the current input character as a character token.
|
||||
|
||||
// DEVIATION: We just continue to buffer characters until it's time to return
|
||||
// NOTE: this branch should never be reached
|
||||
$temporaryBuffer .= $char;
|
||||
}
|
||||
# U+003B SEMICOLON (;)
|
||||
elseif ($char === ';') {
|
||||
# This is an unknown-named-character-reference parse error.
|
||||
# Reconsume in the return state.
|
||||
$this->error(ParseError::UNKNOWN_NAMED_CHARACTER_REFERENCE, $temporaryBuffer.';');
|
||||
$this->state = $returnState;
|
||||
$this->data->unconsume();
|
||||
return $temporaryBuffer;
|
||||
}
|
||||
# Anything else
|
||||
else {
|
||||
# Reconsume in the return state.
|
||||
$this->state = $returnState;
|
||||
$this->data->unconsume();
|
||||
return $temporaryBuffer;
|
||||
}
|
||||
}
|
||||
|
||||
# 12.2.5.75 Numeric character reference state
|
||||
elseif ($this->state === self::NUMERIC_CHARACTER_REFERENCE_STATE) {
|
||||
# Set the character reference code to zero (0).
|
||||
$charRefCode = 0;
|
||||
# Consume the next input character.
|
||||
$char = $this->data->consume();
|
||||
|
||||
# U+0078 LATIN SMALL LETTER X
|
||||
#U+0058 LATIN CAPITAL LETTER X
|
||||
if ($char === 'x' || $char === 'X') {
|
||||
# Append the current input character to the temporary buffer.
|
||||
# Switch to the hexadecimal character reference start state.
|
||||
$temporaryBuffer .= $char;
|
||||
$this->state = self::HEXADECIMAL_CHARACTER_REFERENCE_START_STATE;
|
||||
}
|
||||
# Anything else
|
||||
else {
|
||||
# Reconsume in the decimal character reference start state.
|
||||
$this->state = self::DECIMAL_CHARACTER_REFERENCE_START_STATE;
|
||||
$this->data->unconsume();
|
||||
}
|
||||
}
|
||||
|
||||
# 12.2.5.76 Hexadecimal character reference start state
|
||||
elseif ($this->state === self::HEXADECIMAL_CHARACTER_REFERENCE_START_STATE) {
|
||||
# Consume the next input character.
|
||||
$char = $this->data->consume();
|
||||
|
||||
# ASCII hex digit
|
||||
if (ctype_xdigit($char)) {
|
||||
# Reconsume in the hexadecimal character reference state.
|
||||
|
||||
// OPTIMIZATION:
|
||||
// Just consume the digits here
|
||||
$charRefCode = hexdec($char.$this->data->consumeWhile(self::CTYPE_HEX));
|
||||
$this->state = self::HEXADECIMAL_CHARACTER_REFERENCE_STATE;
|
||||
}
|
||||
# Anything else
|
||||
else {
|
||||
# This is an absence-of-digits-in-numeric-character-reference parse error.
|
||||
# Flush code points consumed as a character reference.
|
||||
# Reconsume in the return state.
|
||||
$this->error(ParseError::ABSENCE_OF_DIGITS_IN_CHARACTER_REFERENCE);
|
||||
$this->state = $returnState;
|
||||
$this->data->unconsume();
|
||||
return $temporaryBuffer;
|
||||
}
|
||||
}
|
||||
|
||||
# 12.2.5.77 Decimal character reference start state
|
||||
elseif ($this->state === self::DECIMAL_CHARACTER_REFERENCE_START_STATE) {
|
||||
# Consume the next input character.
|
||||
$char = $this->data->consume();
|
||||
|
||||
# ASCII digit
|
||||
if (ctype_digit($char)) {
|
||||
# Reconsume in the decimal character reference state.
|
||||
|
||||
// OPTIMIZATION:
|
||||
// Just consume the digits here
|
||||
$charRefCode = hexdec($char.$this->data->consumeWhile(self::CTYPE_NUM));
|
||||
$this->state = self::DECIMAL_CHARACTER_REFERENCE_STATE;
|
||||
}
|
||||
# Anything else
|
||||
else {
|
||||
# This is an absence-of-digits-in-numeric-character-reference parse error.
|
||||
# Flush code points consumed as a character reference.
|
||||
# Reconsume in the return state.
|
||||
$this->error(ParseError::ABSENCE_OF_DIGITS_IN_CHARACTER_REFERENCE);
|
||||
$this->state = $returnState;
|
||||
$this->data->unconsume();
|
||||
return $temporaryBuffer;
|
||||
}
|
||||
}
|
||||
|
||||
# 12.2.5.78 Hexadecimal character reference state
|
||||
elseif ($this->state === self::HEXADECIMAL_CHARACTER_REFERENCE_STATE) {
|
||||
# Consume the next input character.
|
||||
$char = $this->data->consume();
|
||||
|
||||
# ASCII digit
|
||||
# ASCII upper hex digit
|
||||
# ASCII lower hex digit
|
||||
if (ctype_xdigit($char)) {
|
||||
# Multiply the character reference code by 16.
|
||||
# Add a numeric version of the current input
|
||||
# character to the character reference code.
|
||||
|
||||
// OPTIMIZATION: Combine all digit types
|
||||
// NOTE: This branch should never be reached
|
||||
$charRefCode = ($charRefCode * 16) + hexdec($char);
|
||||
}
|
||||
# U+003B SEMICOLON
|
||||
elseif ($char === ';') {
|
||||
# Switch to the numeric character reference end state.
|
||||
$this->state = self::NUMERIC_CHARACTER_REFERENCE_END_STATE;
|
||||
}
|
||||
# Anything else
|
||||
else {
|
||||
# This is a missing-semicolon-after-character-reference parse error.
|
||||
# Reconsume in the numeric character reference end state.
|
||||
$this->error(ParseError::MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE);
|
||||
$this->state = self::NUMERIC_CHARACTER_REFERENCE_END_STATE;
|
||||
$this->data->unconsume();
|
||||
}
|
||||
}
|
||||
|
||||
# 12.2.5.79 Decimal character reference state
|
||||
elseif ($this->state === self::DECIMAL_CHARACTER_REFERENCE_STATE) {
|
||||
# Consume the next input character.
|
||||
$char = $this->data->consume();
|
||||
|
||||
# ASCII digit
|
||||
if (ctype_digit($char)) {
|
||||
# Multiply the character reference code by 10.
|
||||
# Add a numeric version of the current input
|
||||
# character to the character reference code.
|
||||
|
||||
// OPTIMIZATION: Combine all digit types
|
||||
// NOTE: This branch should never be reached
|
||||
$charRefCode = ($charRefCode * 10) + ((int) ($char));
|
||||
}
|
||||
# U+003B SEMICOLON
|
||||
elseif ($char === ';') {
|
||||
# Switch to the numeric character reference end state.
|
||||
$this->state = self::NUMERIC_CHARACTER_REFERENCE_END_STATE;
|
||||
}
|
||||
# Anything else
|
||||
else {
|
||||
# This is a missing-semicolon-after-character-reference parse error.
|
||||
# Reconsume in the numeric character reference end state.
|
||||
$this->error(ParseError::MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE);
|
||||
$this->state = self::NUMERIC_CHARACTER_REFERENCE_END_STATE;
|
||||
$this->data->unconsume();
|
||||
}
|
||||
}
|
||||
|
||||
# 12.2.5.80 Numeric character reference end state
|
||||
elseif ($this->state === self::NUMERIC_CHARACTER_REFERENCE_END_STATE) {
|
||||
# Check the character reference code:
|
||||
|
||||
# If the number is 0x00, then this is a null-character-reference parse error.
|
||||
# Set the character reference code to 0xFFFD.
|
||||
if ($charRefCode === 0) {
|
||||
$this->error(ParseError::NULL_CHARACTER_REFRERENCE);
|
||||
$charRefCode = 0xFFFD;
|
||||
}
|
||||
# If the number is greater than 0x10FFFF, then this is a
|
||||
# character-reference-outside-unicode-range parse error.
|
||||
# Set the character reference code to 0xFFFD.
|
||||
elseif ($charRefCode > 0x10FFFF) {
|
||||
$this->error(ParseError::CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE);
|
||||
$charRefCode = 0xFFFD;
|
||||
}
|
||||
# If the number is a surrogate, then this is a
|
||||
# surrogate-character-reference parse error.
|
||||
# Set the character reference code to 0xFFFD.
|
||||
elseif ($charRefCode >= 0xD800 && $charRefCode <= 0xDFFF) {
|
||||
$this->error(ParseError::SURROGATE_CHARACTER_REFERENCE);
|
||||
$charRefCode = 0xFFFD;
|
||||
}
|
||||
# If the number is a noncharacter, then this is a
|
||||
# noncharacter-character-reference parse error.
|
||||
elseif (($charRefCode >= 0xFDD0 && $charRefCode <= 0xFDEF) || ($charRefCode % 0x10000 & 0xFFFE) === 0xFFFE) {
|
||||
$this->error(ParseError::NONCHARACTER_CHARACTER_REFERENCE);
|
||||
}
|
||||
# If the number is 0x0D, or a control that's not ASCII whitespace, then
|
||||
# this is a control-character-reference parse error.
|
||||
# If the number is one of the numbers in the first column of the following
|
||||
# table, then find the row with that number in the first column, and set
|
||||
# the character reference code to the number in the second column of that row.
|
||||
elseif (($charRefCode < 0x20 && !in_array($charRefCode, [0x9, 0xA, 0xC])) || ($charRefCode >= 0x7F && $charRefCode <= 0x9F)) {
|
||||
// NOTE: Table elided
|
||||
$this->error(ParseError::CONTROL_CHARACTER_REFERENCE);
|
||||
$charRefCode = CharacterReference::C1_TABLE[$charRefCode] ?? $charRefCode;
|
||||
}
|
||||
$temporaryBuffer = UTF8::encode($charRefCode);
|
||||
$this->state = $returnState;
|
||||
return $temporaryBuffer;
|
||||
}
|
||||
|
||||
# Not a valid state, unimplemented, or implemented elsewhere
|
||||
else {
|
||||
throw new \Exception("Unimplemented character reference consumption state: ".(self::STATE_NAMES[$this->state] ?? $this->state));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue