New from-scratch character reference consumer

This commit is contained in:
J. King 2019-12-16 22:39:16 -05:00
parent 67c7f382e2
commit 19fb541806
4 changed files with 523 additions and 29 deletions

View file

@ -84,16 +84,6 @@ class RoboFile extends \Robo\Tasks {
return $this->runTests($exec, "typical", array_merge(["--coverage-html", BASE_TEST."coverage"], $args));
}
/** Runs the coding standards fixer */
public function clean($opts = ['demo|d' => false]): Result {
$t = $this->taskExec(norm(BASE."vendor/bin/php-cs-fixer"));
$t->arg("fix");
if ($opts['demo']) {
$t->args("--dry-run", "--diff")->option("--diff-format", "udiff");
}
return $t->run();
}
protected function findCoverageEngine(): string {
if (IS_WIN) {
$dbg = dirname(\PHP_BINARY)."\\phpdbg.exe";
@ -139,4 +129,119 @@ class RoboFile extends \Robo\Tasks {
}
return $this->taskExec($executor)->option("-d", "zend.assertions=1")->arg($execpath)->option("-c", $confpath)->args(array_merge($set, $args))->run();
}
/** Runs the coding standards fixer */
public function clean($opts = ['demo|d' => false]): Result {
$t = $this->taskExec(norm(BASE."vendor/bin/php-cs-fixer"));
$t->arg("fix");
if ($opts['demo']) {
$t->args("--dry-run", "--diff")->option("--diff-format", "udiff");
}
return $t->run();
}
/** Produces the CharacterReference class file */
public function charref() {
$template = <<<'FILE'
<?php
declare(strict_types=1);
namespace dW\HTML5;
// This file is machine-generated
// DO NOT MODIFY
// To update, run ./robo charref
class CharacterReference {
const LONGEST_NAME = %LONGEST%;
const PREFIX_PATTERN = %NAMED_PATTERN%;
const NAMES = [
%NAMED_REFERENCES%
];
const C1_TABLE = [
%C1_SUBSTITUTIONS%
];
}
FILE;
$input = @json_decode(@file_get_contents("https://html.spec.whatwg.org/entities.json"), true);
if (!is_array($input)) {
throw new \Exception("Could not retrieve character reference table.");
}
$list = [];
$terms = [];
foreach ($input as $entity => $data) {
// strip the ampersand from the entity name
$entity = substr($entity, 1);
// add the entity name to an array of regular expression terms
// if the entry exists in unterminated form, compress it into one, skiping the unterminated version
if (substr($entity, -1) === ';') {
if (isset($input['&'.substr($entity, 0, strlen($entity) -1)])) {
$terms[] = "$entity?";
} else {
$terms[] = $entity;
}
}
// add a PHP-code representation of the entity name and its characters to another array
$chars = $data['codepoints'];
for ($a = 0; $a < sizeof($chars); $a++) {
$chars[$a] = '\u{'.dechex($chars[$a]).'}';
}
$chars = implode('', $chars);
$list[] = "'$entity'=>\"$chars\"";
}
// concatenate the list of entities and substitute them into the template
$list = implode(",", $list);
$template = str_replace('%NAMED_REFERENCES%', $list, $template);
// prepare the list of terms as a regular expression
// sort longest terms first
usort($terms, function($a, $b) {
return -1 * (strlen(preg_replace("/\W/", "", $a)) <=> strlen(preg_replace("/\W/", "", $b)));
});
// note the longest term
$longest = strlen(preg_replace("/\W/", "", $terms[0]));
$template = str_replace('%LONGEST%', $longest, $template);
// concatenate the terms into a case-sensitive non-capturing prefix search
$regexp = '/^(?:'.implode('|', $terms).')/';
$template = str_replace('%NAMED_PATTERN%', var_export($regexp, true), $template);
// Compile the C1 control substitution table
// See https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
$list = [];
$c1table = [
0x80 => 0x20AC, // EURO SIGN (€)
0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ()
0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK (ƒ)
0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK („)
0x85 => 0x2026, // HORIZONTAL ELLIPSIS (…)
0x86 => 0x2020, // DAGGER (†)
0x87 => 0x2021, // DOUBLE DAGGER (‡)
0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ)
0x89 => 0x2030, // PER MILLE SIGN (‰)
0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON (Š)
0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ()
0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE (Œ)
0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON (Ž)
0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ()
0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ()
0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK (“)
0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK (”)
0x95 => 0x2022, // BULLET (•)
0x96 => 0x2013, // EN DASH ()
0x97 => 0x2014, // EM DASH (—)
0x98 => 0x02DC, // SMALL TILDE (˜)
0x99 => 0x2122, // TRADE MARK SIGN (™)
0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON (š)
0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ()
0x9C => 0x0153, // LATIN SMALL LIGATURE OE (œ)
0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON (ž)
0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ)
];
foreach ($c1table as $c1 => $code) {
$list[] = "$c1=>$code";
}
$list = implode(",", $list);
$template = str_replace('%C1_SUBSTITUTIONS%', $list, $template);
// output the file itself
file_put_contents(BASE."lib/CharacterReference.php", $template);
}
}

File diff suppressed because one or more lines are too long

View file

@ -49,6 +49,14 @@ class ParseError {
const EOF_IN_CDATA = 135;
const END_TAG_WITH_ATTRIBUTES = 136;
const END_TAG_WITH_TRAILING_SOLIDUS = 137;
const MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE = 138;
const UNKNOWN_NAMED_CHARACTER_REFERENCE = 139;
const ABSENCE_OF_DIGITS_IN_CHARACTER_REFERENCE = 140;
const NULL_CHARACTER_REFRERENCE = 141;
const CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE = 142;
const SURROGATE_CHARACTER_REFERENCE = 143;
const NONCHARACTER_CHARACTER_REFERENCE = 144;
const CONTROL_CHARACTER_REFERENCE = 145;
protected static $messages = [
self::UNEXPECTED_NULL_CHARACTER => 'Unexpected null character',
@ -88,6 +96,14 @@ class ParseError {
self::EOF_IN_CDATA => 'End-of-file in CDATA section',
self::END_TAG_WITH_ATTRIBUTES => 'End-tag with attributes',
self::END_TAG_WITH_TRAILING_SOLIDUS => 'End-tag with trailing solidus',
self::MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE => 'Missing semicolon after character reference',
self::UNKNOWN_NAMED_CHARACTER_REFERENCE => 'Unknown named character reference "%s"',
self::ABSENCE_OF_DIGITS_IN_CHARACTER_REFERENCE => 'Absence of digits in character reference',
self::NULL_CHARACTER_REFRERENCE => 'Null character reference',
self::CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE => 'Character reference outside Unicode range',
self::SURROGATE_CHARACTER_REFERENCE => 'Surrogate character rereference',
self::NONCHARACTER_CHARACTER_REFERENCE => 'Non-character character reference',
self::CONTROL_CHARACTER_REFERENCE => 'Control-character character reference',
];
public function setHandler() {

View file

@ -2,6 +2,8 @@
declare(strict_types=1);
namespace dW\HTML5;
use MensBeam\Intl\Encoding\UTF8;
class Tokenizer {
use ParseErrorEmitter;
@ -176,9 +178,21 @@ class Tokenizer {
self::NUMERIC_CHARACTER_REFERENCE_END_STATE => "Numeric character reference",
];
const ATTRIBUTE_VALUE_STATE_SET = [
# A character reference is said to be consumed as part of an attribute
# if the return state is either attribute value (double-quoted) state,
# attribute value (single-quoted) state or attribute value (unquoted) state.
self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE,
self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE,
self::ATTRIBUTE_VALUE_UNQUOTED_STATE
];
// Ctype constants
const CTYPE_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
const CTYPE_UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
const CTYPE_ALPHA = self::CTYPE_UPPER.'abcdefghijklmnopqrstuvwxyz';
const CTYPE_NUM = '0123456789';
const CTYPE_ALNUM = self::CTYPE_ALPHA.self::CTYPE_NUM;
const CTYPE_HEX = self::CTYPE_NUM.'ABCDEFabcdef';
public function __construct(Data $data, OpenElementsStack $stack, ParseError $errorHandler) {
$this->state = self::DATA_STATE;
@ -251,8 +265,9 @@ class Tokenizer {
if ($char === '&') {
# Set the return state to the data state.
# Switch to the character reference state.
$returnState = self::DATA_STATE;
$this->state = self::CHARACTER_REFERENCE_STATE;
// DEVIATION: Character reference consumption implemented as a function
return new CharacterToken($this->switchToCharacterReferenceState(self::RCDATA_STATE));
}
# U+003C LESS-THAN SIGN (<)
elseif ($char === '<') {
@ -292,8 +307,9 @@ class Tokenizer {
if ($char === '&') {
# Set the return state to the RCDATA state.
# Switch to the character reference state.
$returnState = self::RCDATA_STATE;
$this->state = self::CHARACTER_REFERENCE_STATE;
// DEVIATION: Character reference consumption implemented as a function
return new CharacterToken($this->switchToCharacterReferenceState(self::RCDATA_STATE));
}
# U+003C LESS-THAN SIGN (<)
elseif ($char === '<') {
@ -1763,8 +1779,10 @@ class Tokenizer {
elseif ($char === '&') {
# Set the return state to the attribute value (double-quoted) state.
# Switch to the character reference state.
$returnState = self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
$this->state = self::CHARACTER_REFERENCE_STATE;
// DEVIATION: Character reference consumption implemented as a function
assert(isset($attribute) && $attribute instanceof TokenAttr);
$attribute->value .= $this->switchToCharacterReferenceState(self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE);
}
# U+0000 NULL
elseif ($char === "\0") {
@ -1807,8 +1825,10 @@ class Tokenizer {
elseif ($char === '&') {
# Set the return state to the attribute value (single-quoted) state.
# Switch to the character reference state.
$returnState = self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
$this->state = self::CHARACTER_REFERENCE_STATE;
// DEVIATION: Character reference consumption implemented as a function
assert(isset($attribute) && $attribute instanceof TokenAttr);
$attribute->value .= $this->switchToCharacterReferenceState(self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE);
}
# U+0000 NULL
elseif ($char === "\0") {
@ -1855,8 +1875,10 @@ class Tokenizer {
elseif ($char === '&') {
# Set the return state to the attribute value (unquoted) state.
# Switch to the character reference state.
$returnState = self::ATTRIBUTE_VALUE_UNQUOTED_STATE;
$this->state = self::CHARACTER_REFERENCE_STATE;
// DEVIATION: Character reference consumption implemented as a function
assert(isset($attribute) && $attribute instanceof TokenAttr);
$attribute->value .= $this->switchToCharacterReferenceState(self::ATTRIBUTE_VALUE_UNQUOTED_STATE);
}
# ">" (U+003E)
elseif ($char === '>') {
@ -3484,17 +3506,349 @@ class Tokenizer {
}
}
#12.2.5.72 Character reference state
elseif ($this->state === self::CHARACTER_REFERENCE_STATE) {
// Not implemented
$this->state = $returnState;
return new CharacterToken('&');
}
# Not a valid state
# Not a valid state, unimplemented, or implemented elsewhere
else {
throw new \Exception("Unimplemented state: ".(self::STATE_NAMES[$this->state] ?? $this->state));
}
}
}
protected function switchToCharacterReferenceState(int $returnState): string {
// This function implements states 72 through 80,
// "Character reference" through "Numeric character reference end" states
$this->state = self::CHARACTER_REFERENCE_STATE;
while (true) {
assert((function() {
$state = self::STATE_NAMES[$this->state] ?? $this->state;
$char = bin2hex($this->data->peek(1));
$this->debugLog .= " State: $state ($char)\n";
return true;
})());
# 12.2.5.72 Character reference state
if ($this->state === self::CHARACTER_REFERENCE_STATE) {
# Set the temporary buffer to the empty string.
# Append a U+0026 AMPERSAND (&) character to the temporary buffer.
# Consume the next input character.
$temporaryBuffer = '&';
$char = $this->data->consume();
# ASCII alphanumeric
if (ctype_alnum($char)) {
# Reconsume in the named character reference state.
$this->state = self::NAMED_CHARACTER_REFERENCE_STATE;
$this->data->unconsume();
}
# U+0023 NUMBER SIGN (#)
elseif ($char === '#') {
# Append the current input character to the temporary buffer.
# Switch to the numeric character reference state.
$temporaryBuffer .= $char;
$this->state = self::NUMERIC_CHARACTER_REFERENCE_STATE;
}
# Anything else
else {
# Flush code points consumed as a character reference.
# Reconsume in the return state.
$this->state = $returnState;
$this->data->unconsume();
return $temporaryBuffer;
}
}
# 12.2.5.73 Named character reference state
elseif ($this->state === self::NAMED_CHARACTER_REFERENCE_STATE) {
# Consume the maximum number of characters possible,
# with the consumed characters matching one of the
# identifiers in the first column of the named character
# references table (in a case-sensitive manner).
// DEVIATION:
// We consume all possible alphanumeric characters,
// up to the length of the longest in the table
$candidate = $this->data->consumeWhile(self::CTYPE_ALNUM, CharacterReference::LONGEST_NAME);
// Keep a record of the terminating character, which is used later
$next = $this->data->peek(1);
if ($next === ';') {
// consume the following character if it is a proper terminator
$candidate .= $this->data->consume();
}
// Look for an exact match
// If not found look for a prefix match if not consuming in an attribute
$match = CharacterReference::NAMES[$candidate] ?? null;
if (is_null($match) && !in_array($returnState, self::ATTRIBUTE_VALUE_STATE_SET)) {
$match = (preg_match(CharacterReference::PREFIX_PATTERN, $candidate, $match)) ? $match[0] : null;
// If a prefix match is found, unconsume to the end of the prefix
if (!is_null($match)) {
$this->data->unconsume(strlen($candidate) - strlen($match));
$next = $candidate[strlen($match)];
}
}
# Append each character to the temporary buffer when it's consumed.
$temporaryBuffer .= ($match ?? $candidate);
# If there is a match
if (!is_null($match)) {
# If the character reference was consumed as part of an attribute,
# and the last character matched is not a U+003B SEMICOLON character (;),
# and the next input character is either a U+003D EQUALS SIGN character (=)
# or an ASCII alphanumeric...
if (in_array($returnState, self::ATTRIBUTE_VALUE_STATE_SET) && $next !== ';' && ($next === '=' || ctype_alnum($next))) {
# ... then, for historical reasons, flush code points consumed
# as a character reference and switch to the return state.
$this->state = $returnState;
return $temporaryBuffer;
}
# Otherwise:
else {
# If the last character matched is not a U+003B SEMICOLON character (;),
# then this is a missing-semicolon-after-character-reference parse error.
if ($next !== ';') {
$this->error(ParseError::MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE);
}
# Set the temporary buffer to the empty string.
# Append one or two characters corresponding to the
# character reference name (as given by the second
# column of the named character references table)
# to the temporary buffer.
# Flush code points consumed as a character reference.
# Switch to the return state.
// In other words: return the match
$this->state = $returnState;
return $match;
}
}
# Otherwise:
else {
# Flush code points consumed as a character reference.
# Switch to the ambiguous ampersand state.
// DEVIATION: We flush only when switching to the return state
$this->state = self::AMBIGUOUS_AMPERSAND_STATE;
}
}
# 12.2.5.74 Ambiguous ampersand state
elseif ($this->state === self::AMBIGUOUS_AMPERSAND_STATE) {
# Consume the next input character.
$char = $this->data->consume();
# ASCII alphanumeric
if (ctype_alnum($char)) {
# If the character reference was consumed as part of an attribute,
# then append the current input character to the current attribute's value.
# Otherwise, emit the current input character as a character token.
// DEVIATION: We just continue to buffer characters until it's time to return
// NOTE: this branch should never be reached
$temporaryBuffer .= $char;
}
# U+003B SEMICOLON (;)
elseif ($char === ';') {
# This is an unknown-named-character-reference parse error.
# Reconsume in the return state.
$this->error(ParseError::UNKNOWN_NAMED_CHARACTER_REFERENCE, $temporaryBuffer.';');
$this->state = $returnState;
$this->data->unconsume();
return $temporaryBuffer;
}
# Anything else
else {
# Reconsume in the return state.
$this->state = $returnState;
$this->data->unconsume();
return $temporaryBuffer;
}
}
# 12.2.5.75 Numeric character reference state
elseif ($this->state === self::NUMERIC_CHARACTER_REFERENCE_STATE) {
# Set the character reference code to zero (0).
$charRefCode = 0;
# Consume the next input character.
$char = $this->data->consume();
# U+0078 LATIN SMALL LETTER X
#U+0058 LATIN CAPITAL LETTER X
if ($char === 'x' || $char === 'X') {
# Append the current input character to the temporary buffer.
# Switch to the hexadecimal character reference start state.
$temporaryBuffer .= $char;
$this->state = self::HEXADECIMAL_CHARACTER_REFERENCE_START_STATE;
}
# Anything else
else {
# Reconsume in the decimal character reference start state.
$this->state = self::DECIMAL_CHARACTER_REFERENCE_START_STATE;
$this->data->unconsume();
}
}
# 12.2.5.76 Hexadecimal character reference start state
elseif ($this->state === self::HEXADECIMAL_CHARACTER_REFERENCE_START_STATE) {
# Consume the next input character.
$char = $this->data->consume();
# ASCII hex digit
if (ctype_xdigit($char)) {
# Reconsume in the hexadecimal character reference state.
// OPTIMIZATION:
// Just consume the digits here
$charRefCode = hexdec($char.$this->data->consumeWhile(self::CTYPE_HEX));
$this->state = self::HEXADECIMAL_CHARACTER_REFERENCE_STATE;
}
# Anything else
else {
# This is an absence-of-digits-in-numeric-character-reference parse error.
# Flush code points consumed as a character reference.
# Reconsume in the return state.
$this->error(ParseError::ABSENCE_OF_DIGITS_IN_CHARACTER_REFERENCE);
$this->state = $returnState;
$this->data->unconsume();
return $temporaryBuffer;
}
}
# 12.2.5.77 Decimal character reference start state
elseif ($this->state === self::DECIMAL_CHARACTER_REFERENCE_START_STATE) {
# Consume the next input character.
$char = $this->data->consume();
# ASCII digit
if (ctype_digit($char)) {
# Reconsume in the decimal character reference state.
// OPTIMIZATION:
// Just consume the digits here
$charRefCode = hexdec($char.$this->data->consumeWhile(self::CTYPE_NUM));
$this->state = self::DECIMAL_CHARACTER_REFERENCE_STATE;
}
# Anything else
else {
# This is an absence-of-digits-in-numeric-character-reference parse error.
# Flush code points consumed as a character reference.
# Reconsume in the return state.
$this->error(ParseError::ABSENCE_OF_DIGITS_IN_CHARACTER_REFERENCE);
$this->state = $returnState;
$this->data->unconsume();
return $temporaryBuffer;
}
}
# 12.2.5.78 Hexadecimal character reference state
elseif ($this->state === self::HEXADECIMAL_CHARACTER_REFERENCE_STATE) {
# Consume the next input character.
$char = $this->data->consume();
# ASCII digit
# ASCII upper hex digit
# ASCII lower hex digit
if (ctype_xdigit($char)) {
# Multiply the character reference code by 16.
# Add a numeric version of the current input
# character to the character reference code.
// OPTIMIZATION: Combine all digit types
// NOTE: This branch should never be reached
$charRefCode = ($charRefCode * 16) + hexdec($char);
}
# U+003B SEMICOLON
elseif ($char === ';') {
# Switch to the numeric character reference end state.
$this->state = self::NUMERIC_CHARACTER_REFERENCE_END_STATE;
}
# Anything else
else {
# This is a missing-semicolon-after-character-reference parse error.
# Reconsume in the numeric character reference end state.
$this->error(ParseError::MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE);
$this->state = self::NUMERIC_CHARACTER_REFERENCE_END_STATE;
$this->data->unconsume();
}
}
# 12.2.5.79 Decimal character reference state
elseif ($this->state === self::DECIMAL_CHARACTER_REFERENCE_STATE) {
# Consume the next input character.
$char = $this->data->consume();
# ASCII digit
if (ctype_digit($char)) {
# Multiply the character reference code by 10.
# Add a numeric version of the current input
# character to the character reference code.
// OPTIMIZATION: Combine all digit types
// NOTE: This branch should never be reached
$charRefCode = ($charRefCode * 10) + ((int) ($char));
}
# U+003B SEMICOLON
elseif ($char === ';') {
# Switch to the numeric character reference end state.
$this->state = self::NUMERIC_CHARACTER_REFERENCE_END_STATE;
}
# Anything else
else {
# This is a missing-semicolon-after-character-reference parse error.
# Reconsume in the numeric character reference end state.
$this->error(ParseError::MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE);
$this->state = self::NUMERIC_CHARACTER_REFERENCE_END_STATE;
$this->data->unconsume();
}
}
# 12.2.5.80 Numeric character reference end state
elseif ($this->state === self::NUMERIC_CHARACTER_REFERENCE_END_STATE) {
# Check the character reference code:
# If the number is 0x00, then this is a null-character-reference parse error.
# Set the character reference code to 0xFFFD.
if ($charRefCode === 0) {
$this->error(ParseError::NULL_CHARACTER_REFRERENCE);
$charRefCode = 0xFFFD;
}
# If the number is greater than 0x10FFFF, then this is a
# character-reference-outside-unicode-range parse error.
# Set the character reference code to 0xFFFD.
elseif ($charRefCode > 0x10FFFF) {
$this->error(ParseError::CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE);
$charRefCode = 0xFFFD;
}
# If the number is a surrogate, then this is a
# surrogate-character-reference parse error.
# Set the character reference code to 0xFFFD.
elseif ($charRefCode >= 0xD800 && $charRefCode <= 0xDFFF) {
$this->error(ParseError::SURROGATE_CHARACTER_REFERENCE);
$charRefCode = 0xFFFD;
}
# If the number is a noncharacter, then this is a
# noncharacter-character-reference parse error.
elseif (($charRefCode >= 0xFDD0 && $charRefCode <= 0xFDEF) || ($charRefCode % 0x10000 & 0xFFFE) === 0xFFFE) {
$this->error(ParseError::NONCHARACTER_CHARACTER_REFERENCE);
}
# If the number is 0x0D, or a control that's not ASCII whitespace, then
# this is a control-character-reference parse error.
# If the number is one of the numbers in the first column of the following
# table, then find the row with that number in the first column, and set
# the character reference code to the number in the second column of that row.
elseif (($charRefCode < 0x20 && !in_array($charRefCode, [0x9, 0xA, 0xC])) || ($charRefCode >= 0x7F && $charRefCode <= 0x9F)) {
// NOTE: Table elided
$this->error(ParseError::CONTROL_CHARACTER_REFERENCE);
$charRefCode = CharacterReference::C1_TABLE[$charRefCode] ?? $charRefCode;
}
$temporaryBuffer = UTF8::encode($charRefCode);
$this->state = $returnState;
return $temporaryBuffer;
}
# Not a valid state, unimplemented, or implemented elsewhere
else {
throw new \Exception("Unimplemented character reference consumption state: ".(self::STATE_NAMES[$this->state] ?? $this->state));
}
}
}
}