New from-scratch character reference consumer

2019-12-16 22:39:16 -05:00 · 2019-12-16 22:39:16 -05:00 · 19fb541806
commit 19fb541806
parent 67c7f382e2
4 changed files with 523 additions and 29 deletions
--- a/RoboFile.php
+++ b/RoboFile.php
@ -84,16 +84,6 @@ class RoboFile extends \Robo\Tasks {
        return $this->runTests($exec, "typical", array_merge(["--coverage-html", BASE_TEST."coverage"], $args));
    }

-    /** Runs the coding standards fixer */
-    public function clean($opts = ['demo|d' => false]): Result {
-        $t = $this->taskExec(norm(BASE."vendor/bin/php-cs-fixer"));
-        $t->arg("fix");
-        if ($opts['demo']) {
-            $t->args("--dry-run", "--diff")->option("--diff-format", "udiff");
-        }
-        return $t->run();
-    }
-
    protected function findCoverageEngine(): string {
        if (IS_WIN) {
            $dbg = dirname(\PHP_BINARY)."\\phpdbg.exe";
@ -139,4 +129,119 @@ class RoboFile extends \Robo\Tasks {
        }
        return $this->taskExec($executor)->option("-d", "zend.assertions=1")->arg($execpath)->option("-c", $confpath)->args(array_merge($set, $args))->run();
    }
+
+    /** Runs the coding standards fixer */
+    public function clean($opts = ['demo|d' => false]): Result {
+        $t = $this->taskExec(norm(BASE."vendor/bin/php-cs-fixer"));
+        $t->arg("fix");
+        if ($opts['demo']) {
+            $t->args("--dry-run", "--diff")->option("--diff-format", "udiff");
+        }
+        return $t->run();
+    }
+
+    /** Produces the CharacterReference class file */
+    public function charref() {
+        $template = <<<'FILE'
+<?php
+declare(strict_types=1);
+namespace dW\HTML5;
+
+// This file is machine-generated
+// DO NOT MODIFY
+
+// To update, run ./robo charref
+
+class CharacterReference {
+    const LONGEST_NAME = %LONGEST%;
+    const PREFIX_PATTERN = %NAMED_PATTERN%;
+    const NAMES = [
+        %NAMED_REFERENCES%
+    ];
+    const C1_TABLE = [
+        %C1_SUBSTITUTIONS%
+    ];
+}
+
+FILE;
+        $input = @json_decode(@file_get_contents("https://html.spec.whatwg.org/entities.json"), true);
+        if (!is_array($input)) {
+            throw new \Exception("Could not retrieve character reference table.");
+        }
+        $list = [];
+        $terms = [];
+        foreach ($input as $entity => $data) {
+            // strip the ampersand from the entity name
+            $entity = substr($entity, 1);
+            // add the entity name to an array of regular expression terms
+            // if the entry exists in unterminated form, compress it into one, skiping the unterminated version
+            if (substr($entity, -1) === ';') {
+                if (isset($input['&'.substr($entity, 0, strlen($entity) -1)])) {
+                    $terms[] = "$entity?";
+                } else {
+                    $terms[] = $entity;
+                }
+            }
+            // add a PHP-code representation of the entity name and its characters to another array
+            $chars = $data['codepoints'];
+            for ($a = 0; $a < sizeof($chars); $a++) {
+                $chars[$a] = '\u{'.dechex($chars[$a]).'}';
+            }
+            $chars = implode('', $chars);
+            $list[] = "'$entity'=>\"$chars\"";
+        }
+        // concatenate the list of entities and substitute them into the template
+        $list = implode(",", $list);
+        $template = str_replace('%NAMED_REFERENCES%', $list, $template);
+        // prepare the list of terms as a regular expression
+        // sort longest terms first
+        usort($terms, function($a, $b) {
+            return -1 * (strlen(preg_replace("/\W/", "", $a)) <=> strlen(preg_replace("/\W/", "", $b)));
+        });
+        // note the longest term
+        $longest = strlen(preg_replace("/\W/", "", $terms[0]));
+        $template = str_replace('%LONGEST%', $longest, $template);
+        // concatenate the terms into a case-sensitive non-capturing prefix search
+        $regexp = '/^(?:'.implode('|', $terms).')/';
+        $template = str_replace('%NAMED_PATTERN%', var_export($regexp, true), $template);
+        // Compile the C1 control substitution table
+        // See https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
+        $list = [];
+        $c1table = [
+            0x80 => 0x20AC, // EURO SIGN (€)
+            0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK (‚)
+            0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK (ƒ)
+            0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK („)
+            0x85 => 0x2026, // HORIZONTAL ELLIPSIS (…)
+            0x86 => 0x2020, // DAGGER (†)
+            0x87 => 0x2021, // DOUBLE DAGGER (‡)
+            0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ)
+            0x89 => 0x2030, // PER MILLE SIGN (‰)
+            0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON (Š)
+            0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹)
+            0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE (Œ)
+            0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON (Ž)
+            0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK (‘)
+            0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK (’)
+            0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK (“)
+            0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK (”)
+            0x95 => 0x2022, // BULLET (•)
+            0x96 => 0x2013, // EN DASH (–)
+            0x97 => 0x2014, // EM DASH (—)
+            0x98 => 0x02DC, // SMALL TILDE (˜)
+            0x99 => 0x2122, // TRADE MARK SIGN (™)
+            0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON (š)
+            0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›)
+            0x9C => 0x0153, // LATIN SMALL LIGATURE OE (œ)
+            0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON (ž)
+            0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ)
+        ];
+        foreach ($c1table as $c1 => $code) {
+            $list[] = "$c1=>$code";
+        }
+        $list = implode(",", $list);
+        $template = str_replace('%C1_SUBSTITUTIONS%', $list, $template);
+        // output the file itself
+        file_put_contents(BASE."lib/CharacterReference.php", $template);
+    }
 }
--- a/lib/CharacterReference.php
+++ b/lib/CharacterReference.php
--- a/lib/ParseError.php
+++ b/lib/ParseError.php
@ -49,6 +49,14 @@ class ParseError {
    const EOF_IN_CDATA                                                      = 135;
    const END_TAG_WITH_ATTRIBUTES                                           = 136;
    const END_TAG_WITH_TRAILING_SOLIDUS                                     = 137;
+    const MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE                       = 138;
+    const UNKNOWN_NAMED_CHARACTER_REFERENCE                                 = 139;
+    const ABSENCE_OF_DIGITS_IN_CHARACTER_REFERENCE                          = 140;
+    const NULL_CHARACTER_REFRERENCE                                         = 141;
+    const CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE                         = 142;
+    const SURROGATE_CHARACTER_REFERENCE                                     = 143;
+    const NONCHARACTER_CHARACTER_REFERENCE                                  = 144;
+    const CONTROL_CHARACTER_REFERENCE                                       = 145;

    protected static $messages = [
        self::UNEXPECTED_NULL_CHARACTER                                         => 'Unexpected null character',
@ -88,6 +96,14 @@ class ParseError {
        self::EOF_IN_CDATA                                                      => 'End-of-file in CDATA section',
        self::END_TAG_WITH_ATTRIBUTES                                           => 'End-tag with attributes',
        self::END_TAG_WITH_TRAILING_SOLIDUS                                     => 'End-tag with trailing solidus',
+        self::MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE                       => 'Missing semicolon after character reference',
+        self::UNKNOWN_NAMED_CHARACTER_REFERENCE                                 => 'Unknown named character reference "%s"',
+        self::ABSENCE_OF_DIGITS_IN_CHARACTER_REFERENCE                          => 'Absence of digits in character reference',
+        self::NULL_CHARACTER_REFRERENCE                                         => 'Null character reference',
+        self::CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE                         => 'Character reference outside Unicode range',
+        self::SURROGATE_CHARACTER_REFERENCE                                     => 'Surrogate character rereference',
+        self::NONCHARACTER_CHARACTER_REFERENCE                                  => 'Non-character character reference',
+        self::CONTROL_CHARACTER_REFERENCE                                       => 'Control-character character reference',
    ];

    public function setHandler() {
--- a/lib/Tokenizer.php
+++ b/lib/Tokenizer.php
@ -2,6 +2,8 @@
 declare(strict_types=1);
 namespace dW\HTML5;

+use MensBeam\Intl\Encoding\UTF8;
+
 class Tokenizer {
    use ParseErrorEmitter;

@ -176,9 +178,21 @@ class Tokenizer {
        self::NUMERIC_CHARACTER_REFERENCE_END_STATE               => "Numeric character reference",
    ];

+    const ATTRIBUTE_VALUE_STATE_SET = [
+        # A character reference is said to be consumed as part of an attribute 
+        #   if the return state is either attribute value (double-quoted) state, 
+        #   attribute value (single-quoted) state or attribute value (unquoted) state.
+        self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE, 
+        self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE, 
+        self::ATTRIBUTE_VALUE_UNQUOTED_STATE
+    ];
+
    // Ctype constants
-    const CTYPE_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
    const CTYPE_UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
+    const CTYPE_ALPHA = self::CTYPE_UPPER.'abcdefghijklmnopqrstuvwxyz';
+    const CTYPE_NUM   = '0123456789';
+    const CTYPE_ALNUM = self::CTYPE_ALPHA.self::CTYPE_NUM;
+    const CTYPE_HEX   = self::CTYPE_NUM.'ABCDEFabcdef';

    public function __construct(Data $data, OpenElementsStack $stack, ParseError $errorHandler) {
        $this->state = self::DATA_STATE;
@ -251,8 +265,9 @@ class Tokenizer {
                if ($char === '&') {
                    # Set the return state to the data state.
                    # Switch to the character reference state.
-                    $returnState = self::DATA_STATE;
-                    $this->state = self::CHARACTER_REFERENCE_STATE;
+
+                    // DEVIATION: Character reference consumption implemented as a function
+                    return new CharacterToken($this->switchToCharacterReferenceState(self::RCDATA_STATE));
                }
                # U+003C LESS-THAN SIGN (<)
                elseif ($char === '<') {
@ -292,8 +307,9 @@ class Tokenizer {
                if ($char === '&') {
                    # Set the return state to the RCDATA state.
                    # Switch to the character reference state.
-                    $returnState = self::RCDATA_STATE;
-                    $this->state = self::CHARACTER_REFERENCE_STATE;
+
+                    // DEVIATION: Character reference consumption implemented as a function
+                    return new CharacterToken($this->switchToCharacterReferenceState(self::RCDATA_STATE));
                }
                # U+003C LESS-THAN SIGN (<)
                elseif ($char === '<') {
@ -1763,8 +1779,10 @@ class Tokenizer {
                elseif ($char === '&') {
                    # Set the return state to the attribute value (double-quoted) state.
                    # Switch to the character reference state.
-                    $returnState = self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
-                    $this->state = self::CHARACTER_REFERENCE_STATE;
+
+                    // DEVIATION: Character reference consumption implemented as a function
+                    assert(isset($attribute) && $attribute instanceof TokenAttr);
+                    $attribute->value .= $this->switchToCharacterReferenceState(self::ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE);
                }
                # U+0000 NULL
                elseif ($char === "\0") {
@ -1807,8 +1825,10 @@ class Tokenizer {
                elseif ($char === '&') {
                    # Set the return state to the attribute value (single-quoted) state.
                    # Switch to the character reference state.
-                    $returnState = self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
-                    $this->state = self::CHARACTER_REFERENCE_STATE;
+
+                    // DEVIATION: Character reference consumption implemented as a function
+                    assert(isset($attribute) && $attribute instanceof TokenAttr);
+                    $attribute->value .= $this->switchToCharacterReferenceState(self::ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE);
                }
                # U+0000 NULL
                elseif ($char === "\0") {
@ -1855,8 +1875,10 @@ class Tokenizer {
                elseif ($char === '&') {
                    # Set the return state to the attribute value (unquoted) state.
                    # Switch to the character reference state.
-                    $returnState = self::ATTRIBUTE_VALUE_UNQUOTED_STATE;
-                    $this->state = self::CHARACTER_REFERENCE_STATE;
+
+                    // DEVIATION: Character reference consumption implemented as a function
+                    assert(isset($attribute) && $attribute instanceof TokenAttr);
+                    $attribute->value .= $this->switchToCharacterReferenceState(self::ATTRIBUTE_VALUE_UNQUOTED_STATE);
                }
                # ">" (U+003E)
                elseif ($char === '>') {
@ -3484,17 +3506,349 @@ class Tokenizer {
                }
            }

-            #12.2.5.72 Character reference state
-            elseif ($this->state === self::CHARACTER_REFERENCE_STATE) {
-                // Not implemented
-                $this->state = $returnState;
-                return new CharacterToken('&');
-            }
-
-            # Not a valid state
+            # Not a valid state, unimplemented, or implemented elsewhere
            else {
                throw new \Exception("Unimplemented state: ".(self::STATE_NAMES[$this->state] ?? $this->state));
            }
        }
    }
+
+    protected function switchToCharacterReferenceState(int $returnState): string {
+        // This function implements states 72 through 80, 
+        // "Character reference" through "Numeric character reference end" states 
+        $this->state = self::CHARACTER_REFERENCE_STATE;
+
+        while (true) {
+            assert((function() {
+                $state = self::STATE_NAMES[$this->state] ?? $this->state;
+                $char = bin2hex($this->data->peek(1));
+                $this->debugLog .= "    State: $state ($char)\n";
+                return true;
+            })());
+
+            # 12.2.5.72 Character reference state
+            if ($this->state === self::CHARACTER_REFERENCE_STATE) {
+                # Set the temporary buffer to the empty string.
+                # Append a U+0026 AMPERSAND (&) character to the temporary buffer.
+                # Consume the next input character.
+                $temporaryBuffer = '&';
+                $char = $this->data->consume();
+
+                # ASCII alphanumeric
+                if (ctype_alnum($char)) {
+                    # Reconsume in the named character reference state.
+                    $this->state = self::NAMED_CHARACTER_REFERENCE_STATE;
+                    $this->data->unconsume();
+                }
+                # U+0023 NUMBER SIGN (#)
+                elseif ($char === '#') {
+                    # Append the current input character to the temporary buffer.
+                    # Switch to the numeric character reference state.
+                    $temporaryBuffer .= $char;
+                    $this->state = self::NUMERIC_CHARACTER_REFERENCE_STATE;
+                }
+                # Anything else
+                else {
+                    # Flush code points consumed as a character reference.
+                    # Reconsume in the return state.
+                    $this->state = $returnState;
+                    $this->data->unconsume();
+                    return $temporaryBuffer;
+                }
+            }
+
+            # 12.2.5.73 Named character reference state
+            elseif ($this->state === self::NAMED_CHARACTER_REFERENCE_STATE) {
+                # Consume the maximum number of characters possible, 
+                #   with the consumed characters matching one of the 
+                #   identifiers in the first column of the named character 
+                #   references table (in a case-sensitive manner).
+                
+                // DEVIATION:
+                // We consume all possible alphanumeric characters, 
+                // up to the length of the longest in the table
+                $candidate = $this->data->consumeWhile(self::CTYPE_ALNUM, CharacterReference::LONGEST_NAME);
+                // Keep a record of the terminating character, which is used later
+                $next = $this->data->peek(1);
+                if ($next === ';') {
+                    // consume the following character if it is a proper terminator
+                    $candidate .= $this->data->consume();
+                }
+                // Look for an exact match 
+                // If not found look for a prefix match if not consuming in an attribute
+                $match = CharacterReference::NAMES[$candidate] ?? null;
+                if (is_null($match) && !in_array($returnState, self::ATTRIBUTE_VALUE_STATE_SET)) {
+                    $match = (preg_match(CharacterReference::PREFIX_PATTERN, $candidate, $match)) ? $match[0] : null;
+                    // If a prefix match is found, unconsume to the end of the prefix
+                    if (!is_null($match)) {
+                        $this->data->unconsume(strlen($candidate) - strlen($match));
+                        $next = $candidate[strlen($match)];
+                    }
+                }
+                
+                # Append each character to the temporary buffer when it's consumed.
+                $temporaryBuffer .= ($match ?? $candidate);
+
+                # If there is a match
+                if (!is_null($match)) {
+                    # If the character reference was consumed as part of an attribute, 
+                    #   and the last character matched is not a U+003B SEMICOLON character (;), 
+                    #   and the next input character is either a U+003D EQUALS SIGN character (=)
+                    #   or an ASCII alphanumeric...
+                    if (in_array($returnState, self::ATTRIBUTE_VALUE_STATE_SET) && $next !== ';' && ($next === '=' || ctype_alnum($next))) {
+                        # ... then, for historical reasons, flush code points consumed 
+                        #   as a character reference and switch to the return state.
+                        $this->state = $returnState;
+                        return $temporaryBuffer;
+                    } 
+                    # Otherwise:
+                    else {
+                        # If the last character matched is not a U+003B SEMICOLON character (;), 
+                        #   then this is a missing-semicolon-after-character-reference parse error.
+                        if ($next !== ';') {
+                            $this->error(ParseError::MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE);
+                        }
+                        # Set the temporary buffer to the empty string. 
+                        # Append one or two characters corresponding to the 
+                        #   character reference name (as given by the second 
+                        #   column of the named character references table) 
+                        #   to the temporary buffer.
+                        # Flush code points consumed as a character reference.
+                        # Switch to the return state.
+
+                        // In other words: return the match
+                        $this->state = $returnState;
+                        return $match;
+                    }
+                }
+                # Otherwise:
+                else {
+                    # Flush code points consumed as a character reference. 
+                    # Switch to the ambiguous ampersand state.
+
+                    // DEVIATION: We flush only when switching to the return state
+                    $this->state = self::AMBIGUOUS_AMPERSAND_STATE;
+                }
+            }
+
+            # 12.2.5.74 Ambiguous ampersand state
+            elseif ($this->state === self::AMBIGUOUS_AMPERSAND_STATE) {
+                # Consume the next input character.
+                $char = $this->data->consume();
+
+                # ASCII alphanumeric
+                if (ctype_alnum($char)) {
+                    # If the character reference was consumed as part of an attribute, 
+                    #   then append the current input character to the current attribute's value.
+                    # Otherwise, emit the current input character as a character token.
+
+                    // DEVIATION: We just continue to buffer characters until it's time to return
+                    // NOTE: this branch should never be reached
+                    $temporaryBuffer .= $char;
+                }
+                # U+003B SEMICOLON (;)
+                elseif ($char === ';') {
+                    # This is an unknown-named-character-reference parse error.
+                    # Reconsume in the return state.
+                    $this->error(ParseError::UNKNOWN_NAMED_CHARACTER_REFERENCE, $temporaryBuffer.';');
+                    $this->state = $returnState;
+                    $this->data->unconsume();
+                    return $temporaryBuffer;
+                }
+                # Anything else
+                else {
+                    # Reconsume in the return state.
+                    $this->state = $returnState;
+                    $this->data->unconsume();
+                    return $temporaryBuffer;
+                }
+            }
+
+            # 12.2.5.75 Numeric character reference state
+            elseif ($this->state === self::NUMERIC_CHARACTER_REFERENCE_STATE) {
+                # Set the character reference code to zero (0).
+                $charRefCode = 0;
+                # Consume the next input character.
+                $char = $this->data->consume();
+                
+                # U+0078 LATIN SMALL LETTER X
+                #U+0058 LATIN CAPITAL LETTER X
+                if ($char === 'x' || $char === 'X') {
+                    # Append the current input character to the temporary buffer.
+                    # Switch to the hexadecimal character reference start state.
+                    $temporaryBuffer .= $char;
+                    $this->state = self::HEXADECIMAL_CHARACTER_REFERENCE_START_STATE;
+                }
+                # Anything else
+                else {
+                    # Reconsume in the decimal character reference start state.
+                    $this->state = self::DECIMAL_CHARACTER_REFERENCE_START_STATE;
+                    $this->data->unconsume();
+                }
+            }
+
+            # 12.2.5.76 Hexadecimal character reference start state
+            elseif ($this->state === self::HEXADECIMAL_CHARACTER_REFERENCE_START_STATE) {
+                # Consume the next input character.
+                $char = $this->data->consume();
+                
+                # ASCII hex digit
+                if (ctype_xdigit($char)) {
+                    # Reconsume in the hexadecimal character reference state.
+
+                    // OPTIMIZATION:
+                    // Just consume the digits here
+                    $charRefCode = hexdec($char.$this->data->consumeWhile(self::CTYPE_HEX));
+                    $this->state = self::HEXADECIMAL_CHARACTER_REFERENCE_STATE;
+                }
+                # Anything else
+                else {
+                    # This is an absence-of-digits-in-numeric-character-reference parse error.
+                    # Flush code points consumed as a character reference.
+                    # Reconsume in the return state.
+                    $this->error(ParseError::ABSENCE_OF_DIGITS_IN_CHARACTER_REFERENCE);
+                    $this->state = $returnState;
+                    $this->data->unconsume();
+                    return $temporaryBuffer;
+                }
+            }
+
+            # 12.2.5.77 Decimal character reference start state
+            elseif ($this->state === self::DECIMAL_CHARACTER_REFERENCE_START_STATE) {
+                # Consume the next input character.
+                $char = $this->data->consume();
+                
+                # ASCII digit
+                if (ctype_digit($char)) {
+                    # Reconsume in the decimal character reference state.
+
+                    // OPTIMIZATION:
+                    // Just consume the digits here
+                    $charRefCode = hexdec($char.$this->data->consumeWhile(self::CTYPE_NUM));
+                    $this->state = self::DECIMAL_CHARACTER_REFERENCE_STATE;
+                }
+                # Anything else
+                else {
+                    # This is an absence-of-digits-in-numeric-character-reference parse error.
+                    # Flush code points consumed as a character reference.
+                    # Reconsume in the return state.
+                    $this->error(ParseError::ABSENCE_OF_DIGITS_IN_CHARACTER_REFERENCE);
+                    $this->state = $returnState;
+                    $this->data->unconsume();
+                    return $temporaryBuffer;
+                }
+            }
+
+            # 12.2.5.78 Hexadecimal character reference state
+            elseif ($this->state === self::HEXADECIMAL_CHARACTER_REFERENCE_STATE) {
+                # Consume the next input character.
+                $char = $this->data->consume();
+                
+                # ASCII digit
+                # ASCII upper hex digit
+                # ASCII lower hex digit
+                if (ctype_xdigit($char)) {
+                    # Multiply the character reference code by 16. 
+                    # Add a numeric version of the current input 
+                    #   character to the character reference code.
+
+                    // OPTIMIZATION: Combine all digit types
+                    // NOTE: This branch should never be reached 
+                    $charRefCode = ($charRefCode * 16) + hexdec($char);
+                }
+                # U+003B SEMICOLON
+                elseif ($char === ';') {
+                    # Switch to the numeric character reference end state.
+                    $this->state = self::NUMERIC_CHARACTER_REFERENCE_END_STATE;
+                }
+                # Anything else
+                else {
+                    # This is a missing-semicolon-after-character-reference parse error.
+                    # Reconsume in the numeric character reference end state.
+                    $this->error(ParseError::MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE);
+                    $this->state = self::NUMERIC_CHARACTER_REFERENCE_END_STATE;
+                    $this->data->unconsume();
+                }
+            }
+
+            # 12.2.5.79 Decimal character reference state
+            elseif ($this->state === self::DECIMAL_CHARACTER_REFERENCE_STATE) {
+                # Consume the next input character.
+                $char = $this->data->consume();
+                
+                # ASCII digit
+                if (ctype_digit($char)) {
+                    # Multiply the character reference code by 10. 
+                    # Add a numeric version of the current input 
+                    #   character to the character reference code.
+
+                    // OPTIMIZATION: Combine all digit types
+                    // NOTE: This branch should never be reached 
+                    $charRefCode = ($charRefCode * 10) + ((int) ($char));
+                }
+                # U+003B SEMICOLON
+                elseif ($char === ';') {
+                    # Switch to the numeric character reference end state.
+                    $this->state = self::NUMERIC_CHARACTER_REFERENCE_END_STATE;
+                }
+                # Anything else
+                else {
+                    # This is a missing-semicolon-after-character-reference parse error.
+                    # Reconsume in the numeric character reference end state.
+                    $this->error(ParseError::MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE);
+                    $this->state = self::NUMERIC_CHARACTER_REFERENCE_END_STATE;
+                    $this->data->unconsume();
+                }
+            }
+
+            # 12.2.5.80 Numeric character reference end state
+            elseif ($this->state === self::NUMERIC_CHARACTER_REFERENCE_END_STATE) {
+                # Check the character reference code:
+
+                # If the number is 0x00, then this is a null-character-reference parse error.
+                # Set the character reference code to 0xFFFD.
+                if ($charRefCode === 0) {
+                    $this->error(ParseError::NULL_CHARACTER_REFRERENCE);
+                    $charRefCode = 0xFFFD;
+                }
+                # If the number is greater than 0x10FFFF, then this is a 
+                #   character-reference-outside-unicode-range parse error.
+                # Set the character reference code to 0xFFFD.
+                elseif ($charRefCode > 0x10FFFF) {
+                    $this->error(ParseError::CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE);
+                    $charRefCode = 0xFFFD;
+                }
+                # If the number is a surrogate, then this is a 
+                #   surrogate-character-reference parse error.
+                # Set the character reference code to 0xFFFD.
+                elseif ($charRefCode >= 0xD800 && $charRefCode <= 0xDFFF) {
+                    $this->error(ParseError::SURROGATE_CHARACTER_REFERENCE);
+                    $charRefCode = 0xFFFD;
+                }
+                # If the number is a noncharacter, then this is a 
+                #   noncharacter-character-reference parse error.
+                elseif (($charRefCode >= 0xFDD0 && $charRefCode <= 0xFDEF) || ($charRefCode % 0x10000 & 0xFFFE) === 0xFFFE) {
+                    $this->error(ParseError::NONCHARACTER_CHARACTER_REFERENCE);
+                }
+                # If the number is 0x0D, or a control that's not ASCII whitespace, then 
+                #   this is a control-character-reference parse error. 
+                # If the number is one of the numbers in the first column of the following 
+                #   table, then find the row with that number in the first column, and set 
+                #   the character reference code to the number in the second column of that row.
+                elseif (($charRefCode < 0x20 && !in_array($charRefCode, [0x9, 0xA, 0xC])) || ($charRefCode >= 0x7F && $charRefCode <= 0x9F)) {
+                    // NOTE: Table elided
+                    $this->error(ParseError::CONTROL_CHARACTER_REFERENCE);
+                    $charRefCode = CharacterReference::C1_TABLE[$charRefCode] ?? $charRefCode;
+                }
+                $temporaryBuffer = UTF8::encode($charRefCode);
+                $this->state = $returnState;
+                return $temporaryBuffer;
+            }
+
+            # Not a valid state, unimplemented, or implemented elsewhere
+            else {
+                throw new \Exception("Unimplemented character reference consumption state: ".(self::STATE_NAMES[$this->state] ?? $this->state));
+            }
+        }        
+    }
 }