Prototype character encoding detection

5 years ago · a7e1083681
4 changed files with 384 additions and 18 deletions
--- a/lib/Charset.php
+++ b/lib/Charset.php
@ -12,7 +12,7 @@ abstract class Charset {
    public static function fromCharset(string $value): ?string {
        $encoding = Encoding::matchLabel($value);
        if ($encoding) {
-            return $encoding['class'];
+            return $encoding['name'];
        }
        return null;
    }
@ -71,7 +71,7 @@ abstract class Charset {
            if ($param === "charset") {
                $encoding = Encoding::matchLabel($value);
                if ($encoding) {
-                    return $encoding['class'];
+                    return $encoding['name'];
                } else {
                    return null;
                }
@ -80,7 +80,372 @@ abstract class Charset {
        return null;
    }
-    public static function fromPrescan(string $data): ?string {
+    public static function fromPrescan(string $data, int $endAfter = 1024): ?string {
-        return null;
+        # When an algorithm requires a user agent to prescan a byte stream to 
        #   determine its encoding, given some defined end condition, then it 
        #   must run the following steps. 
        # These steps either abort unsuccessfully or return a character 
        #   encoding. If at any point during these steps (including during 
        #   instances of the get an attribute algorithm invoked by this one) 
        #   the user agent either runs out of bytes (meaning the position 
        #   pointer created in the first step below goes beyond the end of the 
        #   byte stream obtained so far) or reaches its end condition, then 
        #   abort the prescan a byte stream to determine its encoding 
        #   algorithm unsuccessfully.
        $s = substr($data, 0, $endAfter);
        # Let position be a pointer to a byte in the input byte stream, 
        #   initially pointing at the first byte.
        $pos = 0;
        # Loop: If position points to:
        while ($pos < $endAfter) {
            // OPTIMIZATION: Start my skipping anything not a less-than sign
            if (@$s[$pos] === "<") {
                $pos++;
                # A sequence of bytes starting with: 0x3C 0x21 0x2D 0x2D (`<!--`)
                if (@$s[$pos] === "!" && @$s[$pos + 1] === "-" && @$s[$pos + 2] === "-") {
                    # Advance the position pointer so that it points at the 
                    #   first 0x3E byte which is preceded by two 0x2D bytes 
                    #   (i.e. at the end of an ASCII '-->' sequence) and 
                    #   comes after the 0x3C byte that was found.e (The two 
                    #   0x2D bytes can be the same as those in the '<!--' 
                    #   sequence.)
                    $pos = (strpos($s, "-->", $pos) ?: $endAfter) + 3;
                }
                # A sequence of bytes starting with: 0x3C, 0x4D or 0x6D, 
                #   0x45 or 0x65, 0x54 or 0x74, 0x41 or 0x61, and one of 
                #   0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F (case-insensitive 
                #   ASCII '<meta' followed by a space or slash)
                elseif (preg_match("<^meta[\x09\x0A\x0C\x0D /]$>i", substr($s, $pos, 5))) {
                    # Advance the position pointer so that it points at 
                    #   the next 0x09, 0x0A, 0x0C, 0x0D, 0x20, or 0x2F 
                    #   byte (the one in sequence of characters matched above).
                    $pos += 5;
                    # Let attribute list be an empty list of strings.
                    # Let got pragma be false.
                    # Let need pragma be null.
                    # Let charset be the null value (which, for the purposes 
                    #   of this algorithm, is distinct from an unrecognized 
                    #   encoding or the empty string).
                    $attrList = [];
                    $gotPragma = false;
                    $needPragma = null;
                    $charset = null;
                    # Attributes: Get an attribute and its value. 
                    # If no attribute was sniffed, then jump to the processing step below.
                    while ($attr = self::getAttribute($s, $pos)) {
                        # If the attribute's name is already in attribute list, 
                        #   then return to the step labeled attributes.
                        if (isset($attrList[$attr['name']])) {
                            continue;
                        }
                        # Add the attribute's name to attribute list.
                        $attrList[$attr['name']] = true;
                        # Run the appropriate step from the following list, if one applies:
                        # If the attribute's name is "http-equiv"
                        if ($attr['name'] === "http-equiv") {
                            # If the attribute's value is "content-type", then set got pragma to true.
                            if ($attr['value'] === "content-type") {
                                $gotPragma = true;
                            }
                        }
                        # If the attribute's name is "content"
                        elseif ($attr['name'] === "content") {
                            # Apply the algorithm for extracting a character encoding from a meta 
                            #   element, giving the attribute's value as the string to parse. 
                            # If a character encoding is returned, and if charset is still set to 
                            #   null, let charset be the encoding returned, and set need pragma to true.
                            // OPTIMIZATION: Check if charset is null before performing the algorithm
                            if (is_null($charset) && $candidate = self::fromMeta($attr['value'])) {
                                $charset = $candidate;
                                $needPragma = true;
                            }
                        }
                        # If the attribute's name is "charset"
                        elseif ($attr['name'] === "charset") {
                            # Let charset be the result of getting an encoding from the attribute's 
                            #   value, and set need pragma to false.
                            $candidate = self::fromCharset($attr['value']);
                            $charset = $candidate ?? false; // false signifies 'failure'
                            $needPragma = false;
                        }
                    }
                    # Processing: If need pragma is null, then jump to the step below labeled next byte.
                    # If need pragma is true but got pragma is false, then jump to the step below labeled next byte.
                    if (is_null($needPragma) || ($needPragma && !$gotPragma)) {
                        continue;
                    }
                    # If charset is failure, then jump to the step below labeled next byte.
                    if ($charset === false) {
                        $pos++;
                        continue;
                    }
                    # If charset is a UTF-16 encoding, then set charset to UTF-8.
                    elseif ($charset === "UTF-16") {
                        $charset = "UTF-8";
                    }
                    # If charset is x-user-defined, then set charset to windows-1252.
                    elseif ($charset === "x-user-defined") {
                        $charset = "windows-1252";
                    }
                    # Abort the prescan a byte stream to determine its encoding algorithm,
                    #   returning the encoding given by charset.
                    return $charset;
                }
                # A sequence of bytes starting with a 0x3C byte (<), optionally a 0x2F byte (/), 
                #   and finally a byte in the range 0x41-0x5A or 0x61-0x7A (A-Z or a-z)
                elseif (($s[$pos] === "/" && ctype_alpha($s[$pos + 1])) || (ctype_alpha($s[$pos]))) {
                    # Advance the position pointer so that it points at the next 
                    #   0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>) byte.
                    while (!in_array(@$s[$pos++], ["\x09", "\x0A", "\x0C", "\x0D", " ", ">", ""]));
                    # Repeatedly get an attribute until no further attributes can be found, 
                    #   then jump to the step below labeled next byte.
                    while(self::getAttribute($s, $pos));
                }
                # A sequence of bytes starting with: 0x3C 0x21 (`<!`)
                # A sequence of bytes starting with: 0x3C 0x2F (`</`)
                # A sequence of bytes starting with: 0x3C 0x3F (`<?`)
                elseif (in_array(@$s[$pos], ["!", "/", "?"])) {
                    # Advance the position pointer so that it points at the first 
                    #   0x3E byte (>) that comes after the 0x3C byte that was found.
                    $pos = (strpos($s, ">", $pos) ?: $endAfter) + 1;
                }
            }
            # Any other byte
            else {
                # Do nothing with that byte.
                $pos++;
            }
        }
    }
    protected static function getAttribute(string $s, &$pos): array {
        # When the prescan a byte stream to determine its encoding 
        #   algorithm says to get an attribute, it means doing this:
        # If the byte at position is one of 
        #   0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), 
        #   or 0x2F (/) then advance position to the next byte and 
        #   redo this step.
        while (in_array(@$s[$pos], ["\x09", "\x0A", "\x0C", "\x0D", " ", "/"])) {
            $pos++;
        }
        $char = @$s[$pos];
        # If the byte at position is 0x3E (>), 
        #   then abort the get an attribute algorithm. There isn't one.
        if ($char === ">") {
            return [];
        }
        # Otherwise, the byte at position is the start of the attribute name.
        #  Let attribute name and attribute value be the empty string.
        $name = "";
        $value = "";
        # Process the byte at position as follows:
        while ($char !== "") {
            # If it is 0x3D (=), and the attribute name is longer than the empty string
            if ($char === "=" && $name !== "") {
                # Advance position to the next byte and jump to the step below labeled value.
                $pos++;
                goto value;
            }
            # If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP)
            elseif (in_array($char, ["\x09", "\x0A", "\x0C", "\x0D", " "])) {
                goto spaces;
            }
            # If it is 0x2F (/) or 0x3E (>)
            elseif ($char === "/" || $char === ">") {
                # Abort the get an attribute algorithm.
                # The attribute's name is the value of attribute name, its value is the empty string.
                return ['name' => $name, 'value' => $value];
            }
            # If it is in the range 0x41 (A) to 0x5A (Z)
            # Anything else
            else {
                # Append the code point with the same value as the byte at position to attribute name.
                # (It doesn't actually matter how bytes outside the ASCII range are handled here,
                #    since only ASCII bytes can contribute to the detection of a character encoding.)
                // OPTIMIZATION: Also handle uppercase characters
                $name .= strtolower($char);
            }
            # Advance position to the next byte and return to the previous step.
            $char = @$s[++$pos];
        }
        if ($char === "") {
            // Out of bytes
            return [];
        }
        spaces:
        #  If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 
        #   or 0x20 (SP) then advance position to the next byte, then, repeat this step.
        while (in_array(@$s[$pos], ["\x09", "\x0A", "\x0C", "\x0D", " ", "/"])) {
            $pos++;
        }
        $char = @$s[$pos];
        if ($char === "") {
            // Out of bytes
            return [];
        }
        # If the byte at position is not 0x3D (=), abort the get an attribute algorithm.
        # The attribute's name is the value of attribute name, its value is the empty string.
        if ($char !== "=") {
            return ['name' => $name, 'value' => $value];
        }
        # Advance position past the 0x3D (=) byte.
        $char = @$s[++$pos];
        value:
        # If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 
        #   or 0x20 (SP) then advance position to the next byte, then, repeat this step.
        while (in_array(@$s[$pos], ["\x09", "\x0A", "\x0C", "\x0D", " ", "/"])) {
            $pos++;
        }
        $char = @$s[$pos];
        if ($char === "") {
            // Out of bytes
            return [];
        }
        # Process the byte at position as follows:
        # If it is 0x22 (") or 0x27 (')
        if ($char === "'" || $char === '"') {
            # Let b be the value of the byte at position.
            $b = $char;
            # Quote loop: Advance position to the next byte.
            while (($char = @$s[++$pos]) !== "") {
                # If the value of the byte at position is the value of b, 
                #   then advance position to the next byte and abort 
                #   the "get an attribute" algorithm. 
                # The attribute's name is the value of attribute name, 
                #   and its value is the value of attribute value.
                if ($char === $b) {
                    $pos++;
                    return ['name' => $name, 'value' => $value];
                }
                # Otherwise, append a code point to attribute value whose 
                #   value is the same as the value of the byte at position.
                // OPTIMIZATION: Also handle uppercase characters
                $value .= strtolower($char);
            }
            // Out of bytes
            return [];
        }
        # If it is 0x3E (>)
        elseif ($char === ">") {
            # Abort the get an attribute algorithm.
            # The attribute's name is the value of attribute name, 
            #   its value is the empty string.
            return ['name' => $name, 'value' => $value];
        }
        # Anything else
        else {
            # Append a code point with the same value as the byte at position to attribute value.
            # Advance position to the next byte.
            // OPTIMIZATION: Also handle uppercase characters
            $value .= strtolower($char);
            while (($char = @$s[++$pos]) !== "") {
                # Process the byte at position as follows:
                # If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>)
                if (in_array($char, ["\x09", "\x0A", "\x0C", "\x0D", " ", ">"])) {
                    # Abort the get an attribute algorithm.
                    # The attribute's name is the value of attribute name 
                    #   and its value is the value of attribute value.
                    return ['name' => $name, 'value' => $value];
                }
                # If it is in the range 0x41 (A) to 0x5A (Z)
                # Anything else
                else {
                    # Append a code point with the same value as 
                    #   the byte at position to attribute value.
                    $value .= strtolower($char);
                }
            }
            // Out of bytes
            return [];
        }
    }
    protected static function fromMeta(string $s): ?string {
        # The algorithm for extracting a character encoding from a meta element, 
        #   given a string s, is as follows.
        # It either returns a character encoding or nothing.
        # Let position be a pointer into s, initially pointing at the start of the string.
        $pos = 0;
        $end = strlen($s);
        # Loop:
        while ($pos < $end) {
            # Find the first seven characters in s after position 
            #   that are an ASCII case-insensitive match for the word "charset".
            # If no such match is found, return nothing.
            $found = stripos($s, "charset", $pos);
            if ($found === false) {
                return null;
            }
            $pos = $found + 7;
            # Skip any ASCII whitespace that immediately follow the word "charset" 
            #   (there might not be any).
            while (in_array(@$s[$pos], ["\x09", "\x0A", "\x0C", "\x0D", " "])) {
                $pos++;
            }
            # If the next character is not a U+003D EQUALS SIGN (=), 
            #   then move position to point just before that next 
            #   character, and jump back to the step labeled loop.
            if (@$s[$pos] !== "=") {
                continue;
            }
            # Skip any ASCII whitespace that immediately follow the equals sign 
            #   (there might not be any).
            while (in_array(@$s[++$pos], ["\x09", "\x0A", "\x0C", "\x0D", " "]));
            # Process the next character as follows:
            $char = @$s[$pos];
            # If it is a U+0022 QUOTATION MARK character (")...
            # If it is a U+0027 APOSTROPHE character (')...
            if ($char === '"' || $char === "'") {
                # ... and there is a later U+0022 QUOTATION MARK character (") in s
                # ... and there is a later U+0027 APOSTROPHE character (') in s
                if (($end = strpos($s, $char, $pos + 1)) !== false) {
                    $pos++;
                    return self::fromCharset(substr($s, $pos, $end - $pos));
                }
                # If it is an unmatched U+0022 QUOTATION MARK character (")
                # If it is an unmatched U+0027 APOSTROPHE character (')
                else {
                    # Return nothing
                    return null;
                }
            }
            # There is no next character
            elseif ($char === "") {
                # Return nothing
                return null;
            }
            # Anything else
            else {
                # Return the result of getting an encoding from the substring 
                #   that consists of this character up to but not including 
                #   the first ASCII whitespace or U+003B SEMICOLON (;) 
                #   character, or the end of s, whichever comes first.
                $size = -1;
                while (!in_array(@$s[$pos + (++$size)], ["\x09", "\x0A", "\x0C", "\x0D", " ", ";", ""]));
                return self::fromCharset(substr($s, $pos, $size));
            }
        }
    }
 }
--- a/lib/Data.php
+++ b/lib/Data.php
@ -3,6 +3,7 @@ declare(strict_types=1);
 namespace dW\HTML5;
 use MensBeam\Intl\Encoding;
 use MensBeam\Intl\Encoding\Encoding as EncodingEncoding;
 class Data {
    use ParseErrorEmitter;
@ -50,10 +51,10 @@ class Data {
        } elseif ($encoding = Charset::fromPrescan($data)) {
            // Encoding is tentative
        } else {
-            // Encoding is tentative; fall back to windows 1252
+            // Encoding is tentative; fall back to the configured default encoding
-            $encoding = \MensBeam\Intl\Encoding\Windows1252::class;
+            $encoding = Parser::$fallbackEncoding;
        }
-        $this->data = new $encoding($data, false, true);
+        $this->data = Encoding::createDecoder($encoding, $data, false, true);
    }
--- a/lib/Parser.php
+++ b/lib/Parser.php
@ -34,6 +34,8 @@ class Parser {
    /* Static properties */
    public static $fallbackEncoding = "UTF-8";
    // Property used as an instance for the non-static properties
    protected static $instance;
--- a/tests/cases/TestCharset.php
+++ b/tests/cases/TestCharset.php
@ -3,8 +3,6 @@ declare(strict_types=1);
 namespace dW\HTML5\TestCase;
 use dW\HTML5\Charset;
 use MensBeam\Intl\Encoding\UTF8;
 use MensBeam\Intl\Encoding\Windows1252;
 /** 
 * @covers \dW\HTML5\Charset
@ -17,9 +15,9 @@ class TestCharset extends \PHPUnit\Framework\TestCase {
    public function provideCharsets() {
        return [
-            ["UTF-8",                   UTF8::class],
+            ["UTF-8",                   "UTF-8"],
-            ["  utf8  ",                UTF8::class],
+            ["  utf8  ",                "UTF-8"],
-            ["ISO-8859-1",              Windows1252::class],
+            ["ISO-8859-1",              "windows-1252"],
            ["text/html; charset=utf8", null],
        ];
    }
@ -35,12 +33,12 @@ class TestCharset extends \PHPUnit\Framework\TestCase {
            ["charset=utf8",                                      null],
            ["text/html",                                         null],
            ["text/html charset=utf8",                            null],
-            ["text/html; charset=utf8",                           UTF8::class],
+            ["text/html; charset=utf8",                           "UTF-8"],
-            ["text/html;charset=utf8",                            UTF8::class],
+            ["text/html;charset=utf8",                            "UTF-8"],
-            ["text/html; charset=\"utf8\"",                       UTF8::class],
+            ["text/html; charset=\"utf8\"",                       "UTF-8"],
-            ["image/svg+xml; param=value; charset=utf8",          UTF8::class],
+            ["image/svg+xml; param=value; charset=utf8",          "UTF-8"],
-            ["image/svg+xml; charset=utf8; charset=big5",         UTF8::class],
+            ["image/svg+xml; charset=utf8; charset=big5",         "UTF-8"],
-            ["image/svg+xml; charset=utf8;charset=big5",          UTF8::class],
+            ["image/svg+xml; charset=utf8;charset=big5",          "UTF-8"],
            ["text/html; charset=not-valid; charset=big5",        null],
            ["text/html; charset=not-valid",                      null],
            ["text/html; charsaaet=\"a \\\"fancy\\\" encoding\"", null],