Update encoding pre-scan

Official tests are available in WPT,but will need a new test harness
3 years ago · 29127c51ee
2 changed files with 113 additions and 14 deletions
--- a/lib/Parser/Charset.php
+++ b/lib/Parser/Charset.php
@ -130,24 +130,40 @@ abstract class Charset {
     * @param int $endAfter The number of bytes of the string to stop after 
     */
    public static function fromPrescan(string $data, int $endAfter = 1024): ?string {
-        # When an algorithm requires a user agent to prescan a byte stream to 
-        #   determine its encoding, given some defined end condition, then it 
-        #   must run the following steps. 
-        # These steps either abort unsuccessfully or return a character 
-        #   encoding. If at any point during these steps (including during 
-        #   instances of the get an attribute algorithm invoked by this one) 
-        #   the user agent either runs out of bytes (meaning the position 
-        #   pointer created in the first step below goes beyond the end of the 
-        #   byte stream obtained so far) or reaches its end condition, then 
-        #   abort the prescan a byte stream to determine its encoding 
-        #   algorithm unsuccessfully.
+        # When an algorithm requires a user agent to prescan a byte stream
+        #   to determine its encoding, given some defined end condition,
+        #   then it must run the following steps.
+        # If at any point during these steps (including during instances
+        #   of the get an attribute algorithm invoked by this one) the
+        #   user agent either runs out of bytes (meaning the position
+        #   pointer created in the first step below goes beyond the end
+        #   of the byte stream obtained so far) or reaches its end condition,
+        #   then abort the prescan a byte stream to determine its encoding
+        #   algorithm and return the result get an XML encoding applied to
+        #   the same bytes that the prescan a byte stream to determine its
+        #   encoding algorithm was applied to. Otherwise, these steps will
+        #   return a character encoding.
        $s = substr($data, 0, $endAfter);
        $endAfter = strlen($s);

+        # Let fallback encoding be null.
+        // NOTE: This is never used
        # Let position be a pointer to a byte in the input byte stream, 
        #   initially pointing at the first byte.
        $pos = 0;
-        
+
+        # Prescan for UTF-16 XML declarations: If position points to:
+        # A sequence of bytes starting with: 0x3C, 0x0, 0x3F, 0x0, 0x78, 0x0 (case-sensitive UTF-16 little-endian '<?x')
+        if (substr($s, 0, 6) === "\x3C\x00\x3F\x00\x78\x00") {
+            # Return UTF-16LE.
+            return "UTF-16LE";
+        }
+        # A sequence of bytes starting with: 0x0, 0x3C, 0x0, 0x3F, 0x0, 0x78 (case-sensitive UTF-16 big-endian '<?x')
+        if (substr($s, 0, 6) === "\x00\x3C\x00\x3F\x00\x78") {
+            # Return UTF-16BE.
+            return "UTF-16BE";
+        }
+
        # Loop: If position points to:
        while ($pos < $endAfter) {
            // OPTIMIZATION: Start my skipping anything not a less-than sign
@ -273,7 +289,90 @@ abstract class Charset {
                $pos++;
            }
        }
-        return null;
+        return static::fromXMLDeclaration($data, $endAfter);
+    }
+
+    protected static function fromXMLDeclaration(string $data, int $endAfter): ?string {
+        # When the prescan a byte stream to determine its encoding algorithm
+        #   is aborted without returning an encoding, get an XML encoding
+        #   means doing this.
+        $s = substr($data, 0, $endAfter);
+        $endAfter = strlen($s);
+
+        # Let encodingPosition be a pointer to the start of the stream.
+        $pos = 0;
+        # If encodingPosition does not point to the start of a byte sequence
+        #   0x3C, 0x3F, 0x78, 0x6D, 0x6C (`<?xml`), then return failure.
+        if (substr($s, $pos, 5) !== "<?xml") {
+            return null;
+        }
+        # Let xmlDeclarationEnd be a pointer to the next byte in the input
+        #   byte stream which is 0x3E (>). If there is no such byte,
+        #   then return failure.
+        $xmlDeclarationEnd = strpos($s, ">");
+        if (!$xmlDeclarationEnd) {
+            return null;
+        }
+        # Set encodingPosition to the position of the first occurrence of the
+        #   subsequence of bytes 0x65, 0x6E, 0x63, 0x6F, 0x64, 0x69, 0x6E,
+        #   0x67 (`encoding`) at or after the current encodingPosition. If
+        #   there is no such sequence, then return failure.
+        // NOTE: This is buggy; see https://github.com/whatwg/html/issues/6939
+        $pos = strpos($s, "encoding");
+        if ($pos === false || $pos > $xmlDeclarationEnd) {
+            return null;
+        }
+        # Advance encodingPosition past the 0x67 (g) byte.
+        $pos = $pos + strlen("encoding");
+        # While the byte at encodingPosition is less than or equal to 0x20
+        #   (i.e., it is either an ASCII space or control character), 
+        #   advance encodingPosition to the next byte.
+        while (ord($s[$pos]) <= 0x20) {
+            $pos++;
+        }
+        # If the byte at encodingPosition is not 0x3D (=), then return failure.
+        if ($s[$pos] !== "=") {
+            return null;
+        }
+        # While the byte at encodingPosition is less than or equal to 0x20
+        #   (i.e., it is either an ASCII space or control character), 
+        #   advance encodingPosition to the next byte.
+        while (ord($s[$pos]) <= 0x20) {
+            $pos++;
+        }
+        # Let quoteMark be the byte at encodingPosition.
+        $quoteMark = $s[$pos];
+        # If quoteMark is not either 0x22 (") or 0x27 ('), then return failure.
+        if ($quoteMark !== "'" && $quoteMark !== '"') {
+            return null;
+        }
+        # Advance encodingPosition to the next byte.
+        $pos++;
+        # Let encodingEndPosition be the position of the next occurence of
+        #   quoteMark at or after encodingPosition. If quoteMark does not
+        #   occur again, then return failure.
+        $encodingEndPosition = strpos($s, $quoteMark, $pos);
+        if ($encodingEndPosition === false) {
+            return null;
+        }
+        # Let potentialEncoding be the sequence of the bytes between
+        #   encodingPosition (inclusive) and encodingEndPosition (exlusive).
+        $potentialEncoding = substr($s, $pos, $encodingEndPosition - $pos);
+        # If potentialEncoding contains one or more bytes whose byte value
+        #   is 0x20 or below, then return failure.
+        if (preg_match('/[\x{00}-\x{20}]/', $potentialEncoding)) {
+            return null;
+        }
+        # Let encoding be the result of getting an encoding given
+        #   potentialEncoding isomorphic decoded.
+        // NOTE: Isomorphic decoding is not necessary since all encoding labels are ASCII
+        $encoding = static::fromCharset($potentialEncoding);
+        # If the encoding is UTF-16BE/LE, then change it to UTF-8.
+        if ($encoding === "UTF-16LE" || $encoding === "UTF-16BE") {
+            $encoding = "UTF-8";
+        }
+        # Return encoding.
+        return $encoding;
    }

    /** Scans an attribute during the encoding detection pre-scan */
--- a/tests/cases/TestTreeConstructor.php
+++ b/tests/cases/TestTreeConstructor.php
@ -40,7 +40,7 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
    }

    /** @dataProvider provideStandardTreeTests */
-    public function xtestStandardTreeTestsWithHtmlNamespace(string $data, array $exp, array $errors, $fragment): void {
+    public function testStandardTreeTestsWithHtmlNamespace(string $data, array $exp, array $errors, $fragment): void {
        $config = new Config;
        $config->htmlNamespace = true;
        $this->runTreeTest($data, $exp, $errors, $fragment, $config);