Add standard charset detection tests

- Various new tests needed for full coverage, noted in comment - Byte Order Mark detection methopd added - Japanese encodings nt yet supported, so tests marked incomplete - Tests requiring scripting suppressed
4 years ago · 164e5ff1e8
4 changed files with 88 additions and 7 deletions
--- a/lib/Charset.php
+++ b/lib/Charset.php
@ -5,7 +5,19 @@ namespace dW\HTML5;
 use MensBeam\Intl\Encoding;

 abstract class Charset {
-    /** Matches an encoding label (e.g. "utf-8") to a compatible decoder class.
+    public static function fromBOM(string $data): ?string {
+        if (substr($data, 0, 3 ) === "\u{FEFF}") {
+            return "UTF-8";
+        } elseif ($data[0] === "\xFE" && $data[1] === "\xFF") {
+            return "UTF-6BE";
+        } elseif ($data[0] === "\xFF" && $data[1] === "\xFE") {
+            return "UTF-6LE";
+        } else {
+            return null;
+        }
+    }
+
+    /** Matches an encoding label (e.g. "utf-8") to its canonical name.
     * 
     * @param string $value The encoding label to match
     */
@ -18,7 +30,7 @@ abstract class Charset {
    }

    /** Extracts an encoding from an HTTP Content-Type header-field
-     * and returns the class name of a compatible decoder.
+     * and returns the associated canonical encoding name.
     * 
     * @param string $contentType The value of a Content-Type header-field
     */
@ -80,6 +92,11 @@ abstract class Charset {
        return null;
    }

+    /** Inspects the head of an HTML string to guess its encoding
+     * 
+     * @param string $data The HTML string to scan
+     * @param int $endAfter The number of bytes of the string to stop after 
+     */
    public static function fromPrescan(string $data, int $endAfter = 1024): ?string {
        # When an algorithm requires a user agent to prescan a byte stream to 
        #   determine its encoding, given some defined end condition, then it 
@ -93,6 +110,7 @@ abstract class Charset {
        #   abort the prescan a byte stream to determine its encoding 
        #   algorithm unsuccessfully.
        $s = substr($data, 0, $endAfter);
+        $endAfter = strlen($s);

        # Let position be a pointer to a byte in the input byte stream, 
        #   initially pointing at the first byte.
@ -187,7 +205,7 @@ abstract class Charset {
                        continue;
                    }
                    # If charset is a UTF-16 encoding, then set charset to UTF-8.
-                    elseif ($charset === "UTF-16") {
+                    elseif ($charset === "UTF-16" || $charset === "UTF-16LE" || $charset === "UTF-16BE") {
                        $charset = "UTF-8";
                    }
                    # If charset is x-user-defined, then set charset to windows-1252.
@ -203,7 +221,7 @@ abstract class Charset {
                elseif (($s[$pos] === "/" && ctype_alpha($s[$pos + 1])) || (ctype_alpha($s[$pos]))) {
                    # Advance the position pointer so that it points at the next 
                    #   0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>) byte.
-                    while (!in_array(@$s[$pos++], ["\x09", "\x0A", "\x0C", "\x0D", " ", ">", ""]));
+                    while (!in_array(@$s[++$pos], ["\x09", "\x0A", "\x0C", "\x0D", " ", ">", ""]));
                    # Repeatedly get an attribute until no further attributes can be found, 
                    #   then jump to the step below labeled next byte.
                    while(self::getAttribute($s, $pos));
@ -223,8 +241,10 @@ abstract class Charset {
                $pos++;
            }
        }
+        return null;
    }

+    /** Scans an attribute during the encoding detection pre-scan */
    protected static function getAttribute(string $s, &$pos): array {
        # When the prescan a byte stream to determine its encoding 
        #   algorithm says to get an attribute, it means doing this:
@ -378,6 +398,7 @@ abstract class Charset {
        }
    }

+    /** Interprets a quasi-Content-Type value during the encoding detection pre-scan */
    protected static function fromMeta(string $s): ?string {
        # The algorithm for extracting a character encoding from a meta element, 
        #   given a string s, is as follows.
--- a/lib/Data.php
+++ b/lib/Data.php
@ -44,7 +44,10 @@ class Data {
            $this->filePath = $filePath;
        }

-        if ($encoding = Charset::fromCharset($encodingOrContentType)) {
+        if ($encoding = Charset::fromBOM($data)) {
+            // encoding determined from Unicode byte order mark
+            $this->encodingCertain = true;
+        } elseif ($encoding = Charset::fromCharset($encodingOrContentType)) {
            $this->encodingCertain = true;
        } elseif ($encoding = Charset::fromTransport($encodingOrContentType)) {
            $this->encodingCertain = true;
--- a/tests/cases/TestCharset.php
+++ b/tests/cases/TestCharset.php
@ -2,6 +2,24 @@
 declare(strict_types=1);
 namespace dW\HTML5\TestCase;

+/* Missing tests:
+
+Pre-scan:
+
+- UTF-16LE and UTF-16BE BOM tests
+- Duplicate attributes
+- x-user-defined substitution
+- EOF after attribute name
+- Greater-than sign after equals sign
+- EOF after equals sign
+
+Meta parsing:
+
+- No equals sign after charset
+- EOF after equals sign
+
+*/
+
 use dW\HTML5\Charset;

 /** 
@ -44,4 +62,45 @@ class TestCharset extends \PHPUnit\Framework\TestCase {
            ["text/html; charsaaet=\"a \\\"fancy\\\" encoding\"", null],
        ];
    }
+    
+    /** @dataProvider provideStandardEncodingTests */
+    public function testStandardEncoderTests(string $input, string $exp) {
+        $exp = strtolower($exp);
+        if (in_array($exp, ["euc-jp", "iso-2022-jp", "shift-jis"])) {
+            $this->markTestIncomplete("Japanese encodings are not yet implemented");
+        }
+        $this->assertSame(strtolower($exp), strtolower(Charset::fromBOM($input)?? Charset::fromPrescan($input, \PHP_INT_MAX) ?? "Windows-1252"));
+    }
+
+    public function provideStandardEncodingTests() {
+        $tests = [];
+        $blacklist = [];
+        foreach (new \GlobIterator(\dW\HTML5\BASE."tests/html5lib-tests/encoding/*.dat", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME) as $file) {
+            if (!in_array(basename($file), $blacklist)) {
+                $tests[] = $file;
+            }
+        }
+        return $this->makeEncodingTests(...$tests);
+    }
+
+    protected function makeEncodingTests(string ...$file): iterable {
+        foreach ($file as $path) {
+            $f = basename($path);
+            $test = file($path);
+            $l = 0;
+            $index = 0;
+            while ($l < sizeof($test)) {
+                $testId = "$f #".$index++;
+                $data = "";
+                while (!preg_match("/^#data\s+$/", $test[$l++]));
+                while (!preg_match("/^#encoding\s+$/", ($line = $test[$l++]))) {
+                    $data .= $line;
+                }
+                if (in_array($testId,["tests1.dat #54", "tests1.dat #55"])) {
+                    continue;
+                }
+                yield $testId => [$data, trim($test[$l++])];
+            }
+        }
+    }
 }
--- a/tests/cases/TestTokenizer.php
+++ b/tests/cases/TestTokenizer.php
@ -66,8 +66,6 @@ class TestTokenizer extends \PHPUnit\Framework\TestCase {
                }
            } while (!($t instanceof EOFToken));
        } finally {
-            //$expErrors = $expErrors ? array_column($expErrors, "code") : [];
-            //$errors = $errors ? array_column($errors, "code") : [];
            $actual = $this->combineCharacterTokens($actual);
            $this->assertEquals($expected, $actual, $tokenizer->debugLog);
            $this->assertEquals($expErrors, $errors, $tokenizer->debugLog);