From e26af87ee4d02e2b923e7c5d34f49f9bf9ee0806 Mon Sep 17 00:00:00 2001
From: "J. King" <jking@jkingweb.ca>
Date: Sun, 10 Oct 2021 09:36:38 -0400
Subject: [PATCH] Tests for XML declaration-base encoding detection

Also adds a setting for controlling the number of pre-scan bytes
---
 RoboFile.php                        | 18 ++++++++---
 lib/Parser.php                      |  2 +-
 lib/Parser/Charset.php              |  3 +-
 lib/Parser/Config.php               |  2 ++
 lib/Parser/Data.php                 |  8 +++--
 lib/Parser/Tokenizer.php            |  2 +-
 tests/cases/TestCharset.php         | 48 +++++++++++++++++++++++++++++
 tests/cases/TestTokenizer.php       |  5 ++-
 tests/cases/TestTreeConstructor.php |  4 ++-
 tests/platform-tests                |  1 +
 10 files changed, 80 insertions(+), 13 deletions(-)
 create mode 160000 tests/platform-tests

diff --git a/RoboFile.php b/RoboFile.php
index cbf9742..19fd031 100644
--- a/RoboFile.php
+++ b/RoboFile.php
@@ -54,12 +54,20 @@ class RoboFile extends \Robo\Tasks {
 
     /** Manually updates the imported html5lib test suite */
     public function testUpdate(): Result {
-        $dir = BASE_TEST."html5lib-tests";
-        if (is_dir($dir)) {
-            return $this->taskGitStack()->dir($dir)->pull()->run();
-        } else {
-            return $this->taskGitStack()->cloneRepo("https://github.com/html5lib/html5lib-tests", $dir)->run();
+        $repos = [
+            'html5lib-tests' => "https://github.com/html5lib/html5lib-tests",
+            'platform-tests' => "https://github.com/web-platform-tests/wpt",
+        ];
+        $c = $this->collectionBuilder();
+        foreach ($repos as $dir => $url) {
+            $dir = BASE_TEST.$dir;
+            if (is_dir($dir)) {
+                $c->addTask($this->taskGitStack()->dir($dir)->pull());
+            } else {
+                $c->addTask($this->taskGitStack()->cloneRepo($url, $dir));
+            }
         }
+        return $c->run();
     }
 
     /** Produces a code coverage report
diff --git a/lib/Parser.php b/lib/Parser.php
index a56ac4a..adc275f 100644
--- a/lib/Parser.php
+++ b/lib/Parser.php
@@ -48,7 +48,7 @@ class Parser {
         // Initialize the various classes needed for parsing
         $document = $document ?? new \DOMDocument;
         $errorHandler = $config->errorCollection ? new ParseError : null;
-        $decoder = new Data($data, $encodingOrContentType, $errorHandler, $config->encodingFallback);
+        $decoder = new Data($data, $encodingOrContentType, $errorHandler, $config);
         $stack = new OpenElementsStack($htmlNamespace, $fragmentContext);
         $tokenizer = new Tokenizer($decoder, $stack, $errorHandler);
         $tokenList = $tokenizer->tokenize();
diff --git a/lib/Parser/Charset.php b/lib/Parser/Charset.php
index 62275de..9e428bb 100644
--- a/lib/Parser/Charset.php
+++ b/lib/Parser/Charset.php
@@ -331,7 +331,8 @@ abstract class Charset {
             $pos++;
         }
         # If the byte at encodingPosition is not 0x3D (=), then return failure.
-        if ($s[$pos] !== "=") {
+        // NOTE: This is also buggy: see https://github.com/whatwg/html/issues/7193
+        if ($s[$pos++] !== "=") {
             return null;
         }
         # While the byte at encodingPosition is less than or equal to 0x20
diff --git a/lib/Parser/Config.php b/lib/Parser/Config.php
index c088ed8..979c8eb 100644
--- a/lib/Parser/Config.php
+++ b/lib/Parser/Config.php
@@ -9,6 +9,8 @@ namespace MensBeam\HTML\Parser;
 class Config {
     /** @var ?string The fallback encoding used when no encoding is provided or can be detected for the document. See https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding:implementation-defined for guidance */
     public $encodingFallback = null;
+    /** @var ?int The number of bytes to examine during encoding pre-scan. 1024 is the default and recommended value */
+    public $encodingPrescanBytes = null;
     /** @var ?bool Whether parse errors should be recorded. Recording parse errors incurs a performance penalty. */
     public $errorCollection = null;
     /** @var ?bool Whether to use the HTML namespace rather than the null namespace for HTML elements. Using the HTML namespace is the correct behaviour, but this has performance and compatibility implications for PHP */
diff --git a/lib/Parser/Data.php b/lib/Parser/Data.php
index e7e439f..e2ee7c0 100644
--- a/lib/Parser/Data.php
+++ b/lib/Parser/Data.php
@@ -46,11 +46,13 @@ class Data {
     public const WHITESPACE_SAFE = "\t\x0C "; // "safe" excludes line breaks, as those require extra processing
 
 
-    public function __construct(string $data, ?string $encodingOrContentType, ?ParseError $errorHandler,  ?string $fallbackEncoding) {
+    public function __construct(string $data, ?string $encodingOrContentType, ?ParseError $errorHandler, ?Config $config) {
         $this->string = $data;
         $this->errorHandler = $errorHandler;
+        $config = $config ?? new Config;
         $encodingOrContentType = (string) $encodingOrContentType;
-        $fallbackEncoding = (string) $fallbackEncoding;
+        $prescanBytes = (int) ($config->encodingPrescanBytes ?? 1024);
+        $fallbackEncoding = (string) $config->encodingFallback;
         // don't track the current line/column position if error reporting has been suppressed
         $this->track = (bool) $this->errorHandler;
 
@@ -76,7 +78,7 @@ class Data {
             # If the transport layer specifies a character encoding, and it is
             #   supported, return that encoding with the confidence certain.
             $this->encodingCertain = true;
-        } elseif ($encoding = Charset::fromPrescan($data)) {
+        } elseif ($encoding = Charset::fromPrescan($data, $prescanBytes)) {
             # Optionally prescan the byte stream to determine its encoding.
             # The aforementioned algorithm either aborts unsuccessfully or
             #   returns a character encoding. If it returns a character
diff --git a/lib/Parser/Tokenizer.php b/lib/Parser/Tokenizer.php
index 06526b9..8307116 100644
--- a/lib/Parser/Tokenizer.php
+++ b/lib/Parser/Tokenizer.php
@@ -506,7 +506,7 @@ class Tokenizer {
                     # Create a comment token whose data is the empty string.
                     # Reconsume in the bogus comment state.
                     $this->error(ParseError::UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME);
-                    $token = new ProcessingInstructionToken('');
+                    $token = new ProcessingInstructionToken("");
                     $this->state = self::BOGUS_COMMENT_STATE;
                     goto Reconsume;
                 }
diff --git a/tests/cases/TestCharset.php b/tests/cases/TestCharset.php
index a0082f9..5acb515 100644
--- a/tests/cases/TestCharset.php
+++ b/tests/cases/TestCharset.php
@@ -6,7 +6,9 @@
 declare(strict_types=1);
 namespace MensBeam\HTML\TestCase;
 
+use MensBeam\HTML\Parser;
 use MensBeam\HTML\Parser\Charset;
+use MensBeam\HTML\Parser\Config;
 
 /** 
  * @covers \MensBeam\HTML\Parser\Charset
@@ -104,4 +106,50 @@ class TestCharset extends \PHPUnit\Framework\TestCase {
             }
         }
     }
+
+    /** @dataProvider provideStandardDeclarationTests */
+    public function testStandardDeclarationTests(string $file, ?string $charset, string $exp): void {
+        $config = new Config;
+        $config->encodingPrescanBytes = 2048;
+        $file = \MensBeam\HTML\Parser\BASE."tests/platform-tests/html/syntax/xmldecl/support/".$file;
+        $data = file_get_contents($file);
+        $act = Parser::parse($data, $charset, null, null, null, $config);
+        $this->assertSame($exp, $act->encoding);
+    }
+
+    public function provideStandardDeclarationTests() {
+        $tests = [];
+        $blacklist = ["xmldecl-3.html"];
+        $files = new \AppendIterator();
+        $files->append(new \GlobIterator(\MensBeam\HTML\Parser\BASE."tests/platform-tests/html/syntax/xmldecl/*.htm*", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME));
+        foreach ($files as $file) {
+            if (!in_array(basename($file), $blacklist)) {
+                $tests[] = $file;
+            }
+        }
+        return $this->makeDeclarationTests(...$tests);
+    }
+
+    protected function makeDeclarationTests(string ...$file): iterable {
+        foreach ($file as $f) {
+            $d = new \DOMDocument;
+            @$d->loadHTMLFile($f);
+            foreach ($d->getElementsByTagName("div") as $div) {
+                $exp = $div->getAttribute("class");
+                foreach ($div->getElementsByTagName("iframe") as $frame) {
+                    $test = \MensBeam\HTML\Parser\BASE."tests/platform-tests/html/syntax/xmldecl/".$frame->getAttribute("src");
+                    if (file_exists($test.".headers")) {
+                        $h = file_get_contents($test.".headers");
+                        if (preg_match('/^Content-Type:\s*text\/html;\s*charset=(\S+)\s*$/Dis', $h, $m)) {
+                            $charset = $m[1];
+                        }
+                        assert(isset($charset), new \Exception("Header file associated with $test has no charset"));
+                    } else {
+                        $charset = null;
+                    }
+                    yield [basename($test), $charset, $exp];
+                }
+            }
+        }
+    }
 }
diff --git a/tests/cases/TestTokenizer.php b/tests/cases/TestTokenizer.php
index d99cd98..f76f1a6 100644
--- a/tests/cases/TestTokenizer.php
+++ b/tests/cases/TestTokenizer.php
@@ -13,6 +13,7 @@ use MensBeam\HTML\Parser\ParseError;
 use MensBeam\HTML\Parser\Tokenizer;
 use MensBeam\HTML\Parser\CharacterToken;
 use MensBeam\HTML\Parser\CommentToken;
+use MensBeam\HTML\Parser\Config;
 use MensBeam\HTML\Parser\DOCTYPEToken;
 use MensBeam\HTML\Parser\EndTagToken;
 use MensBeam\HTML\Parser\NullCharacterToken;
@@ -44,6 +45,8 @@ class TestTokenizer extends \PHPUnit\Framework\TestCase {
 
     /** @dataProvider provideStandardTokenizerTests */
     public function testStandardTokenizerTests(string $input, array $expected, int $state, string $open = null, array $expErrors) {
+        $config = new Config;
+        $config->encodingFallback = "UTF-8";
         $errorHandler = new ParseError;
         // initialize a stack of open elements, possibly with an open element
         $stack = new OpenElementsStack(null);
@@ -51,7 +54,7 @@ class TestTokenizer extends \PHPUnit\Framework\TestCase {
             $stack[] = (new \DOMDocument)->createElement($open);
         }
         // initialize the data stream and tokenizer
-        $data = new Data($input, "UTF-8", $errorHandler, "UTF-8");
+        $data = new Data($input, "UTF-8", $errorHandler, $config);
         $tokenizer = new Tokenizer($data, $stack, $errorHandler);
         $tokenizer->state = $state;
         // perform the test
diff --git a/tests/cases/TestTreeConstructor.php b/tests/cases/TestTreeConstructor.php
index f7c7c8f..df72d1e 100644
--- a/tests/cases/TestTreeConstructor.php
+++ b/tests/cases/TestTreeConstructor.php
@@ -66,6 +66,8 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
     }
 
     protected function runTreeTest(string $data, array $exp, array $errors, ?string $fragment, ?Config $config): void {
+        $config = $config ?? new Config;
+        $config->encodingFallback = "UTF-8";
         $this->ns = ($config && $config->htmlNamespace);
         $htmlNamespace = ($this->ns) ? Parser::HTML_NAMESPACE : null;
         // certain tests need to be patched to ignore unavoidable limitations of PHP's DOM
@@ -89,7 +91,7 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
         }
         // initialize the other classes we need
         $errorHandler = new ParseError;
-        $decoder = new Data($data, "UTF-8", $errorHandler, "UTF-8");
+        $decoder = new Data($data, "UTF-8", $errorHandler, $config);
         $stack = new OpenElementsStack($htmlNamespace, $fragmentContext);
         $tokenizer = new Tokenizer($decoder, $stack, $errorHandler);
         $tokenList = $tokenizer->tokenize();
diff --git a/tests/platform-tests b/tests/platform-tests
new file mode 160000
index 0000000..b1147c3
--- /dev/null
+++ b/tests/platform-tests
@@ -0,0 +1 @@
+Subproject commit b1147c3f001cc8cbb0c9922779adc39f4326e23e