Browse Source

Tests for XML declaration-base encoding detection

Also adds a setting for controlling the number of pre-scan bytes
serialize
J. King 3 years ago
parent
commit
e26af87ee4
  1. 18
      RoboFile.php
  2. 2
      lib/Parser.php
  3. 3
      lib/Parser/Charset.php
  4. 2
      lib/Parser/Config.php
  5. 8
      lib/Parser/Data.php
  6. 2
      lib/Parser/Tokenizer.php
  7. 48
      tests/cases/TestCharset.php
  8. 5
      tests/cases/TestTokenizer.php
  9. 4
      tests/cases/TestTreeConstructor.php
  10. 1
      tests/platform-tests

18
RoboFile.php

@ -54,12 +54,20 @@ class RoboFile extends \Robo\Tasks {
/** Manually updates the imported html5lib test suite */
public function testUpdate(): Result {
$dir = BASE_TEST."html5lib-tests";
if (is_dir($dir)) {
return $this->taskGitStack()->dir($dir)->pull()->run();
} else {
return $this->taskGitStack()->cloneRepo("https://github.com/html5lib/html5lib-tests", $dir)->run();
$repos = [
'html5lib-tests' => "https://github.com/html5lib/html5lib-tests",
'platform-tests' => "https://github.com/web-platform-tests/wpt",
];
$c = $this->collectionBuilder();
foreach ($repos as $dir => $url) {
$dir = BASE_TEST.$dir;
if (is_dir($dir)) {
$c->addTask($this->taskGitStack()->dir($dir)->pull());
} else {
$c->addTask($this->taskGitStack()->cloneRepo($url, $dir));
}
}
return $c->run();
}
/** Produces a code coverage report

2
lib/Parser.php

@ -48,7 +48,7 @@ class Parser {
// Initialize the various classes needed for parsing
$document = $document ?? new \DOMDocument;
$errorHandler = $config->errorCollection ? new ParseError : null;
$decoder = new Data($data, $encodingOrContentType, $errorHandler, $config->encodingFallback);
$decoder = new Data($data, $encodingOrContentType, $errorHandler, $config);
$stack = new OpenElementsStack($htmlNamespace, $fragmentContext);
$tokenizer = new Tokenizer($decoder, $stack, $errorHandler);
$tokenList = $tokenizer->tokenize();

3
lib/Parser/Charset.php

@ -331,7 +331,8 @@ abstract class Charset {
$pos++;
}
# If the byte at encodingPosition is not 0x3D (=), then return failure.
if ($s[$pos] !== "=") {
// NOTE: This is also buggy: see https://github.com/whatwg/html/issues/7193
if ($s[$pos++] !== "=") {
return null;
}
# While the byte at encodingPosition is less than or equal to 0x20

2
lib/Parser/Config.php

@ -9,6 +9,8 @@ namespace MensBeam\HTML\Parser;
class Config {
/** @var ?string The fallback encoding used when no encoding is provided or can be detected for the document. See https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding:implementation-defined for guidance */
public $encodingFallback = null;
/** @var ?int The number of bytes to examine during encoding pre-scan. 1024 is the default and recommended value */
public $encodingPrescanBytes = null;
/** @var ?bool Whether parse errors should be recorded. Recording parse errors incurs a performance penalty. */
public $errorCollection = null;
/** @var ?bool Whether to use the HTML namespace rather than the null namespace for HTML elements. Using the HTML namespace is the correct behaviour, but this has performance and compatibility implications for PHP */

8
lib/Parser/Data.php

@ -46,11 +46,13 @@ class Data {
public const WHITESPACE_SAFE = "\t\x0C "; // "safe" excludes line breaks, as those require extra processing
public function __construct(string $data, ?string $encodingOrContentType, ?ParseError $errorHandler, ?string $fallbackEncoding) {
public function __construct(string $data, ?string $encodingOrContentType, ?ParseError $errorHandler, ?Config $config) {
$this->string = $data;
$this->errorHandler = $errorHandler;
$config = $config ?? new Config;
$encodingOrContentType = (string) $encodingOrContentType;
$fallbackEncoding = (string) $fallbackEncoding;
$prescanBytes = (int) ($config->encodingPrescanBytes ?? 1024);
$fallbackEncoding = (string) $config->encodingFallback;
// don't track the current line/column position if error reporting has been suppressed
$this->track = (bool) $this->errorHandler;
@ -76,7 +78,7 @@ class Data {
# If the transport layer specifies a character encoding, and it is
# supported, return that encoding with the confidence certain.
$this->encodingCertain = true;
} elseif ($encoding = Charset::fromPrescan($data)) {
} elseif ($encoding = Charset::fromPrescan($data, $prescanBytes)) {
# Optionally prescan the byte stream to determine its encoding.
# The aforementioned algorithm either aborts unsuccessfully or
# returns a character encoding. If it returns a character

2
lib/Parser/Tokenizer.php

@ -506,7 +506,7 @@ class Tokenizer {
# Create a comment token whose data is the empty string.
# Reconsume in the bogus comment state.
$this->error(ParseError::UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME);
$token = new ProcessingInstructionToken('');
$token = new ProcessingInstructionToken("");
$this->state = self::BOGUS_COMMENT_STATE;
goto Reconsume;
}

48
tests/cases/TestCharset.php

@ -6,7 +6,9 @@
declare(strict_types=1);
namespace MensBeam\HTML\TestCase;
use MensBeam\HTML\Parser;
use MensBeam\HTML\Parser\Charset;
use MensBeam\HTML\Parser\Config;
/**
* @covers \MensBeam\HTML\Parser\Charset
@ -104,4 +106,50 @@ class TestCharset extends \PHPUnit\Framework\TestCase {
}
}
}
/** @dataProvider provideStandardDeclarationTests */
public function testStandardDeclarationTests(string $file, ?string $charset, string $exp): void {
$config = new Config;
$config->encodingPrescanBytes = 2048;
$file = \MensBeam\HTML\Parser\BASE."tests/platform-tests/html/syntax/xmldecl/support/".$file;
$data = file_get_contents($file);
$act = Parser::parse($data, $charset, null, null, null, $config);
$this->assertSame($exp, $act->encoding);
}
public function provideStandardDeclarationTests() {
$tests = [];
$blacklist = ["xmldecl-3.html"];
$files = new \AppendIterator();
$files->append(new \GlobIterator(\MensBeam\HTML\Parser\BASE."tests/platform-tests/html/syntax/xmldecl/*.htm*", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME));
foreach ($files as $file) {
if (!in_array(basename($file), $blacklist)) {
$tests[] = $file;
}
}
return $this->makeDeclarationTests(...$tests);
}
protected function makeDeclarationTests(string ...$file): iterable {
foreach ($file as $f) {
$d = new \DOMDocument;
@$d->loadHTMLFile($f);
foreach ($d->getElementsByTagName("div") as $div) {
$exp = $div->getAttribute("class");
foreach ($div->getElementsByTagName("iframe") as $frame) {
$test = \MensBeam\HTML\Parser\BASE."tests/platform-tests/html/syntax/xmldecl/".$frame->getAttribute("src");
if (file_exists($test.".headers")) {
$h = file_get_contents($test.".headers");
if (preg_match('/^Content-Type:\s*text\/html;\s*charset=(\S+)\s*$/Dis', $h, $m)) {
$charset = $m[1];
}
assert(isset($charset), new \Exception("Header file associated with $test has no charset"));
} else {
$charset = null;
}
yield [basename($test), $charset, $exp];
}
}
}
}
}

5
tests/cases/TestTokenizer.php

@ -13,6 +13,7 @@ use MensBeam\HTML\Parser\ParseError;
use MensBeam\HTML\Parser\Tokenizer;
use MensBeam\HTML\Parser\CharacterToken;
use MensBeam\HTML\Parser\CommentToken;
use MensBeam\HTML\Parser\Config;
use MensBeam\HTML\Parser\DOCTYPEToken;
use MensBeam\HTML\Parser\EndTagToken;
use MensBeam\HTML\Parser\NullCharacterToken;
@ -44,6 +45,8 @@ class TestTokenizer extends \PHPUnit\Framework\TestCase {
/** @dataProvider provideStandardTokenizerTests */
public function testStandardTokenizerTests(string $input, array $expected, int $state, string $open = null, array $expErrors) {
$config = new Config;
$config->encodingFallback = "UTF-8";
$errorHandler = new ParseError;
// initialize a stack of open elements, possibly with an open element
$stack = new OpenElementsStack(null);
@ -51,7 +54,7 @@ class TestTokenizer extends \PHPUnit\Framework\TestCase {
$stack[] = (new \DOMDocument)->createElement($open);
}
// initialize the data stream and tokenizer
$data = new Data($input, "UTF-8", $errorHandler, "UTF-8");
$data = new Data($input, "UTF-8", $errorHandler, $config);
$tokenizer = new Tokenizer($data, $stack, $errorHandler);
$tokenizer->state = $state;
// perform the test

4
tests/cases/TestTreeConstructor.php

@ -66,6 +66,8 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
}
protected function runTreeTest(string $data, array $exp, array $errors, ?string $fragment, ?Config $config): void {
$config = $config ?? new Config;
$config->encodingFallback = "UTF-8";
$this->ns = ($config && $config->htmlNamespace);
$htmlNamespace = ($this->ns) ? Parser::HTML_NAMESPACE : null;
// certain tests need to be patched to ignore unavoidable limitations of PHP's DOM
@ -89,7 +91,7 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
}
// initialize the other classes we need
$errorHandler = new ParseError;
$decoder = new Data($data, "UTF-8", $errorHandler, "UTF-8");
$decoder = new Data($data, "UTF-8", $errorHandler, $config);
$stack = new OpenElementsStack($htmlNamespace, $fragmentContext);
$tokenizer = new Tokenizer($decoder, $stack, $errorHandler);
$tokenList = $tokenizer->tokenize();

1
tests/platform-tests

@ -0,0 +1 @@
Subproject commit b1147c3f001cc8cbb0c9922779adc39f4326e23e
Loading…
Cancel
Save