Browse Source

Add standard charset detection tests

- Various new tests needed for full coverage, noted in comment
- Byte Order Mark detection methopd added
- Japanese encodings nt yet supported, so tests marked incomplete
- Tests requiring scripting suppressed
ns
J. King 4 years ago
parent
commit
164e5ff1e8
  1. 29
      lib/Charset.php
  2. 5
      lib/Data.php
  3. 59
      tests/cases/TestCharset.php
  4. 2
      tests/cases/TestTokenizer.php

29
lib/Charset.php

@ -5,7 +5,19 @@ namespace dW\HTML5;
use MensBeam\Intl\Encoding;
abstract class Charset {
/** Matches an encoding label (e.g. "utf-8") to a compatible decoder class.
public static function fromBOM(string $data): ?string {
if (substr($data, 0, 3 ) === "\u{FEFF}") {
return "UTF-8";
} elseif ($data[0] === "\xFE" && $data[1] === "\xFF") {
return "UTF-6BE";
} elseif ($data[0] === "\xFF" && $data[1] === "\xFE") {
return "UTF-6LE";
} else {
return null;
}
}
/** Matches an encoding label (e.g. "utf-8") to its canonical name.
*
* @param string $value The encoding label to match
*/
@ -18,7 +30,7 @@ abstract class Charset {
}
/** Extracts an encoding from an HTTP Content-Type header-field
* and returns the class name of a compatible decoder.
* and returns the associated canonical encoding name.
*
* @param string $contentType The value of a Content-Type header-field
*/
@ -80,6 +92,11 @@ abstract class Charset {
return null;
}
/** Inspects the head of an HTML string to guess its encoding
*
* @param string $data The HTML string to scan
* @param int $endAfter The number of bytes of the string to stop after
*/
public static function fromPrescan(string $data, int $endAfter = 1024): ?string {
# When an algorithm requires a user agent to prescan a byte stream to
# determine its encoding, given some defined end condition, then it
@ -93,6 +110,7 @@ abstract class Charset {
# abort the prescan a byte stream to determine its encoding
# algorithm unsuccessfully.
$s = substr($data, 0, $endAfter);
$endAfter = strlen($s);
# Let position be a pointer to a byte in the input byte stream,
# initially pointing at the first byte.
@ -187,7 +205,7 @@ abstract class Charset {
continue;
}
# If charset is a UTF-16 encoding, then set charset to UTF-8.
elseif ($charset === "UTF-16") {
elseif ($charset === "UTF-16" || $charset === "UTF-16LE" || $charset === "UTF-16BE") {
$charset = "UTF-8";
}
# If charset is x-user-defined, then set charset to windows-1252.
@ -203,7 +221,7 @@ abstract class Charset {
elseif (($s[$pos] === "/" && ctype_alpha($s[$pos + 1])) || (ctype_alpha($s[$pos]))) {
# Advance the position pointer so that it points at the next
# 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>) byte.
while (!in_array(@$s[$pos++], ["\x09", "\x0A", "\x0C", "\x0D", " ", ">", ""]));
while (!in_array(@$s[++$pos], ["\x09", "\x0A", "\x0C", "\x0D", " ", ">", ""]));
# Repeatedly get an attribute until no further attributes can be found,
# then jump to the step below labeled next byte.
while(self::getAttribute($s, $pos));
@ -223,8 +241,10 @@ abstract class Charset {
$pos++;
}
}
return null;
}
/** Scans an attribute during the encoding detection pre-scan */
protected static function getAttribute(string $s, &$pos): array {
# When the prescan a byte stream to determine its encoding
# algorithm says to get an attribute, it means doing this:
@ -378,6 +398,7 @@ abstract class Charset {
}
}
/** Interprets a quasi-Content-Type value during the encoding detection pre-scan */
protected static function fromMeta(string $s): ?string {
# The algorithm for extracting a character encoding from a meta element,
# given a string s, is as follows.

5
lib/Data.php

@ -44,7 +44,10 @@ class Data {
$this->filePath = $filePath;
}
if ($encoding = Charset::fromCharset($encodingOrContentType)) {
if ($encoding = Charset::fromBOM($data)) {
// encoding determined from Unicode byte order mark
$this->encodingCertain = true;
} elseif ($encoding = Charset::fromCharset($encodingOrContentType)) {
$this->encodingCertain = true;
} elseif ($encoding = Charset::fromTransport($encodingOrContentType)) {
$this->encodingCertain = true;

59
tests/cases/TestCharset.php

@ -2,6 +2,24 @@
declare(strict_types=1);
namespace dW\HTML5\TestCase;
/* Missing tests:
Pre-scan:
- UTF-16LE and UTF-16BE BOM tests
- Duplicate attributes
- x-user-defined substitution
- EOF after attribute name
- Greater-than sign after equals sign
- EOF after equals sign
Meta parsing:
- No equals sign after charset
- EOF after equals sign
*/
use dW\HTML5\Charset;
/**
@ -44,4 +62,45 @@ class TestCharset extends \PHPUnit\Framework\TestCase {
["text/html; charsaaet=\"a \\\"fancy\\\" encoding\"", null],
];
}
/** @dataProvider provideStandardEncodingTests */
public function testStandardEncoderTests(string $input, string $exp) {
$exp = strtolower($exp);
if (in_array($exp, ["euc-jp", "iso-2022-jp", "shift-jis"])) {
$this->markTestIncomplete("Japanese encodings are not yet implemented");
}
$this->assertSame(strtolower($exp), strtolower(Charset::fromBOM($input)?? Charset::fromPrescan($input, \PHP_INT_MAX) ?? "Windows-1252"));
}
public function provideStandardEncodingTests() {
$tests = [];
$blacklist = [];
foreach (new \GlobIterator(\dW\HTML5\BASE."tests/html5lib-tests/encoding/*.dat", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME) as $file) {
if (!in_array(basename($file), $blacklist)) {
$tests[] = $file;
}
}
return $this->makeEncodingTests(...$tests);
}
protected function makeEncodingTests(string ...$file): iterable {
foreach ($file as $path) {
$f = basename($path);
$test = file($path);
$l = 0;
$index = 0;
while ($l < sizeof($test)) {
$testId = "$f #".$index++;
$data = "";
while (!preg_match("/^#data\s+$/", $test[$l++]));
while (!preg_match("/^#encoding\s+$/", ($line = $test[$l++]))) {
$data .= $line;
}
if (in_array($testId,["tests1.dat #54", "tests1.dat #55"])) {
continue;
}
yield $testId => [$data, trim($test[$l++])];
}
}
}
}

2
tests/cases/TestTokenizer.php

@ -66,8 +66,6 @@ class TestTokenizer extends \PHPUnit\Framework\TestCase {
}
} while (!($t instanceof EOFToken));
} finally {
//$expErrors = $expErrors ? array_column($expErrors, "code") : [];
//$errors = $errors ? array_column($errors, "code") : [];
$actual = $this->combineCharacterTokens($actual);
$this->assertEquals($expected, $actual, $tokenizer->debugLog);
$this->assertEquals($expErrors, $errors, $tokenizer->debugLog);

Loading…
Cancel
Save