Start on character encoding detection

This commit is contained in:
J. King 2019-12-21 14:53:51 -05:00
parent 318d7bd7ad
commit 49f31015ac
7 changed files with 177 additions and 32 deletions

6
composer.lock generated
View file

@ -8,11 +8,11 @@
"packages": [
{
"name": "mensbeam/intl",
"version": "0.6.0",
"version": "0.7.0",
"source": {
"type": "git",
"url": "https://code.mensbeam.com/MensBeam/intl",
"reference": "200a310f72b086a63b84c4ca2af3b402852e3cce"
"reference": "c4a2ae17142a1846c841c5b5b9246e67d56c568e"
},
"require": {
"php": "^7.0"
@ -49,7 +49,7 @@
"utf-8",
"utf8"
],
"time": "2019-12-18T19:57:54+00:00"
"time": "2019-12-21T01:56:59+00:00"
}
],
"packages-dev": [

86
lib/Charset.php Normal file
View file

@ -0,0 +1,86 @@
<?php
declare(strict_types=1);
namespace dW\HTML5;
use MensBeam\Intl\Encoding;
abstract class Charset {
/** Matches an encoding label (e.g. "utf-8") to a compatible decoder class.
*
* @param string $value The encoding label to match
*/
public static function fromCharset(string $value): ?string {
$encoding = Encoding::matchLabel($value);
if ($encoding) {
return $encoding['class'];
}
return null;
}
/** Extracts an encoding from an HTTP Content-Type header-field
* and returns the class name of a compatible decoder.
*
* @param string $contentType The value of a Content-Type header-field
*/
public static function fromTransport(string $contentType): ?string {
// Try to sniff out a charset from a Content-Type header-field.
// This does cut some corners, but should be sufficient for practical use
$s = preg_replace("/\s+/", " ", strtolower($contentType));
$pos = 0;
$end = strlen($s);
// skip the type
while ($pos < $end && @$s[$pos++] !== "/");
// skip the subtype
while ($pos < $end && @$s[$pos++] !== ";");
// check parameters in sequence
while ($pos < $end) {
// skip any leading whitespace
if (@$s[$pos] === " ") {
$pos++;
}
// collect characters for the parameter name
$param = "";
while ($pos < $end && @$s[$pos] !== "=") {
$param .= @$s[$pos++];
}
// skip the equals sign
$pos++;
if ($s[$pos] === '"') {
// Value is a quoted-string
$pos++;
$value = "";
while (!in_array($c = @$s[$pos++], ['"', ""])) {
if ($c === "\\") {
$value .= @$s[$pos++];
} else {
$value .= $c;
}
}
// only interpret the value if a closing quotation mark was seen
if ($c !== '"') {
$value = "";
}
} else {
// Value is a bare token
$value = "";
while (!in_array($c = @$s[$pos++], [';', " ", ""])) {
$value .= $c;
}
}
// if the parameter was the character set, interpret its value and return
if ($param === "charset") {
$encoding = Encoding::matchLabel($value);
if ($encoding) {
return $encoding['class'];
} else {
return null;
}
}
}
return null;
}
public static function fromPrescan(string $data): ?string {
return null;
}
}

View file

@ -2,11 +2,14 @@
declare(strict_types=1);
namespace dW\HTML5;
use MensBeam\Intl\Encoding;
class Data {
use ParseErrorEmitter;
// Used to get the file path for error reporting.
public $filePath;
public $encodingCertain = false;
// Internal storage for the Intl data object.
protected $data;
@ -31,7 +34,7 @@ class Data {
const WHITESPACE = "\t\n\x0c\x0d ";
public function __construct(string $data, string $filePath = 'STDIN', ParseError $errorHandler = null) {
public function __construct(string $data, string $filePath = 'STDIN', ParseError $errorHandler = null, string $encodingOrContentType = '') {
$this->errorHandler = $errorHandler ?? new ParseError;
if ($filePath !== 'STDIN') {
$this->filePath = realpath($filePath);
@ -40,13 +43,20 @@ class Data {
$this->filePath = $filePath;
}
// DEVIATION: The spec has steps for parsing and determining the character
// encoding. At this moment this implementation won't determine a character
// encoding and will just assume UTF-8.
$this->data = new \MensBeam\Intl\Encoding\UTF8($data, false, true);
if ($encoding = Charset::fromCharset($encodingOrContentType)) {
$this->encodingCertain = true;
} elseif ($encoding = Charset::fromTransport($encodingOrContentType)) {
$this->encodingCertain = true;
} elseif ($encoding = Charset::fromPrescan($data)) {
// Encoding is tentative
} else {
// Encoding is tentative; fall back to windows 1252
$encoding = \MensBeam\Intl\Encoding\Windows1252::class;
}
$this->data = new $encoding($data, false, true);
}
public function consume(int $length = 1, $advancePointer = true): string {
assert($length > 0, new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length));

View file

@ -0,0 +1,48 @@
<?php
declare(strict_types=1);
namespace dW\HTML5\TestCase;
use dW\HTML5\Charset;
use MensBeam\Intl\Encoding\UTF8;
use MensBeam\Intl\Encoding\Windows1252;
/**
* @covers \dW\HTML5\Charset
*/
class TestCharset extends \PHPUnit\Framework\TestCase {
/** @dataProvider provideCharsets */
public function testDetermineEncodingFromEncodingLabel(string $in, ?string $exp) {
$this->assertSame($exp, Charset::fromCharset($in));
}
public function provideCharsets() {
return [
["UTF-8", UTF8::class],
[" utf8 ", UTF8::class],
["ISO-8859-1", Windows1252::class],
["text/html; charset=utf8", null],
];
}
/** @dataProvider provideContentTypes */
public function testDetermineEncodingFromContentType(string $in, ?string $exp) {
$this->assertSame($exp, Charset::fromTransport($in));
}
public function provideContentTypes() {
return [
["UTF-8", null],
["charset=utf8", null],
["text/html", null],
["text/html charset=utf8", null],
["text/html; charset=utf8", UTF8::class],
["text/html;charset=utf8", UTF8::class],
["text/html; charset=\"utf8\"", UTF8::class],
["image/svg+xml; param=value; charset=utf8", UTF8::class],
["image/svg+xml; charset=utf8; charset=big5", UTF8::class],
["image/svg+xml; charset=utf8;charset=big5", UTF8::class],
["text/html; charset=not-valid; charset=big5", null],
["text/html; charset=not-valid", null],
];
}
}

View file

@ -17,6 +17,7 @@
<testsuites>
<testsuite name="Tokenizer">
<file>cases/TestCharset.php</file>
<file>cases/TestTokenizer.php</file>
</testsuite>
</testsuites>

View file

@ -266,16 +266,16 @@
},
{
"name": "phpdocumentor/reflection-docblock",
"version": "4.3.2",
"version": "4.3.3",
"source": {
"type": "git",
"url": "https://github.com/phpDocumentor/ReflectionDocBlock.git",
"reference": "b83ff7cfcfee7827e1e78b637a5904fe6a96698e"
"reference": "2ecaa9fef01634c83bfa8dc1fe35fb5cef223a62"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/phpDocumentor/ReflectionDocBlock/zipball/b83ff7cfcfee7827e1e78b637a5904fe6a96698e",
"reference": "b83ff7cfcfee7827e1e78b637a5904fe6a96698e",
"url": "https://api.github.com/repos/phpDocumentor/ReflectionDocBlock/zipball/2ecaa9fef01634c83bfa8dc1fe35fb5cef223a62",
"reference": "2ecaa9fef01634c83bfa8dc1fe35fb5cef223a62",
"shasum": ""
},
"require": {
@ -313,7 +313,7 @@
}
],
"description": "With this component, a library can provide support for annotations via DocBlocks or otherwise retrieve information that is embedded in a DocBlock.",
"time": "2019-09-12T14:27:41+00:00"
"time": "2019-12-20T13:40:23+00:00"
},
{
"name": "phpdocumentor/type-resolver",

View file

@ -763,16 +763,16 @@
},
{
"name": "symfony/console",
"version": "v4.4.1",
"version": "v4.4.2",
"source": {
"type": "git",
"url": "https://github.com/symfony/console.git",
"reference": "f0aea3df20d15635b3cb9730ca5eea1c65b7f201"
"reference": "82437719dab1e6bdd28726af14cb345c2ec816d0"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/symfony/console/zipball/f0aea3df20d15635b3cb9730ca5eea1c65b7f201",
"reference": "f0aea3df20d15635b3cb9730ca5eea1c65b7f201",
"url": "https://api.github.com/repos/symfony/console/zipball/82437719dab1e6bdd28726af14cb345c2ec816d0",
"reference": "82437719dab1e6bdd28726af14cb345c2ec816d0",
"shasum": ""
},
"require": {
@ -835,11 +835,11 @@
],
"description": "Symfony Console Component",
"homepage": "https://symfony.com",
"time": "2019-12-01T10:06:17+00:00"
"time": "2019-12-17T10:32:23+00:00"
},
{
"name": "symfony/event-dispatcher",
"version": "v4.4.1",
"version": "v4.4.2",
"source": {
"type": "git",
"url": "https://github.com/symfony/event-dispatcher.git",
@ -967,7 +967,7 @@
},
{
"name": "symfony/filesystem",
"version": "v4.4.1",
"version": "v4.4.2",
"source": {
"type": "git",
"url": "https://github.com/symfony/filesystem.git",
@ -1017,7 +1017,7 @@
},
{
"name": "symfony/finder",
"version": "v4.4.1",
"version": "v4.4.2",
"source": {
"type": "git",
"url": "https://github.com/symfony/finder.git",
@ -1241,16 +1241,16 @@
},
{
"name": "symfony/process",
"version": "v4.4.1",
"version": "v4.4.2",
"source": {
"type": "git",
"url": "https://github.com/symfony/process.git",
"reference": "51c0135ef3f44c5803b33dc60e96bf4f77752726"
"reference": "b84501ad50adb72a94fb460a5b5c91f693e99c9b"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/symfony/process/zipball/51c0135ef3f44c5803b33dc60e96bf4f77752726",
"reference": "51c0135ef3f44c5803b33dc60e96bf4f77752726",
"url": "https://api.github.com/repos/symfony/process/zipball/b84501ad50adb72a94fb460a5b5c91f693e99c9b",
"reference": "b84501ad50adb72a94fb460a5b5c91f693e99c9b",
"shasum": ""
},
"require": {
@ -1286,7 +1286,7 @@
],
"description": "Symfony Process Component",
"homepage": "https://symfony.com",
"time": "2019-11-28T13:33:56+00:00"
"time": "2019-12-06T10:06:46+00:00"
},
{
"name": "symfony/service-contracts",
@ -1348,16 +1348,16 @@
},
{
"name": "symfony/yaml",
"version": "v4.4.1",
"version": "v4.4.2",
"source": {
"type": "git",
"url": "https://github.com/symfony/yaml.git",
"reference": "76de473358fe802578a415d5bb43c296cf09d211"
"reference": "a08832b974dd5fafe3085a66d41fe4c84bb2628c"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/symfony/yaml/zipball/76de473358fe802578a415d5bb43c296cf09d211",
"reference": "76de473358fe802578a415d5bb43c296cf09d211",
"url": "https://api.github.com/repos/symfony/yaml/zipball/a08832b974dd5fafe3085a66d41fe4c84bb2628c",
"reference": "a08832b974dd5fafe3085a66d41fe4c84bb2628c",
"shasum": ""
},
"require": {
@ -1403,7 +1403,7 @@
],
"description": "Symfony Yaml Component",
"homepage": "https://symfony.com",
"time": "2019-11-12T14:51:11+00:00"
"time": "2019-12-10T10:33:21+00:00"
}
],
"packages-dev": [],