Start on character encoding detection
This commit is contained in:
parent
318d7bd7ad
commit
49f31015ac
7 changed files with 177 additions and 32 deletions
6
composer.lock
generated
6
composer.lock
generated
|
@ -8,11 +8,11 @@
|
||||||
"packages": [
|
"packages": [
|
||||||
{
|
{
|
||||||
"name": "mensbeam/intl",
|
"name": "mensbeam/intl",
|
||||||
"version": "0.6.0",
|
"version": "0.7.0",
|
||||||
"source": {
|
"source": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://code.mensbeam.com/MensBeam/intl",
|
"url": "https://code.mensbeam.com/MensBeam/intl",
|
||||||
"reference": "200a310f72b086a63b84c4ca2af3b402852e3cce"
|
"reference": "c4a2ae17142a1846c841c5b5b9246e67d56c568e"
|
||||||
},
|
},
|
||||||
"require": {
|
"require": {
|
||||||
"php": "^7.0"
|
"php": "^7.0"
|
||||||
|
@ -49,7 +49,7 @@
|
||||||
"utf-8",
|
"utf-8",
|
||||||
"utf8"
|
"utf8"
|
||||||
],
|
],
|
||||||
"time": "2019-12-18T19:57:54+00:00"
|
"time": "2019-12-21T01:56:59+00:00"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"packages-dev": [
|
"packages-dev": [
|
||||||
|
|
86
lib/Charset.php
Normal file
86
lib/Charset.php
Normal file
|
@ -0,0 +1,86 @@
|
||||||
|
<?php
|
||||||
|
declare(strict_types=1);
|
||||||
|
namespace dW\HTML5;
|
||||||
|
|
||||||
|
use MensBeam\Intl\Encoding;
|
||||||
|
|
||||||
|
abstract class Charset {
|
||||||
|
/** Matches an encoding label (e.g. "utf-8") to a compatible decoder class.
|
||||||
|
*
|
||||||
|
* @param string $value The encoding label to match
|
||||||
|
*/
|
||||||
|
public static function fromCharset(string $value): ?string {
|
||||||
|
$encoding = Encoding::matchLabel($value);
|
||||||
|
if ($encoding) {
|
||||||
|
return $encoding['class'];
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Extracts an encoding from an HTTP Content-Type header-field
|
||||||
|
* and returns the class name of a compatible decoder.
|
||||||
|
*
|
||||||
|
* @param string $contentType The value of a Content-Type header-field
|
||||||
|
*/
|
||||||
|
public static function fromTransport(string $contentType): ?string {
|
||||||
|
// Try to sniff out a charset from a Content-Type header-field.
|
||||||
|
// This does cut some corners, but should be sufficient for practical use
|
||||||
|
$s = preg_replace("/\s+/", " ", strtolower($contentType));
|
||||||
|
$pos = 0;
|
||||||
|
$end = strlen($s);
|
||||||
|
// skip the type
|
||||||
|
while ($pos < $end && @$s[$pos++] !== "/");
|
||||||
|
// skip the subtype
|
||||||
|
while ($pos < $end && @$s[$pos++] !== ";");
|
||||||
|
// check parameters in sequence
|
||||||
|
while ($pos < $end) {
|
||||||
|
// skip any leading whitespace
|
||||||
|
if (@$s[$pos] === " ") {
|
||||||
|
$pos++;
|
||||||
|
}
|
||||||
|
// collect characters for the parameter name
|
||||||
|
$param = "";
|
||||||
|
while ($pos < $end && @$s[$pos] !== "=") {
|
||||||
|
$param .= @$s[$pos++];
|
||||||
|
}
|
||||||
|
// skip the equals sign
|
||||||
|
$pos++;
|
||||||
|
if ($s[$pos] === '"') {
|
||||||
|
// Value is a quoted-string
|
||||||
|
$pos++;
|
||||||
|
$value = "";
|
||||||
|
while (!in_array($c = @$s[$pos++], ['"', ""])) {
|
||||||
|
if ($c === "\\") {
|
||||||
|
$value .= @$s[$pos++];
|
||||||
|
} else {
|
||||||
|
$value .= $c;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// only interpret the value if a closing quotation mark was seen
|
||||||
|
if ($c !== '"') {
|
||||||
|
$value = "";
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Value is a bare token
|
||||||
|
$value = "";
|
||||||
|
while (!in_array($c = @$s[$pos++], [';', " ", ""])) {
|
||||||
|
$value .= $c;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// if the parameter was the character set, interpret its value and return
|
||||||
|
if ($param === "charset") {
|
||||||
|
$encoding = Encoding::matchLabel($value);
|
||||||
|
if ($encoding) {
|
||||||
|
return $encoding['class'];
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function fromPrescan(string $data): ?string {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
22
lib/Data.php
22
lib/Data.php
|
@ -2,11 +2,14 @@
|
||||||
declare(strict_types=1);
|
declare(strict_types=1);
|
||||||
namespace dW\HTML5;
|
namespace dW\HTML5;
|
||||||
|
|
||||||
|
use MensBeam\Intl\Encoding;
|
||||||
|
|
||||||
class Data {
|
class Data {
|
||||||
use ParseErrorEmitter;
|
use ParseErrorEmitter;
|
||||||
|
|
||||||
// Used to get the file path for error reporting.
|
// Used to get the file path for error reporting.
|
||||||
public $filePath;
|
public $filePath;
|
||||||
|
public $encodingCertain = false;
|
||||||
|
|
||||||
// Internal storage for the Intl data object.
|
// Internal storage for the Intl data object.
|
||||||
protected $data;
|
protected $data;
|
||||||
|
@ -31,7 +34,7 @@ class Data {
|
||||||
const WHITESPACE = "\t\n\x0c\x0d ";
|
const WHITESPACE = "\t\n\x0c\x0d ";
|
||||||
|
|
||||||
|
|
||||||
public function __construct(string $data, string $filePath = 'STDIN', ParseError $errorHandler = null) {
|
public function __construct(string $data, string $filePath = 'STDIN', ParseError $errorHandler = null, string $encodingOrContentType = '') {
|
||||||
$this->errorHandler = $errorHandler ?? new ParseError;
|
$this->errorHandler = $errorHandler ?? new ParseError;
|
||||||
if ($filePath !== 'STDIN') {
|
if ($filePath !== 'STDIN') {
|
||||||
$this->filePath = realpath($filePath);
|
$this->filePath = realpath($filePath);
|
||||||
|
@ -40,13 +43,20 @@ class Data {
|
||||||
$this->filePath = $filePath;
|
$this->filePath = $filePath;
|
||||||
}
|
}
|
||||||
|
|
||||||
// DEVIATION: The spec has steps for parsing and determining the character
|
if ($encoding = Charset::fromCharset($encodingOrContentType)) {
|
||||||
// encoding. At this moment this implementation won't determine a character
|
$this->encodingCertain = true;
|
||||||
// encoding and will just assume UTF-8.
|
} elseif ($encoding = Charset::fromTransport($encodingOrContentType)) {
|
||||||
|
$this->encodingCertain = true;
|
||||||
$this->data = new \MensBeam\Intl\Encoding\UTF8($data, false, true);
|
} elseif ($encoding = Charset::fromPrescan($data)) {
|
||||||
|
// Encoding is tentative
|
||||||
|
} else {
|
||||||
|
// Encoding is tentative; fall back to windows 1252
|
||||||
|
$encoding = \MensBeam\Intl\Encoding\Windows1252::class;
|
||||||
|
}
|
||||||
|
$this->data = new $encoding($data, false, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public function consume(int $length = 1, $advancePointer = true): string {
|
public function consume(int $length = 1, $advancePointer = true): string {
|
||||||
assert($length > 0, new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length));
|
assert($length > 0, new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length));
|
||||||
|
|
||||||
|
|
48
tests/cases/TestCharset.php
Normal file
48
tests/cases/TestCharset.php
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
<?php
|
||||||
|
declare(strict_types=1);
|
||||||
|
namespace dW\HTML5\TestCase;
|
||||||
|
|
||||||
|
use dW\HTML5\Charset;
|
||||||
|
use MensBeam\Intl\Encoding\UTF8;
|
||||||
|
use MensBeam\Intl\Encoding\Windows1252;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @covers \dW\HTML5\Charset
|
||||||
|
*/
|
||||||
|
class TestCharset extends \PHPUnit\Framework\TestCase {
|
||||||
|
/** @dataProvider provideCharsets */
|
||||||
|
public function testDetermineEncodingFromEncodingLabel(string $in, ?string $exp) {
|
||||||
|
$this->assertSame($exp, Charset::fromCharset($in));
|
||||||
|
}
|
||||||
|
|
||||||
|
public function provideCharsets() {
|
||||||
|
return [
|
||||||
|
["UTF-8", UTF8::class],
|
||||||
|
[" utf8 ", UTF8::class],
|
||||||
|
["ISO-8859-1", Windows1252::class],
|
||||||
|
["text/html; charset=utf8", null],
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @dataProvider provideContentTypes */
|
||||||
|
public function testDetermineEncodingFromContentType(string $in, ?string $exp) {
|
||||||
|
$this->assertSame($exp, Charset::fromTransport($in));
|
||||||
|
}
|
||||||
|
|
||||||
|
public function provideContentTypes() {
|
||||||
|
return [
|
||||||
|
["UTF-8", null],
|
||||||
|
["charset=utf8", null],
|
||||||
|
["text/html", null],
|
||||||
|
["text/html charset=utf8", null],
|
||||||
|
["text/html; charset=utf8", UTF8::class],
|
||||||
|
["text/html;charset=utf8", UTF8::class],
|
||||||
|
["text/html; charset=\"utf8\"", UTF8::class],
|
||||||
|
["image/svg+xml; param=value; charset=utf8", UTF8::class],
|
||||||
|
["image/svg+xml; charset=utf8; charset=big5", UTF8::class],
|
||||||
|
["image/svg+xml; charset=utf8;charset=big5", UTF8::class],
|
||||||
|
["text/html; charset=not-valid; charset=big5", null],
|
||||||
|
["text/html; charset=not-valid", null],
|
||||||
|
];
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,6 +17,7 @@
|
||||||
|
|
||||||
<testsuites>
|
<testsuites>
|
||||||
<testsuite name="Tokenizer">
|
<testsuite name="Tokenizer">
|
||||||
|
<file>cases/TestCharset.php</file>
|
||||||
<file>cases/TestTokenizer.php</file>
|
<file>cases/TestTokenizer.php</file>
|
||||||
</testsuite>
|
</testsuite>
|
||||||
</testsuites>
|
</testsuites>
|
||||||
|
|
10
vendor-bin/phpunit/composer.lock
generated
10
vendor-bin/phpunit/composer.lock
generated
|
@ -266,16 +266,16 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "phpdocumentor/reflection-docblock",
|
"name": "phpdocumentor/reflection-docblock",
|
||||||
"version": "4.3.2",
|
"version": "4.3.3",
|
||||||
"source": {
|
"source": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://github.com/phpDocumentor/ReflectionDocBlock.git",
|
"url": "https://github.com/phpDocumentor/ReflectionDocBlock.git",
|
||||||
"reference": "b83ff7cfcfee7827e1e78b637a5904fe6a96698e"
|
"reference": "2ecaa9fef01634c83bfa8dc1fe35fb5cef223a62"
|
||||||
},
|
},
|
||||||
"dist": {
|
"dist": {
|
||||||
"type": "zip",
|
"type": "zip",
|
||||||
"url": "https://api.github.com/repos/phpDocumentor/ReflectionDocBlock/zipball/b83ff7cfcfee7827e1e78b637a5904fe6a96698e",
|
"url": "https://api.github.com/repos/phpDocumentor/ReflectionDocBlock/zipball/2ecaa9fef01634c83bfa8dc1fe35fb5cef223a62",
|
||||||
"reference": "b83ff7cfcfee7827e1e78b637a5904fe6a96698e",
|
"reference": "2ecaa9fef01634c83bfa8dc1fe35fb5cef223a62",
|
||||||
"shasum": ""
|
"shasum": ""
|
||||||
},
|
},
|
||||||
"require": {
|
"require": {
|
||||||
|
@ -313,7 +313,7 @@
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"description": "With this component, a library can provide support for annotations via DocBlocks or otherwise retrieve information that is embedded in a DocBlock.",
|
"description": "With this component, a library can provide support for annotations via DocBlocks or otherwise retrieve information that is embedded in a DocBlock.",
|
||||||
"time": "2019-09-12T14:27:41+00:00"
|
"time": "2019-12-20T13:40:23+00:00"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "phpdocumentor/type-resolver",
|
"name": "phpdocumentor/type-resolver",
|
||||||
|
|
36
vendor-bin/robo/composer.lock
generated
36
vendor-bin/robo/composer.lock
generated
|
@ -763,16 +763,16 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "symfony/console",
|
"name": "symfony/console",
|
||||||
"version": "v4.4.1",
|
"version": "v4.4.2",
|
||||||
"source": {
|
"source": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://github.com/symfony/console.git",
|
"url": "https://github.com/symfony/console.git",
|
||||||
"reference": "f0aea3df20d15635b3cb9730ca5eea1c65b7f201"
|
"reference": "82437719dab1e6bdd28726af14cb345c2ec816d0"
|
||||||
},
|
},
|
||||||
"dist": {
|
"dist": {
|
||||||
"type": "zip",
|
"type": "zip",
|
||||||
"url": "https://api.github.com/repos/symfony/console/zipball/f0aea3df20d15635b3cb9730ca5eea1c65b7f201",
|
"url": "https://api.github.com/repos/symfony/console/zipball/82437719dab1e6bdd28726af14cb345c2ec816d0",
|
||||||
"reference": "f0aea3df20d15635b3cb9730ca5eea1c65b7f201",
|
"reference": "82437719dab1e6bdd28726af14cb345c2ec816d0",
|
||||||
"shasum": ""
|
"shasum": ""
|
||||||
},
|
},
|
||||||
"require": {
|
"require": {
|
||||||
|
@ -835,11 +835,11 @@
|
||||||
],
|
],
|
||||||
"description": "Symfony Console Component",
|
"description": "Symfony Console Component",
|
||||||
"homepage": "https://symfony.com",
|
"homepage": "https://symfony.com",
|
||||||
"time": "2019-12-01T10:06:17+00:00"
|
"time": "2019-12-17T10:32:23+00:00"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "symfony/event-dispatcher",
|
"name": "symfony/event-dispatcher",
|
||||||
"version": "v4.4.1",
|
"version": "v4.4.2",
|
||||||
"source": {
|
"source": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://github.com/symfony/event-dispatcher.git",
|
"url": "https://github.com/symfony/event-dispatcher.git",
|
||||||
|
@ -967,7 +967,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "symfony/filesystem",
|
"name": "symfony/filesystem",
|
||||||
"version": "v4.4.1",
|
"version": "v4.4.2",
|
||||||
"source": {
|
"source": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://github.com/symfony/filesystem.git",
|
"url": "https://github.com/symfony/filesystem.git",
|
||||||
|
@ -1017,7 +1017,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "symfony/finder",
|
"name": "symfony/finder",
|
||||||
"version": "v4.4.1",
|
"version": "v4.4.2",
|
||||||
"source": {
|
"source": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://github.com/symfony/finder.git",
|
"url": "https://github.com/symfony/finder.git",
|
||||||
|
@ -1241,16 +1241,16 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "symfony/process",
|
"name": "symfony/process",
|
||||||
"version": "v4.4.1",
|
"version": "v4.4.2",
|
||||||
"source": {
|
"source": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://github.com/symfony/process.git",
|
"url": "https://github.com/symfony/process.git",
|
||||||
"reference": "51c0135ef3f44c5803b33dc60e96bf4f77752726"
|
"reference": "b84501ad50adb72a94fb460a5b5c91f693e99c9b"
|
||||||
},
|
},
|
||||||
"dist": {
|
"dist": {
|
||||||
"type": "zip",
|
"type": "zip",
|
||||||
"url": "https://api.github.com/repos/symfony/process/zipball/51c0135ef3f44c5803b33dc60e96bf4f77752726",
|
"url": "https://api.github.com/repos/symfony/process/zipball/b84501ad50adb72a94fb460a5b5c91f693e99c9b",
|
||||||
"reference": "51c0135ef3f44c5803b33dc60e96bf4f77752726",
|
"reference": "b84501ad50adb72a94fb460a5b5c91f693e99c9b",
|
||||||
"shasum": ""
|
"shasum": ""
|
||||||
},
|
},
|
||||||
"require": {
|
"require": {
|
||||||
|
@ -1286,7 +1286,7 @@
|
||||||
],
|
],
|
||||||
"description": "Symfony Process Component",
|
"description": "Symfony Process Component",
|
||||||
"homepage": "https://symfony.com",
|
"homepage": "https://symfony.com",
|
||||||
"time": "2019-11-28T13:33:56+00:00"
|
"time": "2019-12-06T10:06:46+00:00"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "symfony/service-contracts",
|
"name": "symfony/service-contracts",
|
||||||
|
@ -1348,16 +1348,16 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "symfony/yaml",
|
"name": "symfony/yaml",
|
||||||
"version": "v4.4.1",
|
"version": "v4.4.2",
|
||||||
"source": {
|
"source": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://github.com/symfony/yaml.git",
|
"url": "https://github.com/symfony/yaml.git",
|
||||||
"reference": "76de473358fe802578a415d5bb43c296cf09d211"
|
"reference": "a08832b974dd5fafe3085a66d41fe4c84bb2628c"
|
||||||
},
|
},
|
||||||
"dist": {
|
"dist": {
|
||||||
"type": "zip",
|
"type": "zip",
|
||||||
"url": "https://api.github.com/repos/symfony/yaml/zipball/76de473358fe802578a415d5bb43c296cf09d211",
|
"url": "https://api.github.com/repos/symfony/yaml/zipball/a08832b974dd5fafe3085a66d41fe4c84bb2628c",
|
||||||
"reference": "76de473358fe802578a415d5bb43c296cf09d211",
|
"reference": "a08832b974dd5fafe3085a66d41fe4c84bb2628c",
|
||||||
"shasum": ""
|
"shasum": ""
|
||||||
},
|
},
|
||||||
"require": {
|
"require": {
|
||||||
|
@ -1403,7 +1403,7 @@
|
||||||
],
|
],
|
||||||
"description": "Symfony Yaml Component",
|
"description": "Symfony Yaml Component",
|
||||||
"homepage": "https://symfony.com",
|
"homepage": "https://symfony.com",
|
||||||
"time": "2019-11-12T14:51:11+00:00"
|
"time": "2019-12-10T10:33:21+00:00"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"packages-dev": [],
|
"packages-dev": [],
|
||||||
|
|
Loading…
Reference in a new issue