From 49f31015ac6a088261d8730f4a8557e342a059ce Mon Sep 17 00:00:00 2001 From: "J. King" Date: Sat, 21 Dec 2019 14:53:51 -0500 Subject: [PATCH] Start on character encoding detection --- composer.lock | 6 +-- lib/Charset.php | 86 ++++++++++++++++++++++++++++++++ lib/Data.php | 22 +++++--- tests/cases/TestCharset.php | 48 ++++++++++++++++++ tests/phpunit.dist.xml | 1 + vendor-bin/phpunit/composer.lock | 10 ++-- vendor-bin/robo/composer.lock | 36 ++++++------- 7 files changed, 177 insertions(+), 32 deletions(-) create mode 100644 lib/Charset.php create mode 100644 tests/cases/TestCharset.php diff --git a/composer.lock b/composer.lock index 44fdaac..33bd1eb 100644 --- a/composer.lock +++ b/composer.lock @@ -8,11 +8,11 @@ "packages": [ { "name": "mensbeam/intl", - "version": "0.6.0", + "version": "0.7.0", "source": { "type": "git", "url": "https://code.mensbeam.com/MensBeam/intl", - "reference": "200a310f72b086a63b84c4ca2af3b402852e3cce" + "reference": "c4a2ae17142a1846c841c5b5b9246e67d56c568e" }, "require": { "php": "^7.0" @@ -49,7 +49,7 @@ "utf-8", "utf8" ], - "time": "2019-12-18T19:57:54+00:00" + "time": "2019-12-21T01:56:59+00:00" } ], "packages-dev": [ diff --git a/lib/Charset.php b/lib/Charset.php new file mode 100644 index 0000000..0ca26ad --- /dev/null +++ b/lib/Charset.php @@ -0,0 +1,86 @@ +errorHandler = $errorHandler ?? new ParseError; if ($filePath !== 'STDIN') { $this->filePath = realpath($filePath); @@ -40,13 +43,20 @@ class Data { $this->filePath = $filePath; } - // DEVIATION: The spec has steps for parsing and determining the character - // encoding. At this moment this implementation won't determine a character - // encoding and will just assume UTF-8. - - $this->data = new \MensBeam\Intl\Encoding\UTF8($data, false, true); + if ($encoding = Charset::fromCharset($encodingOrContentType)) { + $this->encodingCertain = true; + } elseif ($encoding = Charset::fromTransport($encodingOrContentType)) { + $this->encodingCertain = true; + } elseif ($encoding = Charset::fromPrescan($data)) { + // Encoding is tentative + } else { + // Encoding is tentative; fall back to windows 1252 + $encoding = \MensBeam\Intl\Encoding\Windows1252::class; + } + $this->data = new $encoding($data, false, true); } + public function consume(int $length = 1, $advancePointer = true): string { assert($length > 0, new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length)); diff --git a/tests/cases/TestCharset.php b/tests/cases/TestCharset.php new file mode 100644 index 0000000..3aa9d4f --- /dev/null +++ b/tests/cases/TestCharset.php @@ -0,0 +1,48 @@ +assertSame($exp, Charset::fromCharset($in)); + } + + public function provideCharsets() { + return [ + ["UTF-8", UTF8::class], + [" utf8 ", UTF8::class], + ["ISO-8859-1", Windows1252::class], + ["text/html; charset=utf8", null], + ]; + } + + /** @dataProvider provideContentTypes */ + public function testDetermineEncodingFromContentType(string $in, ?string $exp) { + $this->assertSame($exp, Charset::fromTransport($in)); + } + + public function provideContentTypes() { + return [ + ["UTF-8", null], + ["charset=utf8", null], + ["text/html", null], + ["text/html charset=utf8", null], + ["text/html; charset=utf8", UTF8::class], + ["text/html;charset=utf8", UTF8::class], + ["text/html; charset=\"utf8\"", UTF8::class], + ["image/svg+xml; param=value; charset=utf8", UTF8::class], + ["image/svg+xml; charset=utf8; charset=big5", UTF8::class], + ["image/svg+xml; charset=utf8;charset=big5", UTF8::class], + ["text/html; charset=not-valid; charset=big5", null], + ["text/html; charset=not-valid", null], + ]; + } +} diff --git a/tests/phpunit.dist.xml b/tests/phpunit.dist.xml index 74352ed..4befd79 100644 --- a/tests/phpunit.dist.xml +++ b/tests/phpunit.dist.xml @@ -17,6 +17,7 @@ + cases/TestCharset.php cases/TestTokenizer.php diff --git a/vendor-bin/phpunit/composer.lock b/vendor-bin/phpunit/composer.lock index e45897e..25d6484 100644 --- a/vendor-bin/phpunit/composer.lock +++ b/vendor-bin/phpunit/composer.lock @@ -266,16 +266,16 @@ }, { "name": "phpdocumentor/reflection-docblock", - "version": "4.3.2", + "version": "4.3.3", "source": { "type": "git", "url": "https://github.com/phpDocumentor/ReflectionDocBlock.git", - "reference": "b83ff7cfcfee7827e1e78b637a5904fe6a96698e" + "reference": "2ecaa9fef01634c83bfa8dc1fe35fb5cef223a62" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/phpDocumentor/ReflectionDocBlock/zipball/b83ff7cfcfee7827e1e78b637a5904fe6a96698e", - "reference": "b83ff7cfcfee7827e1e78b637a5904fe6a96698e", + "url": "https://api.github.com/repos/phpDocumentor/ReflectionDocBlock/zipball/2ecaa9fef01634c83bfa8dc1fe35fb5cef223a62", + "reference": "2ecaa9fef01634c83bfa8dc1fe35fb5cef223a62", "shasum": "" }, "require": { @@ -313,7 +313,7 @@ } ], "description": "With this component, a library can provide support for annotations via DocBlocks or otherwise retrieve information that is embedded in a DocBlock.", - "time": "2019-09-12T14:27:41+00:00" + "time": "2019-12-20T13:40:23+00:00" }, { "name": "phpdocumentor/type-resolver", diff --git a/vendor-bin/robo/composer.lock b/vendor-bin/robo/composer.lock index 00ebcfe..d87acb6 100644 --- a/vendor-bin/robo/composer.lock +++ b/vendor-bin/robo/composer.lock @@ -763,16 +763,16 @@ }, { "name": "symfony/console", - "version": "v4.4.1", + "version": "v4.4.2", "source": { "type": "git", "url": "https://github.com/symfony/console.git", - "reference": "f0aea3df20d15635b3cb9730ca5eea1c65b7f201" + "reference": "82437719dab1e6bdd28726af14cb345c2ec816d0" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/console/zipball/f0aea3df20d15635b3cb9730ca5eea1c65b7f201", - "reference": "f0aea3df20d15635b3cb9730ca5eea1c65b7f201", + "url": "https://api.github.com/repos/symfony/console/zipball/82437719dab1e6bdd28726af14cb345c2ec816d0", + "reference": "82437719dab1e6bdd28726af14cb345c2ec816d0", "shasum": "" }, "require": { @@ -835,11 +835,11 @@ ], "description": "Symfony Console Component", "homepage": "https://symfony.com", - "time": "2019-12-01T10:06:17+00:00" + "time": "2019-12-17T10:32:23+00:00" }, { "name": "symfony/event-dispatcher", - "version": "v4.4.1", + "version": "v4.4.2", "source": { "type": "git", "url": "https://github.com/symfony/event-dispatcher.git", @@ -967,7 +967,7 @@ }, { "name": "symfony/filesystem", - "version": "v4.4.1", + "version": "v4.4.2", "source": { "type": "git", "url": "https://github.com/symfony/filesystem.git", @@ -1017,7 +1017,7 @@ }, { "name": "symfony/finder", - "version": "v4.4.1", + "version": "v4.4.2", "source": { "type": "git", "url": "https://github.com/symfony/finder.git", @@ -1241,16 +1241,16 @@ }, { "name": "symfony/process", - "version": "v4.4.1", + "version": "v4.4.2", "source": { "type": "git", "url": "https://github.com/symfony/process.git", - "reference": "51c0135ef3f44c5803b33dc60e96bf4f77752726" + "reference": "b84501ad50adb72a94fb460a5b5c91f693e99c9b" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/process/zipball/51c0135ef3f44c5803b33dc60e96bf4f77752726", - "reference": "51c0135ef3f44c5803b33dc60e96bf4f77752726", + "url": "https://api.github.com/repos/symfony/process/zipball/b84501ad50adb72a94fb460a5b5c91f693e99c9b", + "reference": "b84501ad50adb72a94fb460a5b5c91f693e99c9b", "shasum": "" }, "require": { @@ -1286,7 +1286,7 @@ ], "description": "Symfony Process Component", "homepage": "https://symfony.com", - "time": "2019-11-28T13:33:56+00:00" + "time": "2019-12-06T10:06:46+00:00" }, { "name": "symfony/service-contracts", @@ -1348,16 +1348,16 @@ }, { "name": "symfony/yaml", - "version": "v4.4.1", + "version": "v4.4.2", "source": { "type": "git", "url": "https://github.com/symfony/yaml.git", - "reference": "76de473358fe802578a415d5bb43c296cf09d211" + "reference": "a08832b974dd5fafe3085a66d41fe4c84bb2628c" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/symfony/yaml/zipball/76de473358fe802578a415d5bb43c296cf09d211", - "reference": "76de473358fe802578a415d5bb43c296cf09d211", + "url": "https://api.github.com/repos/symfony/yaml/zipball/a08832b974dd5fafe3085a66d41fe4c84bb2628c", + "reference": "a08832b974dd5fafe3085a66d41fe4c84bb2628c", "shasum": "" }, "require": { @@ -1403,7 +1403,7 @@ ], "description": "Symfony Yaml Component", "homepage": "https://symfony.com", - "time": "2019-11-12T14:51:11+00:00" + "time": "2019-12-10T10:33:21+00:00" } ], "packages-dev": [],