Browse Source

Merge branch 'master' into multi-byte

multi-byte
J. King 4 years ago
parent
commit
f49d632642
  1. 1
      .gitignore
  2. 33
      CHANGELOG
  3. 86
      RoboFile.php
  4. 14
      composer.lock
  5. 36
      lib/Encoding.php
  6. 22
      lib/Encoding/Big5.php
  7. 25
      lib/Encoding/EUCKR.php
  8. 30
      lib/Encoding/Encoding.php
  9. 27
      lib/Encoding/GBCommon.php
  10. 55
      lib/Encoding/GenericEncoding.php
  11. 33
      lib/Encoding/SingleByteEncoding.php
  12. 27
      lib/Encoding/UTF16.php
  13. 16
      lib/Encoding/UTF8.php
  14. 7
      lib/Encoding/XUserDefined.php
  15. 16
      robo
  16. 6
      tests/bootstrap.php
  17. 12
      tests/cases/Encoding/TestBig5.php
  18. 12
      tests/cases/Encoding/TestEUCKR.php
  19. 14
      tests/cases/Encoding/TestGB18030.php
  20. 13
      tests/cases/Encoding/TestSingleByte.php
  21. 8
      tests/cases/Encoding/TestUTF16BE.php
  22. 21
      tests/cases/Encoding/TestUTF16LE.php
  23. 19
      tests/cases/Encoding/TestUTF8.php
  24. 12
      tests/cases/Encoding/TestXUserDefined.php
  25. 56
      tests/cases/TestEncoding.php
  26. 23
      tests/lib/DecoderTest.php
  27. 3
      tests/phpunit.xml
  28. 3
      tools/mkindex.php
  29. 40
      tools/mklabels.php
  30. 633
      vendor-bin/csfixer/composer.lock
  31. 2
      vendor-bin/phpunit/composer.json
  32. 590
      vendor-bin/phpunit/composer.lock
  33. 676
      vendor-bin/robo/composer.lock

1
.gitignore

@ -1,5 +1,6 @@
vendor/ vendor/
tests/coverage/ tests/coverage/
/tests/.phpunit.result.cache
perf/docs/ perf/docs/
.php_cs.cache .php_cs.cache

33
CHANGELOG

@ -1,3 +1,36 @@
Version 0.7.0 (2019-12-20)
==========================
New features:
- Added \MensBeam\Intl\Encoding abstract class with createDecoder() and
matchLabel() static methods
Version 0.6.0 (2019-12-18)
==========================
New features:
- Added $allowSurrogates parameter to Encoding constructor
- Added posErr public instance property to Encoding
Version 0.5.0 (2019-12-13)
==========================
Breaking changes:
- Rename Encoding::len() to Encoding::lenChar()
New features:
- Add Encoding::lenByte() method
- Add Encoding::eof() method
Version 0.4.0 (2018-09-15)
==========================
New features:
- Implemention of UTF-16 encoding
- Implemention of Big5 encoding
- Implemention of EUC-KR encoding
- Implemention of x-user-defined encoding
Version 0.3.0 (2018-08-29) Version 0.3.0 (2018-08-29)
========================== ==========================

86
RoboFile.php

@ -3,12 +3,21 @@ declare(strict_types=1);
use Robo\Result; use Robo\Result;
class RoboFile extends \Robo\Tasks { const BASE = __DIR__.\DIRECTORY_SEPARATOR;
const BASE = __DIR__.\DIRECTORY_SEPARATOR; const BASE_TEST = BASE."tests".\DIRECTORY_SEPARATOR;
const BASE_TEST = self::BASE."tests".\DIRECTORY_SEPARATOR; define("IS_WIN", defined("PHP_WINDOWS_VERSION_MAJOR"));
define("IS_MAC", php_uname("s") === "Darwin");
/** function norm(string $path): string {
* Runs the typical test suite $out = realpath($path);
if (!$out) {
$out = str_replace(["/", "\\"], \DIRECTORY_SEPARATOR, $path);
}
return $out;
}
class RoboFile extends \Robo\Tasks {
/** Runs the typical test suite
* *
* Arguments passed to the task are passed on to PHPUnit. Thus one may, for * Arguments passed to the task are passed on to PHPUnit. Thus one may, for
* example, run the following command and get the expected results: * example, run the following command and get the expected results:
@ -18,17 +27,16 @@ class RoboFile extends \Robo\Tasks {
* Please see the PHPUnit documentation for available options. * Please see the PHPUnit documentation for available options.
*/ */
public function test(array $args): Result { public function test(array $args): Result {
return $this->runTests("php", "typical", $args); return $this->runTests(escapeshellarg(\PHP_BINARY), "typical", $args);
} }
/** /** Runs the full test suite
* Runs the full test suite
* *
* This includes pedantic tests which may help to identify problems. * This includes pedantic tests which may help to identify problems.
* See help for the "test" task for more details. * See help for the "test" task for more details.
*/ */
public function testFull(array $args): Result { public function testFull(array $args): Result {
return $this->runTests("php", "full", $args); return $this->runTests(escapeshellarg(\PHP_BINARY), "full", $args);
} }
/** /**
@ -37,7 +45,7 @@ class RoboFile extends \Robo\Tasks {
* See help for the "test" task for more details. * See help for the "test" task for more details.
*/ */
public function testQuick(array $args): Result { public function testQuick(array $args): Result {
return $this->runTests("php", "quick", $args); return $this->runTests(escapeshellarg(\PHP_BINARY), "quick", $args);
} }
/** Produces a code coverage report /** Produces a code coverage report
@ -53,37 +61,56 @@ class RoboFile extends \Robo\Tasks {
public function coverage(array $args): Result { public function coverage(array $args): Result {
// run tests with code coverage reporting enabled // run tests with code coverage reporting enabled
$exec = $this->findCoverageEngine(); $exec = $this->findCoverageEngine();
return $this->runTests($exec, "typical", array_merge(["--coverage-html", self::BASE_TEST."coverage"], $args)); return $this->runTests($exec, "coverage", array_merge(["--coverage-html", BASE_TEST."coverage"], $args));
} }
/** Runs a performance evaluation. /** Produces a code coverage report, with redundant tests
* *
* The performance of the library's basic functionality is tested against * Depending on the environment, some tests that normally provide
* the IntlCodePointBreakIterator class * coverage may be skipped, while working alternatives are normally
* suppressed for reasons of time. This coverage report will try to
* run all tests which may cover code.
*
* See also help for the "coverage" task for more details.
*/ */
public function perf(array $args): Result { public function coverageFull(array $args): Result {
$execpath = realpath(self::BASE."perf/perf.php"); // run tests with code coverage reporting enabled
return $this->taskExec("php")->arg($execpath)->args($args)->run(); $exec = $this->findCoverageEngine();
return $this->runTests($exec, "typical", array_merge(["--coverage-html", BASE_TEST."coverage"], $args));
} }
/** Runs the coding standards fixer */ /** Runs the coding standards fixer */
public function clean($opts = ['demo|d' => false]): Result { public function clean($opts = ['demo|d' => false]): Result {
$t = $this->taskExec(realpath(self::BASE."vendor/bin/php-cs-fixer")); $t = $this->taskExec(norm(BASE."vendor/bin/php-cs-fixer"));
$t->arg("fix")->arg("--allow-risky=yes"); $t->arg("fix");
if ($opts['demo']) { if ($opts['demo']) {
$t->args("--dry-run", "--diff")->option("--diff-format", "udiff"); $t->args("--dry-run", "--diff")->option("--diff-format", "udiff");
} }
return $t->run(); return $t->run();
} }
/** Runs a performance evaluation.
*
* The performance of the library's basic functionality is tested against
* the IntlCodePointBreakIterator class
*/
public function perf(array $args): Result {
$execpath = realpath(norm(BASE."perf/perf.php"));
return $this->taskExec("php")->arg($execpath)->args($args)->run();
}
protected function findCoverageEngine(): string { protected function findCoverageEngine(): string {
$null = null; if (IS_WIN) {
$code = 0; $dbg = dirname(\PHP_BINARY)."\\phpdbg.exe";
exec("phpdbg --version", $null, $code); $dbg = file_exists($dbg) ? $dbg : "";
if (!$code) {
return "phpdbg -qrr";
} else { } else {
return "php"; $dbg = trim(`which phpdbg 2>/dev/null`);
}
if ($dbg) {
return escapeshellarg($dbg)." -qrr";
} else {
$ext = IS_WIN ? "dll" : (IS_MAC ? "dylib" : "so");
return escapeshellarg(\PHP_BINARY)." -d zend_extension=xdebug.$ext";
} }
} }
@ -95,14 +122,17 @@ class RoboFile extends \Robo\Tasks {
case "quick": case "quick":
$set = ["--exclude-group", "optional,slow"]; $set = ["--exclude-group", "optional,slow"];
break; break;
case "coverage":
$set = ["--exclude-group", "optional,coverageOptional"];
break;
case "full": case "full":
$set = []; $set = [];
break; break;
default: default:
throw new \Exception; throw new \Exception;
} }
$execpath = realpath(self::BASE."vendor-bin/phpunit/vendor/phpunit/phpunit/phpunit"); $execpath = norm(BASE."vendor-bin/phpunit/vendor/phpunit/phpunit/phpunit");
$confpath = realpath(self::BASE_TEST."phpunit.xml"); $confpath = realpath(BASE_TEST."phpunit.dist.xml") ?: norm(BASE_TEST."phpunit.xml");
return $this->taskExec($executor)->arg($execpath)->option("-c", $confpath)->args(array_merge($set, $args))->run(); return $this->taskExec($executor)->option("-d", "zend.assertions=1")->arg($execpath)->option("-c", $confpath)->args(array_merge($set, $args))->run();
} }
} }

14
composer.lock

@ -1,7 +1,7 @@
{ {
"_readme": [ "_readme": [
"This file locks the dependencies of your project to a known state", "This file locks the dependencies of your project to a known state",
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file", "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically" "This file is @generated automatically"
], ],
"content-hash": "ba27aa72527421b04188393db2c8510b", "content-hash": "ba27aa72527421b04188393db2c8510b",
@ -9,16 +9,16 @@
"packages-dev": [ "packages-dev": [
{ {
"name": "bamarni/composer-bin-plugin", "name": "bamarni/composer-bin-plugin",
"version": "v1.2.0", "version": "v1.3.0",
"source": { "source": {
"type": "git", "type": "git",
"url": "https://github.com/bamarni/composer-bin-plugin.git", "url": "https://github.com/bamarni/composer-bin-plugin.git",
"reference": "62fef740245a85f00665e81ea8f0aa0b72afe6e7" "reference": "67f9d314dc7ecf7245b8637906e151ccc62b8d24"
}, },
"dist": { "dist": {
"type": "zip", "type": "zip",
"url": "https://api.github.com/repos/bamarni/composer-bin-plugin/zipball/62fef740245a85f00665e81ea8f0aa0b72afe6e7", "url": "https://api.github.com/repos/bamarni/composer-bin-plugin/zipball/67f9d314dc7ecf7245b8637906e151ccc62b8d24",
"reference": "62fef740245a85f00665e81ea8f0aa0b72afe6e7", "reference": "67f9d314dc7ecf7245b8637906e151ccc62b8d24",
"shasum": "" "shasum": ""
}, },
"require": { "require": {
@ -26,7 +26,7 @@
}, },
"require-dev": { "require-dev": {
"composer/composer": "dev-master", "composer/composer": "dev-master",
"symfony/console": "^2.5 || ^3.0" "symfony/console": "^2.5 || ^3.0 || ^4.0"
}, },
"type": "composer-plugin", "type": "composer-plugin",
"extra": { "extra": {
@ -44,7 +44,7 @@
"license": [ "license": [
"MIT" "MIT"
], ],
"time": "2017-09-11T13:13:58+00:00" "time": "2019-03-17T12:38:04+00:00"
} }
], ],
"aliases": [], "aliases": [],

36
lib/Encoding.php

File diff suppressed because one or more lines are too long

22
lib/Encoding/Big5.php

File diff suppressed because one or more lines are too long

25
lib/Encoding/EUCKR.php

File diff suppressed because one or more lines are too long

30
lib/Encoding/Encoding.php

@ -19,10 +19,10 @@ interface Encoding {
const E_UNAVAILABLE_CODE_POINT = 4; const E_UNAVAILABLE_CODE_POINT = 4;
/** Constructs a new decoder /** Constructs a new decoder
* * @param bool $fatal If true, throw enceptions when encountering invalid input. If false, substitute U+FFFD REPLACEMENT CHARACTER instead
* If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted * @param bool $allowSurrogates If true, treats surrogate characters as valid input; this only affects UTF-8 and UTF-16 encodings
*/ */
public function __construct(string $string, bool $fatal = false); public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false);
/** Returns the current byte position of the decoder */ /** Returns the current byte position of the decoder */
public function posByte(): int; public function posByte(): int;
@ -40,15 +40,15 @@ interface Encoding {
* *
* If the end of the string has been reached, false is returned * If the end of the string has been reached, false is returned
* *
* @return int|bool * @return int|false
*/ */
public function nextCode(); public function nextCode();
/** Advance $distance characters through the string /** Advance $distance characters through the string
*
* If $distance is negative, the operation will be performed in reverse
* *
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned * If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
*
* @param int $distance The number of characters to advance. If negative, the operation will seek back toward the beginning of the string
*/ */
public function seek(int $distance): int; public function seek(int $distance): int;
@ -58,17 +58,29 @@ interface Encoding {
*/ */
public function rewind(); public function rewind();
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */ /** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer
*
* @param int $num The number of characters to retrieve
*/
public function peekChar(int $num = 1): string; public function peekChar(int $num = 1): string;
/** Retrieves the next $num code points from the string, without advancing the character pointer */ /** Retrieves the next $num code points from the string, without advancing the character pointer
*
* @param int $num The number of code points to retrieve
*/
public function peekCode(int $num = 1): array; public function peekCode(int $num = 1): array;
/** Calculates the length of the string in bytes */
public function lenByte(): int;
/** Calculates the length of the string in code points /** Calculates the length of the string in code points
* *
* Note that this may involve processing to the end of the string * Note that this may involve processing to the end of the string
*/ */
public function len(): int; public function lenChar(): int;
/** Returns whether the character pointer is at the end of the string */
public function eof(): bool;
/** Generates an iterator which steps through each character in the string */ /** Generates an iterator which steps through each character in the string */
public function chars(): \Generator; public function chars(): \Generator;

27
lib/Encoding/GBCommon.php

@ -13,14 +13,6 @@ abstract class GBCommon implements StatelessEncoding {
const TABLE_RANGES = [0,36,38,45,50,81,89,95,96,100,103,104,105,109,126,133,148,172,175,179,208,306,307,308,309,310,311,312,313,341,428,443,544,545,558,741,742,749,750,805,819,820,7922,7924,7925,7927,7934,7943,7944,7945,7950,8062,8148,8149,8152,8164,8174,8236,8240,8262,8264,8374,8380,8381,8384,8388,8390,8392,8393,8394,8396,8401,8406,8416,8419,8424,8437,8439,8445,8482,8485,8496,8521,8603,8936,8946,9046,9050,9063,9066,9076,9092,9100,9108,9111,9113,9131,9162,9164,9218,9219,11329,11331,11334,11336,11346,11361,11363,11366,11370,11372,11375,11389,11682,11686,11687,11692,11694,11714,11716,11723,11725,11730,11736,11982,11989,12102,12336,12348,12350,12384,12393,12395,12397,12510,12553,12851,12962,12973,13738,13823,13919,13933,14080,14298,14585,14698,15583,15847,16318,16434,16438,16481,16729,17102,17122,17315,17320,17402,17418,17859,17909,17911,17915,17916,17936,17939,17961,18664,18703,18814,18962,19043,33469,33470,33471,33484,33485,33490,33497,33501,33505,33513,33520,33536,33550,37845,37921,37948,38029,38038,38064,38065,38066,38069,38075,38076,38078,39108,39109,39113,39114,39115,39116,39265,39394,39420,189000,1237576]; const TABLE_RANGES = [0,36,38,45,50,81,89,95,96,100,103,104,105,109,126,133,148,172,175,179,208,306,307,308,309,310,311,312,313,341,428,443,544,545,558,741,742,749,750,805,819,820,7922,7924,7925,7927,7934,7943,7944,7945,7950,8062,8148,8149,8152,8164,8174,8236,8240,8262,8264,8374,8380,8381,8384,8388,8390,8392,8393,8394,8396,8401,8406,8416,8419,8424,8437,8439,8445,8482,8485,8496,8521,8603,8936,8946,9046,9050,9063,9066,9076,9092,9100,9108,9111,9113,9131,9162,9164,9218,9219,11329,11331,11334,11336,11346,11361,11363,11366,11370,11372,11375,11389,11682,11686,11687,11692,11694,11714,11716,11723,11725,11730,11736,11982,11989,12102,12336,12348,12350,12384,12393,12395,12397,12510,12553,12851,12962,12973,13738,13823,13919,13933,14080,14298,14585,14698,15583,15847,16318,16434,16438,16481,16729,17102,17122,17315,17320,17402,17418,17859,17909,17911,17915,17916,17936,17939,17961,18664,18703,18814,18962,19043,33469,33470,33471,33484,33485,33490,33497,33501,33505,33513,33520,33536,33550,37845,37921,37948,38029,38038,38064,38065,38066,38069,38075,38076,38078,39108,39109,39113,39114,39115,39116,39265,39394,39420,189000,1237576];
const TABLE_OFFSETS = [128,165,169,178,184,216,226,235,238,244,248,251,253,258,276,284,300,325,329,334,364,463,465,467,469,471,473,475,477,506,594,610,712,716,730,930,938,962,970,1026,1104,1106,8209,8215,8218,8222,8231,8241,8244,8246,8252,8365,8452,8454,8458,8471,8482,8556,8570,8596,8602,8713,8720,8722,8726,8731,8737,8740,8742,8748,8751,8760,8766,8777,8781,8787,8802,8808,8816,8854,8858,8870,8896,8979,9322,9372,9548,9588,9616,9622,9634,9652,9662,9672,9676,9680,9702,9735,9738,9793,9795,11906,11909,11913,11917,11928,11944,11947,11951,11956,11960,11964,11979,12284,12292,12312,12319,12330,12351,12436,12447,12535,12543,12586,12842,12850,12964,13200,13215,13218,13253,13263,13267,13270,13384,13428,13727,13839,13851,14617,14703,14801,14816,14964,15183,15471,15585,16471,16736,17208,17325,17330,17374,17623,17997,18018,18212,18218,18301,18318,18760,18811,18814,18820,18823,18844,18848,18872,19576,19620,19738,19887,40870,59244,59336,59367,59413,59417,59423,59431,59437,59443,59452,59460,59478,59493,63789,63866,63894,63976,63986,64016,64018,64021,64025,64034,64037,64042,65074,65093,65107,65112,65127,65132,65375,65510,null,65536,1114112]; const TABLE_OFFSETS = [128,165,169,178,184,216,226,235,238,244,248,251,253,258,276,284,300,325,329,334,364,463,465,467,469,471,473,475,477,506,594,610,712,716,730,930,938,962,970,1026,1104,1106,8209,8215,8218,8222,8231,8241,8244,8246,8252,8365,8452,8454,8458,8471,8482,8556,8570,8596,8602,8713,8720,8722,8726,8731,8737,8740,8742,8748,8751,8760,8766,8777,8781,8787,8802,8808,8816,8854,8858,8870,8896,8979,9322,9372,9548,9588,9616,9622,9634,9652,9662,9672,9676,9680,9702,9735,9738,9793,9795,11906,11909,11913,11917,11928,11944,11947,11951,11956,11960,11964,11979,12284,12292,12312,12319,12330,12351,12436,12447,12535,12543,12586,12842,12850,12964,13200,13215,13218,13253,13263,13267,13270,13384,13428,13727,13839,13851,14617,14703,14801,14816,14964,15183,15471,15585,16471,16736,17208,17325,17330,17374,17623,17997,18018,18212,18218,18301,18318,18760,18811,18814,18820,18823,18844,18848,18872,19576,19620,19738,19887,40870,59244,59336,59367,59413,59417,59423,59431,59437,59443,59452,59460,59478,59493,63789,63866,63894,63976,63986,64016,64018,64021,64025,64034,64037,64042,65074,65093,65107,65112,65127,65132,65375,65510,null,65536,1114112];
protected $dirtyEOF = 0;
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
*/
public function nextCode() { public function nextCode() {
$first = 0; $first = 0;
$second = 0; $second = 0;
@ -37,6 +29,7 @@ abstract class GBCommon implements StatelessEncoding {
$first = $b; $first = $b;
continue; continue;
} else { } else {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
} }
} elseif ($second === 0) { } elseif ($second === 0) {
@ -49,8 +42,10 @@ abstract class GBCommon implements StatelessEncoding {
$pointer = ($first - 0x81) * 190 + ($b - $offset); $pointer = ($first - 0x81) * 190 + ($b - $offset);
return self::TABLE_GBK[$pointer]; return self::TABLE_GBK[$pointer];
} elseif ($b < 0x80) { } elseif ($b < 0x80) {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, --$this->posByte]); return self::err($this->errMode, [$this->posChar - 1, --$this->posByte]);
} else { } else {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
} }
} }
@ -60,6 +55,7 @@ abstract class GBCommon implements StatelessEncoding {
continue; continue;
} else { } else {
$this->posByte -= 2; $this->posByte -= 2;
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
} }
} else { } else {
@ -79,10 +75,12 @@ abstract class GBCommon implements StatelessEncoding {
if (isset($codePointOffset)) { if (isset($codePointOffset)) {
return $codePointOffset + $pointer - $offset; return $codePointOffset + $pointer - $offset;
} else { } else {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
} }
} else { } else {
$this->posByte -= 3; $this->posByte -= 3;
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
} }
} }
@ -95,16 +93,11 @@ abstract class GBCommon implements StatelessEncoding {
} else { } else {
// dirty EOF; note how many bytes the last character had // dirty EOF; note how many bytes the last character had
$this->dirtyEOF = ($third ? 3 : ($second ? 2 : 1)); $this->dirtyEOF = ($third ? 3 : ($second ? 2 : 1));
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]); return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]);
} }
} }
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
*/
public static function encode(int $codePoint, bool $fatal = true): string { public static function encode(int $codePoint, bool $fatal = true): string {
if ($codePoint < 0 || $codePoint > 0x10FFFF) { if ($codePoint < 0 || $codePoint > 0x10FFFF) {
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT); throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);
@ -148,12 +141,6 @@ abstract class GBCommon implements StatelessEncoding {
/** Implements backward seeking $distance characters */ /** Implements backward seeking $distance characters */
protected function seekBack(int $distance): int { protected function seekBack(int $distance): int {
if ($this->posByte == $this->lenByte && $this->dirtyEOF > 0) {
// if we are at the end of the string and it did not terminate cleanly, go back the correct number of dirty bytes to seek through the last character
$this->posByte -= $this->dirtyEOF;
$distance--;
$this->posChar--;
}
while ($distance > 0 && $this->posByte > 0) { while ($distance > 0 && $this->posByte > 0) {
$distance--; $distance--;
$this->posChar--; $this->posChar--;

55
lib/Encoding/GenericEncoding.php

@ -12,41 +12,32 @@ trait GenericEncoding {
protected $posChar = 0; protected $posChar = 0;
protected $lenByte = null; protected $lenByte = null;
protected $lenChar = null; protected $lenChar = null;
protected $dirtyEOF = 0;
protected $errMode = self::MODE_REPLACE; protected $errMode = self::MODE_REPLACE;
protected $allowSurrogates = false;
/** Constructs a new decoder public $posErr = 0;
*
* If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) {
*/
public function __construct(string $string, bool $fatal = false) {
$this->string = $string; $this->string = $string;
$this->lenByte = strlen($string); $this->lenByte = strlen($string);
$this->errMode = $fatal ? self::MODE_FATAL_DEC : self::MODE_REPLACE; $this->errMode = $fatal ? self::MODE_FATAL_DEC : self::MODE_REPLACE;
$this->allowSurrogates = $allowSurrogates;
} }
/** Returns the current byte position of the decoder */
public function posByte(): int { public function posByte(): int {
return $this->posByte; return $this->posByte;
} }
/** Returns the current character position of the decoder */
public function posChar(): int { public function posChar(): int {
return $this->posChar; return $this->posChar;
} }
/** Seeks to the start of the string
*
* This is usually faster than using the seek method for the same purpose
*/
public function rewind() { public function rewind() {
$this->posByte = 0; $this->posByte = 0;
$this->posChar = 0; $this->posChar = 0;
} }
/** Retrieve the next character in the string, in UTF-8 encoding
*
* The returned character may be a replacement character, or the empty string if the end of the string has been reached
*/
public function nextChar(): string { public function nextChar(): string {
// get the byte at the current position // get the byte at the current position
$b = @$this->string[$this->posByte]; $b = @$this->string[$this->posByte];
@ -64,12 +55,6 @@ trait GenericEncoding {
} }
} }
/** Advance $distance characters through the string
*
* If $distance is negative, the operation will be performed in reverse
*
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
*/
public function seek(int $distance): int { public function seek(int $distance): int {
if ($distance > 0) { if ($distance > 0) {
if ($this->posByte == strlen($this->string)) { if ($this->posByte == strlen($this->string)) {
@ -84,6 +69,13 @@ trait GenericEncoding {
if (!$this->posChar) { if (!$this->posChar) {
return $distance; return $distance;
} }
if ($this->dirtyEOF > 0) {
// if we are at the end of the string and it did not terminate cleanly, go back the correct number of dirty bytes to seek through the last character
$this->posByte -= $this->dirtyEOF;
$this->dirtyEOF = 0;
$distance--;
$this->posChar--;
}
$mode = $this->errMode; $mode = $this->errMode;
$this->errMode = self::MODE_NULL; $this->errMode = self::MODE_NULL;
$out = $this->seekBack($distance); $out = $this->seekBack($distance);
@ -94,7 +86,6 @@ trait GenericEncoding {
} }
} }
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */
public function peekChar(int $num = 1): string { public function peekChar(int $num = 1): string {
$out = ""; $out = "";
$state = $this->stateSave(); $state = $this->stateSave();
@ -108,7 +99,6 @@ trait GenericEncoding {
return $out; return $out;
} }
/** Retrieves the next $num code points from the string, without advancing the character pointer */
public function peekCode(int $num = 1): array { public function peekCode(int $num = 1): array {
$out = []; $out = [];
$state = $this->stateSave(); $state = $this->stateSave();
@ -122,11 +112,11 @@ trait GenericEncoding {
return $out; return $out;
} }
/** Calculates the length of the string in code points public function lenByte(): int {
* return $this->lenByte;
* Note that this may involve processing to the end of the string }
*/
public function len(): int { public function lenChar(): int {
return $this->lenChar ?? (function() { return $this->lenChar ?? (function() {
$state = $this->stateSave(); $state = $this->stateSave();
while ($this->nextCode() !== false); while ($this->nextCode() !== false);
@ -136,14 +126,16 @@ trait GenericEncoding {
})(); })();
} }
/** Generates an iterator which steps through each character in the string */ public function eof(): bool {
return $this->posByte >= $this->lenByte;
}
public function chars(): \Generator { public function chars(): \Generator {
while (($c = $this->nextChar()) !== "") { while (($c = $this->nextChar()) !== "") {
yield ($this->posChar - 1) => $c; yield ($this->posChar - 1) => $c;
} }
} }
/** Generates an iterator which steps through each code point in the string */
public function codes(): \Generator { public function codes(): \Generator {
while (($c = $this->nextCode()) !== false) { while (($c = $this->nextCode()) !== false) {
yield ($this->posChar - 1) => $c; yield ($this->posChar - 1) => $c;
@ -155,6 +147,7 @@ trait GenericEncoding {
return [ return [
'posChar' => $this->posChar, 'posChar' => $this->posChar,
'posByte' => $this->posByte, 'posByte' => $this->posByte,
'posErr' => $this->posErr,
]; ];
} }
@ -181,7 +174,7 @@ trait GenericEncoding {
// fatal replacement mode for decoders // fatal replacement mode for decoders
throw new DecoderException("Invalid code sequence at character offset {$data[0]} (byte offset {$data[1]})", self::E_INVALID_BYTE); throw new DecoderException("Invalid code sequence at character offset {$data[0]} (byte offset {$data[1]})", self::E_INVALID_BYTE);
case self::MODE_FATAL_ENC: case self::MODE_FATAL_ENC:
// fatal replacement mode for decoders; not applicable to Unicode transformation formats // fatal replacement mode for encoders; not applicable to Unicode transformation formats
throw new EncoderException("Code point $data not available in target encoding", self::E_UNAVAILABLE_CODE_POINT); throw new EncoderException("Code point $data not available in target encoding", self::E_UNAVAILABLE_CODE_POINT);
default: default:
// indicative of internal bug; should never be triggered // indicative of internal bug; should never be triggered

33
lib/Encoding/SingleByteEncoding.php

@ -9,10 +9,6 @@ namespace MensBeam\Intl\Encoding;
abstract class SingleByteEncoding implements StatelessEncoding { abstract class SingleByteEncoding implements StatelessEncoding {
use GenericEncoding; use GenericEncoding;
/** Retrieve the next character in the string, in UTF-8 encoding
*
* The returned character may be a replacement character, or the empty string if the end of the string has been reached
*/
public function nextChar(): string { public function nextChar(): string {
// get the byte at the current position // get the byte at the current position
$b = @$this->string[$this->posChar]; $b = @$this->string[$this->posChar];
@ -29,12 +25,6 @@ abstract class SingleByteEncoding implements StatelessEncoding {
} }
} }
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
*/
public function nextCode() { public function nextCode() {
// get the byte at the current position // get the byte at the current position
$b = @$this->string[$this->posChar]; $b = @$this->string[$this->posChar];
@ -51,12 +41,6 @@ abstract class SingleByteEncoding implements StatelessEncoding {
} }
} }
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
*/
public static function encode(int $codePoint, bool $fatal = true): string { public static function encode(int $codePoint, bool $fatal = true): string {
if ($codePoint < 0 || $codePoint > 0x10FFFF) { if ($codePoint < 0 || $codePoint > 0x10FFFF) {
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT); throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);
@ -67,12 +51,6 @@ abstract class SingleByteEncoding implements StatelessEncoding {
} }
} }
/** Advance $distance characters through the string
*
* If $distance is negative, the operation will be performed in reverse
*
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
*/
public function seek(int $distance): int { public function seek(int $distance): int {
if ($distance > 0) { if ($distance > 0) {
while ($this->posChar < $this->lenByte && $distance > 0) { while ($this->posChar < $this->lenByte && $distance > 0) {
@ -92,16 +70,15 @@ abstract class SingleByteEncoding implements StatelessEncoding {
} }
} }
/** Returns the current byte position of the decoder */
public function posByte(): int { public function posByte(): int {
return $this->posChar; return $this->posChar;
} }
/** Calculates the length of the string in code points public function lenChar(): int {
*
* Note that this may involve processing to the end of the string
*/
public function len(): int {
return $this->lenByte; return $this->lenByte;
} }
public function eof(): bool {
return $this->posChar >= $this->lenByte;
}
} }

27
lib/Encoding/UTF16.php

@ -8,15 +8,7 @@ namespace MensBeam\Intl\Encoding;
abstract class UTF16 implements Encoding { abstract class UTF16 implements Encoding {
use GenericEncoding; use GenericEncoding;
protected $dirtyEOF = 0;
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
*/
public function nextCode() { public function nextCode() {
$lead_b = null; $lead_b = null;
$lead_s = null; $lead_s = null;
@ -36,6 +28,9 @@ abstract class UTF16 implements Encoding {
if (!is_null($lead_s)) { if (!is_null($lead_s)) {
if ($code >= 0xDC00 && $code <= 0xDFFF) { if ($code >= 0xDC00 && $code <= 0xDFFF) {
return 0x10000 + (($lead_s - 0xD800) << 10) + ($code - 0xDC00); return 0x10000 + (($lead_s - 0xD800) << 10) + ($code - 0xDC00);
} elseif ($this->allowSurrogates) {
$this->posByte -= 2;
return $lead_s;
} else { } else {
$this->posByte -= 2; $this->posByte -= 2;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]); return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]);
@ -45,7 +40,11 @@ abstract class UTF16 implements Encoding {
$lead_s = $code; $lead_s = $code;
continue; continue;
} elseif ($code >= 0xDC00 && $code <= 0xDFFF) { } elseif ($code >= 0xDC00 && $code <= 0xDFFF) {
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]); if ($this->allowSurrogates) {
return $code;
} else {
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]);
}
} else { } else {
return $code; return $code;
} }
@ -65,10 +64,6 @@ abstract class UTF16 implements Encoding {
} }
} }
/** Retrieve the next character in the string, in UTF-8 encoding
*
* The returned character may be a replacement character, or the empty string if the end of the string has been reached
*/
public function nextChar(): string { public function nextChar(): string {
// get the byte at the current position // get the byte at the current position
$b = @$this->string[$this->posByte]; $b = @$this->string[$this->posByte];
@ -83,12 +78,6 @@ abstract class UTF16 implements Encoding {
/** Implements backward seeking $distance characters */ /** Implements backward seeking $distance characters */
protected function seekBack(int $distance): int { protected function seekBack(int $distance): int {
if ($this->posByte >= $this->lenByte && $this->dirtyEOF > 0) {
// if we are at the end of the string and it did not terminate cleanly, go back the correct number of dirty bytes to seek through the last character
$this->posByte -= $this->dirtyEOF;
$distance--;
$this->posChar--;
}
while ($distance > 0 && $this->posByte > 0) { while ($distance > 0 && $this->posByte > 0) {
$distance--; $distance--;
$this->posChar--; $this->posChar--;

16
lib/Encoding/UTF8.php

@ -12,12 +12,6 @@ class UTF8 implements StatelessEncoding {
const NAME = "UTF-8"; const NAME = "UTF-8";
const LABELS = ["unicode-1-1-utf-8", "utf-8", "utf8"]; const LABELS = ["unicode-1-1-utf-8", "utf-8", "utf8"];
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
*/
public function nextCode() { public function nextCode() {
// this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder // this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder
// optimization for ASCII characters // optimization for ASCII characters
@ -46,7 +40,7 @@ class UTF8 implements StatelessEncoding {
if ($b==0xE0) { if ($b==0xE0) {
$lower = 0xA0; $lower = 0xA0;
} elseif ($b==0xED) { } elseif ($b==0xED) {
$upper = 0x9F; $upper = ($this->allowSurrogates) ? 0xBF : 0x9F;
} }
$point = $b & 0xF; $point = $b & 0xF;
} elseif ($b >= 0xF0 && $b <= 0xF4) { // four-byte character } elseif ($b >= 0xF0 && $b <= 0xF4) { // four-byte character
@ -58,9 +52,11 @@ class UTF8 implements StatelessEncoding {
} }
$point = $b & 0x7; $point = $b & 0x7;
} else { // invalid byte } else { // invalid byte
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar, $this->posByte]); return self::err($this->errMode, [$this->posChar, $this->posByte]);
} }
} elseif ($b < $lower || $b > $upper) { } elseif ($b < $lower || $b > $upper) {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar, $this->posByte--]); return self::err($this->errMode, [$this->posChar, $this->posByte--]);
} else { } else {
$lower = 0x80; $lower = 0x80;
@ -72,12 +68,6 @@ class UTF8 implements StatelessEncoding {
return $point; return $point;
} }
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted. When encoding to UTF-8, all Unicode characters can be encoded, so the argument is ignored
*/
public static function encode(int $codePoint, bool $fatal = true): string { public static function encode(int $codePoint, bool $fatal = true): string {
// this function implements https://encoding.spec.whatwg.org/#utf-8-encoder // this function implements https://encoding.spec.whatwg.org/#utf-8-encoder
if ($codePoint < 0 || $codePoint > 0x10FFFF) { if ($codePoint < 0 || $codePoint > 0x10FFFF) {

7
lib/Encoding/XUserDefined.php

@ -88,7 +88,12 @@ class XUserDefined implements Encoding {
* *
* Note that this may involve processing to the end of the string * Note that this may involve processing to the end of the string
*/ */
public function len(): int { public function lenChar(): int {
return $this->lenByte; return $this->lenByte;
} }
/** Returns whether the character pointer is at the end of the string */
public function eof(): bool {
return $this->posChar >= $this->lenByte;
}
} }

16
robo

@ -1,10 +1,14 @@
#! /bin/sh #! /bin/sh
base=`dirname "$0"` base=`dirname "$0"`
roboCommand="$1" roboCommand="$1"
if [ $# -eq 0 ]; then
shift "$base/vendor/bin/robo"
if [ "$1" == "clean" ]; then
"$base/vendor/bin/robo" "$roboCommand" $*
else else
"$base/vendor/bin/robo" "$roboCommand" -- $* shift
fi ulimit -n 2048
if [ "$1" = "clean" ]; then
"$base/vendor/bin/robo" "$roboCommand" "$@"
else
"$base/vendor/bin/robo" "$roboCommand" -- "$@"
fi
fi

6
tests/bootstrap.php

@ -4,10 +4,14 @@
* See LICENSE and AUTHORS files for details */ * See LICENSE and AUTHORS files for details */
declare(strict_types=1); declare(strict_types=1);
namespace MensBeam\UTF8; namespace MensBeam\Intl;
const NS_BASE = __NAMESPACE__."\\"; const NS_BASE = __NAMESPACE__."\\";
define(NS_BASE."BASE", dirname(__DIR__).DIRECTORY_SEPARATOR); define(NS_BASE."BASE", dirname(__DIR__).DIRECTORY_SEPARATOR);
ini_set("memory_limit", "-1"); ini_set("memory_limit", "-1");
error_reporting(\E_ALL); error_reporting(\E_ALL);
require_once BASE."vendor".DIRECTORY_SEPARATOR."autoload.php"; require_once BASE."vendor".DIRECTORY_SEPARATOR."autoload.php";
if (function_exists("xdebug_set_filter")) {
xdebug_set_filter(\XDEBUG_FILTER_CODE_COVERAGE, \XDEBUG_PATH_WHITELIST, [BASE."lib/"]);
}

12
tests/cases/Encoding/TestBig5.php

@ -76,6 +76,7 @@ class TestBig5 extends \MensBeam\Intl\Test\CoderDecoderTest {
/** /**
* @covers MensBeam\Intl\Encoding\Big5::posChar * @covers MensBeam\Intl\Encoding\Big5::posChar
* @covers MensBeam\Intl\Encoding\Big5::posByte * @covers MensBeam\Intl\Encoding\Big5::posByte
* @covers MensBeam\Intl\Encoding\Big5::eof
*/ */
public function testTraversePastTheEndOfAString() { public function testTraversePastTheEndOfAString() {
return parent::testTraversePastTheEndOfAString(); return parent::testTraversePastTheEndOfAString();
@ -101,7 +102,8 @@ class TestBig5 extends \MensBeam\Intl\Test\CoderDecoderTest {
/** /**
* @dataProvider provideStrings * @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\Big5::len * @covers MensBeam\Intl\Encoding\Big5::lenChar
* @covers MensBeam\Intl\Encoding\Big5::lenByte
* @covers MensBeam\Intl\Encoding\Big5::stateSave * @covers MensBeam\Intl\Encoding\Big5::stateSave
* @covers MensBeam\Intl\Encoding\Big5::stateApply * @covers MensBeam\Intl\Encoding\Big5::stateApply
*/ */
@ -126,6 +128,14 @@ class TestBig5 extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testIterateThroughAString($input, $exp); return parent::testIterateThroughAString($input, $exp);
} }
/**
* @dataProvider provideStrings
* @coversNothing
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
public function provideCodePoints() { public function provideCodePoints() {
return [ return [
'U+0064 (HTML)' => [false, 0x64, "64"], 'U+0064 (HTML)' => [false, 0x64, "64"],

12
tests/cases/Encoding/TestEUCKR.php

@ -76,6 +76,7 @@ class TestEUCKR extends \MensBeam\Intl\Test\CoderDecoderTest {
/** /**
* @covers MensBeam\Intl\Encoding\EUCKR::posChar * @covers MensBeam\Intl\Encoding\EUCKR::posChar
* @covers MensBeam\Intl\Encoding\EUCKR::posByte * @covers MensBeam\Intl\Encoding\EUCKR::posByte
* @covers MensBeam\Intl\Encoding\EUCKR::eof
*/ */
public function testTraversePastTheEndOfAString() { public function testTraversePastTheEndOfAString() {
return parent::testTraversePastTheEndOfAString(); return parent::testTraversePastTheEndOfAString();
@ -101,7 +102,8 @@ class TestEUCKR extends \MensBeam\Intl\Test\CoderDecoderTest {
/** /**
* @dataProvider provideStrings * @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\EUCKR::len * @covers MensBeam\Intl\Encoding\EUCKR::lenChar
* @covers MensBeam\Intl\Encoding\EUCKR::lenByte
* @covers MensBeam\Intl\Encoding\EUCKR::stateSave * @covers MensBeam\Intl\Encoding\EUCKR::stateSave
* @covers MensBeam\Intl\Encoding\EUCKR::stateApply * @covers MensBeam\Intl\Encoding\EUCKR::stateApply
*/ */
@ -126,6 +128,14 @@ class TestEUCKR extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testIterateThroughAString($input, $exp); return parent::testIterateThroughAString($input, $exp);
} }
/**
* @dataProvider provideStrings
* @coversNothing
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
public function provideCodePoints() { public function provideCodePoints() {
return [ return [
'U+0064 (HTML)' => [false, 0x64, "64"], 'U+0064 (HTML)' => [false, 0x64, "64"],

14
tests/cases/Encoding/TestGB18030.php

@ -29,7 +29,7 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest {
/* This string contains an invalid character sequence sandwiched between two null characters */ /* This string contains an invalid character sequence sandwiched between two null characters */
protected $brokenChar = "00 FF 00"; protected $brokenChar = "00 FF 00";
public function tearDown() { public function tearDown(): void {
$this->testedClass = GB18030::class; $this->testedClass = GB18030::class;
} }
@ -84,6 +84,7 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest {
/** /**
* @covers MensBeam\Intl\Encoding\GB18030::posChar * @covers MensBeam\Intl\Encoding\GB18030::posChar
* @covers MensBeam\Intl\Encoding\GB18030::posByte * @covers MensBeam\Intl\Encoding\GB18030::posByte
* @covers MensBeam\Intl\Encoding\GB18030::eof
*/ */
public function testTraversePastTheEndOfAString() { public function testTraversePastTheEndOfAString() {
return parent::testTraversePastTheEndOfAString(); return parent::testTraversePastTheEndOfAString();
@ -109,7 +110,8 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest {
/** /**
* @dataProvider provideStrings * @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\GB18030::len * @covers MensBeam\Intl\Encoding\GB18030::lenChar
* @covers MensBeam\Intl\Encoding\GB18030::lenByte
* @covers MensBeam\Intl\Encoding\GB18030::stateSave * @covers MensBeam\Intl\Encoding\GB18030::stateSave
* @covers MensBeam\Intl\Encoding\GB18030::stateApply * @covers MensBeam\Intl\Encoding\GB18030::stateApply
*/ */
@ -134,6 +136,14 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testIterateThroughAString($input, $exp); return parent::testIterateThroughAString($input, $exp);
} }
/**
* @dataProvider provideStrings
* @coversNothing
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
public function provideCodePoints() { public function provideCodePoints() {
// bytes confirmed using Firefox // bytes confirmed using Firefox
$series_gb18030 = [ $series_gb18030 = [

13
tests/cases/Encoding/TestSingleByte.php

@ -139,6 +139,7 @@ class TestSingleByte extends \MensBeam\Intl\Test\CoderDecoderTest {
* @dataProvider provideClasses * @dataProvider provideClasses
* @covers MensBeam\Intl\Encoding\SingleByteEncoding::posChar * @covers MensBeam\Intl\Encoding\SingleByteEncoding::posChar
* @covers MensBeam\Intl\Encoding\SingleByteEncoding::posByte * @covers MensBeam\Intl\Encoding\SingleByteEncoding::posByte
* @covers MensBeam\Intl\Encoding\SingleByteEncoding::eof
*/ */
public function testTraversePastTheEndOfAString(string $class = SingleByteEncoding::class) { public function testTraversePastTheEndOfAString(string $class = SingleByteEncoding::class) {
$this->testedClass = $class; $this->testedClass = $class;
@ -173,7 +174,8 @@ class TestSingleByte extends \MensBeam\Intl\Test\CoderDecoderTest {
/** /**
* @dataProvider provideStrings * @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\SingleByteEncoding::len * @covers MensBeam\Intl\Encoding\SingleByteEncoding::lenChar
* @covers MensBeam\Intl\Encoding\SingleByteEncoding::lenByte
* @covers MensBeam\Intl\Encoding\SingleByteEncoding::stateSave * @covers MensBeam\Intl\Encoding\SingleByteEncoding::stateSave
* @covers MensBeam\Intl\Encoding\SingleByteEncoding::stateApply * @covers MensBeam\Intl\Encoding\SingleByteEncoding::stateApply
*/ */
@ -203,6 +205,15 @@ class TestSingleByte extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testIterateThroughAString($input, $exp); return parent::testIterateThroughAString($input, $exp);
} }
/**
* @dataProvider provideStrings
* @coversNothing
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $exp, $class = null) {
$this->testedClass = $class;
return parent::testIterateThroughAStringAllowingSurrogates($input, $exp, $exp);
}
public function provideClasses() { public function provideClasses() {
foreach (self::$classes as $name => $class) { foreach (self::$classes as $name => $class) {
yield $name => [$class]; yield $name => [$class];

8
tests/cases/Encoding/TestUTF16BE.php

@ -6,7 +6,6 @@
declare(strict_types=1); declare(strict_types=1);
namespace MensBeam\Intl\TestCase\Encoding; namespace MensBeam\Intl\TestCase\Encoding;
use MensBeam\Intl\Encoding\UTF16LE;
use MensBeam\Intl\Encoding\UTF16BE; use MensBeam\Intl\Encoding\UTF16BE;
class TestUTF16BE extends TestUTF16LE { class TestUTF16BE extends TestUTF16LE {
@ -30,7 +29,10 @@ class TestUTF16BE extends TestUTF16LE {
public function provideStrings() { public function provideStrings() {
foreach (parent::provideStrings() as $name => $test) { foreach (parent::provideStrings() as $name => $test) {
list($string, $codes) = $test; if (sizeof($test) == 2) {
$test[] = null;
}
list($string, $codes, $altCodes) = $test;
$words = explode(" ", $string); $words = explode(" ", $string);
foreach ($words as $a => $word) { foreach ($words as $a => $word) {
if (strlen($word) == 4) { if (strlen($word) == 4) {
@ -38,7 +40,7 @@ class TestUTF16BE extends TestUTF16LE {
} }
} }
$string = implode(" ", $words); $string = implode(" ", $words);
yield $name => [$string, $codes]; yield $name => [$string, $codes, $altCodes];
} }
} }
} }

21
tests/cases/Encoding/TestUTF16LE.php

@ -7,7 +7,6 @@ declare(strict_types=1);
namespace MensBeam\Intl\TestCase\Encoding; namespace MensBeam\Intl\TestCase\Encoding;
use MensBeam\Intl\Encoding\UTF16LE; use MensBeam\Intl\Encoding\UTF16LE;
use MensBeam\Intl\Encoding\UTF16BE;
class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest { class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
protected $testedClass = UTF16LE::class; protected $testedClass = UTF16LE::class;
@ -67,6 +66,7 @@ class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
/** /**
* @covers MensBeam\Intl\Encoding\UTF16::posChar * @covers MensBeam\Intl\Encoding\UTF16::posChar
* @covers MensBeam\Intl\Encoding\UTF16::posByte * @covers MensBeam\Intl\Encoding\UTF16::posByte
* @covers MensBeam\Intl\Encoding\UTF16::eof
*/ */
public function testTraversePastTheEndOfAString() { public function testTraversePastTheEndOfAString() {
return parent::testTraversePastTheEndOfAString(); return parent::testTraversePastTheEndOfAString();
@ -92,7 +92,8 @@ class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
/** /**
* @dataProvider provideStrings * @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\UTF16::len * @covers MensBeam\Intl\Encoding\UTF16::lenChar
* @covers MensBeam\Intl\Encoding\UTF16::lenByte
* @covers MensBeam\Intl\Encoding\UTF16::stateSave * @covers MensBeam\Intl\Encoding\UTF16::stateSave
* @covers MensBeam\Intl\Encoding\UTF16::stateApply * @covers MensBeam\Intl\Encoding\UTF16::stateApply
*/ */
@ -117,6 +118,14 @@ class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
return parent::testIterateThroughAString($input, $exp); return parent::testIterateThroughAString($input, $exp);
} }
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\UTF16::nextCode
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
public function provideStrings() { public function provideStrings() {
return [ return [
// control samples // control samples
@ -128,10 +137,10 @@ class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
'EOF after lead surrogate' => ["0000 34D8", [0, 65533]], 'EOF after lead surrogate' => ["0000 34D8", [0, 65533]],
'EOF in trail surrogate' => ["0000 34D8 1E", [0, 65533]], 'EOF in trail surrogate' => ["0000 34D8 1E", [0, 65533]],
// invalid UTF-16 surrogates // invalid UTF-16 surrogates
'lead surrogate without trail' => ["34D8 0000", [65533, 0]], 'lead surrogate without trail' => ["34D8 0000", [65533, 0], [0xD834, 0]],
'trail surrogate without lead' => ["1EDD 0000", [65533, 0]], 'trail surrogate without lead' => ["1EDD 0000", [65533, 0], [0xDD1E, 0]],
'double lead surrogate' => ["34D8 34D8 1EDD", [65533, 119070]], 'double lead surrogate' => ["34D8 34D8 1EDD", [65533, 119070], [0xD834, 119070]],
'double trail surrogate' => ["34D8 1EDD 1EDD", [119070, 65533]], 'double trail surrogate' => ["34D8 1EDD 1EDD", [119070, 65533], [119070, 0xDD1E]],
]; ];
} }
} }

19
tests/cases/Encoding/TestUTF8.php

@ -76,6 +76,7 @@ class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest {
/** /**
* @covers MensBeam\Intl\Encoding\UTF8::posChar * @covers MensBeam\Intl\Encoding\UTF8::posChar
* @covers MensBeam\Intl\Encoding\UTF8::posByte * @covers MensBeam\Intl\Encoding\UTF8::posByte
* @covers MensBeam\Intl\Encoding\UTF8::eof
*/ */
public function testTraversePastTheEndOfAString() { public function testTraversePastTheEndOfAString() {
return parent::testTraversePastTheEndOfAString(); return parent::testTraversePastTheEndOfAString();
@ -101,7 +102,8 @@ class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest {
/** /**
* @dataProvider provideStrings * @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\UTF8::len * @covers MensBeam\Intl\Encoding\UTF8::lenChar
* @covers MensBeam\Intl\Encoding\UTF8::lenByte
* @covers MensBeam\Intl\Encoding\UTF8::stateSave * @covers MensBeam\Intl\Encoding\UTF8::stateSave
* @covers MensBeam\Intl\Encoding\UTF8::stateApply * @covers MensBeam\Intl\Encoding\UTF8::stateApply
*/ */
@ -126,6 +128,14 @@ class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testIterateThroughAString($input, $exp); return parent::testIterateThroughAString($input, $exp);
} }
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\UTF8::nextCode
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
public function provideCodePoints() { public function provideCodePoints() {
return [ return [
'U+007A (HTML)' => [false, 0x7A, "7A"], 'U+007A (HTML)' => [false, 0x7A, "7A"],
@ -188,9 +198,10 @@ class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest {
'overlong U+10FFFF - 5 bytes' => ["F8 84 8F BF BF", [65533, 65533, 65533, 65533, 65533]], 'overlong U+10FFFF - 5 bytes' => ["F8 84 8F BF BF", [65533, 65533, 65533, 65533, 65533]],
'overlong U+10FFFF - 6 bytes' => ["FC 80 84 8F BF BF", [65533, 65533, 65533, 65533, 65533, 65533]], 'overlong U+10FFFF - 6 bytes' => ["FC 80 84 8F BF BF", [65533, 65533, 65533, 65533, 65533, 65533]],
// UTF-16 surrogates // UTF-16 surrogates
'lead surrogate' => ["ED A0 80", [65533, 65533, 65533]], // surrogates have alternate outputs for when surrogates are being allowed
'trail surrogate' => ["ED B0 80", [65533, 65533, 65533]], 'lead surrogate' => ["ED A0 80", [65533, 65533, 65533], [0xD800]],
'surrogate pair' => ["ED A0 80 ED B0 80", [65533, 65533, 65533, 65533, 65533, 65533]], 'trail surrogate' => ["ED B0 80", [65533, 65533, 65533], [0xDC00]],
'surrogate pair' => ["ED A0 80 ED B0 80", [65533, 65533, 65533, 65533, 65533, 65533], [0xD800, 0xDC00]],
// self-sync edge cases // self-sync edge cases
'trailing continuation' => ["0A 80 80", [10, 65533, 65533]], 'trailing continuation' => ["0A 80 80", [10, 65533, 65533]],
'trailing continuation 2' => ["E5 8F A4 80", [21476, 65533]], 'trailing continuation 2' => ["E5 8F A4 80", [21476, 65533]],

12
tests/cases/Encoding/TestXUserDefined.php

@ -57,6 +57,7 @@ class TestXUserDefined extends \MensBeam\Intl\Test\DecoderTest {
/** /**
* @covers MensBeam\Intl\Encoding\XUserDefined::posChar * @covers MensBeam\Intl\Encoding\XUserDefined::posChar
* @covers MensBeam\Intl\Encoding\XUserDefined::posByte * @covers MensBeam\Intl\Encoding\XUserDefined::posByte
* @covers MensBeam\Intl\Encoding\XUserDefined::eof
*/ */
public function testTraversePastTheEndOfAString() { public function testTraversePastTheEndOfAString() {
return parent::testTraversePastTheEndOfAString(); return parent::testTraversePastTheEndOfAString();
@ -82,7 +83,8 @@ class TestXUserDefined extends \MensBeam\Intl\Test\DecoderTest {
/** /**
* @dataProvider provideStrings * @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\XUserDefined::len * @covers MensBeam\Intl\Encoding\XUserDefined::lenChar
* @covers MensBeam\Intl\Encoding\XUserDefined::lenByte
* @covers MensBeam\Intl\Encoding\XUserDefined::stateSave * @covers MensBeam\Intl\Encoding\XUserDefined::stateSave
* @covers MensBeam\Intl\Encoding\XUserDefined::stateApply * @covers MensBeam\Intl\Encoding\XUserDefined::stateApply
*/ */
@ -107,6 +109,14 @@ class TestXUserDefined extends \MensBeam\Intl\Test\DecoderTest {
return parent::testIterateThroughAString($input, $exp); return parent::testIterateThroughAString($input, $exp);
} }
/**
* @dataProvider provideStrings
* @coversNothing
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
public function provideStrings() { public function provideStrings() {
$a_bytes = []; $a_bytes = [];
$a_codes = []; $a_codes = [];

56
tests/cases/TestEncoding.php

@ -0,0 +1,56 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Intl\TestCase;
use MensBeam\Intl\Encoding;
class TestEncoding extends \PHPUnit\Framework\TestCase {
/** @dataProvider provideLabelData */
public function testMatchALabelToAnEncoding(string $label, array $exp) {
$this->assertSame($exp, Encoding::matchLabel($label));
$this->assertSame($exp, Encoding::matchLabel(strtoupper($label)));
$this->assertSame($exp, Encoding::matchLabel(" $label\n\n\r\t"));
}
public function testFailToMatchALabelToAnEncoding() {
$this->assertNull(Encoding::matchLabel("Not a label"));
}
/** @dataProvider provideLabelData */
public function testCreateADecoderFromALabel(string $label, array $data) {
$this->assertInstanceOf($data['class'], Encoding::createDecoder($label, ""));
$this->assertInstanceOf($data['class'], Encoding::createDecoder(strtoupper($label), ""));
$this->assertInstanceOf($data['class'], Encoding::createDecoder(" $label\n\n\r\t", ""));
}
public function testFailToCreateADecoderFromALabel() {
$this->assertNull(Encoding::createDecoder("Not a label", ""));
}
public function provideLabelData() {
$ns = "MensBeam\\Intl\\Encoding\\";
$labels = [];
$names = [];
foreach (new \GlobIterator(\MensBeam\Intl\BASE."/lib/Encoding/*.php", \FilesystemIterator::CURRENT_AS_PATHNAME) as $file) {
$file = basename($file, ".php");
$className = $ns.$file;
$class = new \ReflectionClass($className);
if ($class->implementsInterface(\MensBeam\Intl\Encoding\Encoding::class) && $class->isInstantiable()) {
$name = $class->getConstant("NAME");
$names[$name] = $className;
foreach ($class->getConstant("LABELS") as $label) {
$labels[$label] = $name;
}
}
}
$out = [];
foreach ($labels as $label => $name) {
$out[] = [(string) $label, ['label' => (string) $label, 'name' => $name, 'class' => $names[$name]]];
}
return $out;
}
}

23
tests/lib/DecoderTest.php

@ -120,26 +120,32 @@ abstract class DecoderTest extends \PHPUnit\Framework\TestCase {
$l = strlen($this->lowerA); $l = strlen($this->lowerA);
$this->assertSame(0, $s->posChar()); $this->assertSame(0, $s->posChar());
$this->assertSame(0, $s->posByte()); $this->assertSame(0, $s->posByte());
$this->assertFalse($s->eof());
$this->assertSame("a", $s->nextChar()); $this->assertSame("a", $s->nextChar());
$this->assertSame(1, $s->posChar()); $this->assertSame(1, $s->posChar());
$this->assertSame($l, $s->posByte()); $this->assertSame($l, $s->posByte());
$this->assertTrue($s->eof());
$this->assertSame("", $s->nextChar()); $this->assertSame("", $s->nextChar());
$this->assertSame(1, $s->posChar()); $this->assertSame(1, $s->posChar());
$this->assertSame($l, $s->posByte()); $this->assertSame($l, $s->posByte());
$this->assertTrue($s->eof());
$s = new $class($this->lowerA); $s = new $class($this->lowerA);
$this->assertSame(0, $s->posChar()); $this->assertSame(0, $s->posChar());
$this->assertSame(0, $s->posByte()); $this->assertSame(0, $s->posByte());
$this->assertFalse($s->eof());
$this->assertSame(ord("a"), $s->nextCode()); $this->assertSame(ord("a"), $s->nextCode());
$this->assertSame(1, $s->posChar()); $this->assertSame(1, $s->posChar());
$this->assertSame($l, $s->posByte()); $this->assertSame($l, $s->posByte());
$this->assertTrue($s->eof());
$this->assertSame(false, $s->nextCode()); $this->assertSame(false, $s->nextCode());
$this->assertSame(1, $s->posChar()); $this->assertSame(1, $s->posChar());
$this->assertSame($l, $s->posByte()); $this->assertSame($l, $s->posByte());
$this->assertTrue($s->eof());
} }
public function testPeekAtCharacters() { public function testPeekAtCharacters() {
@ -220,7 +226,10 @@ abstract class DecoderTest extends \PHPUnit\Framework\TestCase {
$posChar = $s->posChar(); $posChar = $s->posChar();
$posByte = $s->posByte(); $posByte = $s->posByte();
$this->assertSame(sizeof($points), $s->len()); $this->assertSame(sizeof($points), $s->lenChar());
$this->assertSame($posChar, $s->posChar());
$this->assertSame($posByte, $s->posByte());
$this->assertSame(strlen($input), $s->lenByte());
$this->assertSame($posChar, $s->posChar()); $this->assertSame($posChar, $s->posChar());
$this->assertSame($posByte, $s->posByte()); $this->assertSame($posByte, $s->posByte());
} }
@ -272,10 +281,18 @@ abstract class DecoderTest extends \PHPUnit\Framework\TestCase {
} }
public function testIterateThroughAString(string $input, array $exp) { public function testIterateThroughAString(string $input, array $exp) {
$this->iterateThroughAString($input, $exp, false);
}
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
$exp = $relaxedExp ?? $strictExp;
$this->iterateThroughAString($input, $exp, true);
}
protected function iterateThroughAString(string $input, array $exp, bool $allowSurrogates) {
$class = $this->testedClass; $class = $this->testedClass;
$input = $this->prepString($input); $input = $this->prepString($input);
$s = new $class($input); $s = new $class($input, false, $allowSurrogates);
$out = [];
$a = 0; $a = 0;
$this->assertTrue(true); // prevent risky test of empty string $this->assertTrue(true); // prevent risky test of empty string
foreach ($s->codes() as $index => $p) { foreach ($s->codes() as $index => $p) {

3
tests/phpunit.xml

@ -7,7 +7,6 @@
convertWarningsToExceptions="false" convertWarningsToExceptions="false"
beStrictAboutTestsThatDoNotTestAnything="true" beStrictAboutTestsThatDoNotTestAnything="true"
beStrictAboutOutputDuringTests="true" beStrictAboutOutputDuringTests="true"
beStrictAboutTestSize="true"
stopOnError="true"> stopOnError="true">
<filter> <filter>
@ -27,6 +26,8 @@
<file>cases/Encoding/TestGB18030.php</file> <file>cases/Encoding/TestGB18030.php</file>
<file>cases/Encoding/TestBig5.php</file> <file>cases/Encoding/TestBig5.php</file>
<file>cases/Encoding/TestEUCKR.php</file> <file>cases/Encoding/TestEUCKR.php</file>
<file>cases/TestEncoding.php</file>
</testsuite> </testsuite>
</testsuites> </testsuites>
</phpunit> </phpunit>

3
tools/mkindex.php

@ -1,5 +1,8 @@
<?php <?php
declare(strict_types=1); declare(strict_types=1);
// This script produces the index lookup tables
// for a given encoding from the source data at WHATWG
$labels = [ $labels = [
'big5' => "big5", 'big5' => "big5",
'euc-jp' => "eucjp", 'euc-jp' => "eucjp",

40
tools/mklabels.php

@ -0,0 +1,40 @@
<?php
// this script read and names and labels from each concrete
// class in the Encoding set and generates tables mapping labels
// to names and names to classes
use MensBeam\Intl\Encoding\Encoding;
define("BASE", dirname(__DIR__).DIRECTORY_SEPARATOR);
require_once BASE."vendor".DIRECTORY_SEPARATOR."autoload.php";
$ns = "\\MensBeam\\Intl\\Encoding\\";
$labels = [];
$names = [];
foreach (new \GlobIterator(BASE."/lib/Encoding/*.php", \FilesystemIterator::CURRENT_AS_PATHNAME) as $file) {
$file = basename($file, ".php");
$className = $ns.$file;
$class = new \ReflectionClass($className);
if ($class->implementsInterface(Encoding::class) && $class->isInstantiable()) {
$name = $class->getConstant("NAME");
$names[$name] = $className;
foreach ($class->getConstant("LABELS") as $label) {
$labels[$label] = $name;
}
}
}
$labelList = [];
foreach ($labels as $k => $v) {
$labelList[] = "'$k'=>\"$v\"";
}
$labelList = "const LABEL_MAP = [".implode(",", $labelList)."];";
$nameList = [];
foreach ($names as $k => $v) {
$nameList[] = "'$k'=>$v::class";
}
$nameList = "const NAME_MAP = [".implode(",", $nameList)."];";
echo "$labelList\n";
echo "$nameList\n";

633
vendor-bin/csfixer/composer.lock

File diff suppressed because it is too large

2
vendor-bin/phpunit/composer.json

@ -1,5 +1,5 @@
{ {
"require": { "require": {
"phpunit/phpunit": "^6.5" "phpunit/phpunit": "^8.5"
} }
} }

590
vendor-bin/phpunit/composer.lock

File diff suppressed because it is too large

676
vendor-bin/robo/composer.lock

File diff suppressed because it is too large
Loading…
Cancel
Save