Browse Source

Merge branch 'master' into multi-byte

multi-byte
J. King 4 years ago
parent
commit
f49d632642
  1. 1
      .gitignore
  2. 33
      CHANGELOG
  3. 86
      RoboFile.php
  4. 14
      composer.lock
  5. 36
      lib/Encoding.php
  6. 22
      lib/Encoding/Big5.php
  7. 25
      lib/Encoding/EUCKR.php
  8. 30
      lib/Encoding/Encoding.php
  9. 27
      lib/Encoding/GBCommon.php
  10. 55
      lib/Encoding/GenericEncoding.php
  11. 33
      lib/Encoding/SingleByteEncoding.php
  12. 27
      lib/Encoding/UTF16.php
  13. 16
      lib/Encoding/UTF8.php
  14. 7
      lib/Encoding/XUserDefined.php
  15. 16
      robo
  16. 6
      tests/bootstrap.php
  17. 12
      tests/cases/Encoding/TestBig5.php
  18. 12
      tests/cases/Encoding/TestEUCKR.php
  19. 14
      tests/cases/Encoding/TestGB18030.php
  20. 13
      tests/cases/Encoding/TestSingleByte.php
  21. 8
      tests/cases/Encoding/TestUTF16BE.php
  22. 21
      tests/cases/Encoding/TestUTF16LE.php
  23. 19
      tests/cases/Encoding/TestUTF8.php
  24. 12
      tests/cases/Encoding/TestXUserDefined.php
  25. 56
      tests/cases/TestEncoding.php
  26. 23
      tests/lib/DecoderTest.php
  27. 3
      tests/phpunit.xml
  28. 3
      tools/mkindex.php
  29. 40
      tools/mklabels.php
  30. 633
      vendor-bin/csfixer/composer.lock
  31. 2
      vendor-bin/phpunit/composer.json
  32. 590
      vendor-bin/phpunit/composer.lock
  33. 676
      vendor-bin/robo/composer.lock

1
.gitignore

@ -1,5 +1,6 @@
vendor/
tests/coverage/
/tests/.phpunit.result.cache
perf/docs/
.php_cs.cache

33
CHANGELOG

@ -1,3 +1,36 @@
Version 0.7.0 (2019-12-20)
==========================
New features:
- Added \MensBeam\Intl\Encoding abstract class with createDecoder() and
matchLabel() static methods
Version 0.6.0 (2019-12-18)
==========================
New features:
- Added $allowSurrogates parameter to Encoding constructor
- Added posErr public instance property to Encoding
Version 0.5.0 (2019-12-13)
==========================
Breaking changes:
- Rename Encoding::len() to Encoding::lenChar()
New features:
- Add Encoding::lenByte() method
- Add Encoding::eof() method
Version 0.4.0 (2018-09-15)
==========================
New features:
- Implemention of UTF-16 encoding
- Implemention of Big5 encoding
- Implemention of EUC-KR encoding
- Implemention of x-user-defined encoding
Version 0.3.0 (2018-08-29)
==========================

86
RoboFile.php

@ -3,12 +3,21 @@ declare(strict_types=1);
use Robo\Result;
class RoboFile extends \Robo\Tasks {
const BASE = __DIR__.\DIRECTORY_SEPARATOR;
const BASE_TEST = self::BASE."tests".\DIRECTORY_SEPARATOR;
const BASE = __DIR__.\DIRECTORY_SEPARATOR;
const BASE_TEST = BASE."tests".\DIRECTORY_SEPARATOR;
define("IS_WIN", defined("PHP_WINDOWS_VERSION_MAJOR"));
define("IS_MAC", php_uname("s") === "Darwin");
/**
* Runs the typical test suite
function norm(string $path): string {
$out = realpath($path);
if (!$out) {
$out = str_replace(["/", "\\"], \DIRECTORY_SEPARATOR, $path);
}
return $out;
}
class RoboFile extends \Robo\Tasks {
/** Runs the typical test suite
*
* Arguments passed to the task are passed on to PHPUnit. Thus one may, for
* example, run the following command and get the expected results:
@ -18,17 +27,16 @@ class RoboFile extends \Robo\Tasks {
* Please see the PHPUnit documentation for available options.
*/
public function test(array $args): Result {
return $this->runTests("php", "typical", $args);
return $this->runTests(escapeshellarg(\PHP_BINARY), "typical", $args);
}
/**
* Runs the full test suite
/** Runs the full test suite
*
* This includes pedantic tests which may help to identify problems.
* See help for the "test" task for more details.
*/
public function testFull(array $args): Result {
return $this->runTests("php", "full", $args);
return $this->runTests(escapeshellarg(\PHP_BINARY), "full", $args);
}
/**
@ -37,7 +45,7 @@ class RoboFile extends \Robo\Tasks {
* See help for the "test" task for more details.
*/
public function testQuick(array $args): Result {
return $this->runTests("php", "quick", $args);
return $this->runTests(escapeshellarg(\PHP_BINARY), "quick", $args);
}
/** Produces a code coverage report
@ -53,37 +61,56 @@ class RoboFile extends \Robo\Tasks {
public function coverage(array $args): Result {
// run tests with code coverage reporting enabled
$exec = $this->findCoverageEngine();
return $this->runTests($exec, "typical", array_merge(["--coverage-html", self::BASE_TEST."coverage"], $args));
return $this->runTests($exec, "coverage", array_merge(["--coverage-html", BASE_TEST."coverage"], $args));
}
/** Runs a performance evaluation.
/** Produces a code coverage report, with redundant tests
*
* The performance of the library's basic functionality is tested against
* the IntlCodePointBreakIterator class
* Depending on the environment, some tests that normally provide
* coverage may be skipped, while working alternatives are normally
* suppressed for reasons of time. This coverage report will try to
* run all tests which may cover code.
*
* See also help for the "coverage" task for more details.
*/
public function perf(array $args): Result {
$execpath = realpath(self::BASE."perf/perf.php");
return $this->taskExec("php")->arg($execpath)->args($args)->run();
public function coverageFull(array $args): Result {
// run tests with code coverage reporting enabled
$exec = $this->findCoverageEngine();
return $this->runTests($exec, "typical", array_merge(["--coverage-html", BASE_TEST."coverage"], $args));
}
/** Runs the coding standards fixer */
public function clean($opts = ['demo|d' => false]): Result {
$t = $this->taskExec(realpath(self::BASE."vendor/bin/php-cs-fixer"));
$t->arg("fix")->arg("--allow-risky=yes");
$t = $this->taskExec(norm(BASE."vendor/bin/php-cs-fixer"));
$t->arg("fix");
if ($opts['demo']) {
$t->args("--dry-run", "--diff")->option("--diff-format", "udiff");
}
return $t->run();
}
/** Runs a performance evaluation.
*
* The performance of the library's basic functionality is tested against
* the IntlCodePointBreakIterator class
*/
public function perf(array $args): Result {
$execpath = realpath(norm(BASE."perf/perf.php"));
return $this->taskExec("php")->arg($execpath)->args($args)->run();
}
protected function findCoverageEngine(): string {
$null = null;
$code = 0;
exec("phpdbg --version", $null, $code);
if (!$code) {
return "phpdbg -qrr";
if (IS_WIN) {
$dbg = dirname(\PHP_BINARY)."\\phpdbg.exe";
$dbg = file_exists($dbg) ? $dbg : "";
} else {
return "php";
$dbg = trim(`which phpdbg 2>/dev/null`);
}
if ($dbg) {
return escapeshellarg($dbg)." -qrr";
} else {
$ext = IS_WIN ? "dll" : (IS_MAC ? "dylib" : "so");
return escapeshellarg(\PHP_BINARY)." -d zend_extension=xdebug.$ext";
}
}
@ -95,14 +122,17 @@ class RoboFile extends \Robo\Tasks {
case "quick":
$set = ["--exclude-group", "optional,slow"];
break;
case "coverage":
$set = ["--exclude-group", "optional,coverageOptional"];
break;
case "full":
$set = [];
break;
default:
throw new \Exception;
}
$execpath = realpath(self::BASE."vendor-bin/phpunit/vendor/phpunit/phpunit/phpunit");
$confpath = realpath(self::BASE_TEST."phpunit.xml");
return $this->taskExec($executor)->arg($execpath)->option("-c", $confpath)->args(array_merge($set, $args))->run();
$execpath = norm(BASE."vendor-bin/phpunit/vendor/phpunit/phpunit/phpunit");
$confpath = realpath(BASE_TEST."phpunit.dist.xml") ?: norm(BASE_TEST."phpunit.xml");
return $this->taskExec($executor)->option("-d", "zend.assertions=1")->arg($execpath)->option("-c", $confpath)->args(array_merge($set, $args))->run();
}
}

14
composer.lock

@ -1,7 +1,7 @@
{
"_readme": [
"This file locks the dependencies of your project to a known state",
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
"content-hash": "ba27aa72527421b04188393db2c8510b",
@ -9,16 +9,16 @@
"packages-dev": [
{
"name": "bamarni/composer-bin-plugin",
"version": "v1.2.0",
"version": "v1.3.0",
"source": {
"type": "git",
"url": "https://github.com/bamarni/composer-bin-plugin.git",
"reference": "62fef740245a85f00665e81ea8f0aa0b72afe6e7"
"reference": "67f9d314dc7ecf7245b8637906e151ccc62b8d24"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/bamarni/composer-bin-plugin/zipball/62fef740245a85f00665e81ea8f0aa0b72afe6e7",
"reference": "62fef740245a85f00665e81ea8f0aa0b72afe6e7",
"url": "https://api.github.com/repos/bamarni/composer-bin-plugin/zipball/67f9d314dc7ecf7245b8637906e151ccc62b8d24",
"reference": "67f9d314dc7ecf7245b8637906e151ccc62b8d24",
"shasum": ""
},
"require": {
@ -26,7 +26,7 @@
},
"require-dev": {
"composer/composer": "dev-master",
"symfony/console": "^2.5 || ^3.0"
"symfony/console": "^2.5 || ^3.0 || ^4.0"
},
"type": "composer-plugin",
"extra": {
@ -44,7 +44,7 @@
"license": [
"MIT"
],
"time": "2017-09-11T13:13:58+00:00"
"time": "2019-03-17T12:38:04+00:00"
}
],
"aliases": [],

36
lib/Encoding.php

File diff suppressed because one or more lines are too long

22
lib/Encoding/Big5.php

File diff suppressed because one or more lines are too long

25
lib/Encoding/EUCKR.php

File diff suppressed because one or more lines are too long

30
lib/Encoding/Encoding.php

@ -19,10 +19,10 @@ interface Encoding {
const E_UNAVAILABLE_CODE_POINT = 4;
/** Constructs a new decoder
*
* If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted
* @param bool $fatal If true, throw enceptions when encountering invalid input. If false, substitute U+FFFD REPLACEMENT CHARACTER instead
* @param bool $allowSurrogates If true, treats surrogate characters as valid input; this only affects UTF-8 and UTF-16 encodings
*/
public function __construct(string $string, bool $fatal = false);
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false);
/** Returns the current byte position of the decoder */
public function posByte(): int;
@ -40,15 +40,15 @@ interface Encoding {
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
* @return int|false
*/
public function nextCode();
/** Advance $distance characters through the string
*
* If $distance is negative, the operation will be performed in reverse
*
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
*
* @param int $distance The number of characters to advance. If negative, the operation will seek back toward the beginning of the string
*/
public function seek(int $distance): int;
@ -58,17 +58,29 @@ interface Encoding {
*/
public function rewind();
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer
*
* @param int $num The number of characters to retrieve
*/
public function peekChar(int $num = 1): string;
/** Retrieves the next $num code points from the string, without advancing the character pointer */
/** Retrieves the next $num code points from the string, without advancing the character pointer
*
* @param int $num The number of code points to retrieve
*/
public function peekCode(int $num = 1): array;
/** Calculates the length of the string in bytes */
public function lenByte(): int;
/** Calculates the length of the string in code points
*
* Note that this may involve processing to the end of the string
*/
public function len(): int;
public function lenChar(): int;
/** Returns whether the character pointer is at the end of the string */
public function eof(): bool;
/** Generates an iterator which steps through each character in the string */
public function chars(): \Generator;

27
lib/Encoding/GBCommon.php

@ -13,14 +13,6 @@ abstract class GBCommon implements StatelessEncoding {
const TABLE_RANGES = [0,36,38,45,50,81,89,95,96,100,103,104,105,109,126,133,148,172,175,179,208,306,307,308,309,310,311,312,313,341,428,443,544,545,558,741,742,749,750,805,819,820,7922,7924,7925,7927,7934,7943,7944,7945,7950,8062,8148,8149,8152,8164,8174,8236,8240,8262,8264,8374,8380,8381,8384,8388,8390,8392,8393,8394,8396,8401,8406,8416,8419,8424,8437,8439,8445,8482,8485,8496,8521,8603,8936,8946,9046,9050,9063,9066,9076,9092,9100,9108,9111,9113,9131,9162,9164,9218,9219,11329,11331,11334,11336,11346,11361,11363,11366,11370,11372,11375,11389,11682,11686,11687,11692,11694,11714,11716,11723,11725,11730,11736,11982,11989,12102,12336,12348,12350,12384,12393,12395,12397,12510,12553,12851,12962,12973,13738,13823,13919,13933,14080,14298,14585,14698,15583,15847,16318,16434,16438,16481,16729,17102,17122,17315,17320,17402,17418,17859,17909,17911,17915,17916,17936,17939,17961,18664,18703,18814,18962,19043,33469,33470,33471,33484,33485,33490,33497,33501,33505,33513,33520,33536,33550,37845,37921,37948,38029,38038,38064,38065,38066,38069,38075,38076,38078,39108,39109,39113,39114,39115,39116,39265,39394,39420,189000,1237576];
const TABLE_OFFSETS = [128,165,169,178,184,216,226,235,238,244,248,251,253,258,276,284,300,325,329,334,364,463,465,467,469,471,473,475,477,506,594,610,712,716,730,930,938,962,970,1026,1104,1106,8209,8215,8218,8222,8231,8241,8244,8246,8252,8365,8452,8454,8458,8471,8482,8556,8570,8596,8602,8713,8720,8722,8726,8731,8737,8740,8742,8748,8751,8760,8766,8777,8781,8787,8802,8808,8816,8854,8858,8870,8896,8979,9322,9372,9548,9588,9616,9622,9634,9652,9662,9672,9676,9680,9702,9735,9738,9793,9795,11906,11909,11913,11917,11928,11944,11947,11951,11956,11960,11964,11979,12284,12292,12312,12319,12330,12351,12436,12447,12535,12543,12586,12842,12850,12964,13200,13215,13218,13253,13263,13267,13270,13384,13428,13727,13839,13851,14617,14703,14801,14816,14964,15183,15471,15585,16471,16736,17208,17325,17330,17374,17623,17997,18018,18212,18218,18301,18318,18760,18811,18814,18820,18823,18844,18848,18872,19576,19620,19738,19887,40870,59244,59336,59367,59413,59417,59423,59431,59437,59443,59452,59460,59478,59493,63789,63866,63894,63976,63986,64016,64018,64021,64025,64034,64037,64042,65074,65093,65107,65112,65127,65132,65375,65510,null,65536,1114112];
protected $dirtyEOF = 0;
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
*/
public function nextCode() {
$first = 0;
$second = 0;
@ -37,6 +29,7 @@ abstract class GBCommon implements StatelessEncoding {
$first = $b;
continue;
} else {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
}
} elseif ($second === 0) {
@ -49,8 +42,10 @@ abstract class GBCommon implements StatelessEncoding {
$pointer = ($first - 0x81) * 190 + ($b - $offset);
return self::TABLE_GBK[$pointer];
} elseif ($b < 0x80) {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, --$this->posByte]);
} else {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
}
}
@ -60,6 +55,7 @@ abstract class GBCommon implements StatelessEncoding {
continue;
} else {
$this->posByte -= 2;
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
}
} else {
@ -79,10 +75,12 @@ abstract class GBCommon implements StatelessEncoding {
if (isset($codePointOffset)) {
return $codePointOffset + $pointer - $offset;
} else {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
}
} else {
$this->posByte -= 3;
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
}
}
@ -95,16 +93,11 @@ abstract class GBCommon implements StatelessEncoding {
} else {
// dirty EOF; note how many bytes the last character had
$this->dirtyEOF = ($third ? 3 : ($second ? 2 : 1));
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]);
}
}
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
*/
public static function encode(int $codePoint, bool $fatal = true): string {
if ($codePoint < 0 || $codePoint > 0x10FFFF) {
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);
@ -148,12 +141,6 @@ abstract class GBCommon implements StatelessEncoding {
/** Implements backward seeking $distance characters */
protected function seekBack(int $distance): int {
if ($this->posByte == $this->lenByte && $this->dirtyEOF > 0) {
// if we are at the end of the string and it did not terminate cleanly, go back the correct number of dirty bytes to seek through the last character
$this->posByte -= $this->dirtyEOF;
$distance--;
$this->posChar--;
}
while ($distance > 0 && $this->posByte > 0) {
$distance--;
$this->posChar--;

55
lib/Encoding/GenericEncoding.php

@ -12,41 +12,32 @@ trait GenericEncoding {
protected $posChar = 0;
protected $lenByte = null;
protected $lenChar = null;
protected $dirtyEOF = 0;
protected $errMode = self::MODE_REPLACE;
protected $allowSurrogates = false;
/** Constructs a new decoder
*
* If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted
*/
public function __construct(string $string, bool $fatal = false) {
public $posErr = 0;
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) {
$this->string = $string;
$this->lenByte = strlen($string);
$this->errMode = $fatal ? self::MODE_FATAL_DEC : self::MODE_REPLACE;
$this->allowSurrogates = $allowSurrogates;
}
/** Returns the current byte position of the decoder */
public function posByte(): int {
return $this->posByte;
}
/** Returns the current character position of the decoder */
public function posChar(): int {
return $this->posChar;
}
/** Seeks to the start of the string
*
* This is usually faster than using the seek method for the same purpose
*/
public function rewind() {
$this->posByte = 0;
$this->posChar = 0;
}
/** Retrieve the next character in the string, in UTF-8 encoding
*
* The returned character may be a replacement character, or the empty string if the end of the string has been reached
*/
public function nextChar(): string {
// get the byte at the current position
$b = @$this->string[$this->posByte];
@ -64,12 +55,6 @@ trait GenericEncoding {
}
}
/** Advance $distance characters through the string
*
* If $distance is negative, the operation will be performed in reverse
*
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
*/
public function seek(int $distance): int {
if ($distance > 0) {
if ($this->posByte == strlen($this->string)) {
@ -84,6 +69,13 @@ trait GenericEncoding {
if (!$this->posChar) {
return $distance;
}
if ($this->dirtyEOF > 0) {
// if we are at the end of the string and it did not terminate cleanly, go back the correct number of dirty bytes to seek through the last character
$this->posByte -= $this->dirtyEOF;
$this->dirtyEOF = 0;
$distance--;
$this->posChar--;
}
$mode = $this->errMode;
$this->errMode = self::MODE_NULL;
$out = $this->seekBack($distance);
@ -94,7 +86,6 @@ trait GenericEncoding {
}
}
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */
public function peekChar(int $num = 1): string {
$out = "";
$state = $this->stateSave();
@ -108,7 +99,6 @@ trait GenericEncoding {
return $out;
}
/** Retrieves the next $num code points from the string, without advancing the character pointer */
public function peekCode(int $num = 1): array {
$out = [];
$state = $this->stateSave();
@ -122,11 +112,11 @@ trait GenericEncoding {
return $out;
}
/** Calculates the length of the string in code points
*
* Note that this may involve processing to the end of the string
*/
public function len(): int {
public function lenByte(): int {
return $this->lenByte;
}
public function lenChar(): int {
return $this->lenChar ?? (function() {
$state = $this->stateSave();
while ($this->nextCode() !== false);
@ -136,14 +126,16 @@ trait GenericEncoding {
})();
}
/** Generates an iterator which steps through each character in the string */
public function eof(): bool {
return $this->posByte >= $this->lenByte;
}
public function chars(): \Generator {
while (($c = $this->nextChar()) !== "") {
yield ($this->posChar - 1) => $c;
}
}
/** Generates an iterator which steps through each code point in the string */
public function codes(): \Generator {
while (($c = $this->nextCode()) !== false) {
yield ($this->posChar - 1) => $c;
@ -155,6 +147,7 @@ trait GenericEncoding {
return [
'posChar' => $this->posChar,
'posByte' => $this->posByte,
'posErr' => $this->posErr,
];
}
@ -181,7 +174,7 @@ trait GenericEncoding {
// fatal replacement mode for decoders
throw new DecoderException("Invalid code sequence at character offset {$data[0]} (byte offset {$data[1]})", self::E_INVALID_BYTE);
case self::MODE_FATAL_ENC:
// fatal replacement mode for decoders; not applicable to Unicode transformation formats
// fatal replacement mode for encoders; not applicable to Unicode transformation formats
throw new EncoderException("Code point $data not available in target encoding", self::E_UNAVAILABLE_CODE_POINT);
default:
// indicative of internal bug; should never be triggered

33
lib/Encoding/SingleByteEncoding.php

@ -9,10 +9,6 @@ namespace MensBeam\Intl\Encoding;
abstract class SingleByteEncoding implements StatelessEncoding {
use GenericEncoding;
/** Retrieve the next character in the string, in UTF-8 encoding
*
* The returned character may be a replacement character, or the empty string if the end of the string has been reached
*/
public function nextChar(): string {
// get the byte at the current position
$b = @$this->string[$this->posChar];
@ -29,12 +25,6 @@ abstract class SingleByteEncoding implements StatelessEncoding {
}
}
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
*/
public function nextCode() {
// get the byte at the current position
$b = @$this->string[$this->posChar];
@ -51,12 +41,6 @@ abstract class SingleByteEncoding implements StatelessEncoding {
}
}
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
*/
public static function encode(int $codePoint, bool $fatal = true): string {
if ($codePoint < 0 || $codePoint > 0x10FFFF) {
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);
@ -67,12 +51,6 @@ abstract class SingleByteEncoding implements StatelessEncoding {
}
}
/** Advance $distance characters through the string
*
* If $distance is negative, the operation will be performed in reverse
*
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
*/
public function seek(int $distance): int {
if ($distance > 0) {
while ($this->posChar < $this->lenByte && $distance > 0) {
@ -92,16 +70,15 @@ abstract class SingleByteEncoding implements StatelessEncoding {
}
}
/** Returns the current byte position of the decoder */
public function posByte(): int {
return $this->posChar;
}
/** Calculates the length of the string in code points
*
* Note that this may involve processing to the end of the string
*/
public function len(): int {
public function lenChar(): int {
return $this->lenByte;
}
public function eof(): bool {
return $this->posChar >= $this->lenByte;
}
}

27
lib/Encoding/UTF16.php

@ -8,15 +8,7 @@ namespace MensBeam\Intl\Encoding;
abstract class UTF16 implements Encoding {
use GenericEncoding;
protected $dirtyEOF = 0;
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
*/
public function nextCode() {
$lead_b = null;
$lead_s = null;
@ -36,6 +28,9 @@ abstract class UTF16 implements Encoding {
if (!is_null($lead_s)) {
if ($code >= 0xDC00 && $code <= 0xDFFF) {
return 0x10000 + (($lead_s - 0xD800) << 10) + ($code - 0xDC00);
} elseif ($this->allowSurrogates) {
$this->posByte -= 2;
return $lead_s;
} else {
$this->posByte -= 2;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]);
@ -45,7 +40,11 @@ abstract class UTF16 implements Encoding {
$lead_s = $code;
continue;
} elseif ($code >= 0xDC00 && $code <= 0xDFFF) {
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]);
if ($this->allowSurrogates) {
return $code;
} else {
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]);
}
} else {
return $code;
}
@ -65,10 +64,6 @@ abstract class UTF16 implements Encoding {
}
}
/** Retrieve the next character in the string, in UTF-8 encoding
*
* The returned character may be a replacement character, or the empty string if the end of the string has been reached
*/
public function nextChar(): string {
// get the byte at the current position
$b = @$this->string[$this->posByte];
@ -83,12 +78,6 @@ abstract class UTF16 implements Encoding {
/** Implements backward seeking $distance characters */
protected function seekBack(int $distance): int {
if ($this->posByte >= $this->lenByte && $this->dirtyEOF > 0) {
// if we are at the end of the string and it did not terminate cleanly, go back the correct number of dirty bytes to seek through the last character
$this->posByte -= $this->dirtyEOF;
$distance--;
$this->posChar--;
}
while ($distance > 0 && $this->posByte > 0) {
$distance--;
$this->posChar--;

16
lib/Encoding/UTF8.php

@ -12,12 +12,6 @@ class UTF8 implements StatelessEncoding {
const NAME = "UTF-8";
const LABELS = ["unicode-1-1-utf-8", "utf-8", "utf8"];
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
*/
public function nextCode() {
// this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder
// optimization for ASCII characters
@ -46,7 +40,7 @@ class UTF8 implements StatelessEncoding {
if ($b==0xE0) {
$lower = 0xA0;
} elseif ($b==0xED) {
$upper = 0x9F;
$upper = ($this->allowSurrogates) ? 0xBF : 0x9F;
}
$point = $b & 0xF;
} elseif ($b >= 0xF0 && $b <= 0xF4) { // four-byte character
@ -58,9 +52,11 @@ class UTF8 implements StatelessEncoding {
}
$point = $b & 0x7;
} else { // invalid byte
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar, $this->posByte]);
}
} elseif ($b < $lower || $b > $upper) {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar, $this->posByte--]);
} else {
$lower = 0x80;
@ -72,12 +68,6 @@ class UTF8 implements StatelessEncoding {
return $point;
}
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted. When encoding to UTF-8, all Unicode characters can be encoded, so the argument is ignored
*/
public static function encode(int $codePoint, bool $fatal = true): string {
// this function implements https://encoding.spec.whatwg.org/#utf-8-encoder
if ($codePoint < 0 || $codePoint > 0x10FFFF) {

7
lib/Encoding/XUserDefined.php

@ -88,7 +88,12 @@ class XUserDefined implements Encoding {
*
* Note that this may involve processing to the end of the string
*/
public function len(): int {
public function lenChar(): int {
return $this->lenByte;
}
/** Returns whether the character pointer is at the end of the string */
public function eof(): bool {
return $this->posChar >= $this->lenByte;
}
}

16
robo

@ -1,10 +1,14 @@
#! /bin/sh
base=`dirname "$0"`
roboCommand="$1"
shift
if [ "$1" == "clean" ]; then
"$base/vendor/bin/robo" "$roboCommand" $*
if [ $# -eq 0 ]; then
"$base/vendor/bin/robo"
else
"$base/vendor/bin/robo" "$roboCommand" -- $*
fi
shift
ulimit -n 2048
if [ "$1" = "clean" ]; then
"$base/vendor/bin/robo" "$roboCommand" "$@"
else
"$base/vendor/bin/robo" "$roboCommand" -- "$@"
fi
fi

6
tests/bootstrap.php

@ -4,10 +4,14 @@
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\UTF8;
namespace MensBeam\Intl;
const NS_BASE = __NAMESPACE__."\\";
define(NS_BASE."BASE", dirname(__DIR__).DIRECTORY_SEPARATOR);
ini_set("memory_limit", "-1");
error_reporting(\E_ALL);
require_once BASE."vendor".DIRECTORY_SEPARATOR."autoload.php";
if (function_exists("xdebug_set_filter")) {
xdebug_set_filter(\XDEBUG_FILTER_CODE_COVERAGE, \XDEBUG_PATH_WHITELIST, [BASE."lib/"]);
}

12
tests/cases/Encoding/TestBig5.php

@ -76,6 +76,7 @@ class TestBig5 extends \MensBeam\Intl\Test\CoderDecoderTest {
/**
* @covers MensBeam\Intl\Encoding\Big5::posChar
* @covers MensBeam\Intl\Encoding\Big5::posByte
* @covers MensBeam\Intl\Encoding\Big5::eof
*/
public function testTraversePastTheEndOfAString() {
return parent::testTraversePastTheEndOfAString();
@ -101,7 +102,8 @@ class TestBig5 extends \MensBeam\Intl\Test\CoderDecoderTest {
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\Big5::len
* @covers MensBeam\Intl\Encoding\Big5::lenChar
* @covers MensBeam\Intl\Encoding\Big5::lenByte
* @covers MensBeam\Intl\Encoding\Big5::stateSave
* @covers MensBeam\Intl\Encoding\Big5::stateApply
*/
@ -126,6 +128,14 @@ class TestBig5 extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testIterateThroughAString($input, $exp);
}
/**
* @dataProvider provideStrings
* @coversNothing
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
public function provideCodePoints() {
return [
'U+0064 (HTML)' => [false, 0x64, "64"],

12
tests/cases/Encoding/TestEUCKR.php

@ -76,6 +76,7 @@ class TestEUCKR extends \MensBeam\Intl\Test\CoderDecoderTest {
/**
* @covers MensBeam\Intl\Encoding\EUCKR::posChar
* @covers MensBeam\Intl\Encoding\EUCKR::posByte
* @covers MensBeam\Intl\Encoding\EUCKR::eof
*/
public function testTraversePastTheEndOfAString() {
return parent::testTraversePastTheEndOfAString();
@ -101,7 +102,8 @@ class TestEUCKR extends \MensBeam\Intl\Test\CoderDecoderTest {
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\EUCKR::len
* @covers MensBeam\Intl\Encoding\EUCKR::lenChar
* @covers MensBeam\Intl\Encoding\EUCKR::lenByte
* @covers MensBeam\Intl\Encoding\EUCKR::stateSave
* @covers MensBeam\Intl\Encoding\EUCKR::stateApply
*/
@ -126,6 +128,14 @@ class TestEUCKR extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testIterateThroughAString($input, $exp);
}
/**
* @dataProvider provideStrings
* @coversNothing
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
public function provideCodePoints() {
return [
'U+0064 (HTML)' => [false, 0x64, "64"],

14
tests/cases/Encoding/TestGB18030.php

@ -29,7 +29,7 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest {
/* This string contains an invalid character sequence sandwiched between two null characters */
protected $brokenChar = "00 FF 00";
public function tearDown() {
public function tearDown(): void {
$this->testedClass = GB18030::class;
}
@ -84,6 +84,7 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest {
/**
* @covers MensBeam\Intl\Encoding\GB18030::posChar
* @covers MensBeam\Intl\Encoding\GB18030::posByte
* @covers MensBeam\Intl\Encoding\GB18030::eof
*/
public function testTraversePastTheEndOfAString() {
return parent::testTraversePastTheEndOfAString();
@ -109,7 +110,8 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest {
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\GB18030::len
* @covers MensBeam\Intl\Encoding\GB18030::lenChar
* @covers MensBeam\Intl\Encoding\GB18030::lenByte
* @covers MensBeam\Intl\Encoding\GB18030::stateSave
* @covers MensBeam\Intl\Encoding\GB18030::stateApply
*/
@ -134,6 +136,14 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testIterateThroughAString($input, $exp);
}
/**
* @dataProvider provideStrings
* @coversNothing
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
public function provideCodePoints() {
// bytes confirmed using Firefox
$series_gb18030 = [

13
tests/cases/Encoding/TestSingleByte.php

@ -139,6 +139,7 @@ class TestSingleByte extends \MensBeam\Intl\Test\CoderDecoderTest {
* @dataProvider provideClasses
* @covers MensBeam\Intl\Encoding\SingleByteEncoding::posChar
* @covers MensBeam\Intl\Encoding\SingleByteEncoding::posByte
* @covers MensBeam\Intl\Encoding\SingleByteEncoding::eof
*/
public function testTraversePastTheEndOfAString(string $class = SingleByteEncoding::class) {
$this->testedClass = $class;
@ -173,7 +174,8 @@ class TestSingleByte extends \MensBeam\Intl\Test\CoderDecoderTest {
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\SingleByteEncoding::len
* @covers MensBeam\Intl\Encoding\SingleByteEncoding::lenChar
* @covers MensBeam\Intl\Encoding\SingleByteEncoding::lenByte
* @covers MensBeam\Intl\Encoding\SingleByteEncoding::stateSave
* @covers MensBeam\Intl\Encoding\SingleByteEncoding::stateApply
*/
@ -203,6 +205,15 @@ class TestSingleByte extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testIterateThroughAString($input, $exp);
}
/**
* @dataProvider provideStrings
* @coversNothing
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $exp, $class = null) {
$this->testedClass = $class;
return parent::testIterateThroughAStringAllowingSurrogates($input, $exp, $exp);
}
public function provideClasses() {
foreach (self::$classes as $name => $class) {
yield $name => [$class];

8
tests/cases/Encoding/TestUTF16BE.php

@ -6,7 +6,6 @@
declare(strict_types=1);
namespace MensBeam\Intl\TestCase\Encoding;
use MensBeam\Intl\Encoding\UTF16LE;
use MensBeam\Intl\Encoding\UTF16BE;
class TestUTF16BE extends TestUTF16LE {
@ -30,7 +29,10 @@ class TestUTF16BE extends TestUTF16LE {
public function provideStrings() {
foreach (parent::provideStrings() as $name => $test) {
list($string, $codes) = $test;
if (sizeof($test) == 2) {
$test[] = null;
}
list($string, $codes, $altCodes) = $test;
$words = explode(" ", $string);
foreach ($words as $a => $word) {
if (strlen($word) == 4) {
@ -38,7 +40,7 @@ class TestUTF16BE extends TestUTF16LE {
}
}
$string = implode(" ", $words);
yield $name => [$string, $codes];
yield $name => [$string, $codes, $altCodes];
}
}
}

21
tests/cases/Encoding/TestUTF16LE.php

@ -7,7 +7,6 @@ declare(strict_types=1);
namespace MensBeam\Intl\TestCase\Encoding;
use MensBeam\Intl\Encoding\UTF16LE;
use MensBeam\Intl\Encoding\UTF16BE;
class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
protected $testedClass = UTF16LE::class;
@ -67,6 +66,7 @@ class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
/**
* @covers MensBeam\Intl\Encoding\UTF16::posChar
* @covers MensBeam\Intl\Encoding\UTF16::posByte
* @covers MensBeam\Intl\Encoding\UTF16::eof
*/
public function testTraversePastTheEndOfAString() {
return parent::testTraversePastTheEndOfAString();
@ -92,7 +92,8 @@ class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\UTF16::len
* @covers MensBeam\Intl\Encoding\UTF16::lenChar
* @covers MensBeam\Intl\Encoding\UTF16::lenByte
* @covers MensBeam\Intl\Encoding\UTF16::stateSave
* @covers MensBeam\Intl\Encoding\UTF16::stateApply
*/
@ -117,6 +118,14 @@ class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
return parent::testIterateThroughAString($input, $exp);
}
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\UTF16::nextCode
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
public function provideStrings() {
return [
// control samples
@ -128,10 +137,10 @@ class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
'EOF after lead surrogate' => ["0000 34D8", [0, 65533]],
'EOF in trail surrogate' => ["0000 34D8 1E", [0, 65533]],
// invalid UTF-16 surrogates
'lead surrogate without trail' => ["34D8 0000", [65533, 0]],
'trail surrogate without lead' => ["1EDD 0000", [65533, 0]],
'double lead surrogate' => ["34D8 34D8 1EDD", [65533, 119070]],
'double trail surrogate' => ["34D8 1EDD 1EDD", [119070, 65533]],
'lead surrogate without trail' => ["34D8 0000", [65533, 0], [0xD834, 0]],
'trail surrogate without lead' => ["1EDD 0000", [65533, 0], [0xDD1E, 0]],
'double lead surrogate' => ["34D8 34D8 1EDD", [65533, 119070], [0xD834, 119070]],
'double trail surrogate' => ["34D8 1EDD 1EDD", [119070, 65533], [119070, 0xDD1E]],
];
}
}

19
tests/cases/Encoding/TestUTF8.php

@ -76,6 +76,7 @@ class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest {
/**
* @covers MensBeam\Intl\Encoding\UTF8::posChar
* @covers MensBeam\Intl\Encoding\UTF8::posByte
* @covers MensBeam\Intl\Encoding\UTF8::eof
*/
public function testTraversePastTheEndOfAString() {
return parent::testTraversePastTheEndOfAString();
@ -101,7 +102,8 @@ class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest {
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\UTF8::len
* @covers MensBeam\Intl\Encoding\UTF8::lenChar
* @covers MensBeam\Intl\Encoding\UTF8::lenByte
* @covers MensBeam\Intl\Encoding\UTF8::stateSave
* @covers MensBeam\Intl\Encoding\UTF8::stateApply
*/
@ -126,6 +128,14 @@ class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testIterateThroughAString($input, $exp);
}
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\UTF8::nextCode
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
public function provideCodePoints() {
return [
'U+007A (HTML)' => [false, 0x7A, "7A"],
@ -188,9 +198,10 @@ class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest {
'overlong U+10FFFF - 5 bytes' => ["F8 84 8F BF BF", [65533, 65533, 65533, 65533, 65533]],
'overlong U+10FFFF - 6 bytes' => ["FC 80 84 8F BF BF", [65533, 65533, 65533, 65533, 65533, 65533]],
// UTF-16 surrogates
'lead surrogate' => ["ED A0 80", [65533, 65533, 65533]],
'trail surrogate' => ["ED B0 80", [65533, 65533, 65533]],
'surrogate pair' => ["ED A0 80 ED B0 80", [65533, 65533, 65533, 65533, 65533, 65533]],
// surrogates have alternate outputs for when surrogates are being allowed
'lead surrogate' => ["ED A0 80", [65533, 65533, 65533], [0xD800]],
'trail surrogate' => ["ED B0 80", [65533, 65533, 65533], [0xDC00]],
'surrogate pair' => ["ED A0 80 ED B0 80", [65533, 65533, 65533, 65533, 65533, 65533], [0xD800, 0xDC00]],
// self-sync edge cases
'trailing continuation' => ["0A 80 80", [10, 65533, 65533]],
'trailing continuation 2' => ["E5 8F A4 80", [21476, 65533]],

12
tests/cases/Encoding/TestXUserDefined.php

@ -57,6 +57,7 @@ class TestXUserDefined extends \MensBeam\Intl\Test\DecoderTest {
/**
* @covers MensBeam\Intl\Encoding\XUserDefined::posChar
* @covers MensBeam\Intl\Encoding\XUserDefined::posByte
* @covers MensBeam\Intl\Encoding\XUserDefined::eof
*/
public function testTraversePastTheEndOfAString() {
return parent::testTraversePastTheEndOfAString();
@ -82,7 +83,8 @@ class TestXUserDefined extends \MensBeam\Intl\Test\DecoderTest {
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\XUserDefined::len
* @covers MensBeam\Intl\Encoding\XUserDefined::lenChar
* @covers MensBeam\Intl\Encoding\XUserDefined::lenByte
* @covers MensBeam\Intl\Encoding\XUserDefined::stateSave
* @covers MensBeam\Intl\Encoding\XUserDefined::stateApply
*/
@ -107,6 +109,14 @@ class TestXUserDefined extends \MensBeam\Intl\Test\DecoderTest {
return parent::testIterateThroughAString($input, $exp);
}
/**
* @dataProvider provideStrings
* @coversNothing
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
public function provideStrings() {
$a_bytes = [];
$a_codes = [];

56
tests/cases/TestEncoding.php

@ -0,0 +1,56 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Intl\TestCase;
use MensBeam\Intl\Encoding;
class TestEncoding extends \PHPUnit\Framework\TestCase {
/** @dataProvider provideLabelData */
public function testMatchALabelToAnEncoding(string $label, array $exp) {
$this->assertSame($exp, Encoding::matchLabel($label));
$this->assertSame($exp, Encoding::matchLabel(strtoupper($label)));
$this->assertSame($exp, Encoding::matchLabel(" $label\n\n\r\t"));
}
public function testFailToMatchALabelToAnEncoding() {
$this->assertNull(Encoding::matchLabel("Not a label"));
}
/** @dataProvider provideLabelData */
public function testCreateADecoderFromALabel(string $label, array $data) {
$this->assertInstanceOf($data['class'], Encoding::createDecoder($label, ""));
$this->assertInstanceOf($data['class'], Encoding::createDecoder(strtoupper($label), ""));
$this->assertInstanceOf($data['class'], Encoding::createDecoder(" $label\n\n\r\t", ""));
}
public function testFailToCreateADecoderFromALabel() {
$this->assertNull(Encoding::createDecoder("Not a label", ""));
}
public function provideLabelData() {
$ns = "MensBeam\\Intl\\Encoding\\";
$labels = [];
$names = [];
foreach (new \GlobIterator(\MensBeam\Intl\BASE."/lib/Encoding/*.php", \FilesystemIterator::CURRENT_AS_PATHNAME) as $file) {
$file = basename($file, ".php");
$className = $ns.$file;
$class = new \ReflectionClass($className);
if ($class->implementsInterface(\MensBeam\Intl\Encoding\Encoding::class) && $class->isInstantiable()) {
$name = $class->getConstant("NAME");
$names[$name] = $className;
foreach ($class->getConstant("LABELS") as $label) {
$labels[$label] = $name;
}
}
}
$out = [];
foreach ($labels as $label => $name) {
$out[] = [(string) $label, ['label' => (string) $label, 'name' => $name, 'class' => $names[$name]]];
}
return $out;
}
}

23
tests/lib/DecoderTest.php

@ -120,26 +120,32 @@ abstract class DecoderTest extends \PHPUnit\Framework\TestCase {
$l = strlen($this->lowerA);
$this->assertSame(0, $s->posChar());
$this->assertSame(0, $s->posByte());
$this->assertFalse($s->eof());
$this->assertSame("a", $s->nextChar());
$this->assertSame(1, $s->posChar());
$this->assertSame($l, $s->posByte());
$this->assertTrue($s->eof());
$this->assertSame("", $s->nextChar());
$this->assertSame(1, $s->posChar());
$this->assertSame($l, $s->posByte());
$this->assertTrue($s->eof());
$s = new $class($this->lowerA);
$this->assertSame(0, $s->posChar());
$this->assertSame(0, $s->posByte());
$this->assertFalse($s->eof());
$this->assertSame(ord("a"), $s->nextCode());
$this->assertSame(1, $s->posChar());
$this->assertSame($l, $s->posByte());
$this->assertTrue($s->eof());
$this->assertSame(false, $s->nextCode());
$this->assertSame(1, $s->posChar());
$this->assertSame($l, $s->posByte());
$this->assertTrue($s->eof());
}
public function testPeekAtCharacters() {
@ -220,7 +226,10 @@ abstract class DecoderTest extends \PHPUnit\Framework\TestCase {
$posChar = $s->posChar();
$posByte = $s->posByte();
$this->assertSame(sizeof($points), $s->len());
$this->assertSame(sizeof($points), $s->lenChar());
$this->assertSame($posChar, $s->posChar());
$this->assertSame($posByte, $s->posByte());
$this->assertSame(strlen($input), $s->lenByte());
$this->assertSame($posChar, $s->posChar());
$this->assertSame($posByte, $s->posByte());
}
@ -272,10 +281,18 @@ abstract class DecoderTest extends \PHPUnit\Framework\TestCase {
}
public function testIterateThroughAString(string $input, array $exp) {
$this->iterateThroughAString($input, $exp, false);
}
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
$exp = $relaxedExp ?? $strictExp;
$this->iterateThroughAString($input, $exp, true);
}
protected function iterateThroughAString(string $input, array $exp, bool $allowSurrogates) {
$class = $this->testedClass;
$input = $this->prepString($input);
$s = new $class($input);
$out = [];
$s = new $class($input, false, $allowSurrogates);
$a = 0;
$this->assertTrue(true); // prevent risky test of empty string
foreach ($s->codes() as $index => $p) {

3
tests/phpunit.xml

@ -7,7 +7,6 @@
convertWarningsToExceptions="false"
beStrictAboutTestsThatDoNotTestAnything="true"
beStrictAboutOutputDuringTests="true"
beStrictAboutTestSize="true"
stopOnError="true">
<filter>
@ -27,6 +26,8 @@
<file>cases/Encoding/TestGB18030.php</file>
<file>cases/Encoding/TestBig5.php</file>
<file>cases/Encoding/TestEUCKR.php</file>
<file>cases/TestEncoding.php</file>
</testsuite>
</testsuites>
</phpunit>

3
tools/mkindex.php

@ -1,5 +1,8 @@
<?php
declare(strict_types=1);
// This script produces the index lookup tables
// for a given encoding from the source data at WHATWG
$labels = [
'big5' => "big5",
'euc-jp' => "eucjp",

40
tools/mklabels.php

@ -0,0 +1,40 @@
<?php
// this script read and names and labels from each concrete
// class in the Encoding set and generates tables mapping labels
// to names and names to classes
use MensBeam\Intl\Encoding\Encoding;
define("BASE", dirname(__DIR__).DIRECTORY_SEPARATOR);
require_once BASE."vendor".DIRECTORY_SEPARATOR."autoload.php";
$ns = "\\MensBeam\\Intl\\Encoding\\";
$labels = [];
$names = [];
foreach (new \GlobIterator(BASE."/lib/Encoding/*.php", \FilesystemIterator::CURRENT_AS_PATHNAME) as $file) {
$file = basename($file, ".php");
$className = $ns.$file;
$class = new \ReflectionClass($className);
if ($class->implementsInterface(Encoding::class) && $class->isInstantiable()) {
$name = $class->getConstant("NAME");
$names[$name] = $className;
foreach ($class->getConstant("LABELS") as $label) {
$labels[$label] = $name;
}
}
}
$labelList = [];
foreach ($labels as $k => $v) {
$labelList[] = "'$k'=>\"$v\"";
}
$labelList = "const LABEL_MAP = [".implode(",", $labelList)."];";
$nameList = [];
foreach ($names as $k => $v) {
$nameList[] = "'$k'=>$v::class";
}
$nameList = "const NAME_MAP = [".implode(",", $nameList)."];";
echo "$labelList\n";
echo "$nameList\n";

633
vendor-bin/csfixer/composer.lock

File diff suppressed because it is too large

2
vendor-bin/phpunit/composer.json

@ -1,5 +1,5 @@
{
"require": {
"phpunit/phpunit": "^6.5"
"phpunit/phpunit": "^8.5"
}
}

590
vendor-bin/phpunit/composer.lock

File diff suppressed because it is too large

676
vendor-bin/robo/composer.lock

File diff suppressed because it is too large
Loading…
Cancel
Save