Compare commits

...

13 Commits

  1. 20
      CHANGELOG
  2. 1
      RoboFile.php
  3. 5
      composer.json
  4. 31
      composer.lock
  5. 20
      lib/Encoding.php
  6. 38
      lib/Encoding/AbstractEncoding.php
  7. 8
      lib/Encoding/Big5.php
  8. 22
      lib/Encoding/Decoder.php
  9. 8
      lib/Encoding/EUCJP.php
  10. 9
      lib/Encoding/EUCKR.php
  11. 1
      lib/Encoding/Encoder.php
  12. 12
      lib/Encoding/GBCommon.php
  13. 93
      lib/Encoding/ISO2022JP.php
  14. 8
      lib/Encoding/Replacement.php
  15. 10
      lib/Encoding/ShiftJIS.php
  16. 11
      lib/Encoding/SingleByteEncoding.php
  17. 69
      lib/Encoding/UTF16.php
  18. 25
      lib/Encoding/UTF8.php
  19. 12
      lib/Encoding/XUserDefined.php
  20. 16
      tests/cases/Encoding/TestBig5.php
  21. 16
      tests/cases/Encoding/TestEUCJP.php
  22. 16
      tests/cases/Encoding/TestEUCKR.php
  23. 18
      tests/cases/Encoding/TestGB18030.php
  24. 16
      tests/cases/Encoding/TestISO2022JP.php
  25. 20
      tests/cases/Encoding/TestReplacement.php
  26. 16
      tests/cases/Encoding/TestShiftJIS.php
  27. 20
      tests/cases/Encoding/TestSingleByte.php
  28. 23
      tests/cases/Encoding/TestUTF16BE.php
  29. 37
      tests/cases/Encoding/TestUTF16LE.php
  30. 37
      tests/cases/Encoding/TestUTF8.php
  31. 16
      tests/cases/Encoding/TestXUserDefined.php
  32. 18
      tests/cases/TestEncoding.php
  33. 56
      tests/lib/DecoderTest.php
  34. 713
      vendor-bin/csfixer/composer.lock
  35. 2
      vendor-bin/phpunit/composer.json
  36. 1156
      vendor-bin/phpunit/composer.lock
  37. 2
      vendor-bin/robo/composer.json
  38. 1302
      vendor-bin/robo/composer.lock

20
CHANGELOG

@ -1,3 +1,23 @@
Version 0.9.2 (2023-01-25)
==========================
Bug fixes
- Define properties which were accidentally created dynamically
- Avoid use of @ operator to play nice with custom error handlers
Version 0.9.1 (2021-10-24)
==========================
Bug fixes
- Correctly skip byte order marks
- Detect byte order marks in \MensBeam\Intl\Encoding::createEncoder()
Version 0.9.0 (2021-03-25)
==========================
New features:
- Add asciiSpan() and asciiSpanNot() methods to decoders
Version 0.8.1 (2021-03-06)
==========================

1
RoboFile.php

@ -128,6 +128,7 @@ class RoboFile extends \Robo\Tasks {
}
protected function runTests(string $executor, string $set, array $args): Result {
error_reporting(0);
switch ($set) {
case "typical":
$set = ["--exclude-group", "optional"];

5
composer.json

@ -32,5 +32,10 @@
"psr-4": {
"MensBeam\\Intl\\Test\\": "tests/lib/"
}
},
"config": {
"allow-plugins": {
"bamarni/composer-bin-plugin": true
}
}
}

31
composer.lock

@ -9,29 +9,36 @@
"packages-dev": [
{
"name": "bamarni/composer-bin-plugin",
"version": "1.4.1",
"version": "1.8.2",
"source": {
"type": "git",
"url": "https://github.com/bamarni/composer-bin-plugin.git",
"reference": "9329fb0fbe29e0e1b2db8f4639a193e4f5406225"
"reference": "92fd7b1e6e9cdae19b0d57369d8ad31a37b6a880"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/bamarni/composer-bin-plugin/zipball/9329fb0fbe29e0e1b2db8f4639a193e4f5406225",
"reference": "9329fb0fbe29e0e1b2db8f4639a193e4f5406225",
"url": "https://api.github.com/repos/bamarni/composer-bin-plugin/zipball/92fd7b1e6e9cdae19b0d57369d8ad31a37b6a880",
"reference": "92fd7b1e6e9cdae19b0d57369d8ad31a37b6a880",
"shasum": ""
},
"require": {
"composer-plugin-api": "^1.0 || ^2.0",
"php": "^5.5.9 || ^7.0 || ^8.0"
"composer-plugin-api": "^2.0",
"php": "^7.2.5 || ^8.0"
},
"require-dev": {
"composer/composer": "^1.0 || ^2.0",
"symfony/console": "^2.5 || ^3.0 || ^4.0"
"composer/composer": "^2.0",
"ext-json": "*",
"phpstan/extension-installer": "^1.1",
"phpstan/phpstan": "^1.8",
"phpstan/phpstan-phpunit": "^1.1",
"phpunit/phpunit": "^8.5 || ^9.5",
"symfony/console": "^2.8.52 || ^3.4.35 || ^4.4 || ^5.0 || ^6.0",
"symfony/finder": "^2.8.52 || ^3.4.35 || ^4.4 || ^5.0 || ^6.0",
"symfony/process": "^2.8.52 || ^3.4.35 || ^4.4 || ^5.0 || ^6.0"
},
"type": "composer-plugin",
"extra": {
"class": "Bamarni\\Composer\\Bin\\Plugin"
"class": "Bamarni\\Composer\\Bin\\BamarniBinPlugin"
},
"autoload": {
"psr-4": {
@ -53,9 +60,9 @@
],
"support": {
"issues": "https://github.com/bamarni/composer-bin-plugin/issues",
"source": "https://github.com/bamarni/composer-bin-plugin/tree/master"
"source": "https://github.com/bamarni/composer-bin-plugin/tree/1.8.2"
},
"time": "2020-05-03T08:27:20+00:00"
"time": "2022-10-31T08:38:03+00:00"
}
],
"aliases": [],
@ -69,5 +76,5 @@
"platform-dev": {
"ext-intl": "*"
},
"plugin-api-version": "2.0.0"
"plugin-api-version": "2.3.0"
}

20
lib/Encoding.php

@ -16,6 +16,8 @@ abstract class Encoding {
protected const NAME_MAP = ['Big5'=>\MensBeam\Intl\Encoding\Big5::class,'EUC-JP'=>\MensBeam\Intl\Encoding\EUCJP::class,'EUC-KR'=>\MensBeam\Intl\Encoding\EUCKR::class,'gb18030'=>\MensBeam\Intl\Encoding\GB18030::class,'GBK'=>\MensBeam\Intl\Encoding\GBK::class,'IBM866'=>\MensBeam\Intl\Encoding\IBM866::class,'ISO-2022-JP'=>\MensBeam\Intl\Encoding\ISO2022JP::class,'ISO-8859-10'=>\MensBeam\Intl\Encoding\ISO885910::class,'ISO-8859-13'=>\MensBeam\Intl\Encoding\ISO885913::class,'ISO-8859-14'=>\MensBeam\Intl\Encoding\ISO885914::class,'ISO-8859-15'=>\MensBeam\Intl\Encoding\ISO885915::class,'ISO-8859-16'=>\MensBeam\Intl\Encoding\ISO885916::class,'ISO-8859-2'=>\MensBeam\Intl\Encoding\ISO88592::class,'ISO-8859-3'=>\MensBeam\Intl\Encoding\ISO88593::class,'ISO-8859-4'=>\MensBeam\Intl\Encoding\ISO88594::class,'ISO-8859-5'=>\MensBeam\Intl\Encoding\ISO88595::class,'ISO-8859-6'=>\MensBeam\Intl\Encoding\ISO88596::class,'ISO-8859-7'=>\MensBeam\Intl\Encoding\ISO88597::class,'ISO-8859-8'=>\MensBeam\Intl\Encoding\ISO88598::class,'ISO-8859-8-I'=>\MensBeam\Intl\Encoding\ISO88598I::class,'KOI8-R'=>\MensBeam\Intl\Encoding\KOI8R::class,'KOI8-U'=>\MensBeam\Intl\Encoding\KOI8U::class,'macintosh'=>\MensBeam\Intl\Encoding\Macintosh::class,'replacement'=>\MensBeam\Intl\Encoding\Replacement::class,'Shift_JIS'=>\MensBeam\Intl\Encoding\ShiftJIS::class,'UTF-16BE'=>\MensBeam\Intl\Encoding\UTF16BE::class,'UTF-16LE'=>\MensBeam\Intl\Encoding\UTF16LE::class,'UTF-8'=>\MensBeam\Intl\Encoding\UTF8::class,'windows-1250'=>\MensBeam\Intl\Encoding\Windows1250::class,'windows-1251'=>\MensBeam\Intl\Encoding\Windows1251::class,'windows-1252'=>\MensBeam\Intl\Encoding\Windows1252::class,'windows-1253'=>\MensBeam\Intl\Encoding\Windows1253::class,'windows-1254'=>\MensBeam\Intl\Encoding\Windows1254::class,'windows-1255'=>\MensBeam\Intl\Encoding\Windows1255::class,'windows-1256'=>\MensBeam\Intl\Encoding\Windows1256::class,'windows-1257'=>\MensBeam\Intl\Encoding\Windows1257::class,'windows-1258'=>\MensBeam\Intl\Encoding\Windows1258::class,'windows-874'=>\MensBeam\Intl\Encoding\Windows874::class,'x-mac-cyrillic'=>\MensBeam\Intl\Encoding\XMacCyrillic::class,'x-user-defined'=>\MensBeam\Intl\Encoding\XUserDefined::class];
/** Returns a new decoder for the specified $encodingLabel operating on $data, or null if the label is not valid
*
* If $data includes a UTF-8 or UTF-16 byte order mark, this will take precedence over the specified encoding
*
* @param string $encodingLabel One of the encoding labels listed in the specification e.g. "utf-8", "Latin1", "shift_JIS"
* @param string $data The string to decode
@ -25,7 +27,7 @@ abstract class Encoding {
* @see https://encoding.spec.whatwg.org#names-and-labels
*/
public static function createDecoder(string $encodingLabel, string $data, bool $fatal = false, bool $allowSurrogates = false): ?Decoder {
$encoding = self::matchLabel($encodingLabel);
$encoding = self::matchLabel(self::sniffBOM($data) ?? $encodingLabel);
if ($encoding) {
$class = $encoding['class'];
return new $class($data, $fatal, $allowSurrogates);
@ -77,4 +79,20 @@ abstract class Encoding {
return null;
}
}
/** Finds a Unicode byte order mark in a byte stream and returns the detected encoding, if any
*
* @param string $data The string to examine
*/
public static function sniffBOM(string $data): ?string {
if (substr($data, 0, 3) === "\xEF\xBB\xBF") {
return "UTF-8";
} elseif (substr($data, 0, 2) === "\xFE\xFF") {
return "UTF-16BE";
} elseif (substr($data, 0, 2) === "\xFF\xFE") {
return "UTF-16LE";
} else {
return null;
}
}
}

38
lib/Encoding/AbstractEncoding.php

@ -11,6 +11,8 @@ abstract class AbstractEncoding implements Decoder {
protected const MODE_REPLACE = 1;
protected const MODE_FATAL = 2;
protected const HIGH_BYTES = "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF";
/** @var string $string The string being decoded */
protected $string;
/** @var int $posByte The current byte position in the string */
@ -70,7 +72,7 @@ abstract class AbstractEncoding implements Decoder {
public function nextChar(): string {
// get the byte at the current position
$b = @$this->string[$this->posByte];
$b = $this->string[$this->posByte] ?? "";
if ($b === "") {
// if the byte is end of input, simply return it
return "";
@ -162,6 +164,40 @@ abstract class AbstractEncoding implements Decoder {
}
}
public function asciiSpan(string $mask, int $length = null): string {
$mask = preg_replace('/[\x80-\xFF]/s', "", $mask);
if ($length !== null) {
$len = strspn($this->string, $mask, $this->posByte, $length);
} else {
$len = strspn($this->string, $mask, $this->posByte);
}
if ($len) {
$out = substr($this->string, $this->posByte, $len);
$this->posByte += $len;
$this->posChar += $len;
return $out;
} else {
return "";
}
}
public function asciiSpanNot(string $mask, int $length = null): string {
$mask .= self::HIGH_BYTES;
if ($length !== null) {
$len = strcspn($this->string, $mask, $this->posByte, $length);
} else {
$len = strcspn($this->string, $mask, $this->posByte);
}
if ($len) {
$out = substr($this->string, $this->posByte, $len);
$this->posByte += $len;
$this->posChar += $len;
return $out;
} else {
return "";
}
}
/** Returns a copy of the decoder's state to keep in memory */
protected function stateSave(): array {
$out = ['errCount' => sizeof($this->errStack)];

8
lib/Encoding/Big5.php

@ -46,7 +46,7 @@ class Big5 extends AbstractEncoding implements Coder, Decoder {
return $code;
}
$lead = 0x00;
while (($b = @$this->string[$this->posByte++]) !== "") {
while (($b = $this->string[$this->posByte++] ?? "") !== "") {
$b = ord($b);
if ($lead == 0) {
if ($b < 0x80) {
@ -127,13 +127,13 @@ class Big5 extends AbstractEncoding implements Coder, Decoder {
continue;
}
// go back one byte
$b1 = ord(@$this->string[--$this->posByte]);
$b1 = ord($this->string[--$this->posByte] ?? "");
if ($b1 < 0x40 || $b1 == 0x7F || $this->posByte === $this->errMark || $this->posByte == 0) { // these bytes never appear in sequences, a byte coming after an error is necessarily its own character, and the first byte is necessarily the start of a sequence
// the byte is a character
continue;
}
// go back a second byte
$b2 = ord(@$this->string[--$this->posByte]);
$b2 = ord($this->string[--$this->posByte] ?? "");
if ($b2 < 0x80) { // these bytes never appear in the lead of a sequence
// the first byte was a character
$this->posByte += 1;
@ -151,7 +151,7 @@ class Big5 extends AbstractEncoding implements Coder, Decoder {
$pos = $this->posByte;
// go back bytes until an error mark, an ASCII byte, or start of string
while ($pos > 0 && $pos > $this->errMark) {
$b = ord(@$this->string[--$pos]);
$b = ord($this->string[--$pos] ?? "");
if ($b < 0x80) {
$pos++;
break;

22
lib/Encoding/Decoder.php

@ -80,4 +80,26 @@ interface Decoder {
/** Generates an iterator which steps through each code point in the string */
public function codes(): \Generator;
/** Fast-forwards through a span of ASCII characters matching the supplied mask, returning any consumed characters
*
* The mask must consist only of ASCII characters.
*
* Note that if the empty string is returned, this does not necessarily signal the end of the string
*
* @param string $mask The set of ASCII characters to match
* @param int $length The maximum number oof characters to advance by
*/
public function asciiSpan(string $mask, int $length = null): string;
/** Fast-forwards through a span of ASCII characters not matching the supplied mask, returning any consumed characters
*
* The mask must consist only of ASCII characters.
*
* Note that if the empty string is returned, this does not necessarily signal the end of the string
*
* @param string $mask The set of ASCII characters to not match
* @param int $length The maximum number oof characters to advance by
*/
public function asciiSpanNot(string $mask, int $length = null): string;
}

8
lib/Encoding/EUCJP.php

@ -30,7 +30,7 @@ class EUCJP extends AbstractEncoding implements Coder, Decoder {
$this->posChar++;
$lead = 0x00;
$jis0212 = false;
while (($b = @$this->string[$this->posByte++]) !== "") {
while (($b = $this->string[$this->posByte++] ?? "") !== "") {
$b = ord($b);
if ($lead == 0) {
if ($b < 0x80) {
@ -116,14 +116,14 @@ class EUCJP extends AbstractEncoding implements Coder, Decoder {
continue;
}
// go back one byte
$b1 = ord(@$this->string[--$this->posByte]);
$b1 = ord($this->string[--$this->posByte] ?? "");
// if the byte is an ASCII byte or the first byte in the string, this is a character
if ($b1 < 0x80 || $this->posByte === 0) { // ASCII bytes are always isolate in EUC-JP
// the byte is a character
continue;
}
// go back a second byte
$b2 = ord(@$this->string[--$this->posByte]);
$b2 = ord($this->string[--$this->posByte] ?? "");
if ($b2 === 0x8E) { // JIS X 0201 character
// the two bytes form a character
continue;
@ -132,7 +132,7 @@ class EUCJP extends AbstractEncoding implements Coder, Decoder {
continue;
}
// go back a third byte
$b3 = ord(@$this->string[--$this->posByte]);
$b3 = ord($this->string[--$this->posByte] ?? "");
if ($b3 === 0x8F) { // JIS X 0212 character
// the three bytes form a character
continue;

9
lib/Encoding/EUCKR.php

@ -24,11 +24,12 @@ class EUCKR extends AbstractEncoding implements Coder, Decoder {
/** @var array $pointerCache A cached result of flipping the pointer-to-code-point table */
protected static $pointerCache;
protected $dirtyEOF = 0;
public function nextCode() {
$this->posChar++;
$lead = 0x00;
while (($b = @$this->string[$this->posByte++]) !== "") {
while (($b = $this->string[$this->posByte++] ?? "") !== "") {
$b = ord($b);
if ($lead == 0) {
if ($b < 0x80) {
@ -96,13 +97,13 @@ class EUCKR extends AbstractEncoding implements Coder, Decoder {
continue;
}
// go back one byte
$b1 = ord(@$this->string[--$this->posByte]);
$b1 = ord($this->string[--$this->posByte] ?? "");
if ($b1 < 0x41 || $this->posByte === $this->errMark || $this->posByte == 0) { // these bytes never appear in sequences, a byte coming after an error is necessarily its own character, and the first byte is necessarily the start of a sequence
// the byte is a character
continue;
}
// go back a second byte
$b2 = ord(@$this->string[--$this->posByte]);
$b2 = ord($this->string[--$this->posByte] ?? "");
if ($b2 < 0x80) { // these bytes never appear in the lead of a sequence
// the first byte was a character
$this->posByte += 1;
@ -115,7 +116,7 @@ class EUCKR extends AbstractEncoding implements Coder, Decoder {
$pos = $this->posByte;
// go back bytes until an error mark, an ASCII byte, or start of string
while ($pos > 0 && $pos > $this->errMark) {
$b = ord(@$this->string[--$pos]);
$b = ord($this->string[--$pos] ?? "");
if ($b < 0x80) {
$pos++;
break;

1
lib/Encoding/Encoder.php

@ -11,6 +11,7 @@ use MensBeam\Intl\Encoding as Matcher;
class Encoder {
protected $name;
protected $fatal = true;
protected $mode = null;
/** Constructs a new encoder for the specified $label
*

12
lib/Encoding/GBCommon.php

@ -20,7 +20,7 @@ abstract class GBCommon extends AbstractEncoding implements Coder, Decoder {
$second = 0;
$third = 0;
$this->posChar++;
while (($b = @$this->string[$this->posByte++]) !== "") {
while (($b = $this->string[$this->posByte++] ?? "") !== "") {
$b = ord($b);
if ($first === 0) {
if ($b < 0x80) {
@ -148,7 +148,7 @@ abstract class GBCommon extends AbstractEncoding implements Coder, Decoder {
continue;
}
// go back one byte
$b1 = ord(@$this->string[--$this->posByte]);
$b1 = ord($this->string[--$this->posByte] ?? "");
if ($b1 > 0x80) { // only GBK characters end in high bytes
// the preceeding byte starts the character
$this->posByte--;
@ -165,10 +165,10 @@ abstract class GBCommon extends AbstractEncoding implements Coder, Decoder {
continue;
}
// go back a second byte
$b2 = ord(@$this->string[$this->posByte - 1]);
$b2 = ord($this->string[$this->posByte - 1] ?? "");
if ($b2 > 0x80) {
// go back a third byte
$b3 = ord(@$this->string[$this->posByte - 2]);
$b3 = ord($this->string[$this->posByte - 2] ?? "");
if ($b3 >= 0x30 && $b3 <= 0x39) {
// the next byte starts the character
$this->posByte -= 3;
@ -179,7 +179,7 @@ abstract class GBCommon extends AbstractEncoding implements Coder, Decoder {
continue;
} else { // this can either be the trail of a two-byte GBK character, or a single-byte character
// go back a second byte
$b2 = ord(@$this->string[--$this->posByte]);
$b2 = ord($this->string[--$this->posByte] ?? "");
if ($b2 < 0x81) { // these bytes never appear in the lead of a sequence
// the first byte was a character
$this->posByte += 1;
@ -189,7 +189,7 @@ abstract class GBCommon extends AbstractEncoding implements Coder, Decoder {
$pos = $this->posByte;
// go back bytes until an error mark, an ASCII byte, or start of string
while ($pos > 0 && $pos > $this->errMark) {
$b = ord(@$this->string[--$pos]);
$b = ord($this->string[--$pos] ?? "");
if ($b < 0x81) {
$pos++;
break;

93
lib/Encoding/ISO2022JP.php

@ -57,7 +57,7 @@ class ISO2022JP extends AbstractEncoding implements ModalCoder, Decoder {
$this->posChar++;
$state = $this->mode;
while (true) {
$b = @$this->string[$this->posByte++];
$b = $this->string[$this->posByte++] ?? "";
$eof = ($b === "");
$b = ord($b);
// unify handling of basic states where possible
@ -184,6 +184,97 @@ class ISO2022JP extends AbstractEncoding implements ModalCoder, Decoder {
return $distance;
}
public function asciiSpan(string $mask, int $length = null): string {
$out = "";
$left = ($length === null) ? -1 : $length;
Process:
if ($this->mode === self::KATAKANA_STATE || $this->mode === self::LEAD_BYTE_STATE) {
// these modes will always return an empty span
} else {
if ($this->mode === self::ASCII_STATE) {
$exc = '/[\x0E\x0F\x1B\x80-\xFF]/s';
} elseif ($this->mode === self::ROMAN_STATE) {
$exc = '/[\x0E\x0F\x1B\x5C\x7E\x80-\xFF]/s';
}
$effectiveMask = preg_replace($exc, "", $mask);
if ($length !== null) {
$len = strspn($this->string, $effectiveMask, $this->posByte, $left);
} else {
$len = strspn($this->string, $effectiveMask, $this->posByte);
}
if ($len) {
$out .= substr($this->string, $this->posByte, $len);
$this->posByte += $len;
$this->posChar += $len;
$left -= $len;
}
}
// check if the current position has a mode change to ASCII or Roman modes and is followed by a desired character
if ($left && ($this->string[$this->posByte] ?? "") === "\x1B") {
if (substr($this->string, $this->posByte + 1, 2) === "\x28\x42") {
$exc = '/[\x0E\x0F\x1B\x80-\xFF]/s';
} elseif (substr($this->string, $this->posByte + 1, 2) === "\x28\x4A") {
$exc = '/[\x0E\x0F\x1B\x5C\x7E\x80-\xFF]/s';
} else {
return $out;
}
$effectiveMask = preg_replace($exc, "", $mask);
// if the byte after the mode switch is a wanted one, consume it and go back to the start
if (strspn($this->string[$this->posByte + 3] ?? "", $effectiveMask)) {
$out .= $this->nextChar();
if (--$left) {
goto Process;
}
}
}
return $out;
}
public function asciiSpanNot(string $mask, int $length = null): string {
$mask .= self::HIGH_BYTES;
$out = "";
$left = ($length === null) ? -1 : $length;
Process:
if ($this->mode === self::KATAKANA_STATE || $this->mode === self::LEAD_BYTE_STATE) {
// these modes will always return an empty span
} else {
if ($this->mode === self::ASCII_STATE) {
$effectiveMask = $mask."\x0E\x0F\x1B";
} elseif ($this->mode === self::ROMAN_STATE) {
$effectiveMask = $mask."\x0E\x0F\x1B\x5C\x7E";
}
if ($length !== null) {
$len = strcspn($this->string, $effectiveMask, $this->posByte, $left);
} else {
$len = strcspn($this->string, $effectiveMask, $this->posByte);
}
if ($len) {
$out .= substr($this->string, $this->posByte, $len);
$this->posByte += $len;
$this->posChar += $len;
$left -= $len;
}
}
// check if the current position has a mode change to ASCII or Roman modes and is followed by a desired character
if ($left && ($this->string[$this->posByte] ?? "") === "\x1B") {
if (substr($this->string, $this->posByte + 1, 2) === "\x28\x42") {
$effectiveMask = $mask."\x0E\x0F\x1B";
} elseif (substr($this->string, $this->posByte + 1, 2) === "\x28\x4A") {
$effectiveMask = $mask."\x0E\x0F\x1B\x5C\x7E";
} else {
return $out;
}
// if the byte after the mode switch is a wanted one, consume it and go back to the start
if (strcspn($this->string[$this->posByte + 3] ?? "", $effectiveMask)) {
$out .= $this->nextChar();
if (--$left) {
goto Process;
}
}
}
return $out;
}
protected function stateSave(): array {
$out = parent::stateSave();
$out['modeCount'] = sizeof($this->modeStack);

8
lib/Encoding/Replacement.php

@ -122,4 +122,12 @@ class Replacement implements Decoder {
yield 0 => $this->nextCode();
}
}
public function asciiSpan(string $mask, int $length = null): string {
return "";
}
public function asciiSpanNot(string $mask, int $length = null): string {
return "";
}
}

10
lib/Encoding/ShiftJIS.php

@ -25,7 +25,7 @@ class ShiftJIS extends AbstractEncoding implements Coder, Decoder {
protected static $pointerCache;
public function nextCode() {
if (($b = @$this->string[$this->posByte++]) === "") {
if (($b = $this->string[$this->posByte++] ?? "") === "") {
// clean EOF
$this->posByte--;
return false;
@ -38,7 +38,7 @@ class ShiftJIS extends AbstractEncoding implements Coder, Decoder {
return 0xFF61 - 0xA1 + $b;
} elseif (($b >= 0x81 && $b <= 0x9F) || ($b >= 0xE0 && $b <= 0xFC)) {
$lead = $b;
if (($b = @$this->string[$this->posByte++]) === "") {
if (($b = $this->string[$this->posByte++] ?? "") === "") {
// dirty EOF
return $this->errDec($this->errMode, $this->posChar - 1, --$this->posByte - 1);
}
@ -121,13 +121,13 @@ class ShiftJIS extends AbstractEncoding implements Coder, Decoder {
continue;
}
// go back one byte
$b1 = ord(@$this->string[--$this->posByte]);
$b1 = ord($this->string[--$this->posByte] ?? "");
if ($b1 < 0x40 || $b1 > 0xFC || $b1 === 0x7F || $this->posByte === 0 || $this->posByte === $this->errMark) { // these bytes never appear in sequences, and the first byte is necessarily the start of a sequence
// the byte is a character
continue;
}
// go back a second byte
$b2 = ord(@$this->string[--$this->posByte]);
$b2 = ord($this->string[--$this->posByte] ?? "");
if ($b2 < 0x81 || $b2 > 0xFC || ($b2 >= 0xA0 && $b2 <= 0xDF)) { // these bytes never appear in the lead of a sequence
// the first byte was a character
$this->posByte += 1;
@ -140,7 +140,7 @@ class ShiftJIS extends AbstractEncoding implements Coder, Decoder {
$pos = $this->posByte;
// go back bytes until an error mark, a definite byte, or start of string
while ($pos > 0 && $pos > $this->errMark) {
$b = ord(@$this->string[--$pos]);
$b = ord($this->string[--$pos] ?? "");
if ($b < 0x81 || ($b >= 0xA0 && $b <= 0xDF) || $b > 0xFC) {
$pos++;
break;

11
lib/Encoding/SingleByteEncoding.php

@ -11,11 +11,12 @@ abstract class SingleByteEncoding extends AbstractEncoding implements Coder, Dec
public function nextChar(): string {
// get the byte at the current position
$b = @$this->string[$this->posChar];
$b = $this->string[$this->posChar] ?? "";
if ($b === "") {
return "";
}
$this->posChar++;
$this->posByte++;
$p = ord($b);
if ($p < 0x80) {
// if the byte is an ASCII character or end of input, simply return it
@ -27,11 +28,12 @@ abstract class SingleByteEncoding extends AbstractEncoding implements Coder, Dec
public function nextCode() {
// get the byte at the current position
$b = @$this->string[$this->posChar];
$b = $this->string[$this->posChar] ?? "";
if ($b === "") {
return false;
}
$this->posChar++;
$this->posByte++;
$p = ord($b);
if ($p < 0x80) {
// if the byte is an ASCII character or end of input, simply return it
@ -62,6 +64,7 @@ abstract class SingleByteEncoding extends AbstractEncoding implements Coder, Dec
$distance = abs($distance);
while ($this->posChar > 0 && $distance > 0) {
$this->posChar--;
$this->posByte--;
$distance--;
}
return $distance;
@ -76,10 +79,6 @@ abstract class SingleByteEncoding extends AbstractEncoding implements Coder, Dec
return 0;
}
public function posByte(): int {
return $this->posChar;
}
public function lenChar(): int {
return $this->lenByte;
}

69
lib/Encoding/UTF16.php

@ -9,17 +9,28 @@ namespace MensBeam\Intl\Encoding;
abstract class UTF16 extends AbstractEncoding {
protected $selfSynchronizing = true;
protected $dirtyEOF = 0;
/** @var int The size of the string's byte order mark, if any */
protected $BOM = 0;
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) {
$this->stateProps[] = "dirtyEOF";
parent::__construct($string, $fatal, $allowSurrogates);
if (substr($string, 0, 2) === (static::BE ? "\xFE\xFF" : "\xFF\xFE")) {
$this->BOM = 2;
$this->posByte = 2;
}
}
public function rewind(): void {
parent::rewind();
$this->posByte = $this->BOM;
}
public function nextCode() {
$lead_b = null;
$lead_s = null;
$this->posChar++;
while (($b = @$this->string[$this->posByte++]) !== "") {
while (($b = $this->string[$this->posByte++] ?? "") !== "") {
$b = ord($b);
if (is_null($lead_b)) {
$lead_b = $b;
@ -74,7 +85,7 @@ abstract class UTF16 extends AbstractEncoding {
public function nextChar(): string {
// get the byte at the current position
$b = @$this->string[$this->posByte];
$b = $this->string[$this->posByte] ?? "";
if ($b === "") {
// if the byte is end of input, simply return it
return "";
@ -84,6 +95,58 @@ abstract class UTF16 extends AbstractEncoding {
}
}
public function asciiSpan(string $mask, int $length = null): string {
// UTF-16 has no ASCII characters, so we must do things the hard way
$out = "";
$left = ($length === null) ? -1 : $length;
while ($left) {
$c1 = $this->string[$this->posByte] ?? "";
$c2 = $this->string[$this->posByte + 1] ?? "";
$b = ord(static::BE ? $c1 : $c2);
if (!$b) {
$c = static::BE ? $c2 : $c1;
$b = ord($c);
if ($b < 0x80 && strpos($mask, $c) !== false && $c1 !== "" && $c2 !== "") {
$out .= $c;
$this->posByte += 2;
$this->posChar++;
$left--;
} else {
break;
}
} else {
break;
}
}
return $out;
}
public function asciiSpanNot(string $mask, int $length = null): string {
// this is a copy of asciiSpan above with only the strpos check reversed
$out = "";
$left = ($length === null) ? -1 : $length;
while ($left) {
$c1 = $this->string[$this->posByte] ?? "";
$c2 = $this->string[$this->posByte + 1] ?? "";
$b = ord(static::BE ? $c1 : $c2);
if (!$b) {
$c = static::BE ? $c2 : $c1;
$b = ord($c);
if ($b < 0x80 && strpos($mask, $c) === false && $c1 !== "" && $c2 !== "") {
$out .= $c;
$this->posByte += 2;
$this->posChar++;
$left--;
} else {
break;
}
} else {
break;
}
}
return $out;
}
/** Implements backward seeking $distance characters */
protected function seekBack(int $distance): int {
if ($this->dirtyEOF && $distance) {
@ -92,7 +155,7 @@ abstract class UTF16 extends AbstractEncoding {
$this->posByte -= $this->dirtyEOF;
$this->dirtyEOF = 0;
}
while ($distance > 0 && $this->posByte > 0) {
while ($distance > 0 && $this->posChar > 0) {
$distance--;
$this->posChar--;
if ($this->posByte < 4) {

25
lib/Encoding/UTF8.php

@ -18,11 +18,26 @@ class UTF8 extends AbstractEncoding implements Coder, Decoder {
];
protected $selfSynchronizing = true;
/** @var int The size of the string's byte order mark, if any */
protected $BOM = 0;
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) {
parent::__construct($string, $fatal, $allowSurrogates);
if (substr($string, 0, 3) === "\xEF\xBB\xBF") {
$this->BOM = 3;
$this->posByte = 3;
}
}
public function rewind(): void {
parent::rewind();
$this->posByte = $this->BOM;
}
public function nextCode() {
// this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder
// optimization for ASCII characters
$b = @$this->string[$this->posByte];
$b = $this->string[$this->posByte] ?? "";
if ($b === "") {
return false;
} elseif (($b = ord($b)) < 0x80) {
@ -37,7 +52,7 @@ class UTF8 extends AbstractEncoding implements Coder, Decoder {
$lower = 0x80;
$upper = 0xBF;
while ($seen < $needed) {
$b = ord(@$this->string[$this->posByte++]);
$b = ord($this->string[$this->posByte++] ?? "");
if (!$seen) {
if ($b >= 0xC2 && $b <= 0xDF) { // two-byte character
$needed = 2;
@ -99,10 +114,10 @@ class UTF8 extends AbstractEncoding implements Coder, Decoder {
/** Implements backward seeking $distance characters */
protected function seekBack(int $distance): int {
while ($distance > 0 && $this->posByte > 0) {
while ($distance > 0 && $this->posChar > 0) {
$distance--;
$this->posChar--;
$b = ord(@$this->string[$this->posByte - 1]);
$b = ord($this->string[$this->posByte - 1] ?? "");
if ($b < 0x80) {
// if the byte is an ASCII byte or the end of input, then this is already a synchronized position
$this->posByte--;
@ -110,7 +125,7 @@ class UTF8 extends AbstractEncoding implements Coder, Decoder {
$s = $this->posByte;
$pos = $s - 1;
while ($b >= 0x80 && $b <= 0xBF && $pos > 0 && ($s - $pos) < 4) { // go back at most four bytes, no further than the start of the string, and only as long as the byte remains a continuation byte
$b = ord(@$this->string[--$pos]);
$b = ord($this->string[--$pos] ?? "");
}
$this->posByte = $pos;
// decrement the character position because nextCode() increments it

12
lib/Encoding/XUserDefined.php

@ -16,11 +16,12 @@ class XUserDefined extends AbstractEncoding implements Coder, Decoder {
*/
public function nextChar(): string {
// get the byte at the current position
$b = @$this->string[$this->posChar];
$b = $this->string[$this->posChar] ?? "";
if ($b === "") {
return "";
}
$this->posChar++;
$this->posByte++;
$p = ord($b);
if ($p < 0x80) {
// if the byte is an ASCII character or end of input, simply return it
@ -38,11 +39,12 @@ class XUserDefined extends AbstractEncoding implements Coder, Decoder {
*/
public function nextCode() {
// get the byte at the current position
$b = @$this->string[$this->posChar];
$b = $this->string[$this->posChar] ?? "";
if ($b === "") {
return false;
}
$this->posChar++;
$this->posByte++;
$p = ord($b);
if ($p < 0x80) {
// if the byte is an ASCII character or end of input, simply return it
@ -69,6 +71,7 @@ class XUserDefined extends AbstractEncoding implements Coder, Decoder {
$distance = abs($distance);
while ($this->posChar > 0 && $distance > 0) {
$this->posChar--;
$this->posByte--;
$distance--;
}
return $distance;
@ -95,11 +98,6 @@ class XUserDefined extends AbstractEncoding implements Coder, Decoder {
return 0;
}
/** Returns the current byte position of the decoder */
public function posByte(): int {
return $this->posChar;
}
/** Calculates the length of the string in code points
*
* Note that this may involve processing to the end of the string

16
tests/cases/Encoding/TestBig5.php

@ -27,6 +27,8 @@ class TestBig5 extends \MensBeam\Intl\Test\CoderDecoderTest {
protected $seekOffsets = [0, 1, 3, 5, 5, 7, 7, 9];
/* This string contains an invalid character sequence sandwiched between two null characters */
protected $brokenChar = "00 FF 00";
/* This string conatins the ASCII characters "A" and "Z" followed by two arbitrary non-ASCII characters, followed by the two ASCII characters "0" and "9" */
protected $spanString = "41 5A D7AA A4F4 30 39";
public function provideCodePoints() {
return [
@ -195,6 +197,20 @@ class TestBig5 extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testSeekBackOverRandomData();
}
/**
* @covers MensBeam\Intl\Encoding\Big5::asciiSpan
*/
public function testExtractAsciiSpans() {
parent::testExtractAsciiSpans();
}
/**
* @covers MensBeam\Intl\Encoding\Big5::asciiSpanNot
*/
public function testExtractNegativeAsciiSpans() {
parent::testExtractNegativeAsciiSpans();
}
/**
* @group optional
*/

16
tests/cases/Encoding/TestEUCJP.php

@ -27,6 +27,8 @@ class TestEUCJP extends \MensBeam\Intl\Test\CoderDecoderTest {
protected $seekOffsets = [0, 1, 3, 5, 7, 8, 10, 13];
/* This string contains an invalid character sequence sandwiched between two null characters */
protected $brokenChar = "00 FF 00";
/* This string conatins the ASCII characters "A" and "Z" followed by two arbitrary non-ASCII characters, followed by the two ASCII characters "0" and "9" */
protected $spanString = "41 5A 8EDB 8FB0EF 30 39";
public function provideCodePoints() {
return [
@ -206,6 +208,20 @@ class TestEUCJP extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testSeekBackOverRandomData();
}
/**
* @covers MensBeam\Intl\Encoding\EUCJP::asciiSpan
*/
public function testExtractAsciiSpans() {
parent::testExtractAsciiSpans();
}
/**
* @covers MensBeam\Intl\Encoding\EUCJP::asciiSpanNot
*/
public function testExtractNegativeAsciiSpans() {
parent::testExtractNegativeAsciiSpans();
}
/**
* @group optional
*/

16
tests/cases/Encoding/TestEUCKR.php

@ -27,6 +27,8 @@ class TestEUCKR extends \MensBeam\Intl\Test\CoderDecoderTest {
protected $seekOffsets = [0, 1, 3, 4, 6, 8, 10, 11];
/* This string contains an invalid character sequence sandwiched between two null characters */
protected $brokenChar = "00 FF 00";
/* This string conatins the ASCII characters "A" and "Z" followed by two arbitrary non-ASCII characters, followed by the two ASCII characters "0" and "9" */
protected $spanString = "41 5A E2A9 A5C1 30 39";
public function provideCodePoints() {
return [
@ -185,6 +187,20 @@ class TestEUCKR extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testSeekBackOverRandomData();
}
/**
* @covers MensBeam\Intl\Encoding\EUCKR::asciiSpan
*/
public function testExtractAsciiSpans() {
parent::testExtractAsciiSpans();
}
/**
* @covers MensBeam\Intl\Encoding\EUCKR::asciiSpanNot
*/
public function testExtractNegativeAsciiSpans() {
parent::testExtractNegativeAsciiSpans();
}
/**
* @group optional
*/

18
tests/cases/Encoding/TestGB18030.php

@ -23,11 +23,13 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest {
Char 6 U+FFFE (4 bytes) Offset 19
End of string at char 7, offset 23
*/
protected $seekString = "7A 81 30 84 34 CB AE 94 32 BE 34 84 30 81 30 E3 32 9A 33 84 31 A4 38";
protected $seekString = "7A 81308434 CBAE 9432BE34 84308130 E3329A33 8431A438";
protected $seekCodes = [0x007A, 0x00A2, 0x6C34, 0x1D11E, 0xF8FF, 0x10FFFD, 0xFFFE];
protected $seekOffsets = [0, 1, 5, 7, 11, 15, 19, 23];
/* This string contains an invalid character sequence sandwiched between two null characters */
protected $brokenChar = "00 FF 00";
/* This string conatins the ASCII characters "A" and "Z" followed by two arbitrary non-ASCII characters, followed by the two ASCII characters "0" and "9" */
protected $spanString = "41 5A 81308434 CBAE 30 39";
public function tearDown(): void {
$this->testedClass = GB18030::class;
@ -274,6 +276,20 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testSeekBackOverRandomData();
}
/**
* @covers MensBeam\Intl\Encoding\GB18030::asciiSpan
*/
public function testExtractAsciiSpans() {
parent::testExtractAsciiSpans();
}
/**
* @covers MensBeam\Intl\Encoding\GB18030::asciiSpanNot
*/
public function testExtractNegativeAsciiSpans() {
parent::testExtractNegativeAsciiSpans();
}
/**
* @group optional
*/

16
tests/cases/Encoding/TestISO2022JP.php

@ -31,6 +31,8 @@ class TestISO2022JP extends \MensBeam\Intl\Test\CoderDecoderTest {
protected $seekOffsets = [0, 1, 5, 6, 11, 13, 15, 19];
/* This string contains an invalid character sequence sandwiched between two null characters */
protected $brokenChar = "00 FF 00";
/* This string conatins the ASCII characters "A" and "Z" followed by two arbitrary non-ASCII characters, followed by the two ASCII characters "0" and "9" */
protected $spanString = "1B284A 41 5A 1B2849 5C 5F 1B2842 30 39";
public function provideCodePoints() {
return [
@ -243,6 +245,20 @@ class TestISO2022JP extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testSeekBackOverRandomData();
}
/**
* @covers MensBeam\Intl\Encoding\ISO2022JP::asciiSpan
*/
public function testExtractAsciiSpans() {
parent::testExtractAsciiSpans();
}
/**
* @covers MensBeam\Intl\Encoding\ISO2022JP::asciiSpanNot
*/
public function testExtractNegativeAsciiSpans() {
parent::testExtractNegativeAsciiSpans();
}
/**
* @group optional
*/

20
tests/cases/Encoding/TestReplacement.php

@ -198,4 +198,24 @@ class TestReplacement extends \MensBeam\Intl\Test\DecoderTest {
public function testSeekBackOverRandomData() {
return parent::testSeekBackOverRandomData();
}
/**
* @covers MensBeam\Intl\Encoding\Replacement::asciiSpan
*/
public function testExtractAsciiSpans() {
$d = new Replacement("VVVVVV");
$this->assertSame("", $d->asciiSpan($this->allBytes()));
$d->nextChar();
$this->assertTrue($d->eof());
}
/**
* @covers MensBeam\Intl\Encoding\Replacement::asciiSpanNot
*/
public function testExtractNegativeAsciiSpans() {
$d = new Replacement("VVVVVV");
$this->assertSame("", $d->asciiSpanNot(""));
$d->nextChar();
$this->assertTrue($d->eof());
}
}

16
tests/cases/Encoding/TestShiftJIS.php

@ -27,6 +27,8 @@ class TestShiftJIS extends \MensBeam\Intl\Test\CoderDecoderTest {
protected $seekOffsets = [0, 1, 2, 4, 6, 7, 8, 10];
/* This string contains an invalid character sequence sandwiched between two null characters */
protected $brokenChar = "00 FF 00";
/* This string conatins the ASCII characters "A" and "Z" followed by two arbitrary non-ASCII characters, followed by the two ASCII characters "0" and "9" */
protected $spanString = "41 5A D6 82E6 30 39";
public function provideCodePoints() {
return [
@ -198,6 +200,20 @@ class TestShiftJIS extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testSeekBackOverRandomData();
}
/**
* @covers MensBeam\Intl\Encoding\ShiftJIS::asciiSpan
*/
public function testExtractAsciiSpans() {
parent::testExtractAsciiSpans();
}
/**
* @covers MensBeam\Intl\Encoding\ShiftJIS::asciiSpanNot
*/
public function testExtractNegativeAsciiSpans() {
parent::testExtractNegativeAsciiSpans();
}
/**
* @group optional
*/

20
tests/cases/Encoding/TestSingleByte.php

@ -81,6 +81,8 @@ class TestSingleByte extends \MensBeam\Intl\Test\CoderDecoderTest {
protected $seekOffsets = [0, 1, 2, 3, 4, 5, 6, 7];
/* This string is supposed to contain an invalid character sequence sandwiched between two null characters; this is different for each single-byte encoding (and many do not have invalid characters) */
protected $brokenChar = "";
/* This string conatins the ASCII characters "A" and "Z" followed by two arbitrary non-ASCII characters, followed by the two ASCII characters "0" and "9" */
protected $spanString = "41 5A 80 FF 30 39";
/**
* @dataProvider provideCodePoints
@ -234,6 +236,24 @@ class TestSingleByte extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testSeekBackOverRandomData();
}
/**
* @dataProvider provideClasses
* @covers MensBeam\Intl\Encoding\SingleByteEncoding::asciiSpan
*/
public function testExtractAsciiSpans($class = null) {
$this->testedClass = $class;
parent::testExtractAsciiSpans();
}
/**
* @dataProvider provideClasses
* @covers MensBeam\Intl\Encoding\SingleByteEncoding::asciiSpan
*/
public function testExtractNegativeAsciiSpans($class = null) {
$this->testedClass = $class;
parent::testExtractNegativeAsciiSpans();
}
public function provideClasses() {
foreach (self::$classes as $name => $class) {
yield $name => [$class];

23
tests/cases/Encoding/TestUTF16BE.php

@ -11,20 +11,23 @@ use MensBeam\Intl\Encoding\UTF16BE;
class TestUTF16BE extends TestUTF16LE {
protected $testedClass = UTF16BE::class;
/*
Char 0 U+007A (2 byte) Offset 0
Char 1 U+00A2 (2 bytes) Offset 2
Char 2 U+6C34 (2 bytes) Offset 4
Char 3 U+1D11E (4 bytes) Offset 6
Char 4 U+F8FF (2 bytes) Offset 10
Char 5 U+10FFFD (4 bytes) Offset 12
Char 6 U+FFFE (2 bytes) Offset 16
End of string at char 7, offset 18
Byte Order Mark (2 bytes) Offset 0
Char 0 U+007A (2 bytes) Offset 2
Char 1 U+00A2 (2 bytes) Offset 4
Char 2 U+6C34 (2 bytes) Offset 6
Char 3 U+1D11E (4 bytes) Offset 8
Char 4 U+F8FF (2 bytes) Offset 12
Char 5 U+10FFFD (4 bytes) Offset 14
Char 6 U+FFFE (2 bytes) Offset 18
End of string at char 7, offset 20
*/
protected $seekString = "007A 00A2 6C34 D834DD1E F8FF DBFFDFFD FFFE";
protected $seekString = "FEFF 007A 00A2 6C34 D834DD1E F8FF DBFFDFFD FFFE";
protected $seekCodes = [0x007A, 0x00A2, 0x6C34, 0x1D11E, 0xF8FF, 0x10FFFD, 0xFFFE];
protected $seekOffsets = [0, 2, 4, 6, 10, 12, 16, 18];
protected $seekOffsets = [2, 4, 6, 8, 12, 14, 18, 20];
/* This string contains an invalid character sequence sandwiched between two null characters */
protected $brokenChar = "0000 DC00 0000";
/* This string conatins the ASCII characters "A" and "Z" followed by two arbitrary non-ASCII characters, followed by the two ASCII characters "0" and "9" */
protected $spanString = "0041 005A 6C34 D834DD1E 0030 0039";
protected $lowerA = "\x00a";
public function provideStrings() {

37
tests/cases/Encoding/TestUTF16LE.php

@ -11,20 +11,23 @@ use MensBeam\Intl\Encoding\UTF16LE;
class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
protected $testedClass = UTF16LE::class;
/*
Char 0 U+007A (2 byte) Offset 0
Char 1 U+00A2 (2 bytes) Offset 2
Char 2 U+6C34 (2 bytes) Offset 4
Char 3 U+1D11E (4 bytes) Offset 6
Char 4 U+F8FF (2 bytes) Offset 10
Char 5 U+10FFFD (4 bytes) Offset 12
Char 6 U+FFFE (2 bytes) Offset 16
End of string at char 7, offset 18
Byte Order Mark (2 bytes) Offset 0
Char 0 U+007A (2 bytes) Offset 2
Char 1 U+00A2 (2 bytes) Offset 4
Char 2 U+6C34 (2 bytes) Offset 6
Char 3 U+1D11E (4 bytes) Offset 8
Char 4 U+F8FF (2 bytes) Offset 12
Char 5 U+10FFFD (4 bytes) Offset 14
Char 6 U+FFFE (2 bytes) Offset 18
End of string at char 7, offset 20
*/
protected $seekString = "7A00 A200 346C 34D81EDD FFF8 FFDBFDDF FEFF";
protected $seekString = "FFFE 7A00 A200 346C 34D81EDD FFF8 FFDBFDDF FEFF";
protected $seekCodes = [0x007A, 0x00A2, 0x6C34, 0x1D11E, 0xF8FF, 0x10FFFD, 0xFFFE];
protected $seekOffsets = [0, 2, 4, 6, 10, 12, 16, 18];
protected $seekOffsets = [2, 4, 6, 8, 12, 14, 18, 20];
/* This string contains an invalid character sequence sandwiched between two null characters */
protected $brokenChar = "0000 00DC 0000";
/* This string conatins the ASCII characters "A" and "Z" followed by two arbitrary non-ASCII characters, followed by the two ASCII characters "0" and "9" */
protected $spanString = "4100 5A00 346C 34D81EDD 3000 3900";
protected $lowerA = "a\x00";
/**
@ -133,6 +136,20 @@ class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
return parent::testSeekBackOverRandomData();
}
/**
* @covers MensBeam\Intl\Encoding\UTF16::asciiSpan
*/
public function testExtractAsciiSpans() {
parent::testExtractAsciiSpans();
}
/**
* @covers MensBeam\Intl\Encoding\UTF16::asciiSpanNot
*/
public function testExtractNegativeAsciiSpans() {
parent::testExtractNegativeAsciiSpans();
}
public function provideStrings() {
return [
// control samples

37
tests/cases/Encoding/TestUTF8.php

@ -13,20 +13,23 @@ use MensBeam\Intl\Encoding\EncoderException;
class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest {
protected $testedClass = UTF8::class;
/*
Char 0 U+007A (1 byte) Offset 0
Char 1 U+00A2 (2 bytes) Offset 1
Char 2 U+6C34 (3 bytes) Offset 3
Char 3 U+1D11E (4 bytes) Offset 6
Char 4 U+F8FF (3 bytes) Offset 10
Char 5 U+10FFFD (4 bytes) Offset 13
Char 6 U+FFFE (3 bytes) Offset 17
End of string at char 7, offset 20
Byte Order Mark (3 bytes) Offset 0
Char 0 U+007A (1 byte) Offset 3
Char 1 U+00A2 (2 bytes) Offset 4
Char 2 U+6C34 (3 bytes) Offset 6
Char 3 U+1D11E (4 bytes) Offset 9
Char 4 U+F8FF (3 bytes) Offset 13
Char 5 U+10FFFD (4 bytes) Offset 16
Char 6 U+FFFE (3 bytes) Offset 20
End of string at char 7, offset 23
*/
protected $seekString = "7A C2A2 E6B0B4 F09D849E EFA3BF F48FBFBD EFBFBE";
protected $seekString = "EFBBBF 7A C2A2 E6B0B4 F09D849E EFA3BF F48FBFBD EFBFBE";
protected $seekCodes = [0x007A, 0x00A2, 0x6C34, 0x1D11E, 0xF8FF, 0x10FFFD, 0xFFFE];
protected $seekOffsets = [0, 1, 3, 6, 10, 13, 17, 20];
protected $seekOffsets = [3, 4, 6, 9, 13, 16, 20, 23];
/* This string contains an invalid character sequence sandwiched between two null characters */
protected $brokenChar = "00 FF 00";
/* This string conatins the ASCII characters "A" and "Z" followed by two arbitrary non-ASCII characters, followed by the two ASCII characters "0" and "9" */
protected $spanString = "41 5A E6B0B4 F09D849E 30 39";
public function provideCodePoints() {
return [
@ -224,4 +227,18 @@ class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest {
public function testSeekBackOverRandomData() {
return parent::testSeekBackOverRandomData();
}
/**
* @covers MensBeam\Intl\Encoding\UTF8::asciiSpan
*/
public function testExtractAsciiSpans() {
parent::testExtractAsciiSpans();
}
/**
* @covers MensBeam\Intl\Encoding\UTF8::asciiSpanNot
*/
public function testExtractNegativeAsciiSpans() {
parent::testExtractNegativeAsciiSpans();
}
}

16
tests/cases/Encoding/TestXUserDefined.php

@ -18,6 +18,8 @@ class TestXUserDefined extends \MensBeam\Intl\Test\CoderDecoderTest {
protected $seekOffsets = [0, 1, 2, 3, 4, 5, 6, 7];
/* This string is supposed to contain an invalid character sequence sandwiched between two null characters, but x-user-defined has no invalid characters */
protected $brokenChar = "";
/* This string conatins the ASCII characters "A" and "Z" followed by two arbitrary non-ASCII characters, followed by the two ASCII characters "0" and "9" */
protected $spanString = "41 5A 80 FF 30 39";
public function provideCodePoints() {
return [
@ -183,4 +185,18 @@ class TestXUserDefined extends \MensBeam\Intl\Test\CoderDecoderTest {
public function testSeekBackOverRandomData() {
return parent::testSeekBackOverRandomData();
}
/**
* @covers MensBeam\Intl\Encoding\XUserDefined::asciiSpan
*/
public function testExtractAsciiSpans() {
parent::testExtractAsciiSpans();
}
/**
* @covers MensBeam\Intl\Encoding\XUserDefined::asciiSpanNot
*/
public function testExtractNegativeAsciiSpans() {
parent::testExtractNegativeAsciiSpans();
}
}

18
tests/cases/TestEncoding.php

@ -8,6 +8,9 @@ namespace MensBeam\Intl\TestCase;
use MensBeam\Intl\Encoding;
use MensBeam\Intl\Encoding\Encoder;
use MensBeam\Intl\Encoding\UTF16BE;
use MensBeam\Intl\Encoding\UTF16LE;
use MensBeam\Intl\Encoding\UTF8;
class TestEncoding extends \PHPUnit\Framework\TestCase {
/** @dataProvider provideLabelData */
@ -28,6 +31,11 @@ class TestEncoding extends \PHPUnit\Framework\TestCase {
$this->assertInstanceOf($data['class'], Encoding::createDecoder(" $label\n\n\r\t", ""));
}
/** @dataProvider provideBOMSniffings */
public function testCreateADecoderWhileSniffingBOM(string $label, string $string, string $class) {
$this->assertInstanceOf($class, Encoding::createDecoder($label, $string));
}
public function testFailToCreateADecoderFromALabel() {
$this->assertNull(Encoding::createDecoder("Not a label", ""));
}
@ -71,4 +79,14 @@ class TestEncoding extends \PHPUnit\Framework\TestCase {
yield [(string) $label, ['label' => (string) $label, 'name' => $name, 'class' => $class, 'encoder' => $encoder]];
}
}
public function provideBOMSniffings() {
return [
'No BOM' => ["UTF-8", "Hello world!", UTF8::class],
'UTF-8 BOM' => ["Shift_JIS", "\xEF\xBB\xBFA", UTF8::class],
'UTF-16BE BOM' => ["UTF-8", "\xFE\xFF\x00A", UTF16BE::class],
'UTF-16LE BOM' => ["UTF-8", "\xFF\xFEA\x00", UTF16LE::class],
'GB18030 BOM' => ["UTF-8", "\x84\x31\x95\x33A", UTF8::class],
];
}
}

56
tests/lib/DecoderTest.php

@ -8,6 +8,9 @@ namespace MensBeam\Intl\Test;
use MensBeam\Intl\Encoding\DecoderException;
use MensBeam\Intl\Encoding\ISO2022JP;
use MensBeam\Intl\Encoding\UTF16BE;
use MensBeam\Intl\Encoding\UTF16LE;
use MensBeam\Intl\Encoding\UTF8;
abstract class DecoderTest extends \PHPUnit\Framework\TestCase {
protected $random = "L51yGwEFuatjbZi7wgNC80qYncvauVm1Lh8vCSK/KJs6QxoynMU8TCamx5TNhbjeh5VpWqQ0Q1j/W6u4O/InxBDxk8g83azJFQHzU+L7Npk0bkdofFv2AHDI2SUlXotYeEOnkKa/c6eQiDk8NapS0LGnb64ypKASacAMp6s2wSUU03l6iVVapHsNBgYs0cD++vnG8ckgbGsV3KkE3Lh601u6jviDyeRwbTxLZcUfSS2uIzrvvGWFfw6D4/FOa3uTR1k2Ya6jT+T/F+OdMgWlUPouuAVgLuvFxj9v9ZBnI+FAFc0kX4aT/JoTuBGMm8YS4xPVvczdrPXCUijML5TZrU201uFqeB9LDDWULp1Ai9d41fcD/8GBFrzlpXPIV+hsSJ4HvWswXdDeVKWgSMrQ78pf+zwvD66TA4FjMiEsLLpf9bb+mPiS2Aa3BP0JpjPwi0gdBu8QipLXNGFUUGW/15jGlj3eNynELRAtvyYZnoYIYShsN1TIU+buw8hHOp9iKsKT+fqPaEuuLLtlJ/cqhcxaZhbaWRB6vCQW9mO7f8whl7cpbBOO+NwDDCJZCsULh7rINF2omkexfOZzQSt/LC3yw+Pzqrf5Pmp5YgpMvoNgHcY1FkpsHc48IHMsJ+gex2zltIG51TQBAhy/fWF0KIqd+IPT+qngVGYIw/WuXj0LaK7XIVp33tc6fzuXNv+GUzYwpv4k9ry8R/DW8EX572FXFA49HHxbytSIJLD/+KpE2CE1WOr3ONwOXm6WduUBmFi4bwlRrCKnHqnFtLztVdLwMOauFa8N822XoAnWvHs+8R1DLHtgUyZas3ktp/qjMp5oVsb2PO+VpPFHIighHySgljrPl+sKaPULh7P/rAHXOuS9p9zTZKHrQ4nccl8SnYZlHKdioWo1NK5LRZB0PXYH8Ytu8aWVBmb4lAlpAFbSTqtOhydUJ/lyM29STG5mTV3rbG6tWMsUXBpaX4PrGCnhj40RVdz0BzsgvzLu4PNI+s3TJ6ZKV4hGS5on040xMDC2423DpKHPNa7mbl7J036dFt0JcYeGu07maGxssJnwLbebg5cm36Ecea7cTBWEGFMqiFjLoBEu0Y2CfF/GEbwqOf55/p1ewaZMrunFKd/Mj89qyYU5bp6mwmXSwj10psAA+qtXYm3XzRrLHKfCuiukyPEtvI+RdjbQDtMP1vF5qkmjlQLHXvEDpviJMaqvIPkjGrZkvAej1JX5yka50z0od9LLz8TIernjLLoVZ+cWtpd3kchO6w+zTpIOups4HdD66zaiPJrXIrJwi5bIgwTOWLhVs3ufZ0loFjlWWUh5FlTW+oWl1AD4h/yPBHWglqfMaTTqH75B4XEriy+Bw9k=";
@ -72,16 +75,22 @@ abstract class DecoderTest extends \PHPUnit\Framework\TestCase {
$input = $this->prepString($this->seekString);
$off = $this->seekOffsets;
$s = new $class($input);
$bom = [
UTF8::class => 3,
UTF16BE::class => 2,
UTF16LE::class => 2,
][$this->testedClass] ?? 0;
$this->assertSame(0, $s->posChar());
$this->assertSame(0, $s->posByte());
$this->assertSame($bom, $s->posByte());
$this->assertSame(0, $s->seek(0));
$this->assertSame(0, $s->posChar());
$this->assertSame(0, $s->posByte());
$this->assertSame($bom, $s->posByte());
$this->assertSame(1, $s->seek(-1));
$this->assertSame(0, $s->posChar());
$this->assertSame(0, $s->posByte());
$this->assertSame($bom, $s->posByte());
$this->assertSame(0, $s->seek(1));
$this->assertSame(1, $s->posChar());
@ -109,15 +118,15 @@ abstract class DecoderTest extends \PHPUnit\Framework\TestCase {
$this->assertSame(6, $s->seek(-10));
$this->assertSame(0, $s->posChar());
$this->assertSame(0, $s->posByte());
$this->assertSame($bom, $s->posByte());
$this->assertSame(0, $s->seek(5));
$this->assertSame(5, $s->posChar());
$this->assertSame($off[5], $s->posByte());
$s->rewind(0);
$s->rewind();
$this->assertSame(0, $s->posChar());
$this->assertSame(0, $s->posByte());
$this->assertSame($bom, $s->posByte());
}
public function testTraversePastTheEndOfAString() {
@ -354,7 +363,42 @@ abstract class DecoderTest extends \PHPUnit\Framework\TestCase {
$this->assertSame(sizeof($exp), $a);
}
public function testExtractAsciiSpans() {
$allBytes = $this->allBytes();
$class = $this->testedClass;
$d = new $class($this->prepString($this->spanString));
$this->assertSame("", $d->asciiSpan("az"));
$this->assertSame("A", $d->asciiSpan("AZ", 1));
$this->assertSame("Z", $d->asciiSpan("AZ"));
$this->assertSame("", $d->asciiSpan($allBytes));
$d->nextChar();
$this->assertSame("", $d->asciiSpan($allBytes));
$d->nextChar();
$this->assertSame("09", $d->asciiSpan($allBytes));
}
public function testExtractNegativeAsciiSpans() {
$class = $this->testedClass;
$d = new $class($this->prepString($this->spanString));
$this->assertSame("", $d->asciiSpanNot("AZ"));
$this->assertSame("A", $d->asciiSpanNot("az", 1));
$this->assertSame("Z", $d->asciiSpanNot("az"));
$this->assertSame("", $d->asciiSpanNot(""));
$d->nextChar();
$this->assertSame("", $d->asciiSpanNot(""));
$d->nextChar();
$this->assertSame("09", $d->asciiSpanNot(""));
}
protected function prepString(string $str): string {
return hex2bin(str_replace(" ", "", $str));
}
protected function allBytes(): string {
$out = "";
for ($a = 0x00; $a <= 0xFF; $a++) {
$out .= chr($a);
}
return $out;
}
}

713
vendor-bin/csfixer/composer.lock

File diff suppressed because it is too large

2
vendor-bin/phpunit/composer.json

@ -1,5 +1,5 @@
{
"require": {
"phpunit/phpunit": "^8.5"
"phpunit/phpunit": "^8.5 | ^9.0"
}
}

1156
vendor-bin/phpunit/composer.lock

File diff suppressed because it is too large

2
vendor-bin/robo/composer.json

@ -1,5 +1,5 @@
{
"require": {
"consolidation/robo": "^1.1"
"consolidation/robo": "^4.0"
}
}

1302
vendor-bin/robo/composer.lock

File diff suppressed because it is too large
Loading…
Cancel
Save