Browse Source

Optionally allow surrogates

Also removed unnecessary docblocks
span 0.6.0
J. King 4 years ago
parent
commit
200a310f72
  1. 7
      CHANGELOG
  2. 15
      lib/Encoding/Big5.php
  3. 16
      lib/Encoding/EUCKR.php
  4. 22
      lib/Encoding/Encoding.php
  5. 19
      lib/Encoding/GBCommon.php
  6. 39
      lib/Encoding/GenericEncoding.php
  7. 28
      lib/Encoding/SingleByteEncoding.php
  8. 19
      lib/Encoding/UTF16.php
  9. 16
      lib/Encoding/UTF8.php
  10. 8
      tests/cases/Encoding/TestBig5.php
  11. 8
      tests/cases/Encoding/TestEUCKR.php
  12. 8
      tests/cases/Encoding/TestGB18030.php
  13. 9
      tests/cases/Encoding/TestSingleByte.php
  14. 8
      tests/cases/Encoding/TestUTF16BE.php
  15. 17
      tests/cases/Encoding/TestUTF16LE.php
  16. 15
      tests/cases/Encoding/TestUTF8.php
  17. 8
      tests/cases/Encoding/TestXUserDefined.php
  18. 12
      tests/lib/DecoderTest.php

7
CHANGELOG

@ -1,3 +1,10 @@
Version 0.6.0 (2019-12-18)
==========================
New features:
- Added $allowSurrogates parameter to Encoding constructor
- Added posErr public instance property to Encoding
Version 0.5.0 (2019-12-13)
==========================

15
lib/Encoding/Big5.php

@ -24,12 +24,6 @@ class Big5 implements StatelessEncoding {
protected $bufferedCode = 0;
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
*/
public function nextCode() {
$this->posChar++;
if ($this->bufferedCode > 0) {
@ -70,8 +64,10 @@ class Big5 implements StatelessEncoding {
return $code;
} else {
if ($b < 0x80) {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar -1, --$this->posByte - 1]);
} else {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar -1, $this->posByte - 2]);
}
}
@ -85,16 +81,11 @@ class Big5 implements StatelessEncoding {
} else {
// dirty EOF
$this->dirtyEOF = 1;
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]);
}
}
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
*/
public static function encode(int $codePoint, bool $fatal = true): string {
if ($codePoint < 0 || $codePoint > 0x10FFFF) {
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);

16
lib/Encoding/EUCKR.php

@ -27,12 +27,6 @@ class EUCKR implements StatelessEncoding {
protected $dirtyEOF = 0;
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
*/
public function nextCode() {
$this->posChar++;
$lead = 0x00;
@ -42,6 +36,7 @@ class EUCKR implements StatelessEncoding {
if ($b < 0x80) {
return $b;
} elseif ($b == 0x80 || $b == 0xFF) {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar -1, $this->posByte - 1]);
} else {
$lead = $b;
@ -57,8 +52,10 @@ class EUCKR implements StatelessEncoding {
return $code;
} else {
if ($b < 0x80) {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar -1, --$this->posByte - 1]);
} else {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar -1, $this->posByte - 2]);
}
}
@ -72,16 +69,11 @@ class EUCKR implements StatelessEncoding {
} else {
// dirty EOF
$this->dirtyEOF = 1;
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]);
}
}
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
*/
public static function encode(int $codePoint, bool $fatal = true): string {
if ($codePoint < 0 || $codePoint > 0x10FFFF) {
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);

22
lib/Encoding/Encoding.php

@ -19,10 +19,10 @@ interface Encoding {
const E_UNAVAILABLE_CODE_POINT = 4;
/** Constructs a new decoder
*
* If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted
* @param bool $fatal If true, throw enceptions when encountering invalid input. If false, substitute U+FFFD REPLACEMENT CHARACTER instead
* @param bool $allowSurrogates If true, treats surrogate characters as valid input; this only affects UTF-8 and UTF-16 encodings
*/
public function __construct(string $string, bool $fatal = false);
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false);
/** Returns the current byte position of the decoder */
public function posByte(): int;
@ -40,15 +40,15 @@ interface Encoding {
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
* @return int|false
*/
public function nextCode();
/** Advance $distance characters through the string
*
* If $distance is negative, the operation will be performed in reverse
*
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
*
* @param int $distance The number of characters to advance. If negative, the operation will seek back toward the beginning of the string
*/
public function seek(int $distance): int;
@ -58,10 +58,16 @@ interface Encoding {
*/
public function rewind();
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer
*
* @param int $num The number of characters to retrieve
*/
public function peekChar(int $num = 1): string;
/** Retrieves the next $num code points from the string, without advancing the character pointer */
/** Retrieves the next $num code points from the string, without advancing the character pointer
*
* @param int $num The number of code points to retrieve
*/
public function peekCode(int $num = 1): array;
/** Calculates the length of the string in bytes */

19
lib/Encoding/GBCommon.php

@ -15,12 +15,6 @@ abstract class GBCommon implements StatelessEncoding {
protected $dirtyEOF = 0;
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
*/
public function nextCode() {
$first = 0;
$second = 0;
@ -37,6 +31,7 @@ abstract class GBCommon implements StatelessEncoding {
$first = $b;
continue;
} else {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
}
} elseif ($second === 0) {
@ -49,8 +44,10 @@ abstract class GBCommon implements StatelessEncoding {
$pointer = ($first - 0x81) * 190 + ($b - $offset);
return self::TABLE_GBK[$pointer];
} elseif ($b < 0x80) {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, --$this->posByte]);
} else {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
}
}
@ -60,6 +57,7 @@ abstract class GBCommon implements StatelessEncoding {
continue;
} else {
$this->posByte -= 2;
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
}
} else {
@ -79,10 +77,12 @@ abstract class GBCommon implements StatelessEncoding {
if (isset($codePointOffset)) {
return $codePointOffset + $pointer - $offset;
} else {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
}
} else {
$this->posByte -= 3;
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
}
}
@ -95,16 +95,11 @@ abstract class GBCommon implements StatelessEncoding {
} else {
// dirty EOF; note how many bytes the last character had
$this->dirtyEOF = ($third ? 3 : ($second ? 2 : 1));
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]);
}
}
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
*/
public static function encode(int $codePoint, bool $fatal = true): string {
if ($codePoint < 0 || $codePoint > 0x10FFFF) {
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);

39
lib/Encoding/GenericEncoding.php

@ -13,40 +13,30 @@ trait GenericEncoding {
protected $lenByte = null;
protected $lenChar = null;
protected $errMode = self::MODE_REPLACE;
protected $allowSurrogates = false;
/** Constructs a new decoder
*
* If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted
*/
public function __construct(string $string, bool $fatal = false) {
public $posErr = 0;
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) {
$this->string = $string;
$this->lenByte = strlen($string);
$this->errMode = $fatal ? self::MODE_FATAL_DEC : self::MODE_REPLACE;
$this->allowSurrogates = $allowSurrogates;
}
/** Returns the current byte position of the decoder */
public function posByte(): int {
return $this->posByte;
}
/** Returns the current character position of the decoder */
public function posChar(): int {
return $this->posChar;
}
/** Seeks to the start of the string
*
* This is usually faster than using the seek method for the same purpose
*/
public function rewind() {
$this->posByte = 0;
$this->posChar = 0;
}
/** Retrieve the next character in the string, in UTF-8 encoding
*
* The returned character may be a replacement character, or the empty string if the end of the string has been reached
*/
public function nextChar(): string {
// get the byte at the current position
$b = @$this->string[$this->posByte];
@ -64,12 +54,6 @@ trait GenericEncoding {
}
}
/** Advance $distance characters through the string
*
* If $distance is negative, the operation will be performed in reverse
*
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
*/
public function seek(int $distance): int {
if ($distance > 0) {
if ($this->posByte == strlen($this->string)) {
@ -94,7 +78,6 @@ trait GenericEncoding {
}
}
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */
public function peekChar(int $num = 1): string {
$out = "";
$state = $this->stateSave();
@ -108,7 +91,6 @@ trait GenericEncoding {
return $out;
}
/** Retrieves the next $num code points from the string, without advancing the character pointer */
public function peekCode(int $num = 1): array {
$out = [];
$state = $this->stateSave();
@ -122,15 +104,10 @@ trait GenericEncoding {
return $out;
}
/** Calculates the length of the string in bytes */
public function lenByte(): int {
return $this->lenByte;
}
/** Calculates the length of the string in code points
*
* Note that this may involve processing to the end of the string
*/
public function lenChar(): int {
return $this->lenChar ?? (function() {
$state = $this->stateSave();
@ -141,19 +118,16 @@ trait GenericEncoding {
})();
}
/** Returns whether the character pointer is at the end of the string */
public function eof(): bool {
return $this->posByte >= $this->lenByte;
}
/** Generates an iterator which steps through each character in the string */
public function chars(): \Generator {
while (($c = $this->nextChar()) !== "") {
yield ($this->posChar - 1) => $c;
}
}
/** Generates an iterator which steps through each code point in the string */
public function codes(): \Generator {
while (($c = $this->nextCode()) !== false) {
yield ($this->posChar - 1) => $c;
@ -165,6 +139,7 @@ trait GenericEncoding {
return [
'posChar' => $this->posChar,
'posByte' => $this->posByte,
'posErr' => $this->posErr,
];
}
@ -191,7 +166,7 @@ trait GenericEncoding {
// fatal replacement mode for decoders
throw new DecoderException("Invalid code sequence at character offset {$data[0]} (byte offset {$data[1]})", self::E_INVALID_BYTE);
case self::MODE_FATAL_ENC:
// fatal replacement mode for decoders; not applicable to Unicode transformation formats
// fatal replacement mode for encoders; not applicable to Unicode transformation formats
throw new EncoderException("Code point $data not available in target encoding", self::E_UNAVAILABLE_CODE_POINT);
default:
// indicative of internal bug; should never be triggered

28
lib/Encoding/SingleByteEncoding.php

@ -9,10 +9,6 @@ namespace MensBeam\Intl\Encoding;
abstract class SingleByteEncoding implements StatelessEncoding {
use GenericEncoding;
/** Retrieve the next character in the string, in UTF-8 encoding
*
* The returned character may be a replacement character, or the empty string if the end of the string has been reached
*/
public function nextChar(): string {
// get the byte at the current position
$b = @$this->string[$this->posChar];
@ -29,12 +25,6 @@ abstract class SingleByteEncoding implements StatelessEncoding {
}
}
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
*/
public function nextCode() {
// get the byte at the current position
$b = @$this->string[$this->posChar];
@ -51,12 +41,6 @@ abstract class SingleByteEncoding implements StatelessEncoding {
}
}
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
*/
public static function encode(int $codePoint, bool $fatal = true): string {
if ($codePoint < 0 || $codePoint > 0x10FFFF) {
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);
@ -67,12 +51,6 @@ abstract class SingleByteEncoding implements StatelessEncoding {
}
}
/** Advance $distance characters through the string
*
* If $distance is negative, the operation will be performed in reverse
*
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
*/
public function seek(int $distance): int {
if ($distance > 0) {
while ($this->posChar < $this->lenByte && $distance > 0) {
@ -92,20 +70,14 @@ abstract class SingleByteEncoding implements StatelessEncoding {
}
}
/** Returns the current byte position of the decoder */
public function posByte(): int {
return $this->posChar;
}
/** Calculates the length of the string in code points
*
* Note that this may involve processing to the end of the string
*/
public function lenChar(): int {
return $this->lenByte;
}
/** Returns whether the character pointer is at the end of the string */
public function eof(): bool {
return $this->posChar >= $this->lenByte;
}

19
lib/Encoding/UTF16.php

@ -11,12 +11,6 @@ abstract class UTF16 implements Encoding {
protected $dirtyEOF = 0;
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
*/
public function nextCode() {
$lead_b = null;
$lead_s = null;
@ -36,6 +30,9 @@ abstract class UTF16 implements Encoding {
if (!is_null($lead_s)) {
if ($code >= 0xDC00 && $code <= 0xDFFF) {
return 0x10000 + (($lead_s - 0xD800) << 10) + ($code - 0xDC00);
} elseif ($this->allowSurrogates) {
$this->posByte -= 2;
return $lead_s;
} else {
$this->posByte -= 2;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]);
@ -45,7 +42,11 @@ abstract class UTF16 implements Encoding {
$lead_s = $code;
continue;
} elseif ($code >= 0xDC00 && $code <= 0xDFFF) {
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]);
if ($this->allowSurrogates) {
return $code;
} else {
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]);
}
} else {
return $code;
}
@ -65,10 +66,6 @@ abstract class UTF16 implements Encoding {
}
}
/** Retrieve the next character in the string, in UTF-8 encoding
*
* The returned character may be a replacement character, or the empty string if the end of the string has been reached
*/
public function nextChar(): string {
// get the byte at the current position
$b = @$this->string[$this->posByte];

16
lib/Encoding/UTF8.php

@ -12,12 +12,6 @@ class UTF8 implements StatelessEncoding {
const NAME = "UTF-8";
const LABELS = ["unicode-1-1-utf-8", "utf-8", "utf8"];
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
*/
public function nextCode() {
// this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder
// optimization for ASCII characters
@ -46,7 +40,7 @@ class UTF8 implements StatelessEncoding {
if ($b==0xE0) {
$lower = 0xA0;
} elseif ($b==0xED) {
$upper = 0x9F;
$upper = ($this->allowSurrogates) ? 0xBF : 0x9F;
}
$point = $b & 0xF;
} elseif ($b >= 0xF0 && $b <= 0xF4) { // four-byte character
@ -58,9 +52,11 @@ class UTF8 implements StatelessEncoding {
}
$point = $b & 0x7;
} else { // invalid byte
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar, $this->posByte]);
}
} elseif ($b < $lower || $b > $upper) {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar, $this->posByte--]);
} else {
$lower = 0x80;
@ -72,12 +68,6 @@ class UTF8 implements StatelessEncoding {
return $point;
}
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted. When encoding to UTF-8, all Unicode characters can be encoded, so the argument is ignored
*/
public static function encode(int $codePoint, bool $fatal = true): string {
// this function implements https://encoding.spec.whatwg.org/#utf-8-encoder
if ($codePoint < 0 || $codePoint > 0x10FFFF) {

8
tests/cases/Encoding/TestBig5.php

@ -128,6 +128,14 @@ class TestBig5 extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testIterateThroughAString($input, $exp);
}
/**
* @dataProvider provideStrings
* @coversNothing
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
public function provideCodePoints() {
return [
'U+0064 (HTML)' => [false, 0x64, "64"],

8
tests/cases/Encoding/TestEUCKR.php

@ -128,6 +128,14 @@ class TestEUCKR extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testIterateThroughAString($input, $exp);
}
/**
* @dataProvider provideStrings
* @coversNothing
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
public function provideCodePoints() {
return [
'U+0064 (HTML)' => [false, 0x64, "64"],

8
tests/cases/Encoding/TestGB18030.php

@ -136,6 +136,14 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testIterateThroughAString($input, $exp);
}
/**
* @dataProvider provideStrings
* @coversNothing
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
public function provideCodePoints() {
// bytes confirmed using Firefox
$series_gb18030 = [

9
tests/cases/Encoding/TestSingleByte.php

@ -205,6 +205,15 @@ class TestSingleByte extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testIterateThroughAString($input, $exp);
}
/**
* @dataProvider provideStrings
* @coversNothing
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $exp, $class = null) {
$this->testedClass = $class;
return parent::testIterateThroughAStringAllowingSurrogates($input, $exp, $exp);
}
public function provideClasses() {
foreach (self::$classes as $name => $class) {
yield $name => [$class];

8
tests/cases/Encoding/TestUTF16BE.php

@ -6,7 +6,6 @@
declare(strict_types=1);
namespace MensBeam\Intl\TestCase\Encoding;
use MensBeam\Intl\Encoding\UTF16LE;
use MensBeam\Intl\Encoding\UTF16BE;
class TestUTF16BE extends TestUTF16LE {
@ -30,7 +29,10 @@ class TestUTF16BE extends TestUTF16LE {
public function provideStrings() {
foreach (parent::provideStrings() as $name => $test) {
list($string, $codes) = $test;
if (sizeof($test) == 2) {
$test[] = null;
}
list($string, $codes, $altCodes) = $test;
$words = explode(" ", $string);
foreach ($words as $a => $word) {
if (strlen($word) == 4) {
@ -38,7 +40,7 @@ class TestUTF16BE extends TestUTF16LE {
}
}
$string = implode(" ", $words);
yield $name => [$string, $codes];
yield $name => [$string, $codes, $altCodes];
}
}
}

17
tests/cases/Encoding/TestUTF16LE.php

@ -7,7 +7,6 @@ declare(strict_types=1);
namespace MensBeam\Intl\TestCase\Encoding;
use MensBeam\Intl\Encoding\UTF16LE;
use MensBeam\Intl\Encoding\UTF16BE;
class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
protected $testedClass = UTF16LE::class;
@ -119,6 +118,14 @@ class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
return parent::testIterateThroughAString($input, $exp);
}
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\UTF16::nextCode
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
public function provideStrings() {
return [
// control samples
@ -130,10 +137,10 @@ class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
'EOF after lead surrogate' => ["0000 34D8", [0, 65533]],
'EOF in trail surrogate' => ["0000 34D8 1E", [0, 65533]],
// invalid UTF-16 surrogates
'lead surrogate without trail' => ["34D8 0000", [65533, 0]],
'trail surrogate without lead' => ["1EDD 0000", [65533, 0]],
'double lead surrogate' => ["34D8 34D8 1EDD", [65533, 119070]],
'double trail surrogate' => ["34D8 1EDD 1EDD", [119070, 65533]],
'lead surrogate without trail' => ["34D8 0000", [65533, 0], [0xD834, 0]],
'trail surrogate without lead' => ["1EDD 0000", [65533, 0], [0xDD1E, 0]],
'double lead surrogate' => ["34D8 34D8 1EDD", [65533, 119070], [0xD834, 119070]],
'double trail surrogate' => ["34D8 1EDD 1EDD", [119070, 65533], [119070, 0xDD1E]],
];
}
}

15
tests/cases/Encoding/TestUTF8.php

@ -128,6 +128,14 @@ class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testIterateThroughAString($input, $exp);
}
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\UTF8::nextCode
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
public function provideCodePoints() {
return [
'U+007A (HTML)' => [false, 0x7A, "7A"],
@ -190,9 +198,10 @@ class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest {
'overlong U+10FFFF - 5 bytes' => ["F8 84 8F BF BF", [65533, 65533, 65533, 65533, 65533]],
'overlong U+10FFFF - 6 bytes' => ["FC 80 84 8F BF BF", [65533, 65533, 65533, 65533, 65533, 65533]],
// UTF-16 surrogates
'lead surrogate' => ["ED A0 80", [65533, 65533, 65533]],
'trail surrogate' => ["ED B0 80", [65533, 65533, 65533]],
'surrogate pair' => ["ED A0 80 ED B0 80", [65533, 65533, 65533, 65533, 65533, 65533]],
// surrogates have alternate outputs for when surrogates are being allowed
'lead surrogate' => ["ED A0 80", [65533, 65533, 65533], [0xD800]],
'trail surrogate' => ["ED B0 80", [65533, 65533, 65533], [0xDC00]],
'surrogate pair' => ["ED A0 80 ED B0 80", [65533, 65533, 65533, 65533, 65533, 65533], [0xD800, 0xDC00]],
// self-sync edge cases
'trailing continuation' => ["0A 80 80", [10, 65533, 65533]],
'trailing continuation 2' => ["E5 8F A4 80", [21476, 65533]],

8
tests/cases/Encoding/TestXUserDefined.php

@ -109,6 +109,14 @@ class TestXUserDefined extends \MensBeam\Intl\Test\DecoderTest {
return parent::testIterateThroughAString($input, $exp);
}
/**
* @dataProvider provideStrings
* @coversNothing
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
public function provideStrings() {
$a_bytes = [];
$a_codes = [];

12
tests/lib/DecoderTest.php

@ -281,10 +281,18 @@ abstract class DecoderTest extends \PHPUnit\Framework\TestCase {
}
public function testIterateThroughAString(string $input, array $exp) {
$this->iterateThroughAString($input, $exp, false);
}
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
$exp = $relaxedExp ?? $strictExp;
$this->iterateThroughAString($input, $exp, true);
}
protected function iterateThroughAString(string $input, array $exp, bool $allowSurrogates) {
$class = $this->testedClass;
$input = $this->prepString($input);
$s = new $class($input);
$out = [];
$s = new $class($input, false, $allowSurrogates);
$a = 0;
$this->assertTrue(true); // prevent risky test of empty string
foreach ($s->codes() as $index => $p) {

Loading…
Cancel
Save