diff --git a/CHANGELOG b/CHANGELOG index cac36e0..93e96f8 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,10 @@ +Version 0.6.0 (2019-12-18) +========================== + +New features: +- Added $allowSurrogates parameter to Encoding constructor +- Added posErr public instance property to Encoding + Version 0.5.0 (2019-12-13) ========================== diff --git a/lib/Encoding/Big5.php b/lib/Encoding/Big5.php index 6e84c96..3305a8d 100644 --- a/lib/Encoding/Big5.php +++ b/lib/Encoding/Big5.php @@ -24,12 +24,6 @@ class Big5 implements StatelessEncoding { protected $bufferedCode = 0; - /** Decodes the next character from the string and returns its code point number - * - * If the end of the string has been reached, false is returned - * - * @return int|bool - */ public function nextCode() { $this->posChar++; if ($this->bufferedCode > 0) { @@ -70,8 +64,10 @@ class Big5 implements StatelessEncoding { return $code; } else { if ($b < 0x80) { + $this->posErr = $this->posChar; return self::err($this->errMode, [$this->posChar -1, --$this->posByte - 1]); } else { + $this->posErr = $this->posChar; return self::err($this->errMode, [$this->posChar -1, $this->posByte - 2]); } } @@ -85,16 +81,11 @@ class Big5 implements StatelessEncoding { } else { // dirty EOF $this->dirtyEOF = 1; + $this->posErr = $this->posChar; return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]); } } - /** Returns the encoding of $codePoint as a byte string - * - * If $codePoint is less than 0 or greater than 1114111, an exception is thrown - * - * If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted - */ public static function encode(int $codePoint, bool $fatal = true): string { if ($codePoint < 0 || $codePoint > 0x10FFFF) { throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT); diff --git a/lib/Encoding/EUCKR.php b/lib/Encoding/EUCKR.php index 54ce73f..a3ce004 100644 --- a/lib/Encoding/EUCKR.php +++ b/lib/Encoding/EUCKR.php @@ -27,12 +27,6 @@ class EUCKR implements StatelessEncoding { protected $dirtyEOF = 0; - /** Decodes the next character from the string and returns its code point number - * - * If the end of the string has been reached, false is returned - * - * @return int|bool - */ public function nextCode() { $this->posChar++; $lead = 0x00; @@ -42,6 +36,7 @@ class EUCKR implements StatelessEncoding { if ($b < 0x80) { return $b; } elseif ($b == 0x80 || $b == 0xFF) { + $this->posErr = $this->posChar; return self::err($this->errMode, [$this->posChar -1, $this->posByte - 1]); } else { $lead = $b; @@ -57,8 +52,10 @@ class EUCKR implements StatelessEncoding { return $code; } else { if ($b < 0x80) { + $this->posErr = $this->posChar; return self::err($this->errMode, [$this->posChar -1, --$this->posByte - 1]); } else { + $this->posErr = $this->posChar; return self::err($this->errMode, [$this->posChar -1, $this->posByte - 2]); } } @@ -72,16 +69,11 @@ class EUCKR implements StatelessEncoding { } else { // dirty EOF $this->dirtyEOF = 1; + $this->posErr = $this->posChar; return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]); } } - /** Returns the encoding of $codePoint as a byte string - * - * If $codePoint is less than 0 or greater than 1114111, an exception is thrown - * - * If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted - */ public static function encode(int $codePoint, bool $fatal = true): string { if ($codePoint < 0 || $codePoint > 0x10FFFF) { throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT); diff --git a/lib/Encoding/Encoding.php b/lib/Encoding/Encoding.php index 5b7a731..04d4f58 100644 --- a/lib/Encoding/Encoding.php +++ b/lib/Encoding/Encoding.php @@ -19,10 +19,10 @@ interface Encoding { const E_UNAVAILABLE_CODE_POINT = 4; /** Constructs a new decoder - * - * If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted + * @param bool $fatal If true, throw enceptions when encountering invalid input. If false, substitute U+FFFD REPLACEMENT CHARACTER instead + * @param bool $allowSurrogates If true, treats surrogate characters as valid input; this only affects UTF-8 and UTF-16 encodings */ - public function __construct(string $string, bool $fatal = false); + public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false); /** Returns the current byte position of the decoder */ public function posByte(): int; @@ -40,15 +40,15 @@ interface Encoding { * * If the end of the string has been reached, false is returned * - * @return int|bool + * @return int|false */ public function nextCode(); /** Advance $distance characters through the string - * - * If $distance is negative, the operation will be performed in reverse * * If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned + * + * @param int $distance The number of characters to advance. If negative, the operation will seek back toward the beginning of the string */ public function seek(int $distance): int; @@ -58,10 +58,16 @@ interface Encoding { */ public function rewind(); - /** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */ + /** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer + * + * @param int $num The number of characters to retrieve + */ public function peekChar(int $num = 1): string; - /** Retrieves the next $num code points from the string, without advancing the character pointer */ + /** Retrieves the next $num code points from the string, without advancing the character pointer + * + * @param int $num The number of code points to retrieve + */ public function peekCode(int $num = 1): array; /** Calculates the length of the string in bytes */ diff --git a/lib/Encoding/GBCommon.php b/lib/Encoding/GBCommon.php index 6689a22..7c47b46 100644 --- a/lib/Encoding/GBCommon.php +++ b/lib/Encoding/GBCommon.php @@ -15,12 +15,6 @@ abstract class GBCommon implements StatelessEncoding { protected $dirtyEOF = 0; - /** Decodes the next character from the string and returns its code point number - * - * If the end of the string has been reached, false is returned - * - * @return int|bool - */ public function nextCode() { $first = 0; $second = 0; @@ -37,6 +31,7 @@ abstract class GBCommon implements StatelessEncoding { $first = $b; continue; } else { + $this->posErr = $this->posChar; return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); } } elseif ($second === 0) { @@ -49,8 +44,10 @@ abstract class GBCommon implements StatelessEncoding { $pointer = ($first - 0x81) * 190 + ($b - $offset); return self::TABLE_GBK[$pointer]; } elseif ($b < 0x80) { + $this->posErr = $this->posChar; return self::err($this->errMode, [$this->posChar - 1, --$this->posByte]); } else { + $this->posErr = $this->posChar; return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); } } @@ -60,6 +57,7 @@ abstract class GBCommon implements StatelessEncoding { continue; } else { $this->posByte -= 2; + $this->posErr = $this->posChar; return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); } } else { @@ -79,10 +77,12 @@ abstract class GBCommon implements StatelessEncoding { if (isset($codePointOffset)) { return $codePointOffset + $pointer - $offset; } else { + $this->posErr = $this->posChar; return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); } } else { $this->posByte -= 3; + $this->posErr = $this->posChar; return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); } } @@ -95,16 +95,11 @@ abstract class GBCommon implements StatelessEncoding { } else { // dirty EOF; note how many bytes the last character had $this->dirtyEOF = ($third ? 3 : ($second ? 2 : 1)); + $this->posErr = $this->posChar; return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]); } } - /** Returns the encoding of $codePoint as a byte string - * - * If $codePoint is less than 0 or greater than 1114111, an exception is thrown - * - * If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted - */ public static function encode(int $codePoint, bool $fatal = true): string { if ($codePoint < 0 || $codePoint > 0x10FFFF) { throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT); diff --git a/lib/Encoding/GenericEncoding.php b/lib/Encoding/GenericEncoding.php index 8b26626..9d0b2a0 100644 --- a/lib/Encoding/GenericEncoding.php +++ b/lib/Encoding/GenericEncoding.php @@ -13,40 +13,30 @@ trait GenericEncoding { protected $lenByte = null; protected $lenChar = null; protected $errMode = self::MODE_REPLACE; + protected $allowSurrogates = false; - /** Constructs a new decoder - * - * If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted - */ - public function __construct(string $string, bool $fatal = false) { + public $posErr = 0; + + public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) { $this->string = $string; $this->lenByte = strlen($string); $this->errMode = $fatal ? self::MODE_FATAL_DEC : self::MODE_REPLACE; + $this->allowSurrogates = $allowSurrogates; } - /** Returns the current byte position of the decoder */ public function posByte(): int { return $this->posByte; } - /** Returns the current character position of the decoder */ public function posChar(): int { return $this->posChar; } - /** Seeks to the start of the string - * - * This is usually faster than using the seek method for the same purpose - */ public function rewind() { $this->posByte = 0; $this->posChar = 0; } - /** Retrieve the next character in the string, in UTF-8 encoding - * - * The returned character may be a replacement character, or the empty string if the end of the string has been reached - */ public function nextChar(): string { // get the byte at the current position $b = @$this->string[$this->posByte]; @@ -64,12 +54,6 @@ trait GenericEncoding { } } - /** Advance $distance characters through the string - * - * If $distance is negative, the operation will be performed in reverse - * - * If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned - */ public function seek(int $distance): int { if ($distance > 0) { if ($this->posByte == strlen($this->string)) { @@ -94,7 +78,6 @@ trait GenericEncoding { } } - /** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */ public function peekChar(int $num = 1): string { $out = ""; $state = $this->stateSave(); @@ -108,7 +91,6 @@ trait GenericEncoding { return $out; } - /** Retrieves the next $num code points from the string, without advancing the character pointer */ public function peekCode(int $num = 1): array { $out = []; $state = $this->stateSave(); @@ -122,15 +104,10 @@ trait GenericEncoding { return $out; } - /** Calculates the length of the string in bytes */ public function lenByte(): int { return $this->lenByte; } - /** Calculates the length of the string in code points - * - * Note that this may involve processing to the end of the string - */ public function lenChar(): int { return $this->lenChar ?? (function() { $state = $this->stateSave(); @@ -141,19 +118,16 @@ trait GenericEncoding { })(); } - /** Returns whether the character pointer is at the end of the string */ public function eof(): bool { return $this->posByte >= $this->lenByte; } - /** Generates an iterator which steps through each character in the string */ public function chars(): \Generator { while (($c = $this->nextChar()) !== "") { yield ($this->posChar - 1) => $c; } } - /** Generates an iterator which steps through each code point in the string */ public function codes(): \Generator { while (($c = $this->nextCode()) !== false) { yield ($this->posChar - 1) => $c; @@ -165,6 +139,7 @@ trait GenericEncoding { return [ 'posChar' => $this->posChar, 'posByte' => $this->posByte, + 'posErr' => $this->posErr, ]; } @@ -191,7 +166,7 @@ trait GenericEncoding { // fatal replacement mode for decoders throw new DecoderException("Invalid code sequence at character offset {$data[0]} (byte offset {$data[1]})", self::E_INVALID_BYTE); case self::MODE_FATAL_ENC: - // fatal replacement mode for decoders; not applicable to Unicode transformation formats + // fatal replacement mode for encoders; not applicable to Unicode transformation formats throw new EncoderException("Code point $data not available in target encoding", self::E_UNAVAILABLE_CODE_POINT); default: // indicative of internal bug; should never be triggered diff --git a/lib/Encoding/SingleByteEncoding.php b/lib/Encoding/SingleByteEncoding.php index 9bdf996..0b48684 100644 --- a/lib/Encoding/SingleByteEncoding.php +++ b/lib/Encoding/SingleByteEncoding.php @@ -9,10 +9,6 @@ namespace MensBeam\Intl\Encoding; abstract class SingleByteEncoding implements StatelessEncoding { use GenericEncoding; - /** Retrieve the next character in the string, in UTF-8 encoding - * - * The returned character may be a replacement character, or the empty string if the end of the string has been reached - */ public function nextChar(): string { // get the byte at the current position $b = @$this->string[$this->posChar]; @@ -29,12 +25,6 @@ abstract class SingleByteEncoding implements StatelessEncoding { } } - /** Decodes the next character from the string and returns its code point number - * - * If the end of the string has been reached, false is returned - * - * @return int|bool - */ public function nextCode() { // get the byte at the current position $b = @$this->string[$this->posChar]; @@ -51,12 +41,6 @@ abstract class SingleByteEncoding implements StatelessEncoding { } } - /** Returns the encoding of $codePoint as a byte string - * - * If $codePoint is less than 0 or greater than 1114111, an exception is thrown - * - * If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted - */ public static function encode(int $codePoint, bool $fatal = true): string { if ($codePoint < 0 || $codePoint > 0x10FFFF) { throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT); @@ -67,12 +51,6 @@ abstract class SingleByteEncoding implements StatelessEncoding { } } - /** Advance $distance characters through the string - * - * If $distance is negative, the operation will be performed in reverse - * - * If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned - */ public function seek(int $distance): int { if ($distance > 0) { while ($this->posChar < $this->lenByte && $distance > 0) { @@ -92,20 +70,14 @@ abstract class SingleByteEncoding implements StatelessEncoding { } } - /** Returns the current byte position of the decoder */ public function posByte(): int { return $this->posChar; } - /** Calculates the length of the string in code points - * - * Note that this may involve processing to the end of the string - */ public function lenChar(): int { return $this->lenByte; } - /** Returns whether the character pointer is at the end of the string */ public function eof(): bool { return $this->posChar >= $this->lenByte; } diff --git a/lib/Encoding/UTF16.php b/lib/Encoding/UTF16.php index 0bce6d9..8a8f752 100644 --- a/lib/Encoding/UTF16.php +++ b/lib/Encoding/UTF16.php @@ -11,12 +11,6 @@ abstract class UTF16 implements Encoding { protected $dirtyEOF = 0; - /** Decodes the next character from the string and returns its code point number - * - * If the end of the string has been reached, false is returned - * - * @return int|bool - */ public function nextCode() { $lead_b = null; $lead_s = null; @@ -36,6 +30,9 @@ abstract class UTF16 implements Encoding { if (!is_null($lead_s)) { if ($code >= 0xDC00 && $code <= 0xDFFF) { return 0x10000 + (($lead_s - 0xD800) << 10) + ($code - 0xDC00); + } elseif ($this->allowSurrogates) { + $this->posByte -= 2; + return $lead_s; } else { $this->posByte -= 2; return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]); @@ -45,7 +42,11 @@ abstract class UTF16 implements Encoding { $lead_s = $code; continue; } elseif ($code >= 0xDC00 && $code <= 0xDFFF) { - return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]); + if ($this->allowSurrogates) { + return $code; + } else { + return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]); + } } else { return $code; } @@ -65,10 +66,6 @@ abstract class UTF16 implements Encoding { } } - /** Retrieve the next character in the string, in UTF-8 encoding - * - * The returned character may be a replacement character, or the empty string if the end of the string has been reached - */ public function nextChar(): string { // get the byte at the current position $b = @$this->string[$this->posByte]; diff --git a/lib/Encoding/UTF8.php b/lib/Encoding/UTF8.php index 69d500c..ea39156 100644 --- a/lib/Encoding/UTF8.php +++ b/lib/Encoding/UTF8.php @@ -12,12 +12,6 @@ class UTF8 implements StatelessEncoding { const NAME = "UTF-8"; const LABELS = ["unicode-1-1-utf-8", "utf-8", "utf8"]; - /** Decodes the next character from the string and returns its code point number - * - * If the end of the string has been reached, false is returned - * - * @return int|bool - */ public function nextCode() { // this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder // optimization for ASCII characters @@ -46,7 +40,7 @@ class UTF8 implements StatelessEncoding { if ($b==0xE0) { $lower = 0xA0; } elseif ($b==0xED) { - $upper = 0x9F; + $upper = ($this->allowSurrogates) ? 0xBF : 0x9F; } $point = $b & 0xF; } elseif ($b >= 0xF0 && $b <= 0xF4) { // four-byte character @@ -58,9 +52,11 @@ class UTF8 implements StatelessEncoding { } $point = $b & 0x7; } else { // invalid byte + $this->posErr = $this->posChar; return self::err($this->errMode, [$this->posChar, $this->posByte]); } } elseif ($b < $lower || $b > $upper) { + $this->posErr = $this->posChar; return self::err($this->errMode, [$this->posChar, $this->posByte--]); } else { $lower = 0x80; @@ -72,12 +68,6 @@ class UTF8 implements StatelessEncoding { return $point; } - /** Returns the encoding of $codePoint as a byte string - * - * If $codePoint is less than 0 or greater than 1114111, an exception is thrown - * - * If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted. When encoding to UTF-8, all Unicode characters can be encoded, so the argument is ignored - */ public static function encode(int $codePoint, bool $fatal = true): string { // this function implements https://encoding.spec.whatwg.org/#utf-8-encoder if ($codePoint < 0 || $codePoint > 0x10FFFF) { diff --git a/tests/cases/Encoding/TestBig5.php b/tests/cases/Encoding/TestBig5.php index 85711de..f86568e 100644 --- a/tests/cases/Encoding/TestBig5.php +++ b/tests/cases/Encoding/TestBig5.php @@ -128,6 +128,14 @@ class TestBig5 extends \MensBeam\Intl\Test\CoderDecoderTest { return parent::testIterateThroughAString($input, $exp); } + /** + * @dataProvider provideStrings + * @coversNothing + */ + public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) { + return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp); + } + public function provideCodePoints() { return [ 'U+0064 (HTML)' => [false, 0x64, "64"], diff --git a/tests/cases/Encoding/TestEUCKR.php b/tests/cases/Encoding/TestEUCKR.php index 6fc35a7..eec110f 100644 --- a/tests/cases/Encoding/TestEUCKR.php +++ b/tests/cases/Encoding/TestEUCKR.php @@ -128,6 +128,14 @@ class TestEUCKR extends \MensBeam\Intl\Test\CoderDecoderTest { return parent::testIterateThroughAString($input, $exp); } + /** + * @dataProvider provideStrings + * @coversNothing + */ + public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) { + return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp); + } + public function provideCodePoints() { return [ 'U+0064 (HTML)' => [false, 0x64, "64"], diff --git a/tests/cases/Encoding/TestGB18030.php b/tests/cases/Encoding/TestGB18030.php index 293b1d7..419d21f 100644 --- a/tests/cases/Encoding/TestGB18030.php +++ b/tests/cases/Encoding/TestGB18030.php @@ -136,6 +136,14 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest { return parent::testIterateThroughAString($input, $exp); } + /** + * @dataProvider provideStrings + * @coversNothing + */ + public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) { + return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp); + } + public function provideCodePoints() { // bytes confirmed using Firefox $series_gb18030 = [ diff --git a/tests/cases/Encoding/TestSingleByte.php b/tests/cases/Encoding/TestSingleByte.php index 7fa81b3..f1fd634 100644 --- a/tests/cases/Encoding/TestSingleByte.php +++ b/tests/cases/Encoding/TestSingleByte.php @@ -205,6 +205,15 @@ class TestSingleByte extends \MensBeam\Intl\Test\CoderDecoderTest { return parent::testIterateThroughAString($input, $exp); } + /** + * @dataProvider provideStrings + * @coversNothing + */ + public function testIterateThroughAStringAllowingSurrogates(string $input, array $exp, $class = null) { + $this->testedClass = $class; + return parent::testIterateThroughAStringAllowingSurrogates($input, $exp, $exp); + } + public function provideClasses() { foreach (self::$classes as $name => $class) { yield $name => [$class]; diff --git a/tests/cases/Encoding/TestUTF16BE.php b/tests/cases/Encoding/TestUTF16BE.php index 2e88e53..c77ab04 100644 --- a/tests/cases/Encoding/TestUTF16BE.php +++ b/tests/cases/Encoding/TestUTF16BE.php @@ -6,7 +6,6 @@ declare(strict_types=1); namespace MensBeam\Intl\TestCase\Encoding; -use MensBeam\Intl\Encoding\UTF16LE; use MensBeam\Intl\Encoding\UTF16BE; class TestUTF16BE extends TestUTF16LE { @@ -30,7 +29,10 @@ class TestUTF16BE extends TestUTF16LE { public function provideStrings() { foreach (parent::provideStrings() as $name => $test) { - list($string, $codes) = $test; + if (sizeof($test) == 2) { + $test[] = null; + } + list($string, $codes, $altCodes) = $test; $words = explode(" ", $string); foreach ($words as $a => $word) { if (strlen($word) == 4) { @@ -38,7 +40,7 @@ class TestUTF16BE extends TestUTF16LE { } } $string = implode(" ", $words); - yield $name => [$string, $codes]; + yield $name => [$string, $codes, $altCodes]; } } } diff --git a/tests/cases/Encoding/TestUTF16LE.php b/tests/cases/Encoding/TestUTF16LE.php index 1424b44..2129df3 100644 --- a/tests/cases/Encoding/TestUTF16LE.php +++ b/tests/cases/Encoding/TestUTF16LE.php @@ -7,7 +7,6 @@ declare(strict_types=1); namespace MensBeam\Intl\TestCase\Encoding; use MensBeam\Intl\Encoding\UTF16LE; -use MensBeam\Intl\Encoding\UTF16BE; class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest { protected $testedClass = UTF16LE::class; @@ -119,6 +118,14 @@ class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest { return parent::testIterateThroughAString($input, $exp); } + /** + * @dataProvider provideStrings + * @covers MensBeam\Intl\Encoding\UTF16::nextCode + */ + public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) { + return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp); + } + public function provideStrings() { return [ // control samples @@ -130,10 +137,10 @@ class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest { 'EOF after lead surrogate' => ["0000 34D8", [0, 65533]], 'EOF in trail surrogate' => ["0000 34D8 1E", [0, 65533]], // invalid UTF-16 surrogates - 'lead surrogate without trail' => ["34D8 0000", [65533, 0]], - 'trail surrogate without lead' => ["1EDD 0000", [65533, 0]], - 'double lead surrogate' => ["34D8 34D8 1EDD", [65533, 119070]], - 'double trail surrogate' => ["34D8 1EDD 1EDD", [119070, 65533]], + 'lead surrogate without trail' => ["34D8 0000", [65533, 0], [0xD834, 0]], + 'trail surrogate without lead' => ["1EDD 0000", [65533, 0], [0xDD1E, 0]], + 'double lead surrogate' => ["34D8 34D8 1EDD", [65533, 119070], [0xD834, 119070]], + 'double trail surrogate' => ["34D8 1EDD 1EDD", [119070, 65533], [119070, 0xDD1E]], ]; } } diff --git a/tests/cases/Encoding/TestUTF8.php b/tests/cases/Encoding/TestUTF8.php index 6be0395..8c027ed 100644 --- a/tests/cases/Encoding/TestUTF8.php +++ b/tests/cases/Encoding/TestUTF8.php @@ -128,6 +128,14 @@ class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest { return parent::testIterateThroughAString($input, $exp); } + /** + * @dataProvider provideStrings + * @covers MensBeam\Intl\Encoding\UTF8::nextCode + */ + public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) { + return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp); + } + public function provideCodePoints() { return [ 'U+007A (HTML)' => [false, 0x7A, "7A"], @@ -190,9 +198,10 @@ class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest { 'overlong U+10FFFF - 5 bytes' => ["F8 84 8F BF BF", [65533, 65533, 65533, 65533, 65533]], 'overlong U+10FFFF - 6 bytes' => ["FC 80 84 8F BF BF", [65533, 65533, 65533, 65533, 65533, 65533]], // UTF-16 surrogates - 'lead surrogate' => ["ED A0 80", [65533, 65533, 65533]], - 'trail surrogate' => ["ED B0 80", [65533, 65533, 65533]], - 'surrogate pair' => ["ED A0 80 ED B0 80", [65533, 65533, 65533, 65533, 65533, 65533]], + // surrogates have alternate outputs for when surrogates are being allowed + 'lead surrogate' => ["ED A0 80", [65533, 65533, 65533], [0xD800]], + 'trail surrogate' => ["ED B0 80", [65533, 65533, 65533], [0xDC00]], + 'surrogate pair' => ["ED A0 80 ED B0 80", [65533, 65533, 65533, 65533, 65533, 65533], [0xD800, 0xDC00]], // self-sync edge cases 'trailing continuation' => ["0A 80 80", [10, 65533, 65533]], 'trailing continuation 2' => ["E5 8F A4 80", [21476, 65533]], diff --git a/tests/cases/Encoding/TestXUserDefined.php b/tests/cases/Encoding/TestXUserDefined.php index 5d47dda..decd5bc 100644 --- a/tests/cases/Encoding/TestXUserDefined.php +++ b/tests/cases/Encoding/TestXUserDefined.php @@ -109,6 +109,14 @@ class TestXUserDefined extends \MensBeam\Intl\Test\DecoderTest { return parent::testIterateThroughAString($input, $exp); } + /** + * @dataProvider provideStrings + * @coversNothing + */ + public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) { + return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp); + } + public function provideStrings() { $a_bytes = []; $a_codes = []; diff --git a/tests/lib/DecoderTest.php b/tests/lib/DecoderTest.php index 56dcfa1..4730420 100644 --- a/tests/lib/DecoderTest.php +++ b/tests/lib/DecoderTest.php @@ -281,10 +281,18 @@ abstract class DecoderTest extends \PHPUnit\Framework\TestCase { } public function testIterateThroughAString(string $input, array $exp) { + $this->iterateThroughAString($input, $exp, false); + } + + public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) { + $exp = $relaxedExp ?? $strictExp; + $this->iterateThroughAString($input, $exp, true); + } + + protected function iterateThroughAString(string $input, array $exp, bool $allowSurrogates) { $class = $this->testedClass; $input = $this->prepString($input); - $s = new $class($input); - $out = []; + $s = new $class($input, false, $allowSurrogates); $a = 0; $this->assertTrue(true); // prevent risky test of empty string foreach ($s->codes() as $index => $p) {