diff --git a/lib/Encoding/UTF16.php b/lib/Encoding/UTF16.php new file mode 100644 index 0000000..0bce6d9 --- /dev/null +++ b/lib/Encoding/UTF16.php @@ -0,0 +1,114 @@ +posChar++; + while (($b = @$this->string[$this->posByte++]) !== "") { + $b = ord($b); + if (is_null($lead_b)) { + $lead_b = $b; + continue; + } else { + if (static::BE) { + $code = ($lead_b << 8) + $b; + } else { + $code = ($b << 8) + $lead_b; + } + $lead_b = null; + if (!is_null($lead_s)) { + if ($code >= 0xDC00 && $code <= 0xDFFF) { + return 0x10000 + (($lead_s - 0xD800) << 10) + ($code - 0xDC00); + } else { + $this->posByte -= 2; + return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]); + } + } else { + if ($code >= 0xD800 && $code <= 0xDBFF) { + $lead_s = $code; + continue; + } elseif ($code >= 0xDC00 && $code <= 0xDFFF) { + return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]); + } else { + return $code; + } + } + } + } + $this->posByte--; + if (($lead_b + $lead_s) == 0) { + // clean EOF + $this->posChar--; + return false; + } else { + // dirty EOF; note how many bytes the last character had + // properly synchronizing UTF-16 is possible without retaining this information, but retaining it makes the task easier + $this->dirtyEOF = ($lead_s && $lead_b ? 3 : ($lead_s ? 2 : 1)); + return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]); + } + } + + /** Retrieve the next character in the string, in UTF-8 encoding + * + * The returned character may be a replacement character, or the empty string if the end of the string has been reached + */ + public function nextChar(): string { + // get the byte at the current position + $b = @$this->string[$this->posByte]; + if ($b === "") { + // if the byte is end of input, simply return it + return ""; + } else { + // otherwise return the serialization of the code point at the current position + return UTF8::encode($this->nextCode()); + } + } + + /** Implements backward seeking $distance characters */ + protected function seekBack(int $distance): int { + if ($this->posByte >= $this->lenByte && $this->dirtyEOF > 0) { + // if we are at the end of the string and it did not terminate cleanly, go back the correct number of dirty bytes to seek through the last character + $this->posByte -= $this->dirtyEOF; + $distance--; + $this->posChar--; + } + while ($distance > 0 && $this->posByte > 0) { + $distance--; + $this->posChar--; + if ($this->posByte < 4) { + // if we're less than four bytes into the string, the previous character is necessarily double-byte + $this->posByte -= 2; + } else { + // otherwise go back four bytes and consume a character + $start = $this->posByte; + $this->posByte -= 4; + $this->posChar--; + $this->nextCode(); + if ($this->posByte == $start) { + // if we're back at our starting position the character was four bytes + $this->posByte -= 4; + } else { + // otherwise we're already where we need to be + } + } + } + return $distance; + } +} diff --git a/lib/Encoding/UTF16BE.php b/lib/Encoding/UTF16BE.php new file mode 100644 index 0000000..5e159d1 --- /dev/null +++ b/lib/Encoding/UTF16BE.php @@ -0,0 +1,13 @@ +testedClass = GB18030::class; diff --git a/tests/cases/Encoding/TestSingleByte.php b/tests/cases/Encoding/TestSingleByte.php index c2ea422..a9c310d 100644 --- a/tests/cases/Encoding/TestSingleByte.php +++ b/tests/cases/Encoding/TestSingleByte.php @@ -8,7 +8,6 @@ namespace MensBeam\Intl\TestCase\Encoding; use MensBeam\Intl\Encoding\SingleByteEncoding; use MensBeam\Intl\Encoding\EncoderException; -use MensBeam\Intl\Encoding\DecoderException; class TestSingleByte extends \MensBeam\Intl\Test\EncodingTest { // maps taken from https://github.com/web-platform-tests/wpt/blob/d6c29bef8d4bcdfe4f689defca73360b07647d71/encoding/single-byte-decoder.html @@ -79,8 +78,9 @@ class TestSingleByte extends \MensBeam\Intl\Test\EncodingTest { protected $seekString = "30 31 32 33 34 35 36"; protected $seekCodes = [0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36]; protected $seekOffsets = [0, 1, 2, 3, 4, 5, 6, 7]; - /* This string is supposed to contain a single invalid character sequence; this is different for each single-byte encoding (and many do not have invalid characters) */ + /* This string is supposed to contain an invalid character sequence sandwiched between two null characters; this is different for each single-byte encoding (and many do not have invalid characters) */ protected $brokenChar = ""; + protected $lowerA = "a"; /** * @dataProvider provideCodePoints @@ -182,17 +182,17 @@ class TestSingleByte extends \MensBeam\Intl\Test\EncodingTest { } /** - * @dataProvider provideStrings + * @dataProvider provideBrokenStrings * @covers MensBeam\Intl\Encoding\SingleByteEncoding::err */ - public function testReplacementModes(string $input = "", array $points = [], string $class = SingleByteEncoding::class) { - if (($bump = array_search(0xFFFD, $points, true)) === false) { + public function testReplacementModes(string $input = "", string $class = SingleByteEncoding::class) { + if (!$input) { // if the encoding uses all 128 high byte values, this test is non-operative $this->assertTrue(true); return; } $this->testedClass = $class; - $this->brokenChar = bin2hex(chr($bump)); + $this->brokenChar = $input; return parent::testReplacementModes(); } @@ -262,6 +262,20 @@ class TestSingleByte extends \MensBeam\Intl\Test\EncodingTest { } } + public function provideBrokenStrings() { + foreach ($this->provideStrings() as $name => $test) { + $codes = $test[1]; + $class = $test[2]; + if (($bump = array_search(0xFFFD, $codes, true)) === false) { + // if the encoding uses all 128 high byte values, this test is non-operative + yield $name => ["", $class]; + } else { + $byte = strtoupper(bin2hex(chr($bump))); + yield $name => ["00 $byte 00", $class]; + } + } + } + /** * @dataProvider provideInvalids * @covers MensBeam\Intl\Encoding\SingleByteEncoding::encode diff --git a/tests/cases/Encoding/TestUTF16BE.php b/tests/cases/Encoding/TestUTF16BE.php new file mode 100644 index 0000000..39acb88 --- /dev/null +++ b/tests/cases/Encoding/TestUTF16BE.php @@ -0,0 +1,44 @@ + $test) { + list($string, $codes) = $test; + $words = explode(" ", $string); + foreach($words as $a => $word) { + if (strlen($word) == 4) { + $words[$a] = $word[2].$word[3].$word[0].$word[1]; + } + } + $string = implode(" ",$words); + yield $name => [$string, $codes]; + } + } +} diff --git a/tests/cases/Encoding/TestUTF16LE.php b/tests/cases/Encoding/TestUTF16LE.php new file mode 100644 index 0000000..dabacd4 --- /dev/null +++ b/tests/cases/Encoding/TestUTF16LE.php @@ -0,0 +1,151 @@ +assertTrue(true); + } + + /** + * @dataProvider provideStrings + * @covers MensBeam\Intl\Encoding\UTF16::__construct + * @covers MensBeam\Intl\Encoding\UTF16::nextCode + */ + public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) { + return parent::testDecodeMultipleCharactersAsCodePoints($input, $exp); + } + + /** + * @dataProvider provideStrings + * @covers MensBeam\Intl\Encoding\UTF16::__construct + * @covers MensBeam\Intl\Encoding\UTF16::nextChar + */ + public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) { + return parent::testDecodeMultipleCharactersAsStrings($input, $exp); + } + + /** + * @dataProvider provideStrings + * @covers MensBeam\Intl\Encoding\UTF16::seekBack + */ + public function testSTepBackThroughAString(string $input, array $exp) { + return parent::testSTepBackThroughAString($input, $exp); + } + + /** + * @covers MensBeam\Intl\Encoding\UTF16::seek + * @covers MensBeam\Intl\Encoding\UTF16::posChar + * @covers MensBeam\Intl\Encoding\UTF16::posByte + * @covers MensBeam\Intl\Encoding\UTF16::rewind + */ + public function testSeekThroughAString() { + return parent::testSeekThroughAString(); + } + + /** + * @covers MensBeam\Intl\Encoding\UTF16::posChar + * @covers MensBeam\Intl\Encoding\UTF16::posByte + */ + public function testTraversePastTheEndOfAString() { + return parent::testTraversePastTheEndOfAString(); + } + + /** + * @covers MensBeam\Intl\Encoding\UTF16::peekChar + * @covers MensBeam\Intl\Encoding\UTF16::stateSave + * @covers MensBeam\Intl\Encoding\UTF16::stateApply + */ + public function testPeekAtCharacters() { + return parent::testPeekAtCharacters(); + } + + /** + * @covers MensBeam\Intl\Encoding\UTF16::peekCode + * @covers MensBeam\Intl\Encoding\UTF16::stateSave + * @covers MensBeam\Intl\Encoding\UTF16::stateApply + */ + public function testPeekAtCodePoints() { + return parent::testPeekAtCodePoints(); + } + + /** + * @dataProvider provideStrings + * @covers MensBeam\Intl\Encoding\UTF16::len + * @covers MensBeam\Intl\Encoding\UTF16::stateSave + * @covers MensBeam\Intl\Encoding\UTF16::stateApply + */ + public function testGetStringLength(string $input, array $points) { + return parent::testGetStringLength($input, $points); + } + + /** + * @covers MensBeam\Intl\Encoding\UTF16::err + */ + public function testReplacementModes() { + return parent::testReplacementModes(); + } + + /** + * @dataProvider provideStrings + * @covers MensBeam\Intl\Encoding\UTF16::rewind + * @covers MensBeam\Intl\Encoding\UTF16::chars + * @covers MensBeam\Intl\Encoding\UTF16::codes + */ + public function testIterateThroughAString(string $input, array $exp) { + return parent::testIterateThroughAString($input, $exp); + } + + public function provideCodePoints() { + // UTF-16 has no encoder + return [[true, 0, ""]]; + } + + public function provideStrings() { + return [ + // control samples + 'empty string' => ["", []], + 'sanity check' => ["6100 6200 6300 3100 3200 3300", [97, 98, 99, 49, 50, 51]], + 'mixed sample' => ["7A00 A200 346C 34D8 1EDD FFF8 FFDB FDDF FEFF", [122, 162, 27700, 119070, 63743, 1114109, 65534]], + // unexpected EOF + 'EOF in BMP character' => ["FF", [65533]], + 'EOF after lead surrogate' => ["34D8", [65533]], + 'EOF in trail surrogate' => ["34D8 1E", [65533]], + // invalid UTF-16 surrogates + 'lead surrogate without trail' => ["34D8 0000", [65533, 0]], + 'trail surrogate without lead' => ["1EDD 0000", [65533, 0]], + 'double lead surrogate' => ["34D8 34D8 1EDD", [65533, 119070]], + 'double trail surrogate' => ["34D8 1EDD 1EDD", [119070, 65533]], + ]; + } +} diff --git a/tests/cases/Encoding/TestUTF8.php b/tests/cases/Encoding/TestUTF8.php index 19c472f..921fb22 100644 --- a/tests/cases/Encoding/TestUTF8.php +++ b/tests/cases/Encoding/TestUTF8.php @@ -8,7 +8,6 @@ namespace MensBeam\Intl\TestCase\Encoding; use MensBeam\Intl\Encoding\UTF8; use MensBeam\Intl\Encoding\EncoderException; -use MensBeam\Intl\Encoding\DecoderException; class TestUTF8 extends \MensBeam\Intl\Test\EncodingTest { protected $testedClass = UTF8::class; @@ -22,11 +21,12 @@ class TestUTF8 extends \MensBeam\Intl\Test\EncodingTest { Char 6 U+FFFE (3 bytes) Offset 17 End of string at char 7, offset 20 */ - protected $seekString = "7A C2 A2 E6 B0 B4 F0 9D 84 9E EF A3 BF F4 8F BF BD EF BF BE"; + protected $seekString = "7A C2A2 E6B0B4 F09D849E EFA3BF F48FBFBD EFBFBE"; protected $seekCodes = [0x007A, 0x00A2, 0x6C34, 0x1D11E, 0xF8FF, 0x10FFFD, 0xFFFE]; protected $seekOffsets = [0, 1, 3, 6, 10, 13, 17, 20]; - /* This string contains a single invalid charactersequence */ - protected $brokenChar = "FF"; + /* This string contains an invalid character sequence sandwiched between two null characters */ + protected $brokenChar = "00 FF 00"; + protected $lowerA = "a"; /** * @dataProvider provideCodePoints diff --git a/tests/lib/EncodingTest.php b/tests/lib/EncodingTest.php index b6d6df8..c5b9808 100644 --- a/tests/lib/EncodingTest.php +++ b/tests/lib/EncodingTest.php @@ -34,7 +34,7 @@ abstract class EncodingTest extends \PHPUnit\Framework\TestCase { $out[] = $p; } $this->assertSame($exp, $out); - $this->assertSame($s->posByte(), strlen($input)); + $this->assertSame(strlen($input), $s->posByte()); } public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) { @@ -49,7 +49,7 @@ abstract class EncodingTest extends \PHPUnit\Framework\TestCase { $out[] = $p; } $this->assertSame($exp, $out); - $this->assertSame($s->posByte(), strlen($input)); + $this->assertSame(strlen($input), $s->posByte()); } public function testSTepBackThroughAString(string $input, array $exp) { @@ -118,29 +118,30 @@ abstract class EncodingTest extends \PHPUnit\Framework\TestCase { public function testTraversePastTheEndOfAString() { $class = $this->testedClass; - $s = new $class("a"); + $s = new $class($this->lowerA); + $l = strlen($this->lowerA); $this->assertSame(0, $s->posChar()); $this->assertSame(0, $s->posByte()); $this->assertSame("a", $s->nextChar()); $this->assertSame(1, $s->posChar()); - $this->assertSame(1, $s->posByte()); + $this->assertSame($l, $s->posByte()); $this->assertSame("", $s->nextChar()); $this->assertSame(1, $s->posChar()); - $this->assertSame(1, $s->posByte()); + $this->assertSame($l, $s->posByte()); - $s = new $class("a"); + $s = new $class($this->lowerA); $this->assertSame(0, $s->posChar()); $this->assertSame(0, $s->posByte()); $this->assertSame(ord("a"), $s->nextCode()); $this->assertSame(1, $s->posChar()); - $this->assertSame(1, $s->posByte()); + $this->assertSame($l, $s->posByte()); $this->assertSame(false, $s->nextCode()); $this->assertSame(1, $s->posChar()); - $this->assertSame(1, $s->posByte()); + $this->assertSame($l, $s->posByte()); } public function testPeekAtCharacters() { @@ -220,7 +221,7 @@ abstract class EncodingTest extends \PHPUnit\Framework\TestCase { public function testReplacementModes() { $class = $this->testedClass; - $input = $this->prepString("00".$this->brokenChar."00"); + $input = $this->prepString($this->brokenChar); // officially test replacement characters (already effectively tested by other tests) $s = new $class($input, false); $s->seek(1); diff --git a/tests/phpunit.xml b/tests/phpunit.xml index 4a80afc..7bf4948 100644 --- a/tests/phpunit.xml +++ b/tests/phpunit.xml @@ -19,6 +19,8 @@ cases/Encoding/TestUTF8.php + cases/Encoding/TestUTF16LE.php + cases/Encoding/TestUTF16BE.php cases/Encoding/TestSingleByte.php cases/Encoding/TestGB18030.php