diff --git a/README.md b/README.md index 52767c2..6cc7199 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ Included here is a partial suite of WHATWG-compatible seekable string decoders w * gb18030 * GBK * Big5 +* EUC-KR * all single-byte encodings * x-user-defined diff --git a/lib/Encoding/EUCKR.php b/lib/Encoding/EUCKR.php index 7c0d68f..54ce73f 100644 --- a/lib/Encoding/EUCKR.php +++ b/lib/Encoding/EUCKR.php @@ -107,7 +107,7 @@ class EUCKR implements StatelessEncoding { $distance--; $this->posChar--; } - while ($distance > 0 && ($this->posByte > 0 || $this->bufferedCode > 0)) { + while ($distance > 0 && $this->posByte > 0) { $distance--; $this->posChar--; // go back one byte diff --git a/tests/cases/Encoding/TestBig5.php b/tests/cases/Encoding/TestBig5.php index defb69e..55f2933 100644 --- a/tests/cases/Encoding/TestBig5.php +++ b/tests/cases/Encoding/TestBig5.php @@ -148,9 +148,9 @@ class TestBig5 extends \MensBeam\Intl\Test\CoderDecoderTest { 'two-byte character' => ["D7 D7", [36290]], 'EOF after first byte' => ["D7", [65533]], 'low byte after first byte' => ["D7 39", [65533, 57]], - '0x80 as first byte' => ["80 D7", [65533, 65533]], - '0xFF as first byte' => ["FF D7", [65533, 65533]], - 'invalid high byte as first byte' => ["81 D7", [65533]], + '0x80 as first byte' => ["80 D7 00", [65533, 65533, 0]], + '0xFF as first byte' => ["FF D7 00", [65533, 65533, 0]], + 'invalid high byte as first byte' => ["81 D7 00", [65533, 0]], '0x7F after first byte' => ["D7 7F", [65533, 127]], '0xFF after first byte' => ["D7 FF", [65533]], 'invalid high byte after first byte' => ["D7 81", [65533]], diff --git a/tests/cases/Encoding/TestEUCKR.php b/tests/cases/Encoding/TestEUCKR.php index 4363b47..3d67109 100644 --- a/tests/cases/Encoding/TestEUCKR.php +++ b/tests/cases/Encoding/TestEUCKR.php @@ -12,11 +12,21 @@ use MensBeam\Intl\Encoding\EncoderException; class TestEUCKR extends \MensBeam\Intl\Test\CoderDecoderTest { protected $testedClass = EUCKR::class; - protected $seekString = ""; - protected $seekCodes = []; - protected $seekOffsets = []; + /* + Char 0 U+007A (1 byte) Offset 0 + Char 1 U+ACF2 (2 bytes) Offset 1 + Char 2 U+0020 (1 byte) Offset 3 + Char 3 U+6C34 (2 bytes) Offset 4 + Char 4 U+0391 (2 bytes) Offset 6 + Char 5 U+03C9 (2 bytes) Offset 8 + Char 6 U+002A (1 byte) Offset 10 + End of string at char 7, offset 11 + */ + protected $seekString = "7A 81E9 20 E2A9 A5C1 A5F8 2A"; + protected $seekCodes = [0x7A, 0xACF2, 0x20, 0x6C34, 0x391, 0x3C9, 0x2A]; + protected $seekOffsets = [0, 1, 3, 4, 6, 8, 10, 11]; /* This string contains an invalid character sequence sandwiched between two null characters */ - protected $brokenChar = ""; + protected $brokenChar = "00 FF 00"; /** * @dataProvider provideCodePoints @@ -118,11 +128,33 @@ class TestEUCKR extends \MensBeam\Intl\Test\CoderDecoderTest { public function provideCodePoints() { return [ + 'U+0064 (HTML)' => [false, 0x64, "64"], + 'U+0064 (fatal)' => [true, 0x64, "64"], + 'U+00CA (HTML)' => [false, 0xCA, bin2hex("Ê")], + 'U+00CA (fatal)' => [true, 0xCA, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)], + 'U+ACF2 (HTML)' => [false, 0xACF2, "81 E9"], + 'U+ACF2 (fatal)' => [true, 0xACF2, "81 E9"], + '-1 (HTML)' => [false, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], + '-1 (fatal)' => [true, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], + '0x110000 (HTML)' => [false, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], + '0x110000 (fatal)' => [true, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], ]; } public function provideStrings() { return [ + 'empty string' => ["", []], + 'sanity check' => ["40", [64]], + 'two-byte character' => ["D7 D7", [21033]], + 'EOF after first byte' => ["D7", [65533]], + 'low byte after first byte' => ["D7 39", [65533, 57]], + '0x80 as first byte' => ["80 D7 00", [65533, 65533, 0]], + '0xFF as first byte' => ["FF D7 00", [65533, 65533, 0]], + '0x7F after first byte' => ["D7 7F", [65533, 127]], + '0xFF after first byte' => ["D7 FF", [65533]], + 'non-character' => ["A5 DC", [65533]], + 'mixed string' => ["7A D7 AA A4 F4 88 62 88 A5", [122, 30267, 12676, 45714, 45802]], + 'mixed string 2' => ["62 D7 D7 D7 D7 62", [98, 21033, 21033, 98]], ]; } diff --git a/tools/test-big5.html b/tools/test-big5.html index 3aa7ecc..f6ce6c5 100644 --- a/tools/test-big5.html +++ b/tools/test-big5.html @@ -9,9 +9,9 @@ var sampleStrings = { // invalid sequences 'EOF after first byte': "D7", 'low byte after first byte': "D7 39", - '0x80 as first byte': "80 D7", - '0xFF as first byte': "FF D7", - 'invalid high byte as first byte': "81 D7", + '0x80 as first byte': "80 D7 00", + '0xFF as first byte': "FF D7 00", + 'invalid high byte as first byte': "81 D7 00", '0x7F after first byte': "D7 7F", '0xFF after first byte': "D7 FF", 'invalid high byte after first byte': "D7 81", diff --git a/tools/test-euckr.html b/tools/test-euckr.html new file mode 100644 index 0000000..30ba9be --- /dev/null +++ b/tools/test-euckr.html @@ -0,0 +1,38 @@ + + + +