From 915aa7ca93c2067b8ee30430834d712435f43494 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Wed, 7 Oct 2020 22:48:50 -0400 Subject: [PATCH] Finally fix Shift_JIS seeker --- lib/Encoding/ShiftJIS.php | 10 +++++----- tests/cases/Encoding/TestGB18030.php | 16 ++++++++-------- tests/cases/Encoding/TestShiftJIS.php | 16 ++++++---------- tests/lib/DecoderTest.php | 2 +- tools/test-shiftjis.html | 20 ++++++++++++++++++++ 5 files changed, 40 insertions(+), 24 deletions(-) create mode 100644 tools/test-shiftjis.html diff --git a/lib/Encoding/ShiftJIS.php b/lib/Encoding/ShiftJIS.php index c4eb274..998add0 100644 --- a/lib/Encoding/ShiftJIS.php +++ b/lib/Encoding/ShiftJIS.php @@ -120,19 +120,19 @@ class ShiftJIS extends AbstractEncoding implements StatelessEncoding { } // go back one byte $b1 = ord(@$this->string[--$this->posByte]); - if ($b1 < 0x40 || $b1 > 0xFC || $b1 == 0x7F || $this->posByte === 0 || $this->posByte === $this->errMark) { // these bytes never appear in sequences, and the first byte is necessarily the start of a sequence + if ($b1 < 0x40 || $b1 > 0xFC || $b1 === 0x7F || $this->posByte === 0 || $this->posByte === $this->errMark) { // these bytes never appear in sequences, and the first byte is necessarily the start of a sequence // the byte is a character continue; } // go back a second byte $b2 = ord(@$this->string[--$this->posByte]); - if ($this->posByte === $this->errMark || $this->posByte === 0) { // position is unambiguously the start of a character - // the two bytes form a character - continue; - } elseif ($b2 < 0x81 || $b2 > 0xFC || ($b2 >= 0xA0 && $b2 <= 0xDF)) { // these bytes never appear in the lead of a sequence + if ($b2 < 0x81 || $b2 > 0xFC || ($b2 >= 0xA0 && $b2 <= 0xDF)) { // these bytes never appear in the lead of a sequence // the first byte was a character $this->posByte += 1; continue; + } elseif ($this->posByte === $this->errMark || $this->posByte === 0) { // position is unambiguously the start of a character + // the two bytes form a character + continue; } else { // the second byte is part of a two-byte sequence, but it's unclear if it's the lead or trail byte $start = $this->posByte + 2; $pos = $this->posByte; diff --git a/tests/cases/Encoding/TestGB18030.php b/tests/cases/Encoding/TestGB18030.php index ddf30a6..6cddf7c 100644 --- a/tests/cases/Encoding/TestGB18030.php +++ b/tests/cases/Encoding/TestGB18030.php @@ -144,6 +144,14 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest { return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp); } + + /** + * @covers MensBeam\Intl\Encoding\GB18030::seekBack + */ + public function testSeekBackOverRandomData() { + return parent::testSeekBackOverRandomData(); + } + public function provideCodePoints() { // bytes confirmed using Firefox $series_gb18030 = [ @@ -200,14 +208,6 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest { } } - - /** - * @covers MensBeam\Intl\Encoding\GB18030::seekBack - */ - public function testSeekBackOverRandomData() { - return parent::testSeekBackOverRandomData(); - } - public function provideStrings() { return [ 'empty string' => ["", []], diff --git a/tests/cases/Encoding/TestShiftJIS.php b/tests/cases/Encoding/TestShiftJIS.php index abb154c..1007e8d 100644 --- a/tests/cases/Encoding/TestShiftJIS.php +++ b/tests/cases/Encoding/TestShiftJIS.php @@ -12,16 +12,6 @@ use MensBeam\Intl\Encoding\EncoderException; class TestShiftJIS extends \MensBeam\Intl\Test\CoderDecoderTest { protected $testedClass = ShiftJIS::class; - /* - Char 0 U+007A (1 byte) Offset 0 - Char 1 U+86CC (2 bytes) Offset 1 - Char 2 U+6C34 (2 bytes) Offset 3 - Char 3 U+00CA (0 bytes) Offset 5 - Char 4 U+0304 (2 bytes) Offset 5 - Char 5 U+00EA (0 bytes) Offset 7 - Char 6 U+030C (2 bytes) Offset 7 - End of string at char 7, offset 9 - */ protected $seekString = ""; protected $seekCodes = []; protected $seekOffsets = []; @@ -151,6 +141,12 @@ class TestShiftJIS extends \MensBeam\Intl\Test\CoderDecoderTest { public function provideStrings() { return [ + 'empty string' => ["", []], + 'sanity check' => ["40", [64]], + 'former ASCII deviations' => ["5C 7E", [92, 126]], + 'JIS X 0201 range' => ["A1 DF", [65377, 65439]], + 'EUDC range' => ["F040 F9FC", [57344, 59223]], + 'JIS X 0208 assigned range' => ["8140 9F7E 8180 9FFC", [12288, 27631, 247, 28364]], ]; } diff --git a/tests/lib/DecoderTest.php b/tests/lib/DecoderTest.php index 5f74450..fcf069e 100644 --- a/tests/lib/DecoderTest.php +++ b/tests/lib/DecoderTest.php @@ -54,7 +54,7 @@ abstract class DecoderTest extends \PHPUnit\Framework\TestCase { } $this->assertSame(sizeof($exp), $pos); while ($s->posChar()) { - $this->assertSame(0, $s->seek(-1)); + $this->assertSame(0, $s->seek(-1), "Error stepping back to position ".($pos - 1)); $this->assertSame(--$pos, $s->posChar()); $act[] = $s->nextCode(); $s->seek(-1); diff --git a/tools/test-shiftjis.html b/tools/test-shiftjis.html new file mode 100644 index 0000000..69b31d5 --- /dev/null +++ b/tools/test-shiftjis.html @@ -0,0 +1,20 @@ + + + + +