diff --git a/lib/Encoding/GBCommon.php b/lib/Encoding/GBCommon.php index ba57e18..a0d8d6b 100644 --- a/lib/Encoding/GBCommon.php +++ b/lib/Encoding/GBCommon.php @@ -35,12 +35,14 @@ abstract class GBCommon implements StatelessEncoding { return 0x20AC; } elseif ($b > 0x80 && $b < 0xFF) { $first = $b; + continue; } else { return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); } } elseif ($second === 0) { - if ($b > 0x29 && $b < 0x40) { + if ($b > 0x2F && $b < 0x3A) { $second = $b; + continue; } else { if (($b > 0x39 && $b < 0x7F) || ($b > 0x7F && $b < 0xFF)) { $offset = ($b < 0x7F) ? 0x40 : 0x41; @@ -55,12 +57,13 @@ abstract class GBCommon implements StatelessEncoding { } elseif ($third === 0) { if ($b > 0x80 && $b < 0xFF) { $third = $b; + continue; } else { $this->posByte -= 2; return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); } } else { - if ($b > 0x29 && $b < 0x40) { + if ($b > 0x2F && $b < 0x3A) { // look up code point $pointer = (($first - 0x81) * (10 * 126 * 10)) + (($second - 0x30) * (10 * 126)) + (($third - 0x81) * 10) + $b - 0x30; if ($pointer === 7457) { @@ -84,6 +87,7 @@ abstract class GBCommon implements StatelessEncoding { } } } + $this->posByte--; if (($first + $second + $third) == 0) { // clean EOF $this->posChar--; @@ -91,7 +95,7 @@ abstract class GBCommon implements StatelessEncoding { } else { // dirty EOF; note how many bytes the last character had $this->dirtyEOF = ($third ? 3 : ($second ? 2 : 1)); - return self::err($this->errMode, [$this->posChar - 1, --$this->posByte]); + return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]); } } @@ -154,43 +158,49 @@ abstract class GBCommon implements StatelessEncoding { } // go back one byte $b1 = ord(@$this->string[--$this->posByte]); - if ($b1 < 0x30 || $b1 == 0x80 || $b1 == 0xFF || $this->posByte == 0) { // these bytes are never part of a sequence, and the first byte is necessarily the start of a sequence + if ($b1 < 0x30 || $b1 == 0x7F || $this->posByte == 0) { // these bytes never appear in sequences, and the first byte is necessarily the start of a sequence // the byte is a character continue; - } else { - // go back a second byte - $b2 = ord(@$this->string[--$this->posByte]); - if ($b2 < 0x81 || $b2 == 0xFF) { // these bytes never appear second-to-last in a sequence - // the first byte was a character - $this->posByte += 1; - continue; - } elseif ($b1 > 0x39) { - // two-byte character - continue; - } elseif ($this->posByte < 2) { // byte values indicate a four-byte character, but there are insufficient bytes in the string - // the first byte was a character - $this->posByte += 1; - continue; - } else { - // go back a third byte - $b3 = ord(@$this->string[--$this->posByte]); - if ($b3 < 0x30 || $b3 > 0x39) { // these bytes never appear third-to-last in a sequence - // the first byte was a character - $this->posByte += 2; - continue; - } else { - // go back a fourth byte - $b4 = ord(@$this->string[--$this->posByte]); - if ($b4 < 0x81 || $b4 == 0xFF) { // these bytes never appear first in a sequence - // the first byte was a character - $this->posByte += 3; - continue; - } else { - // four-byte character - continue; - } + } + // go back a second byte + $b2 = ord(@$this->string[--$this->posByte]); + if ($b2 < 0x81 || $b2 == 0xFF) { // these bytes never appear second-to-last in a sequence + // the first byte was a character + $this->posByte += 1; + continue; + } elseif ($b1 < 0x40 && $this->posByte < 2) { // byte values indicate a four-byte character, but there are insufficient bytes in the string + // the first byte was a character + $this->posByte += 1; + continue; + } elseif ($b1 > 0x39) { // the second byte is part of a two-byte sequence, but it's unclear if it's the lead or trail byte + $start = $this->posByte + 2; + // go back bytes until a definite trail byte or end of string + while ($this->posByte > 0) { + if ($b2 < 0x81 || $b2 == 0xFF) { + $this->posByte++; + break; } + $b2 = ord(@$this->string[--$this->posByte]); } + // if the number of ambiguous bytes is odd, the character is a single-byte character, otherwise it is double-byte + $this->posByte = $start - (($start - $this->posByte) % 2 ? 1 : 2); + continue; + } + // go back a third byte + $b3 = ord(@$this->string[--$this->posByte]); + if ($b3 > 0x39 || $b3 < 0x30) { // these bytes never appear in the second position of a four-byte sequence + // the first byte was a character + $this->posByte += 2; + continue; + } + // go back a fourth byte + $b4 = ord(@$this->string[--$this->posByte]); + if (($b4 < 0x81 || $b4 == 0xFF)) { // these bytes never appear first in a four-byte sequence + // the first byte was a character + $this->posByte += 3; + continue; + } else { + // this is a four-byte character } } return $distance; diff --git a/tests/cases/Encoding/TestGB18030.php b/tests/cases/Encoding/TestGB18030.php index 5590569..0d0d2c5 100644 --- a/tests/cases/Encoding/TestGB18030.php +++ b/tests/cases/Encoding/TestGB18030.php @@ -36,7 +36,8 @@ class TestGB18030 extends \PHPUnit\Framework\TestCase { * @covers MensBeam\Intl\Encoding\GB18030::posChar */ public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) { - $s = new GB18030(hex2bin($input)); + $input = $this->prepString($input); + $s = new GB18030($input); $out = []; $a = 0; $this->assertSame($a, $s->posChar()); @@ -45,6 +46,7 @@ class TestGB18030 extends \PHPUnit\Framework\TestCase { $out[] = $p; } $this->assertSame($exp, $out); + $this->assertSame($s->posByte(), strlen($input)); } /** @@ -56,33 +58,32 @@ class TestGB18030 extends \PHPUnit\Framework\TestCase { $exp = array_map(function($v) { return \IntlChar::chr($v); }, $exp); - $s = new GB18030(hex2bin($input)); + $input = $this->prepString($input); + $s = new GB18030($input); $out = []; while (($p = $s->nextChar()) !== "") { $out[] = $p; } $this->assertSame($exp, $out); + $this->assertSame($s->posByte(), strlen($input)); } /** * @dataProvider provideStrings * @covers MensBeam\Intl\Encoding\GB18030::seekBack */ - public function testSTepBackThroughAString(string $input, array $points) { - $s = new GB18030(hex2bin($input)); - $a = 0; - $test1 = []; - $test2 = []; - while (($p1 = $s->nextCode()) !== false) { - $test1[] = $p1; - $this->assertSame(0, $s->seek(-1)); - $p2 = $s->nextCode(); - $test2[] = $p2; - $this->assertSame($p1, $p2, "Mismatch at character position $a"); - $this->assertSame(++$a, $s->posChar(), "Character position should be $a"); + public function testSTepBackThroughAString(string $input, array $exp) { + $input = $this->prepString($input); + $s = new GB18030($input); + $exp = array_reverse($exp); + $act = []; + while ($s->nextCode() !== false); + while($s->posByte()) { + $s->seek(-1); + $act[] = $s->nextCode(); + $s->seek(-1); } - $this->assertSame($points, $test1); - $this->assertSame($points, $test2); + $this->assertEquals($exp, $act); } public function provideCodePoints() { @@ -133,34 +134,52 @@ class TestGB18030 extends \PHPUnit\Framework\TestCase { // valid single characters 'sanity check' => ["40", [64]], 'special case for 0x80' => ["80", [8364]], - 'four-byte special case' => ["8135F437", [59335]], - 'two-byte character' => ["A84E", [8735]], - 'four-byte character' => ["8231A237", [15081]], + 'four-byte special case' => ["81 35 F4 37", [59335]], + 'two-byte character' => ["A8 4E", [8735]], + 'four-byte character' => ["82 31 A2 37", [15081]], // cut sequences 'EOF after first byte' => ["82", [65533]], - 'EOF after second byte' => ["8230", [65533]], - 'EOF after third byte' => ["823081", [65533]], + 'EOF after second byte' => ["82 30", [65533]], + 'EOF after third byte' => ["82 30 81", [65533]], // invalid sequences - 'bad first byte' => ["FF35F437", [65533, 53, 65533]], - 'bad second byte' => ["81FFF437", [65533, 65533]], - 'bad third byte' => ["8135FF37", [65533, 53, 65533, 55]], - 'bad fourth byte' => ["8135F4FF", [65533, 53, 65533]], - 'control first byte' => ["0035F437", [0, 53, 65533]], - 'control second byte' => ["8100F437", [65533, 0, 65533]], - 'control third byte' => ["81350037", [65533, 53, 0, 55]], - 'control fourth byte' => ["8135F400", [65533, 53, 65533, 0]], + 'bad first byte' => ["FF 35 F4 37", [65533, 53, 65533]], + 'bad second byte' => ["81 FF F4 37", [65533, 65533]], + 'bad third byte' => ["81 35 FF 37", [65533, 53, 65533, 55]], + 'bad fourth byte' => ["81 35 F4 FF", [65533, 53, 65533]], + 'control first byte' => ["00 35 F4 37", [0, 53, 65533]], + 'control second byte' => ["81 00 F4 37", [65533, 0, 65533]], + 'control third byte' => ["81 35 00 37", [65533, 53, 0, 55]], + 'control fourth byte' => ["81 35 F4 00", [65533, 53, 65533, 0]], // invalid sequences with clean EOF - 'bad first byte (padded)' => ["FF35F43700000000", [65533, 53, 65533, 55, 0, 0, 0, 0]], - 'bad second byte (padded)' => ["81FFF43700000000", [65533, 65533, 55, 0, 0, 0, 0]], - 'bad third byte (padded)' => ["8135FF3700000000", [65533, 53, 65533, 55, 0, 0, 0, 0]], - 'bad fourth byte (padded)' => ["8135F4FF00000000", [65533, 53, 65533, 0, 0, 0, 0]], - 'control first byte (padded)' => ["0035F43700000000", [0, 53, 65533, 55, 0, 0, 0, 0]], - 'control second byte (padded)' => ["8100F43700000000", [65533, 0, 65533, 55, 0, 0, 0, 0]], - 'control third byte (padded)' => ["8135003700000000", [65533, 53, 0, 55, 0, 0, 0, 0]], - 'control fourth byte (padded)' => ["8135F40000000000", [65533, 53, 65533, 0, 0, 0, 0, 0]], + 'bad first byte (padded)' => ["FF 35 F4 37 00 00 00 00", [65533, 53, 65533, 55, 0, 0, 0, 0]], + 'bad second byte (padded)' => ["81 FF F4 37 00 00 00 00", [65533, 65533, 55, 0, 0, 0, 0]], + 'bad third byte (padded)' => ["81 35 FF 37 00 00 00 00", [65533, 53, 65533, 55, 0, 0, 0, 0]], + 'bad fourth byte (padded)' => ["81 35 F4 FF 00 00 00 00", [65533, 53, 65533, 0, 0, 0, 0]], + 'control first byte (padded)' => ["00 35 F4 37 00 00 00 00", [0, 53, 65533, 55, 0, 0, 0, 0]], + 'control second byte (padded)' => ["81 00 F4 37 00 00 00 00", [65533, 0, 65533, 55, 0, 0, 0, 0]], + 'control third byte (padded)' => ["81 35 00 37 00 00 00 00", [65533, 53, 0, 55, 0, 0, 0, 0]], + 'control fourth byte (padded)' => ["81 35 F4 00 00 00 00 00", [65533, 53, 65533, 0, 0, 0, 0, 0]], // out-of-range sequences - 'void sequence' => ["8432A439", [65533]], - 'void sequence 2' => ["FE39FE39", [65533]], + 'void sequence' => ["84 32 A4 39", [65533]], + 'void sequence 2' => ["FE 39 FE 39", [65533]], + // backward seeking tests + 'seek test 1' => ["81 81 81 30", [20118, 65533]], + 'seek test 2' => ["81 81 80", [20118, 8364]], + 'seek test 3' => ["81 81 00", [20118, 0]], + 'seek test 4' => ["81 81 81 00", [20118, 65533, 0]], + 'seek test 5' => ["81 30 30 30", [65533, 48, 48, 48]], + 'seek test 6' => ["81 30 81 81", [65533, 48, 20118]], + 'seek test 7' => ["30 30 81 81", [48, 48, 20118]], + 'seek test 8' => ["F8 83 FE 80", [40229, 18211]], + 'seek test 1 (padded)' => ["00 00 00 00 81 81 81 30 00 00 00 00", [0, 0, 0, 0, 20118, 65533, 48, 0, 0, 0, 0]], + 'seek test 2 (padded)' => ["00 00 00 00 81 81 80 00 00 00 00", [0, 0, 0, 0, 20118, 8364, 0, 0, 0, 0]], + 'seek test 3 (padded)' => ["00 00 00 00 81 81 00 00 00 00 00", [0, 0, 0, 0, 20118, 0, 0, 0, 0, 0]], + 'seek test 4 (padded)' => ["00 00 00 00 81 81 81 00 00 00 00 00", [0, 0, 0, 0, 20118, 65533, 0, 0, 0, 0, 0]], + 'seek test 5 (padded)' => ["00 00 00 00 81 30 30 30 00 00 00 00", [0, 0, 0, 0, 65533, 48, 48, 48, 0, 0, 0, 0]], + 'seek test 6 (padded)' => ["00 00 00 00 81 30 81 81 00 00 00 00", [0, 0, 0, 0, 65533, 48, 20118, 0, 0, 0, 0]], + 'seek test 7 (padded)' => ["00 00 00 00 30 30 81 81 00 00 00 00", [0, 0, 0, 0, 48, 48, 20118, 0, 0, 0, 0]], + 'seek test 8 (padded)' => ["00 00 00 00 F8 83 FE 80 00 00 00 00", [0, 0, 0, 0, 40229, 18211, 0, 0, 0, 0]], + ]; } @@ -185,4 +204,8 @@ class TestGB18030 extends \PHPUnit\Framework\TestCase { } } } + + protected function prepString(string $str): string { + return hex2bin(str_replace(" ", "", $str)); + } } diff --git a/tools/mktestgbk.html b/tools/mktestgbk.html index c1e7473..edda407 100644 --- a/tools/mktestgbk.html +++ b/tools/mktestgbk.html @@ -1,10 +1,10 @@ -

+