Browse Source

Fix numerous bugs with gb18030

span
J. King 6 years ago
parent
commit
1b9889914a
  1. 82
      lib/Encoding/GBCommon.php
  2. 101
      tests/cases/Encoding/TestGB18030.php
  3. 29
      tools/mktestgbk.html

82
lib/Encoding/GBCommon.php

@ -35,12 +35,14 @@ abstract class GBCommon implements StatelessEncoding {
return 0x20AC;
} elseif ($b > 0x80 && $b < 0xFF) {
$first = $b;
continue;
} else {
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
}
} elseif ($second === 0) {
if ($b > 0x29 && $b < 0x40) {
if ($b > 0x2F && $b < 0x3A) {
$second = $b;
continue;
} else {
if (($b > 0x39 && $b < 0x7F) || ($b > 0x7F && $b < 0xFF)) {
$offset = ($b < 0x7F) ? 0x40 : 0x41;
@ -55,12 +57,13 @@ abstract class GBCommon implements StatelessEncoding {
} elseif ($third === 0) {
if ($b > 0x80 && $b < 0xFF) {
$third = $b;
continue;
} else {
$this->posByte -= 2;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
}
} else {
if ($b > 0x29 && $b < 0x40) {
if ($b > 0x2F && $b < 0x3A) {
// look up code point
$pointer = (($first - 0x81) * (10 * 126 * 10)) + (($second - 0x30) * (10 * 126)) + (($third - 0x81) * 10) + $b - 0x30;
if ($pointer === 7457) {
@ -84,6 +87,7 @@ abstract class GBCommon implements StatelessEncoding {
}
}
}
$this->posByte--;
if (($first + $second + $third) == 0) {
// clean EOF
$this->posChar--;
@ -91,7 +95,7 @@ abstract class GBCommon implements StatelessEncoding {
} else {
// dirty EOF; note how many bytes the last character had
$this->dirtyEOF = ($third ? 3 : ($second ? 2 : 1));
return self::err($this->errMode, [$this->posChar - 1, --$this->posByte]);
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]);
}
}
@ -154,43 +158,49 @@ abstract class GBCommon implements StatelessEncoding {
}
// go back one byte
$b1 = ord(@$this->string[--$this->posByte]);
if ($b1 < 0x30 || $b1 == 0x80 || $b1 == 0xFF || $this->posByte == 0) { // these bytes are never part of a sequence, and the first byte is necessarily the start of a sequence
if ($b1 < 0x30 || $b1 == 0x7F || $this->posByte == 0) { // these bytes never appear in sequences, and the first byte is necessarily the start of a sequence
// the byte is a character
continue;
} else {
// go back a second byte
$b2 = ord(@$this->string[--$this->posByte]);
if ($b2 < 0x81 || $b2 == 0xFF) { // these bytes never appear second-to-last in a sequence
// the first byte was a character
$this->posByte += 1;
continue;
} elseif ($b1 > 0x39) {
// two-byte character
continue;
} elseif ($this->posByte < 2) { // byte values indicate a four-byte character, but there are insufficient bytes in the string
// the first byte was a character
$this->posByte += 1;
continue;
} else {
// go back a third byte
$b3 = ord(@$this->string[--$this->posByte]);
if ($b3 < 0x30 || $b3 > 0x39) { // these bytes never appear third-to-last in a sequence
// the first byte was a character
$this->posByte += 2;
continue;
} else {
// go back a fourth byte
$b4 = ord(@$this->string[--$this->posByte]);
if ($b4 < 0x81 || $b4 == 0xFF) { // these bytes never appear first in a sequence
// the first byte was a character
$this->posByte += 3;
continue;
} else {
// four-byte character
continue;
}
}
// go back a second byte
$b2 = ord(@$this->string[--$this->posByte]);
if ($b2 < 0x81 || $b2 == 0xFF) { // these bytes never appear second-to-last in a sequence
// the first byte was a character
$this->posByte += 1;
continue;
} elseif ($b1 < 0x40 && $this->posByte < 2) { // byte values indicate a four-byte character, but there are insufficient bytes in the string
// the first byte was a character
$this->posByte += 1;
continue;
} elseif ($b1 > 0x39) { // the second byte is part of a two-byte sequence, but it's unclear if it's the lead or trail byte
$start = $this->posByte + 2;
// go back bytes until a definite trail byte or end of string
while ($this->posByte > 0) {
if ($b2 < 0x81 || $b2 == 0xFF) {
$this->posByte++;
break;
}
$b2 = ord(@$this->string[--$this->posByte]);
}
// if the number of ambiguous bytes is odd, the character is a single-byte character, otherwise it is double-byte
$this->posByte = $start - (($start - $this->posByte) % 2 ? 1 : 2);
continue;
}
// go back a third byte
$b3 = ord(@$this->string[--$this->posByte]);
if ($b3 > 0x39 || $b3 < 0x30) { // these bytes never appear in the second position of a four-byte sequence
// the first byte was a character
$this->posByte += 2;
continue;
}
// go back a fourth byte
$b4 = ord(@$this->string[--$this->posByte]);
if (($b4 < 0x81 || $b4 == 0xFF)) { // these bytes never appear first in a four-byte sequence
// the first byte was a character
$this->posByte += 3;
continue;
} else {
// this is a four-byte character
}
}
return $distance;

101
tests/cases/Encoding/TestGB18030.php

@ -36,7 +36,8 @@ class TestGB18030 extends \PHPUnit\Framework\TestCase {
* @covers MensBeam\Intl\Encoding\GB18030::posChar
*/
public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) {
$s = new GB18030(hex2bin($input));
$input = $this->prepString($input);
$s = new GB18030($input);
$out = [];
$a = 0;
$this->assertSame($a, $s->posChar());
@ -45,6 +46,7 @@ class TestGB18030 extends \PHPUnit\Framework\TestCase {
$out[] = $p;
}
$this->assertSame($exp, $out);
$this->assertSame($s->posByte(), strlen($input));
}
/**
@ -56,33 +58,32 @@ class TestGB18030 extends \PHPUnit\Framework\TestCase {
$exp = array_map(function($v) {
return \IntlChar::chr($v);
}, $exp);
$s = new GB18030(hex2bin($input));
$input = $this->prepString($input);
$s = new GB18030($input);
$out = [];
while (($p = $s->nextChar()) !== "") {
$out[] = $p;
}
$this->assertSame($exp, $out);
$this->assertSame($s->posByte(), strlen($input));
}
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\GB18030::seekBack
*/
public function testSTepBackThroughAString(string $input, array $points) {
$s = new GB18030(hex2bin($input));
$a = 0;
$test1 = [];
$test2 = [];
while (($p1 = $s->nextCode()) !== false) {
$test1[] = $p1;
$this->assertSame(0, $s->seek(-1));
$p2 = $s->nextCode();
$test2[] = $p2;
$this->assertSame($p1, $p2, "Mismatch at character position $a");
$this->assertSame(++$a, $s->posChar(), "Character position should be $a");
public function testSTepBackThroughAString(string $input, array $exp) {
$input = $this->prepString($input);
$s = new GB18030($input);
$exp = array_reverse($exp);
$act = [];
while ($s->nextCode() !== false);
while($s->posByte()) {
$s->seek(-1);
$act[] = $s->nextCode();
$s->seek(-1);
}
$this->assertSame($points, $test1);
$this->assertSame($points, $test2);
$this->assertEquals($exp, $act);
}
public function provideCodePoints() {
@ -133,34 +134,52 @@ class TestGB18030 extends \PHPUnit\Framework\TestCase {
// valid single characters
'sanity check' => ["40", [64]],
'special case for 0x80' => ["80", [8364]],
'four-byte special case' => ["8135F437", [59335]],
'two-byte character' => ["A84E", [8735]],
'four-byte character' => ["8231A237", [15081]],
'four-byte special case' => ["81 35 F4 37", [59335]],
'two-byte character' => ["A8 4E", [8735]],
'four-byte character' => ["82 31 A2 37", [15081]],
// cut sequences
'EOF after first byte' => ["82", [65533]],
'EOF after second byte' => ["8230", [65533]],
'EOF after third byte' => ["823081", [65533]],
'EOF after second byte' => ["82 30", [65533]],
'EOF after third byte' => ["82 30 81", [65533]],
// invalid sequences
'bad first byte' => ["FF35F437", [65533, 53, 65533]],
'bad second byte' => ["81FFF437", [65533, 65533]],
'bad third byte' => ["8135FF37", [65533, 53, 65533, 55]],
'bad fourth byte' => ["8135F4FF", [65533, 53, 65533]],
'control first byte' => ["0035F437", [0, 53, 65533]],
'control second byte' => ["8100F437", [65533, 0, 65533]],
'control third byte' => ["81350037", [65533, 53, 0, 55]],
'control fourth byte' => ["8135F400", [65533, 53, 65533, 0]],
'bad first byte' => ["FF 35 F4 37", [65533, 53, 65533]],
'bad second byte' => ["81 FF F4 37", [65533, 65533]],
'bad third byte' => ["81 35 FF 37", [65533, 53, 65533, 55]],
'bad fourth byte' => ["81 35 F4 FF", [65533, 53, 65533]],
'control first byte' => ["00 35 F4 37", [0, 53, 65533]],
'control second byte' => ["81 00 F4 37", [65533, 0, 65533]],
'control third byte' => ["81 35 00 37", [65533, 53, 0, 55]],
'control fourth byte' => ["81 35 F4 00", [65533, 53, 65533, 0]],
// invalid sequences with clean EOF
'bad first byte (padded)' => ["FF35F43700000000", [65533, 53, 65533, 55, 0, 0, 0, 0]],
'bad second byte (padded)' => ["81FFF43700000000", [65533, 65533, 55, 0, 0, 0, 0]],
'bad third byte (padded)' => ["8135FF3700000000", [65533, 53, 65533, 55, 0, 0, 0, 0]],
'bad fourth byte (padded)' => ["8135F4FF00000000", [65533, 53, 65533, 0, 0, 0, 0]],
'control first byte (padded)' => ["0035F43700000000", [0, 53, 65533, 55, 0, 0, 0, 0]],
'control second byte (padded)' => ["8100F43700000000", [65533, 0, 65533, 55, 0, 0, 0, 0]],
'control third byte (padded)' => ["8135003700000000", [65533, 53, 0, 55, 0, 0, 0, 0]],
'control fourth byte (padded)' => ["8135F40000000000", [65533, 53, 65533, 0, 0, 0, 0, 0]],
'bad first byte (padded)' => ["FF 35 F4 37 00 00 00 00", [65533, 53, 65533, 55, 0, 0, 0, 0]],
'bad second byte (padded)' => ["81 FF F4 37 00 00 00 00", [65533, 65533, 55, 0, 0, 0, 0]],
'bad third byte (padded)' => ["81 35 FF 37 00 00 00 00", [65533, 53, 65533, 55, 0, 0, 0, 0]],
'bad fourth byte (padded)' => ["81 35 F4 FF 00 00 00 00", [65533, 53, 65533, 0, 0, 0, 0]],
'control first byte (padded)' => ["00 35 F4 37 00 00 00 00", [0, 53, 65533, 55, 0, 0, 0, 0]],
'control second byte (padded)' => ["81 00 F4 37 00 00 00 00", [65533, 0, 65533, 55, 0, 0, 0, 0]],
'control third byte (padded)' => ["81 35 00 37 00 00 00 00", [65533, 53, 0, 55, 0, 0, 0, 0]],
'control fourth byte (padded)' => ["81 35 F4 00 00 00 00 00", [65533, 53, 65533, 0, 0, 0, 0, 0]],
// out-of-range sequences
'void sequence' => ["8432A439", [65533]],
'void sequence 2' => ["FE39FE39", [65533]],
'void sequence' => ["84 32 A4 39", [65533]],
'void sequence 2' => ["FE 39 FE 39", [65533]],
// backward seeking tests
'seek test 1' => ["81 81 81 30", [20118, 65533]],
'seek test 2' => ["81 81 80", [20118, 8364]],
'seek test 3' => ["81 81 00", [20118, 0]],
'seek test 4' => ["81 81 81 00", [20118, 65533, 0]],
'seek test 5' => ["81 30 30 30", [65533, 48, 48, 48]],
'seek test 6' => ["81 30 81 81", [65533, 48, 20118]],
'seek test 7' => ["30 30 81 81", [48, 48, 20118]],
'seek test 8' => ["F8 83 FE 80", [40229, 18211]],
'seek test 1 (padded)' => ["00 00 00 00 81 81 81 30 00 00 00 00", [0, 0, 0, 0, 20118, 65533, 48, 0, 0, 0, 0]],
'seek test 2 (padded)' => ["00 00 00 00 81 81 80 00 00 00 00", [0, 0, 0, 0, 20118, 8364, 0, 0, 0, 0]],
'seek test 3 (padded)' => ["00 00 00 00 81 81 00 00 00 00 00", [0, 0, 0, 0, 20118, 0, 0, 0, 0, 0]],
'seek test 4 (padded)' => ["00 00 00 00 81 81 81 00 00 00 00 00", [0, 0, 0, 0, 20118, 65533, 0, 0, 0, 0, 0]],
'seek test 5 (padded)' => ["00 00 00 00 81 30 30 30 00 00 00 00", [0, 0, 0, 0, 65533, 48, 48, 48, 0, 0, 0, 0]],
'seek test 6 (padded)' => ["00 00 00 00 81 30 81 81 00 00 00 00", [0, 0, 0, 0, 65533, 48, 20118, 0, 0, 0, 0]],
'seek test 7 (padded)' => ["00 00 00 00 30 30 81 81 00 00 00 00", [0, 0, 0, 0, 48, 48, 20118, 0, 0, 0, 0]],
'seek test 8 (padded)' => ["00 00 00 00 F8 83 FE 80 00 00 00 00", [0, 0, 0, 0, 40229, 18211, 0, 0, 0, 0]],
];
}
@ -185,4 +204,8 @@ class TestGB18030 extends \PHPUnit\Framework\TestCase {
}
}
}
protected function prepString(string $str): string {
return hex2bin(str_replace(" ", "", $str));
}
}

29
tools/mktestgbk.html

@ -1,10 +1,10 @@
<!DOCTYPE html>
<meta charset=gb18030>
<!-- Correct results are provided by Firefox -->
<pre></pre>
<pre style="font-family: 'Consolas', monospace;"></pre>
<script>
var data = [
// basics
{ encoding: 'gb18030', input: [], name: 'empty string' },
{ encoding: 'gb18030', input: [0x40], name: 'sanity check' },
{ encoding: 'gb18030', input: [0x80], name: 'special case for 0x80' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x37], name: 'four-byte special case' },
@ -31,11 +31,27 @@ var data = [
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00], name: 'control fourth byte (padded)' },
{ encoding: 'gb18030', input: [0x84, 0x32, 0xA4, 0x39], name: 'void sequence' },
{ encoding: 'gb18030', input: [0xFE, 0x39, 0xFE, 0x39], name: 'void sequence 2' },
{ encoding: 'gb18030', input: [0x81, 0x81, 0x81, 0x30], name: 'seek test 1' },
{ encoding: 'gb18030', input: [0x81, 0x81, 0x80], name: 'seek test 2' },
{ encoding: 'gb18030', input: [0x81, 0x81, 0x00], name: 'seek test 3' },
{ encoding: 'gb18030', input: [0x81, 0x81, 0x81, 0x00], name: 'seek test 4' },
{ encoding: 'gb18030', input: [0x81, 0x30, 0x30, 0x30], name: 'seek test 5' },
{ encoding: 'gb18030', input: [0x81, 0x30, 0x81, 0x81], name: 'seek test 6' },
{ encoding: 'gb18030', input: [0x30, 0x30, 0x81, 0x81], name: 'seek test 7' },
{ encoding: 'gb18030', input: [0xF8, 0x83, 0xFE, 0x80], name: 'seek test 8' },
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x81, 0x81, 0x30, 0x00, 0x00, 0x00, 0x00], name: 'seek test 1 (padded)' },
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x81, 0x80, 0x00, 0x00, 0x00, 0x00], name: 'seek test 2 (padded)' },
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00], name: 'seek test 3 (padded)' },
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x81, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00], name: 'seek test 4 (padded)' },
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x30, 0x30, 0x30, 0x00, 0x00, 0x00, 0x00], name: 'seek test 5 (padded)' },
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x30, 0x81, 0x81, 0x00, 0x00, 0x00, 0x00], name: 'seek test 6 (padded)' },
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x30, 0x30, 0x81, 0x81, 0x00, 0x00, 0x00, 0x00], name: 'seek test 7 (padded)' },
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0xF8, 0x83, 0xFE, 0x80, 0x00, 0x00, 0x00, 0x00], name: 'seek test 8 (padded)' },
];
data.forEach(function(data) {
var bytes = "";
var bytes = [];
data.input.forEach((p) => {
bytes = bytes + p.toString(16).padStart(2, "0").toUpperCase()
bytes.push(p.toString(16).padStart(2, "0").toUpperCase());
});
var codes = [];
var text = new TextDecoder(data.encoding).decode(new Uint8Array(data.input));
@ -48,6 +64,7 @@ data.forEach(function(data) {
}
codes[b++] = point;
}
bytes = bytes.join(" ");
codes = codes.join(", ");
var line = "'" + data.name + "' => [" + '"' + bytes + '", [' + codes + "]],\n";
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode(line));
@ -62,8 +79,8 @@ document.getElementsByTagName("pre")[0].appendChild(document.createTextNode("\n\
var bytes = "";
for (let a = 0; a < url.length; a++) {
if (url.charAt(a) == "%") {
bytes = bytes.concat(url.charAt(a + 1), url.charAt(a + (padded)));
a = a + (padded);
bytes = bytes.concat(url.charAt(a + 1), url.charAt(a + 2));
a = a + 2;
} else {
bytes = bytes.concat(url.charCodeAt(a).toString(16).padStart(2, "0"));
}

Loading…
Cancel
Save