Browse Source

Fix numerous bugs with gb18030

span
J. King 6 years ago
parent
commit
1b9889914a
  1. 82
      lib/Encoding/GBCommon.php
  2. 101
      tests/cases/Encoding/TestGB18030.php
  3. 29
      tools/mktestgbk.html

82
lib/Encoding/GBCommon.php

@ -35,12 +35,14 @@ abstract class GBCommon implements StatelessEncoding {
return 0x20AC; return 0x20AC;
} elseif ($b > 0x80 && $b < 0xFF) { } elseif ($b > 0x80 && $b < 0xFF) {
$first = $b; $first = $b;
continue;
} else { } else {
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
} }
} elseif ($second === 0) { } elseif ($second === 0) {
if ($b > 0x29 && $b < 0x40) { if ($b > 0x2F && $b < 0x3A) {
$second = $b; $second = $b;
continue;
} else { } else {
if (($b > 0x39 && $b < 0x7F) || ($b > 0x7F && $b < 0xFF)) { if (($b > 0x39 && $b < 0x7F) || ($b > 0x7F && $b < 0xFF)) {
$offset = ($b < 0x7F) ? 0x40 : 0x41; $offset = ($b < 0x7F) ? 0x40 : 0x41;
@ -55,12 +57,13 @@ abstract class GBCommon implements StatelessEncoding {
} elseif ($third === 0) { } elseif ($third === 0) {
if ($b > 0x80 && $b < 0xFF) { if ($b > 0x80 && $b < 0xFF) {
$third = $b; $third = $b;
continue;
} else { } else {
$this->posByte -= 2; $this->posByte -= 2;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
} }
} else { } else {
if ($b > 0x29 && $b < 0x40) { if ($b > 0x2F && $b < 0x3A) {
// look up code point // look up code point
$pointer = (($first - 0x81) * (10 * 126 * 10)) + (($second - 0x30) * (10 * 126)) + (($third - 0x81) * 10) + $b - 0x30; $pointer = (($first - 0x81) * (10 * 126 * 10)) + (($second - 0x30) * (10 * 126)) + (($third - 0x81) * 10) + $b - 0x30;
if ($pointer === 7457) { if ($pointer === 7457) {
@ -84,6 +87,7 @@ abstract class GBCommon implements StatelessEncoding {
} }
} }
} }
$this->posByte--;
if (($first + $second + $third) == 0) { if (($first + $second + $third) == 0) {
// clean EOF // clean EOF
$this->posChar--; $this->posChar--;
@ -91,7 +95,7 @@ abstract class GBCommon implements StatelessEncoding {
} else { } else {
// dirty EOF; note how many bytes the last character had // dirty EOF; note how many bytes the last character had
$this->dirtyEOF = ($third ? 3 : ($second ? 2 : 1)); $this->dirtyEOF = ($third ? 3 : ($second ? 2 : 1));
return self::err($this->errMode, [$this->posChar - 1, --$this->posByte]); return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]);
} }
} }
@ -154,43 +158,49 @@ abstract class GBCommon implements StatelessEncoding {
} }
// go back one byte // go back one byte
$b1 = ord(@$this->string[--$this->posByte]); $b1 = ord(@$this->string[--$this->posByte]);
if ($b1 < 0x30 || $b1 == 0x80 || $b1 == 0xFF || $this->posByte == 0) { // these bytes are never part of a sequence, and the first byte is necessarily the start of a sequence if ($b1 < 0x30 || $b1 == 0x7F || $this->posByte == 0) { // these bytes never appear in sequences, and the first byte is necessarily the start of a sequence
// the byte is a character // the byte is a character
continue; continue;
} else { }
// go back a second byte // go back a second byte
$b2 = ord(@$this->string[--$this->posByte]); $b2 = ord(@$this->string[--$this->posByte]);
if ($b2 < 0x81 || $b2 == 0xFF) { // these bytes never appear second-to-last in a sequence if ($b2 < 0x81 || $b2 == 0xFF) { // these bytes never appear second-to-last in a sequence
// the first byte was a character // the first byte was a character
$this->posByte += 1; $this->posByte += 1;
continue; continue;
} elseif ($b1 > 0x39) { } elseif ($b1 < 0x40 && $this->posByte < 2) { // byte values indicate a four-byte character, but there are insufficient bytes in the string
// two-byte character // the first byte was a character
continue; $this->posByte += 1;
} elseif ($this->posByte < 2) { // byte values indicate a four-byte character, but there are insufficient bytes in the string continue;
// the first byte was a character } elseif ($b1 > 0x39) { // the second byte is part of a two-byte sequence, but it's unclear if it's the lead or trail byte
$this->posByte += 1; $start = $this->posByte + 2;
continue; // go back bytes until a definite trail byte or end of string
} else { while ($this->posByte > 0) {
// go back a third byte if ($b2 < 0x81 || $b2 == 0xFF) {
$b3 = ord(@$this->string[--$this->posByte]); $this->posByte++;
if ($b3 < 0x30 || $b3 > 0x39) { // these bytes never appear third-to-last in a sequence break;
// the first byte was a character
$this->posByte += 2;
continue;
} else {
// go back a fourth byte
$b4 = ord(@$this->string[--$this->posByte]);
if ($b4 < 0x81 || $b4 == 0xFF) { // these bytes never appear first in a sequence
// the first byte was a character
$this->posByte += 3;
continue;
} else {
// four-byte character
continue;
}
} }
$b2 = ord(@$this->string[--$this->posByte]);
} }
// if the number of ambiguous bytes is odd, the character is a single-byte character, otherwise it is double-byte
$this->posByte = $start - (($start - $this->posByte) % 2 ? 1 : 2);
continue;
}
// go back a third byte
$b3 = ord(@$this->string[--$this->posByte]);
if ($b3 > 0x39 || $b3 < 0x30) { // these bytes never appear in the second position of a four-byte sequence
// the first byte was a character
$this->posByte += 2;
continue;
}
// go back a fourth byte
$b4 = ord(@$this->string[--$this->posByte]);
if (($b4 < 0x81 || $b4 == 0xFF)) { // these bytes never appear first in a four-byte sequence
// the first byte was a character
$this->posByte += 3;
continue;
} else {
// this is a four-byte character
} }
} }
return $distance; return $distance;

101
tests/cases/Encoding/TestGB18030.php

@ -36,7 +36,8 @@ class TestGB18030 extends \PHPUnit\Framework\TestCase {
* @covers MensBeam\Intl\Encoding\GB18030::posChar * @covers MensBeam\Intl\Encoding\GB18030::posChar
*/ */
public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) { public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) {
$s = new GB18030(hex2bin($input)); $input = $this->prepString($input);
$s = new GB18030($input);
$out = []; $out = [];
$a = 0; $a = 0;
$this->assertSame($a, $s->posChar()); $this->assertSame($a, $s->posChar());
@ -45,6 +46,7 @@ class TestGB18030 extends \PHPUnit\Framework\TestCase {
$out[] = $p; $out[] = $p;
} }
$this->assertSame($exp, $out); $this->assertSame($exp, $out);
$this->assertSame($s->posByte(), strlen($input));
} }
/** /**
@ -56,33 +58,32 @@ class TestGB18030 extends \PHPUnit\Framework\TestCase {
$exp = array_map(function($v) { $exp = array_map(function($v) {
return \IntlChar::chr($v); return \IntlChar::chr($v);
}, $exp); }, $exp);
$s = new GB18030(hex2bin($input)); $input = $this->prepString($input);
$s = new GB18030($input);
$out = []; $out = [];
while (($p = $s->nextChar()) !== "") { while (($p = $s->nextChar()) !== "") {
$out[] = $p; $out[] = $p;
} }
$this->assertSame($exp, $out); $this->assertSame($exp, $out);
$this->assertSame($s->posByte(), strlen($input));
} }
/** /**
* @dataProvider provideStrings * @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\GB18030::seekBack * @covers MensBeam\Intl\Encoding\GB18030::seekBack
*/ */
public function testSTepBackThroughAString(string $input, array $points) { public function testSTepBackThroughAString(string $input, array $exp) {
$s = new GB18030(hex2bin($input)); $input = $this->prepString($input);
$a = 0; $s = new GB18030($input);
$test1 = []; $exp = array_reverse($exp);
$test2 = []; $act = [];
while (($p1 = $s->nextCode()) !== false) { while ($s->nextCode() !== false);
$test1[] = $p1; while($s->posByte()) {
$this->assertSame(0, $s->seek(-1)); $s->seek(-1);
$p2 = $s->nextCode(); $act[] = $s->nextCode();
$test2[] = $p2; $s->seek(-1);
$this->assertSame($p1, $p2, "Mismatch at character position $a");
$this->assertSame(++$a, $s->posChar(), "Character position should be $a");
} }
$this->assertSame($points, $test1); $this->assertEquals($exp, $act);
$this->assertSame($points, $test2);
} }
public function provideCodePoints() { public function provideCodePoints() {
@ -133,34 +134,52 @@ class TestGB18030 extends \PHPUnit\Framework\TestCase {
// valid single characters // valid single characters
'sanity check' => ["40", [64]], 'sanity check' => ["40", [64]],
'special case for 0x80' => ["80", [8364]], 'special case for 0x80' => ["80", [8364]],
'four-byte special case' => ["8135F437", [59335]], 'four-byte special case' => ["81 35 F4 37", [59335]],
'two-byte character' => ["A84E", [8735]], 'two-byte character' => ["A8 4E", [8735]],
'four-byte character' => ["8231A237", [15081]], 'four-byte character' => ["82 31 A2 37", [15081]],
// cut sequences // cut sequences
'EOF after first byte' => ["82", [65533]], 'EOF after first byte' => ["82", [65533]],
'EOF after second byte' => ["8230", [65533]], 'EOF after second byte' => ["82 30", [65533]],
'EOF after third byte' => ["823081", [65533]], 'EOF after third byte' => ["82 30 81", [65533]],
// invalid sequences // invalid sequences
'bad first byte' => ["FF35F437", [65533, 53, 65533]], 'bad first byte' => ["FF 35 F4 37", [65533, 53, 65533]],
'bad second byte' => ["81FFF437", [65533, 65533]], 'bad second byte' => ["81 FF F4 37", [65533, 65533]],
'bad third byte' => ["8135FF37", [65533, 53, 65533, 55]], 'bad third byte' => ["81 35 FF 37", [65533, 53, 65533, 55]],
'bad fourth byte' => ["8135F4FF", [65533, 53, 65533]], 'bad fourth byte' => ["81 35 F4 FF", [65533, 53, 65533]],
'control first byte' => ["0035F437", [0, 53, 65533]], 'control first byte' => ["00 35 F4 37", [0, 53, 65533]],
'control second byte' => ["8100F437", [65533, 0, 65533]], 'control second byte' => ["81 00 F4 37", [65533, 0, 65533]],
'control third byte' => ["81350037", [65533, 53, 0, 55]], 'control third byte' => ["81 35 00 37", [65533, 53, 0, 55]],
'control fourth byte' => ["8135F400", [65533, 53, 65533, 0]], 'control fourth byte' => ["81 35 F4 00", [65533, 53, 65533, 0]],
// invalid sequences with clean EOF // invalid sequences with clean EOF
'bad first byte (padded)' => ["FF35F43700000000", [65533, 53, 65533, 55, 0, 0, 0, 0]], 'bad first byte (padded)' => ["FF 35 F4 37 00 00 00 00", [65533, 53, 65533, 55, 0, 0, 0, 0]],
'bad second byte (padded)' => ["81FFF43700000000", [65533, 65533, 55, 0, 0, 0, 0]], 'bad second byte (padded)' => ["81 FF F4 37 00 00 00 00", [65533, 65533, 55, 0, 0, 0, 0]],
'bad third byte (padded)' => ["8135FF3700000000", [65533, 53, 65533, 55, 0, 0, 0, 0]], 'bad third byte (padded)' => ["81 35 FF 37 00 00 00 00", [65533, 53, 65533, 55, 0, 0, 0, 0]],
'bad fourth byte (padded)' => ["8135F4FF00000000", [65533, 53, 65533, 0, 0, 0, 0]], 'bad fourth byte (padded)' => ["81 35 F4 FF 00 00 00 00", [65533, 53, 65533, 0, 0, 0, 0]],
'control first byte (padded)' => ["0035F43700000000", [0, 53, 65533, 55, 0, 0, 0, 0]], 'control first byte (padded)' => ["00 35 F4 37 00 00 00 00", [0, 53, 65533, 55, 0, 0, 0, 0]],
'control second byte (padded)' => ["8100F43700000000", [65533, 0, 65533, 55, 0, 0, 0, 0]], 'control second byte (padded)' => ["81 00 F4 37 00 00 00 00", [65533, 0, 65533, 55, 0, 0, 0, 0]],
'control third byte (padded)' => ["8135003700000000", [65533, 53, 0, 55, 0, 0, 0, 0]], 'control third byte (padded)' => ["81 35 00 37 00 00 00 00", [65533, 53, 0, 55, 0, 0, 0, 0]],
'control fourth byte (padded)' => ["8135F40000000000", [65533, 53, 65533, 0, 0, 0, 0, 0]], 'control fourth byte (padded)' => ["81 35 F4 00 00 00 00 00", [65533, 53, 65533, 0, 0, 0, 0, 0]],
// out-of-range sequences // out-of-range sequences
'void sequence' => ["8432A439", [65533]], 'void sequence' => ["84 32 A4 39", [65533]],
'void sequence 2' => ["FE39FE39", [65533]], 'void sequence 2' => ["FE 39 FE 39", [65533]],
// backward seeking tests
'seek test 1' => ["81 81 81 30", [20118, 65533]],
'seek test 2' => ["81 81 80", [20118, 8364]],
'seek test 3' => ["81 81 00", [20118, 0]],
'seek test 4' => ["81 81 81 00", [20118, 65533, 0]],
'seek test 5' => ["81 30 30 30", [65533, 48, 48, 48]],
'seek test 6' => ["81 30 81 81", [65533, 48, 20118]],
'seek test 7' => ["30 30 81 81", [48, 48, 20118]],
'seek test 8' => ["F8 83 FE 80", [40229, 18211]],
'seek test 1 (padded)' => ["00 00 00 00 81 81 81 30 00 00 00 00", [0, 0, 0, 0, 20118, 65533, 48, 0, 0, 0, 0]],
'seek test 2 (padded)' => ["00 00 00 00 81 81 80 00 00 00 00", [0, 0, 0, 0, 20118, 8364, 0, 0, 0, 0]],
'seek test 3 (padded)' => ["00 00 00 00 81 81 00 00 00 00 00", [0, 0, 0, 0, 20118, 0, 0, 0, 0, 0]],
'seek test 4 (padded)' => ["00 00 00 00 81 81 81 00 00 00 00 00", [0, 0, 0, 0, 20118, 65533, 0, 0, 0, 0, 0]],
'seek test 5 (padded)' => ["00 00 00 00 81 30 30 30 00 00 00 00", [0, 0, 0, 0, 65533, 48, 48, 48, 0, 0, 0, 0]],
'seek test 6 (padded)' => ["00 00 00 00 81 30 81 81 00 00 00 00", [0, 0, 0, 0, 65533, 48, 20118, 0, 0, 0, 0]],
'seek test 7 (padded)' => ["00 00 00 00 30 30 81 81 00 00 00 00", [0, 0, 0, 0, 48, 48, 20118, 0, 0, 0, 0]],
'seek test 8 (padded)' => ["00 00 00 00 F8 83 FE 80 00 00 00 00", [0, 0, 0, 0, 40229, 18211, 0, 0, 0, 0]],
]; ];
} }
@ -185,4 +204,8 @@ class TestGB18030 extends \PHPUnit\Framework\TestCase {
} }
} }
} }
protected function prepString(string $str): string {
return hex2bin(str_replace(" ", "", $str));
}
} }

29
tools/mktestgbk.html

@ -1,10 +1,10 @@
<!DOCTYPE html> <!DOCTYPE html>
<meta charset=gb18030> <meta charset=gb18030>
<!-- Correct results are provided by Firefox --> <!-- Correct results are provided by Firefox -->
<pre></pre> <pre style="font-family: 'Consolas', monospace;"></pre>
<script> <script>
var data = [ var data = [
// basics { encoding: 'gb18030', input: [], name: 'empty string' },
{ encoding: 'gb18030', input: [0x40], name: 'sanity check' }, { encoding: 'gb18030', input: [0x40], name: 'sanity check' },
{ encoding: 'gb18030', input: [0x80], name: 'special case for 0x80' }, { encoding: 'gb18030', input: [0x80], name: 'special case for 0x80' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x37], name: 'four-byte special case' }, { encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x37], name: 'four-byte special case' },
@ -31,11 +31,27 @@ var data = [
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00], name: 'control fourth byte (padded)' }, { encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00], name: 'control fourth byte (padded)' },
{ encoding: 'gb18030', input: [0x84, 0x32, 0xA4, 0x39], name: 'void sequence' }, { encoding: 'gb18030', input: [0x84, 0x32, 0xA4, 0x39], name: 'void sequence' },
{ encoding: 'gb18030', input: [0xFE, 0x39, 0xFE, 0x39], name: 'void sequence 2' }, { encoding: 'gb18030', input: [0xFE, 0x39, 0xFE, 0x39], name: 'void sequence 2' },
{ encoding: 'gb18030', input: [0x81, 0x81, 0x81, 0x30], name: 'seek test 1' },
{ encoding: 'gb18030', input: [0x81, 0x81, 0x80], name: 'seek test 2' },
{ encoding: 'gb18030', input: [0x81, 0x81, 0x00], name: 'seek test 3' },
{ encoding: 'gb18030', input: [0x81, 0x81, 0x81, 0x00], name: 'seek test 4' },
{ encoding: 'gb18030', input: [0x81, 0x30, 0x30, 0x30], name: 'seek test 5' },
{ encoding: 'gb18030', input: [0x81, 0x30, 0x81, 0x81], name: 'seek test 6' },
{ encoding: 'gb18030', input: [0x30, 0x30, 0x81, 0x81], name: 'seek test 7' },
{ encoding: 'gb18030', input: [0xF8, 0x83, 0xFE, 0x80], name: 'seek test 8' },
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x81, 0x81, 0x30, 0x00, 0x00, 0x00, 0x00], name: 'seek test 1 (padded)' },
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x81, 0x80, 0x00, 0x00, 0x00, 0x00], name: 'seek test 2 (padded)' },
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00], name: 'seek test 3 (padded)' },
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x81, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00], name: 'seek test 4 (padded)' },
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x30, 0x30, 0x30, 0x00, 0x00, 0x00, 0x00], name: 'seek test 5 (padded)' },
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x30, 0x81, 0x81, 0x00, 0x00, 0x00, 0x00], name: 'seek test 6 (padded)' },
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x30, 0x30, 0x81, 0x81, 0x00, 0x00, 0x00, 0x00], name: 'seek test 7 (padded)' },
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0xF8, 0x83, 0xFE, 0x80, 0x00, 0x00, 0x00, 0x00], name: 'seek test 8 (padded)' },
]; ];
data.forEach(function(data) { data.forEach(function(data) {
var bytes = ""; var bytes = [];
data.input.forEach((p) => { data.input.forEach((p) => {
bytes = bytes + p.toString(16).padStart(2, "0").toUpperCase() bytes.push(p.toString(16).padStart(2, "0").toUpperCase());
}); });
var codes = []; var codes = [];
var text = new TextDecoder(data.encoding).decode(new Uint8Array(data.input)); var text = new TextDecoder(data.encoding).decode(new Uint8Array(data.input));
@ -48,6 +64,7 @@ data.forEach(function(data) {
} }
codes[b++] = point; codes[b++] = point;
} }
bytes = bytes.join(" ");
codes = codes.join(", "); codes = codes.join(", ");
var line = "'" + data.name + "' => [" + '"' + bytes + '", [' + codes + "]],\n"; var line = "'" + data.name + "' => [" + '"' + bytes + '", [' + codes + "]],\n";
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode(line)); document.getElementsByTagName("pre")[0].appendChild(document.createTextNode(line));
@ -62,8 +79,8 @@ document.getElementsByTagName("pre")[0].appendChild(document.createTextNode("\n\
var bytes = ""; var bytes = "";
for (let a = 0; a < url.length; a++) { for (let a = 0; a < url.length; a++) {
if (url.charAt(a) == "%") { if (url.charAt(a) == "%") {
bytes = bytes.concat(url.charAt(a + 1), url.charAt(a + (padded))); bytes = bytes.concat(url.charAt(a + 1), url.charAt(a + 2));
a = a + (padded); a = a + 2;
} else { } else {
bytes = bytes.concat(url.charCodeAt(a).toString(16).padStart(2, "0")); bytes = bytes.concat(url.charCodeAt(a).toString(16).padStart(2, "0"));
} }

Loading…
Cancel
Save