Browse Source

Implement gb18030 seeking

Also fix some bugs in EOF handling
span
J. King 6 years ago
parent
commit
467c565e8c
  1. 1
      lib/Encoding/GB18030.php
  2. 68
      lib/Encoding/GBCommon.php
  3. 1
      lib/Encoding/GBK.php
  4. 30
      lib/Encoding/GenericEncoding.php
  5. 37
      tests/cases/Encoding/TestGB18030.php
  6. 20
      tools/mktestgbk.html

1
lib/Encoding/GB18030.php

@ -10,5 +10,4 @@ class GB18030 extends GBCommon {
const GBK = false;
const NAME = "gb18030";
const LABELS = ["gb18030"];
}

68
lib/Encoding/GBCommon.php

@ -13,6 +13,8 @@ abstract class GBCommon implements StatelessEncoding {
const TABLE_RANGES = [0,36,38,45,50,81,89,95,96,100,103,104,105,109,126,133,148,172,175,179,208,306,307,308,309,310,311,312,313,341,428,443,544,545,558,741,742,749,750,805,819,820,7922,7924,7925,7927,7934,7943,7944,7945,7950,8062,8148,8149,8152,8164,8174,8236,8240,8262,8264,8374,8380,8381,8384,8388,8390,8392,8393,8394,8396,8401,8406,8416,8419,8424,8437,8439,8445,8482,8485,8496,8521,8603,8936,8946,9046,9050,9063,9066,9076,9092,9100,9108,9111,9113,9131,9162,9164,9218,9219,11329,11331,11334,11336,11346,11361,11363,11366,11370,11372,11375,11389,11682,11686,11687,11692,11694,11714,11716,11723,11725,11730,11736,11982,11989,12102,12336,12348,12350,12384,12393,12395,12397,12510,12553,12851,12962,12973,13738,13823,13919,13933,14080,14298,14585,14698,15583,15847,16318,16434,16438,16481,16729,17102,17122,17315,17320,17402,17418,17859,17909,17911,17915,17916,17936,17939,17961,18664,18703,18814,18962,19043,33469,33470,33471,33484,33485,33490,33497,33501,33505,33513,33520,33536,33550,37845,37921,37948,38029,38038,38064,38065,38066,38069,38075,38076,38078,39108,39109,39113,39114,39115,39116,39265,39394,39420,189000,1237576];
const TABLE_OFFSETS = [128,165,169,178,184,216,226,235,238,244,248,251,253,258,276,284,300,325,329,334,364,463,465,467,469,471,473,475,477,506,594,610,712,716,730,930,938,962,970,1026,1104,1106,8209,8215,8218,8222,8231,8241,8244,8246,8252,8365,8452,8454,8458,8471,8482,8556,8570,8596,8602,8713,8720,8722,8726,8731,8737,8740,8742,8748,8751,8760,8766,8777,8781,8787,8802,8808,8816,8854,8858,8870,8896,8979,9322,9372,9548,9588,9616,9622,9634,9652,9662,9672,9676,9680,9702,9735,9738,9793,9795,11906,11909,11913,11917,11928,11944,11947,11951,11956,11960,11964,11979,12284,12292,12312,12319,12330,12351,12436,12447,12535,12543,12586,12842,12850,12964,13200,13215,13218,13253,13263,13267,13270,13384,13428,13727,13839,13851,14617,14703,14801,14816,14964,15183,15471,15585,16471,16736,17208,17325,17330,17374,17623,17997,18018,18212,18218,18301,18318,18760,18811,18814,18820,18823,18844,18848,18872,19576,19620,19738,19887,40870,59244,59336,59367,59413,59417,59423,59431,59437,59443,59452,59460,59478,59493,63789,63866,63894,63976,63986,64016,64018,64021,64025,64034,64037,64042,65074,65093,65107,65112,65127,65132,65375,65510,null,65536,1114112];
protected $dirtyEOF = 0;
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
@ -84,10 +86,12 @@ abstract class GBCommon implements StatelessEncoding {
}
if (($first + $second + $third) == 0) {
// clean EOF
$this->posChar--;
return false;
} else {
// dirty EOF
return self::err($this->errMode, [--$this->posChar, --$this->posByte]);
// dirty EOF; note how many bytes the last character had
$this->dirtyEOF = ($third ? 3 : ($second ? 2 : 1));
return self::err($this->errMode, [$this->posChar - 1, --$this->posByte]);
}
}
@ -138,13 +142,57 @@ abstract class GBCommon implements StatelessEncoding {
}
}
/** Advance $distance characters through the string
*
* If $distance is negative, the operation will be performed in reverse
*
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
*/
public function seek(int $distance): int {
// stub
/** Implements backward seeking $distance characters */
protected function seekBack(int $distance): int {
while ($distance > 0 && $this->posByte > 0) {
$distance--;
$this->posChar--;
if ($this->posByte == $this->lenByte && $this->dirtyEOF > 0) {
// if we are at the end of the string and it did not terminate cleanly, go back the correct number of dirty bytes to seek through the last character
$this->posByte -= $this->dirtyEOF;
continue;
}
// go back one byte
$b1 = ord(@$this->string[--$this->posByte]);
if ($b1 < 0x30 || $b1 == 0x80 || $b1 == 0xFF || $this->posByte == 0) { // these bytes are never part of a sequence, and the first byte is necessarily the start of a sequence
// the byte is a character
continue;
} else {
// go back a second byte
$b2 = ord(@$this->string[--$this->posByte]);
if ($b2 < 0x81 || $b2 == 0xFF) { // these bytes never appear second-to-last in a sequence
// the first byte was a character
$this->posByte += 1;
continue;
} elseif ($b1 > 0x39) {
// two-byte character
continue;
} elseif ($this->posByte < 2) { // byte values indicate a four-byte character, but there are insufficient bytes in the string
// the first byte was a character
$this->posByte += 1;
continue;
} else {
// go back a third byte
$b3 = ord(@$this->string[--$this->posByte]);
if ($b3 < 0x30 || $b3 > 0x39) { // these bytes never appear third-to-last in a sequence
// the first byte was a character
$this->posByte += 2;
continue;
} else {
// go back a fourth byte
$b4 = ord(@$this->string[--$this->posByte]);
if ($b4 < 0x81 || $b4 == 0xFF) { // these bytes never appear first in a sequence
// the first byte was a character
$this->posByte += 3;
continue;
} else {
// four-byte character
continue;
}
}
}
}
}
return $distance;
}
}

1
lib/Encoding/GBK.php

@ -20,5 +20,4 @@ class GBK extends GBCommon {
"iso-ir-58",
"x-gbk",
];
}

30
lib/Encoding/GenericEncoding.php

@ -64,6 +64,36 @@ trait GenericEncoding {
}
}
/** Advance $distance characters through the string
*
* If $distance is negative, the operation will be performed in reverse
*
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
*/
public function seek(int $distance): int {
if ($distance > 0) {
if ($this->posByte == strlen($this->string)) {
return $distance;
}
do {
$p = $this->nextCode();
} while (--$distance && $p !== false);
return $distance;
} elseif ($distance < 0) {
$distance = abs($distance);
if (!$this->posByte) {
return $distance;
}
$mode = $this->errMode;
$this->errMode = self::MODE_NULL;
$out = $this->seekBack($distance);
$this->errMode = $mode;
return $out;
} else {
return 0;
}
}
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */
public function peekChar(int $num = 1): string {
$out = "";

37
tests/cases/Encoding/TestGB18030.php

@ -16,8 +16,8 @@ class TestGB18030 extends \PHPUnit\Framework\TestCase {
/**
* @dataProvider provideCodePoints
* @covers MensBeam\Intl\Encoding\GB18030::encode
* @covers MensBeam\Intl\Encoding\GBK::encode
* @covers MensBeam\Intl\Encoding\GB18030::err
* @covers MensBeam\Intl\Encoding\GBK::encode
* @covers MensBeam\Intl\Encoding\GBK::err
*/
public function testEncodeCodePoints(string $class, bool $fatal, int $input, $exp) {
@ -33,11 +33,15 @@ class TestGB18030 extends \PHPUnit\Framework\TestCase {
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\GB18030::__construct
* @covers MensBeam\Intl\Encoding\GB18030::nextCode
* @covers MensBeam\Intl\Encoding\GB18030::posChar
*/
public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) {
$s = new GB18030(hex2bin($input));
$out = [];
$a = 0;
$this->assertSame($a, $s->posChar());
while (($p = $s->nextCode()) !== false) {
$this->assertSame(++$a, $s->posChar());
$out[] = $p;
}
$this->assertSame($exp, $out);
@ -60,6 +64,27 @@ class TestGB18030 extends \PHPUnit\Framework\TestCase {
$this->assertSame($exp, $out);
}
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\GB18030::seekBack
*/
public function testSTepBackThroughAString(string $input, array $points) {
$s = new GB18030(hex2bin($input));
$a = 0;
$test1 = [];
$test2 = [];
while (($p1 = $s->nextCode()) !== false) {
$test1[] = $p1;
$this->assertSame(0, $s->seek(-1));
$p2 = $s->nextCode();
$test2[] = $p2;
$this->assertSame($p1, $p2, "Mismatch at character position $a");
$this->assertSame(++$a, $s->posChar(), "Character position should be $a");
}
$this->assertSame($points, $test1);
$this->assertSame($points, $test2);
}
public function provideCodePoints() {
// bytes confirmed using Firefox
return [
@ -104,6 +129,7 @@ class TestGB18030 extends \PHPUnit\Framework\TestCase {
public function provideStrings() {
return [
'empty string' => ["", []],
// valid single characters
'sanity check' => ["40", [64]],
'special case for 0x80' => ["80", [8364]],
@ -123,6 +149,15 @@ class TestGB18030 extends \PHPUnit\Framework\TestCase {
'control second byte' => ["8100F437", [65533, 0, 65533]],
'control third byte' => ["81350037", [65533, 53, 0, 55]],
'control fourth byte' => ["8135F400", [65533, 53, 65533, 0]],
// invalid sequences with clean EOF
'bad first byte (padded)' => ["FF35F43700000000", [65533, 53, 65533, 55, 0, 0, 0, 0]],
'bad second byte (padded)' => ["81FFF43700000000", [65533, 65533, 55, 0, 0, 0, 0]],
'bad third byte (padded)' => ["8135FF3700000000", [65533, 53, 65533, 55, 0, 0, 0, 0]],
'bad fourth byte (padded)' => ["8135F4FF00000000", [65533, 53, 65533, 0, 0, 0, 0]],
'control first byte (padded)' => ["0035F43700000000", [0, 53, 65533, 55, 0, 0, 0, 0]],
'control second byte (padded)' => ["8100F43700000000", [65533, 0, 65533, 55, 0, 0, 0, 0]],
'control third byte (padded)' => ["8135003700000000", [65533, 53, 0, 55, 0, 0, 0, 0]],
'control fourth byte (padded)' => ["8135F40000000000", [65533, 53, 65533, 0, 0, 0, 0, 0]],
// out-of-range sequences
'void sequence' => ["8432A439", [65533]],
'void sequence 2' => ["FE39FE39", [65533]],

20
tools/mktestgbk.html

@ -21,14 +21,14 @@ var data = [
{ encoding: 'gb18030', input: [0x81, 0x00, 0xF4, 0x37], name: 'control second byte' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0x00, 0x37], name: 'control third byte' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x00], name: 'control fourth byte' },
{ encoding: 'gb18030', input: [0xFF, 0x35, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'bad first byte 2' },
{ encoding: 'gb18030', input: [0x81, 0xFF, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'bad second byte 2' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0xFF, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'bad third byte 2' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0xFF, 0x00, 0x00, 0x00, 0x00], name: 'bad fourth byte 2' },
{ encoding: 'gb18030', input: [0x00, 0x35, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'control first byte 2' },
{ encoding: 'gb18030', input: [0x81, 0x00, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'control second byte 2' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0x00, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'control third byte 2' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00], name: 'control fourth byte 2' },
{ encoding: 'gb18030', input: [0xFF, 0x35, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'bad first byte (padded)' },
{ encoding: 'gb18030', input: [0x81, 0xFF, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'bad second byte (padded)' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0xFF, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'bad third byte (padded)' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0xFF, 0x00, 0x00, 0x00, 0x00], name: 'bad fourth byte (padded)' },
{ encoding: 'gb18030', input: [0x00, 0x35, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'control first byte (padded)' },
{ encoding: 'gb18030', input: [0x81, 0x00, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'control second byte (padded)' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0x00, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'control third byte (padded)' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00], name: 'control fourth byte (padded)' },
{ encoding: 'gb18030', input: [0x84, 0x32, 0xA4, 0x39], name: 'void sequence' },
{ encoding: 'gb18030', input: [0xFE, 0x39, 0xFE, 0x39], name: 'void sequence 2' },
];
@ -62,8 +62,8 @@ document.getElementsByTagName("pre")[0].appendChild(document.createTextNode("\n\
var bytes = "";
for (let a = 0; a < url.length; a++) {
if (url.charAt(a) == "%") {
bytes = bytes.concat(url.charAt(a + 1), url.charAt(a + 2));
a = a + 2;
bytes = bytes.concat(url.charAt(a + 1), url.charAt(a + (padded)));
a = a + (padded);
} else {
bytes = bytes.concat(url.charCodeAt(a).toString(16).padStart(2, "0"));
}

Loading…
Cancel
Save