Browse Source

Complete Big5 tests, with numerous fixes

span
J. King 6 years ago
parent
commit
bfc6c677c5
  1. 17
      lib/Encoding/Big5.php
  2. 2
      lib/Encoding/GenericEncoding.php
  3. 52
      tests/cases/Encoding/TestBig5.php
  4. 14
      tests/lib/DecoderTest.php
  5. 35
      tools/mkindex.php
  6. 40
      tools/test-big5.html

17
lib/Encoding/Big5.php

File diff suppressed because one or more lines are too long

2
lib/Encoding/GenericEncoding.php

@ -81,7 +81,7 @@ trait GenericEncoding {
return $distance;
} elseif ($distance < 0) {
$distance = abs($distance);
if (!$this->posByte) {
if (!$this->posChar) {
return $distance;
}
$mode = $this->errMode;

52
tests/cases/Encoding/TestBig5.php

@ -12,12 +12,21 @@ use MensBeam\Intl\Encoding\EncoderException;
class TestBig5 extends \MensBeam\Intl\Test\CoderDecoderTest {
protected $testedClass = Big5::class;
protected $seekString = "";
protected $seekCodes = [];
protected $seekOffsets = [];
/*
Char 0 U+007A (1 byte) Offset 0
Char 1 U+86CC (2 bytes) Offset 1
Char 2 U+6C34 (2 bytes) Offset 3
Char 3 U+00CA (0 bytes) Offset 5
Char 4 U+0304 (2 bytes) Offset 5
Char 5 U+00EA (0 bytes) Offset 7
Char 6 U+030C (2 bytes) Offset 7
End of string at char 7, offset 9
*/
protected $seekString = "7A D7AA A4F4 8862 88A5";
protected $seekCodes = [0x7A, 0x86CC, 0x6C34, 0xCA, 0x304, 0xEA, 0x30C];
protected $seekOffsets = [0, 1, 3, 5, 5, 7, 7, 9];
/* This string contains an invalid character sequence sandwiched between two null characters */
protected $brokenChar = "00 FF 00";
protected $lowerA = "a";
/**
* @dataProvider provideCodePoints
@ -118,21 +127,38 @@ class TestBig5 extends \MensBeam\Intl\Test\CoderDecoderTest {
}
public function provideCodePoints() {
return [];
$series = [
return [
'U+0064 (HTML)' => [false, 0x64, "64"],
'U+0064 (fatal)' => [true, 0x64, "64"],
'U+00CA (HTML)' => [false, 0xCA, bin2hex("&#202;")],
'U+00CA (fatal)' => [true, 0xCA, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)],
'U+3007 (HTML)' => [false, 0x3007, "C6 E2"],
'U+3007 (fatal)' => [true, 0x3007, "C6 E2"],
'-1 (HTML)' => [false, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
'-1 (fatal)' => [true, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
'0x110000 (HTML)' => [false, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
'0x110000 (fatal)' => [true, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
];
foreach ($series as $name => $test) {
yield "$name (fatal)" => array_merge([true], $test);
yield "$name (HTML)" => array_merge([false], $test);
}
}
public function provideStrings() {
return [];
return [
// control samples
'empty string' => ["", []],
'sanity check' => ["61 62 63 31 32 33", [97, 98, 99, 49, 50, 51]],
'sanity check' => ["40", [64]],
'two-byte character' => ["D7 D7", [36290]],
'EOF after first byte' => ["D7", [65533]],
'low byte after first byte' => ["D7 39", [65533, 57]],
'0x80 as first byte' => ["80 D7", [65533, 65533]],
'0xFF as first byte' => ["FF D7", [65533, 65533]],
'invalid high byte as first byte' => ["81 D7", [65533]],
'0x7F after first byte' => ["D7 7F", [65533, 127]],
'0xFF after first byte' => ["D7 FF", [65533]],
'invalid high byte after first byte' => ["D7 81", [65533]],
'double-characters low' => ["88 62 88 64", [202, 772, 202, 780]],
'double-characters high' => ["88 A3 88 A5", [234, 772, 234, 780]],
'mixed string' => ["7A D7 AA A4 F4 88 62 88 A5", [122, 34508, 27700, 202, 772, 234, 780]],
'mixed string 2' => ["62 D7 D7 D7 D7 62", [98, 36290, 36290, 98]],
'broken string' => ["00 FF 00", [0, 65533, 0]],
];
}

14
tests/lib/DecoderTest.php

@ -47,9 +47,14 @@ abstract class DecoderTest extends \PHPUnit\Framework\TestCase {
$s = new $class($input);
$exp = array_reverse($exp);
$act = [];
while ($s->nextCode() !== false);
while ($s->posByte()) {
$s->seek(-1);
$pos = 0;
while ($s->nextCode() !== false) {
$this->assertSame(++$pos, $s->posChar());
}
$this->assertSame(sizeof($exp), $pos);
while ($s->posChar()) {
$this->assertSame(0, $s->seek(-1));
$this->assertSame(--$pos, $s->posChar());
$act[] = $s->nextCode();
$s->seek(-1);
}
@ -245,7 +250,8 @@ abstract class DecoderTest extends \PHPUnit\Framework\TestCase {
}
$this->assertSame(2, $s->posChar());
$this->assertSame(0x00, $s->nextCode());
$s->seek(-2);
$this->assertSame(3, $s->posChar());
$this->assertSame(0, $s->seek(-2));
$this->assertSame(1, $s->posChar());
try {
$p = $s->peekCode();

35
tools/mkindex.php

@ -88,8 +88,41 @@ function big5(string $label) {
1166 => [0x00EA, 0x030C],
]
ARRAY_LITERAL;
// compile an encoder table
// see https://encoding.spec.whatwg.org/#index-big5-pointer for particulars
// first get the decoder table as an array
$table = eval("return $codes;");
// filter out the low end of the table containing Hong Kong Supplement characters, which are not used during encoding
$table = array_filter($table, function($key) {
return (!($key < ((0xA1 - 0x81) * 157)));
}, \ARRAY_FILTER_USE_KEY);
// search for each unique code point's pointer in the table, the first for some, the last for a specific set
$enc = [];
$a = 0;
$points = array_unique($table);
sort($points);
foreach ($points as $point) {
// find the correct pointer
if (in_array($point, [0x2550, 0x255E, 0x256A, 0x5341, 0x5345])) {
$pointer = array_search($point, array_reverse($table, true));
} else {
$pointer = array_search($point, $table);
}
// step the output array's key
if ($a == $point) {
$key = "";
} else {
$a = $point;
$key = "$point=>";
}
$a++;
$enc[] = "$key$pointer";
}
// compose the encoder table literal
$enc = "[".implode(",", $enc)."]";
echo "const TABLE_CODES = $codes;\n";
echo "const TABLE_DOUBLES = $specials;\n";
echo "const TABLE_ENC = $enc;\n";
}
function euckr(string $label) {
@ -143,7 +176,7 @@ function make_decoder_char_array(array $entries): string {
return "[".implode(",", $out)."]";
}
// this is only used for single-byte encoders; other encoders instead flip their decoder arrays
// this is only used for single-byte encoders; other encoders instead flip their decoder arrays or use custom tables
function make_encoder_array(array $entries): string {
$out = [];
foreach ($entries as $match) {

40
tools/test-big5.html

@ -5,10 +5,50 @@ var sampleStrings = {
'empty string': "",
// valid single characters
'sanity check': "40",
'two-byte character': "D7 D7",
// invalid sequences
'EOF after first byte': "D7",
'low byte after first byte': "D7 39",
'0x80 as first byte': "80 D7",
'0xFF as first byte': "FF D7",
'invalid high byte as first byte': "81 D7",
'0x7F after first byte': "D7 7F",
'0xFF after first byte': "D7 FF",
'invalid high byte after first byte': "D7 81",
'non-character': "88 66",
// double sequences
'double-characters low': "88 62 88 64",
'double-characters high': "88 A3 88 A5",
// mixed string
'mixed string': "7A D7 AA A4 F4 88 62 88 A5",
'mixed string 2': "62 D7 D7 D7 D7 62",
};
var sampleCharacters = {
'U+0064': 0x64,
'U+00CA': 0xCA,
'U+3007': 0x3007,
'-1': -1,
'0x110000': 0x110000,
};
var seekCodePoints = [
/*
Char 0 U+007A (1 byte) Offset 0
Char 1 U+86CC (2 bytes) Offset 1
Char 2 U+6C34 (2 bytes) Offset 3
Char 3 U+00CA (0 bytes) Offset 5
Char 4 U+0304 (2 bytes) Offset 5
Char 5 U+00EA (0 bytes) Offset 7
Char 6 U+030C (2 bytes) Offset 7
End of string at char 7, offset 9
*/
0x007A,
0x86CC,
0x6C34,
// these four should be replaced with bytes 8862 88A5, which together produce four characters
0x00CA,
0x0304,
0x00EA,
0x030C,
];
</script>
<script src="test.js"></script>

Loading…
Cancel
Save