Browse Source

Fixes for multi-byte index-base encoders

- array_flip() retains the last duplicate, when we need the first
- Indexes are now prepared with a list of first-duplicate code points
to search before flipping
- This affected only U+3000 in GBK
- Big5 did not use array_flip(), but its list of override code points
did not include U+2561; Big5 now flips like the others
- EUC-JP had a long list of errors, but this encoding was not
previously released
- Shift_JIS' indexes are probably not correct, still
multi-byte
J. King 4 years ago
parent
commit
d9b8cd8dd1
  1. 9
      lib/Encoding/Big5.php
  2. 12
      lib/Encoding/EUCJP.php
  3. 7
      lib/Encoding/GBCommon.php
  4. 4
      tests/cases/Encoding/TestBig5.php
  5. 2
      tests/cases/Encoding/TestEUCJP.php
  6. 4
      tests/cases/Encoding/TestGB18030.php
  7. 200
      tools/mkindex.php
  8. 2
      tools/test-big5.html
  9. 1
      tools/test-eucjp.html
  10. 1
      tools/test-gb18030.html
  11. 1
      tools/test-gbk.html

9
lib/Encoding/Big5.php

File diff suppressed because one or more lines are too long

12
lib/Encoding/EUCJP.php

File diff suppressed because one or more lines are too long

7
lib/Encoding/GBCommon.php

File diff suppressed because one or more lines are too long

4
tests/cases/Encoding/TestBig5.php

@ -154,6 +154,10 @@ class TestBig5 extends \MensBeam\Intl\Test\CoderDecoderTest {
'U+3007 (fatal)' => [true, 0x3007, "C6 E2"], 'U+3007 (fatal)' => [true, 0x3007, "C6 E2"],
'U+5341 (HTML)' => [false, 0x5341, "A4 51"], 'U+5341 (HTML)' => [false, 0x5341, "A4 51"],
'U+5341 (fatal)' => [true, 0x5341, "A4 51"], 'U+5341 (fatal)' => [true, 0x5341, "A4 51"],
'U+2561 (HTML)' => [false, 0x2561, "F9 EB"],
'U+2561 (fatal)' => [true, 0x2561, "F9 EB"],
'U+256D (HTML)' => [false, 0x256D, "A2 7E"],
'U+256D (fatal)' => [true, 0x256D, "A2 7E"],
'-1 (HTML)' => [false, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], '-1 (HTML)' => [false, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
'-1 (fatal)' => [true, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], '-1 (fatal)' => [true, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
'0x110000 (HTML)' => [false, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], '0x110000 (HTML)' => [false, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],

2
tests/cases/Encoding/TestEUCJP.php

@ -160,6 +160,8 @@ class TestEUCJP extends \MensBeam\Intl\Test\CoderDecoderTest {
'U+00E6 (fatal)' => [true, 0xE6, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)], 'U+00E6 (fatal)' => [true, 0xE6, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)],
'U+FFE2 (HTML)' => [false, 0xFFE2, "A2 CC"], 'U+FFE2 (HTML)' => [false, 0xFFE2, "A2 CC"],
'U+FFE2 (fatal)' => [true, 0xFFE2, "A2 CC"], 'U+FFE2 (fatal)' => [true, 0xFFE2, "A2 CC"],
'U=2116 (HTML)' => [false, 0x2116, "AD E2"],
'U=2116 (fatal)' => [true, 0x2116, "AD E2"],
'-1 (HTML)' => [false, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], '-1 (HTML)' => [false, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
'-1 (fatal)' => [true, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], '-1 (fatal)' => [true, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
'0x110000 (HTML)' => [false, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], '0x110000 (HTML)' => [false, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],

4
tests/cases/Encoding/TestGB18030.php

@ -161,6 +161,8 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest {
'U+1D11E (fatal)' => [true, 0x1D11E, "94 32 BE 34"], 'U+1D11E (fatal)' => [true, 0x1D11E, "94 32 BE 34"],
'U+E5E5 (HTML)' => [false, 0xE5E5, bin2hex("")], 'U+E5E5 (HTML)' => [false, 0xE5E5, bin2hex("")],
'U+E5E5 (fatal)' => [true, 0xE5E5, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)], 'U+E5E5 (fatal)' => [true, 0xE5E5, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)],
'U+3000 (HTML)' => [false, 0x3000, "A1 A1"],
'U+3000 (fatal)' => [true, 0x3000, "A1 A1"],
'-1 (HTML)' => [false, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], '-1 (HTML)' => [false, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
'-1 (fatal)' => [true, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], '-1 (fatal)' => [true, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
'0x110000 (HTML)' => [false, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], '0x110000 (HTML)' => [false, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
@ -181,6 +183,8 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest {
'U+1D11E (fatal)' => [true, 0x1D11E, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)], 'U+1D11E (fatal)' => [true, 0x1D11E, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)],
'U+E5E5 (HTML)' => [false, 0xE5E5, bin2hex("")], 'U+E5E5 (HTML)' => [false, 0xE5E5, bin2hex("")],
'U+E5E5 (fatal)' => [true, 0xE5E5, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)], 'U+E5E5 (fatal)' => [true, 0xE5E5, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)],
'U+3000 (HTML)' => [false, 0x3000, "A1 A1"],
'U+3000 (fatal)' => [true, 0x3000, "A1 A1"],
'-1 (HTML)' => [false, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], '-1 (HTML)' => [false, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
'-1 (fatal)' => [true, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], '-1 (fatal)' => [true, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
'0x110000 (HTML)' => [false, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], '0x110000 (HTML)' => [false, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],

200
tools/mkindex.php

@ -48,25 +48,27 @@ if (!isset($labels[$label])) {
// encoding-specific output generators // encoding-specific output generators
function single_byte(string $label) { function single_byte(string $label) {
$entries = read_index($label, "https://encoding.spec.whatwg.org/index-$label.txt"); $table = read_index($label, "https://encoding.spec.whatwg.org/index-$label.txt");
$dec_char = make_decoder_char_array($entries); $dec_char = serialize_char_array($table);
$dec_code = make_decoder_point_array($entries); $dec_code = serialize_point_array($table);
$enc = make_encoder_array($entries); $enc = serialize_single_byte_array($table);
echo "const TABLE_DEC_CHAR = $dec_char;\n"; echo "const TABLE_DEC_CHAR = $dec_char;\n";
echo "const TABLE_DEC_CODE = $dec_code;\n"; echo "const TABLE_DEC_CODE = $dec_code;\n";
echo "const TABLE_ENC = $enc;\n"; echo "const TABLE_ENC = $enc;\n";
} }
function gb18030(string $label) { function gb18030(string $label) {
$dec_gbk = make_decoder_point_array(read_index($label, "https://encoding.spec.whatwg.org/index-$label.txt")); $gbk = read_index($label, "https://encoding.spec.whatwg.org/index-$label.txt");
$dec_gbk = serialize_point_array($gbk);
$enc_gbk = serialize_point_array(make_override_array($gbk));
$ranges = read_index($label, "https://encoding.spec.whatwg.org/index-$label-ranges.txt"); $ranges = read_index($label, "https://encoding.spec.whatwg.org/index-$label-ranges.txt");
$dec_max = []; $dec_max = [];
$dec_off = []; $dec_off = [];
foreach ($ranges as $match) { foreach ($ranges as $pointer => $code) {
// gather the range starts in one array; they will actually be used as range ends // gather the range starts in one array; they will actually be used as range ends
$dec_max[] = (int) $match[1]; $dec_max[] = $pointer;
// gather the starting code points in another array // gather the starting code points in another array
$dec_off[] = hexdec($match[2]); $dec_off[] = $code;
} }
// fudge the top of the ranges // fudge the top of the ranges
// see https://encoding.spec.whatwg.org/#index-gb18030-ranges-code-point Step 1 // see https://encoding.spec.whatwg.org/#index-gb18030-ranges-code-point Step 1
@ -77,13 +79,16 @@ function gb18030(string $label) {
$dec_off[] = 0x110000; $dec_off[] = 0x110000;
$dec_max = "[".implode(",", $dec_max)."]"; $dec_max = "[".implode(",", $dec_max)."]";
$dec_off = "[".implode(",", $dec_off)."]"; $dec_off = "[".implode(",", $dec_off)."]";
echo "const TABLE_GBK = $dec_gbk;\n"; echo "const TABLE_CODES = $dec_gbk;\n";
echo "const TABLE_POINTERS = $enc_gbk;\n";
echo "const TABLE_RANGES = $dec_max;\n"; echo "const TABLE_RANGES = $dec_max;\n";
echo "const TABLE_OFFSETS = $dec_off;\n"; echo "const TABLE_OFFSETS = $dec_off;\n";
} }
function big5(string $label) { function big5(string $label) {
$codes = make_decoder_point_array(read_index($label, "https://encoding.spec.whatwg.org/index-$label.txt")); // Big5 has unusually complex encoding requirements
// see https://encoding.spec.whatwg.org/#index-big5-pointer for particulars
$table = read_index($label, "https://encoding.spec.whatwg.org/index-$label.txt");
$specials = <<<ARRAY_LITERAL $specials = <<<ARRAY_LITERAL
[ [
1133 => [0x00CA, 0x0304], 1133 => [0x00CA, 0x0304],
@ -92,107 +97,64 @@ function big5(string $label) {
1166 => [0x00EA, 0x030C], 1166 => [0x00EA, 0x030C],
] ]
ARRAY_LITERAL; ARRAY_LITERAL;
// compile an encoder table // split Hong Kong Supplement code points from the rest of Big5
// see https://encoding.spec.whatwg.org/#index-big5-pointer for particulars $stop = (0xA1 - 0x81) * 157;
// first get the decoder table as an array $hk = [];
$table = eval("return $codes;"); $nhk = [];
// filter out the low end of the table containing Hong Kong Supplement characters, which are not used during encoding foreach ($table as $pointer => $code) {
$table = array_filter($table, function($key) { if ($pointer < $stop) {
return (!($key < ((0xA1 - 0x81) * 157))); $hk[$pointer] = $code;
}, \ARRAY_FILTER_USE_KEY);
// search for each unique code point's pointer in the table, the first for some, the last for a specific set
$enc = [];
$a = 0;
$points = array_unique($table);
sort($points);
foreach ($points as $point) {
// find the correct pointer
if (in_array($point, [0x2550, 0x255E, 0x256A, 0x5341, 0x5345])) {
$pointer = array_search($point, array_reverse($table, true));
} else {
$pointer = array_search($point, $table);
}
// step the output array's key
if ($a == $point) {
$key = "";
} else { } else {
$a = $point; $nhk[$pointer] = $code;
$key = "$point=>";
} }
$a++;
$enc[] = "$key$pointer";
} }
// compose the encoder table literal // search the Big5 rump for duplicates
$enc = "[".implode(",", $enc)."]"; $dupes = make_override_array($nhk);
echo "const TABLE_CODES = $codes;\n"; // remove those duplicates which should use the last code point
foreach([0x2550, 0x255E, 0x2561, 0x256A, 0x5341, 0x5345] as $code) {
unset($dupes[$code]);
}
// serialize and print; Hong Kong characters are kept separate as they are not used in encoding
$codes_tw = serialize_point_array($nhk);
$codes_hk = serialize_point_array($hk);
$enc = serialize_point_array($dupes);
echo "const TABLE_DOUBLES = $specials;\n"; echo "const TABLE_DOUBLES = $specials;\n";
echo "const TABLE_ENC = $enc;\n"; echo "const TABLE_CODES_TW = $codes_tw;\n";
echo "const TABLE_CODES_HK = $codes_hk;\n";
echo "const TABLE_POINTERS = $enc;\n";
} }
function euckr(string $label) { function euckr(string $label) {
$codes = make_decoder_point_array(read_index($label, "https://encoding.spec.whatwg.org/index-$label.txt")); $codes = serialize_point_array(read_index($label, "https://encoding.spec.whatwg.org/index-$label.txt"));
echo "const TABLE_CODES = $codes;\n"; echo "const TABLE_CODES = $codes;\n";
} }
function eucjp(string $label) { function eucjp(string $label) {
$jis0212 = make_decoder_point_array(read_index("jis0212", "https://encoding.spec.whatwg.org/index-jis0212.txt")); $jis0212 = serialize_point_array(read_index("jis0212", "https://encoding.spec.whatwg.org/index-jis0212.txt"));
$jis0208 = make_decoder_point_array(read_index("jis0208", "https://encoding.spec.whatwg.org/index-jis0208.txt")); $table = read_index("jis0208", "https://encoding.spec.whatwg.org/index-jis0208.txt");
$table = eval("return $jis0208;"); $dupes = serialize_point_array(make_override_array($table));
// search for each unique code point's first pointer in the table $jis0208 = serialize_point_array($table);
$enc = []; echo "const TABLE_JIS0208 = $jis0208;\n";
$a = 0;
$points = array_unique($table);
sort($points);
foreach ($points as $point) {
// find the correct pointer
$pointer = array_search($point, $table);
// step the output array's key
if ($a == $point) {
$key = "";
} else {
$a = $point;
$key = "$point=>";
}
$a++;
$enc[] = "$key$pointer";
}
// compose the encoder table literal
$enc = "[".implode(",", $enc)."]";
echo "const TABLE_JIS0208_DEC = $jis0208;\n";
echo "const TABLE_JIS0208_ENC = $enc;\n";
echo "const TABLE_JIS0212 = $jis0212;\n"; echo "const TABLE_JIS0212 = $jis0212;\n";
echo "const TABLE_POINTERS = $dupes;\n";
} }
function shiftjis(string $label) { function shiftjis(string $label) {
$codes = make_decoder_point_array(read_index($label, "https://encoding.spec.whatwg.org/index-jis0208.txt")); $table = read_index($label, "https://encoding.spec.whatwg.org/index-jis0208.txt");
$table = eval("return $codes;"); // exclude a range of pointers from override consideration
// remove the block of pointers between 8272 and 8835 $good = [];
// see https://encoding.spec.whatwg.org#index-shift_jis-pointer foreach ($table as $pointer => $code) {
foreach (range(8272, 8835) as $pointer) { if ($pointer < 8272 || $pointer > 8835) {
unset($table[$pointer]); $good[$pointer] = $code;
}
// now search for each unique code point's first pointer in the table as normal
$enc = [];
$a = 0;
$points = array_unique($table);
sort($points);
foreach ($points as $point) {
// find the correct pointer
$pointer = array_search($point, $table);
// step the output array's key
if ($a == $point) {
$key = "";
} else {
$a = $point;
$key = "$point=>";
} }
$a++;
$enc[] = "$key$pointer";
} }
// compose the encoder table literal // search the rump for duplicates
$enc = "[".implode(",", $enc)."]"; $dupes = make_override_array($good);
echo "const TABLE_CODES_DEC = $codes;\n"; // serialize and print
echo "const TABLE_CODES_ENC = $enc;\n"; $codes = serialize_point_array($table);
$enc = serialize_point_array($dupes);
echo "const TABLE_CODES = $codes;\n";
echo "const TABLE_POINTERS = $enc;\n";
} }
// generic helper functions // generic helper functions
@ -201,36 +163,36 @@ function read_index(string $label, string $url): array {
$data = file_get_contents($url) or die("index file for '$label' could not be retrieved from network."); $data = file_get_contents($url) or die("index file for '$label' could not be retrieved from network.");
// find lines that contain data // find lines that contain data
preg_match_all("/^\s*(\d+)\s+0x([0-9A-Z]+)/m", $data, $matches, \PREG_SET_ORDER); preg_match_all("/^\s*(\d+)\s+0x([0-9A-Z]+)/m", $data, $matches, \PREG_SET_ORDER);
return $matches; $out = [];
foreach ($matches as list($match, $index, $code)) {
$out[(int) $index] = (int) hexdec($code);
}
return $out;
} }
function make_decoder_point_array(array $entries): string { function serialize_point_array(array $table): string {
$out = []; $out = [];
$i = 0; $i = 0;
foreach ($entries as $match) { foreach ($table as $index => $code) {
$index = (int) $match[1]; // non-sequential indices must be printed, but others can be omitted
$code = hexdec($match[2]); if ($index === $i) {
// missing indexes necessitate specifying keys explicitly
if ($index == $i) {
$key = ""; $key = "";
} else { } else {
$key = "$index=>"; $key = "$index=>";
$i = $index; $i = $index;
} }
$out[] = $key."$code"; $out[] = $key.$code;
$i++; $i++;
} }
return "[".implode(",", $out)."]"; return "[".implode(",", $out)."]";
} }
function make_decoder_char_array(array $entries): string { function serialize_char_array(array $table): string {
$out = []; $out = [];
$i = 0; $i = 0;
foreach ($entries as $match) { foreach ($table as $index => $code) {
$index = (int) $match[1]; // non-sequential indices must be printed, but others can be omitted
$code = $match[2]; if ($index === $i) {
// missing indexes necessitate specifying keys explicitly
if ($index == $i) {
$key = ""; $key = "";
} else { } else {
$key = "$index=>"; $key = "$index=>";
@ -242,12 +204,10 @@ function make_decoder_char_array(array $entries): string {
return "[".implode(",", $out)."]"; return "[".implode(",", $out)."]";
} }
// this is only used for single-byte encoders; other encoders instead flip their decoder arrays or use custom tables // this is only used for single-byte encoders; other encoders instead flip their decoder arrays with overrides for duplicates or special cases
function make_encoder_array(array $entries): string { function serialize_single_byte_array(array $table): string {
$out = []; $out = [];
foreach ($entries as $match) { foreach ($table as $index => $code) {
$index = (int) $match[1];
$code = $match[2];
$byte = strtoupper(str_pad(dechex($index + 128), 2, "0", \STR_PAD_LEFT)); $byte = strtoupper(str_pad(dechex($index + 128), 2, "0", \STR_PAD_LEFT));
$out[$code] = "\"\\x$byte\""; $out[$code] = "\"\\x$byte\"";
} }
@ -265,3 +225,17 @@ function make_encoder_array(array $entries): string {
} }
return "[".implode(",", $out)."]"; return "[".implode(",", $out)."]";
} }
// indexes with duplicate code points by default need to match the lowest pointer when encoding
// PHP's array_flip() function retains the last duplicate rather than the first, so we have to find duplicates
function make_override_array(array $table): array {
$out = [];
$dupes = array_keys(array_filter(array_count_values($table), function($v) {
return $v > 1;
}));
foreach ($dupes as $code_point) {
$out[$code_point] = array_search($code_point, $table);
}
ksort($out);
return $out;
}

2
tools/test-big5.html

@ -28,6 +28,8 @@ var sampleCharacters = {
'U+00CA': 0xCA, 'U+00CA': 0xCA,
'U+3007': 0x3007, 'U+3007': 0x3007,
'U+5341': 0x5341, 'U+5341': 0x5341,
'U+2561': 0x2561,
'U+256D': 0x256D,
'-1': -1, '-1': -1,
'0x110000': 0x110000, '0x110000': 0x110000,
}; };

1
tools/test-eucjp.html

@ -39,6 +39,7 @@ var sampleCharacters = {
'U+2212': 0x2212, 'U+2212': 0x2212,
'U+00E6': 0xE6, 'U+00E6': 0xE6,
'U+FFE2': 0xFFE2, 'U+FFE2': 0xFFE2,
'U=2116': 0x2116,
'-1': -1, '-1': -1,
'0x110000': 0x110000, '0x110000': 0x110000,
}; };

1
tools/test-gb18030.html

@ -60,6 +60,7 @@ var sampleCharacters = {
'U+E7C7': 0xE7C7, 'U+E7C7': 0xE7C7,
'U+1D11E': 0x1D11E, 'U+1D11E': 0x1D11E,
'U+E5E5': 0xE5E5, 'U+E5E5': 0xE5E5,
'U+3000': 0x3000,
'-1': -1, '-1': -1,
'0x110000': 0x110000, '0x110000': 0x110000,
}; };

1
tools/test-gbk.html

@ -9,6 +9,7 @@ var sampleCharacters = {
'U+E7C7': 0xE7C7, 'U+E7C7': 0xE7C7,
'U+1D11E': 0x1D11E, 'U+1D11E': 0x1D11E,
'U+E5E5': 0xE5E5, 'U+E5E5': 0xE5E5,
'U+3000': 0x3000,
'-1': -1, '-1': -1,
'0x110000': 0x110000, '0x110000': 0x110000,
}; };

Loading…
Cancel
Save