Browse Source

Encode correct duplicate pointers in EUC-JP

multi-byte
J. King 4 years ago
parent
commit
b284056644
  1. 8
      lib/Encoding/EUCJP.php
  2. 2
      lib/Encoding/EUCKR.php
  3. 2
      tests/cases/Encoding/TestBig5.php
  4. 2
      tests/cases/Encoding/TestEUCJP.php
  5. 26
      tools/mkindex.php
  6. 1
      tools/test-big5.html
  7. 1
      tools/test-eucjp.html

8
lib/Encoding/EUCJP.php

File diff suppressed because one or more lines are too long

2
lib/Encoding/EUCKR.php

@ -71,7 +71,7 @@ class EUCKR extends AbstractEncoding implements StatelessEncoding {
} elseif ($codePoint < 128) {
return chr($codePoint);
} else {
$pointer = array_flip(self::TABLE_CODES)[$codePoint] ?? null;
$pointer = array_flip(self::TABLE_CODES)[$codePoint] ?? null; // this is safe: the EUC-KR index has no duplicates
if (isset($pointer)) {
$lead = (int) ($pointer / 190) + 0x81;
$trail = ($pointer % 190) + 0x41;

2
tests/cases/Encoding/TestBig5.php

@ -152,6 +152,8 @@ class TestBig5 extends \MensBeam\Intl\Test\CoderDecoderTest {
'U+00CA (fatal)' => [true, 0xCA, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)],
'U+3007 (HTML)' => [false, 0x3007, "C6 E2"],
'U+3007 (fatal)' => [true, 0x3007, "C6 E2"],
'U+5341 (HTML)' => [false, 0x5341, "A4 51"],
'U+5341 (fatal)' => [true, 0x5341, "A4 51"],
'-1 (HTML)' => [false, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
'-1 (fatal)' => [true, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
'0x110000 (HTML)' => [false, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],

2
tests/cases/Encoding/TestEUCJP.php

@ -158,6 +158,8 @@ class TestEUCJP extends \MensBeam\Intl\Test\CoderDecoderTest {
'U+2212 (fatal)' => [true, 0x2212, "A1 DD"],
'U+00E6 (HTML)' => [false, 0xE6, bin2hex("&#230;")],
'U+00E6 (fatal)' => [true, 0xE6, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)],
'U+FFE2 (HTML)' => [false, 0xFFE2, "A2 CC"],
'U+FFE2 (fatal)' => [true, 0xFFE2, "A2 CC"],
'-1 (HTML)' => [false, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
'-1 (fatal)' => [true, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
'0x110000 (HTML)' => [false, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],

26
tools/mkindex.php

@ -135,9 +135,31 @@ function euckr(string $label) {
}
function eucjp(string $label) {
$jis0208 = make_decoder_point_array(read_index("jis0208", "https://encoding.spec.whatwg.org/index-jis0208.txt"));
$jis0212 = make_decoder_point_array(read_index("jis0212", "https://encoding.spec.whatwg.org/index-jis0212.txt"));
echo "const TABLE_JIS0208 = $jis0208;\n";
$jis0208 = make_decoder_point_array(read_index("jis0208", "https://encoding.spec.whatwg.org/index-jis0208.txt"));
$table = eval("return $jis0208;");
// search for each unique code point's first pointer in the table
$enc = [];
$a = 0;
$points = array_unique($table);
sort($points);
foreach ($points as $point) {
// find the correct pointer
$pointer = array_search($point, $table);
// step the output array's key
if ($a == $point) {
$key = "";
} else {
$a = $point;
$key = "$point=>";
}
$a++;
$enc[] = "$key$pointer";
}
// compose the encoder table literal
$enc = "[".implode(",", $enc)."]";
echo "const TABLE_JIS0208_DEC = $jis0208;\n";
echo "const TABLE_JIS0208_ENC = $enc;\n";
echo "const TABLE_JIS0212 = $jis0212;\n";
}

1
tools/test-big5.html

@ -27,6 +27,7 @@ var sampleCharacters = {
'U+0064': 0x64,
'U+00CA': 0xCA,
'U+3007': 0x3007,
'U+5341': 0x5341,
'-1': -1,
'0x110000': 0x110000,
};

1
tools/test-eucjp.html

@ -38,6 +38,7 @@ var sampleCharacters = {
'U+FF96': 0xFF96,
'U+2212': 0x2212,
'U+00E6': 0xE6,
'U+FFE2': 0xFFE2,
'-1': -1,
'0x110000': 0x110000,
};

Loading…
Cancel
Save