Browse Source

Implement gb18030 decoder

span
J. King 6 years ago
parent
commit
d6747532cd
  1. 116
      lib/Encoding/GB18030.php
  2. 94
      tests/cases/Encoding/TestGB18030.php
  3. 1
      tests/phpunit.xml
  4. 43
      tools/mkgbk.php
  5. 2
      tools/mkindex.php
  6. 58
      tools/mktestgbk.html
  7. 41
      tools/mktestgbk.php

116
lib/Encoding/GB18030.php

File diff suppressed because one or more lines are too long

94
tests/cases/Encoding/TestGB18030.php

File diff suppressed because one or more lines are too long

1
tests/phpunit.xml

@ -20,6 +20,7 @@
<testsuite name="Encoding">
<file>cases/Encoding/TestUTF8.php</file>
<file>cases/Encoding/TestSingleByte.php</file>
<file>cases/Encoding/TestGB18030.php</file>
</testsuite>
</testsuites>
</phpunit>

43
tools/mkgbk.php

@ -0,0 +1,43 @@
<?php
// retrieve the GB18030 index file for two-byte sequences
$label = "gb18030";
$data = file_get_contents("https://encoding.spec.whatwg.org/index-$label.txt") or die("index file for '$label' could not be retrieved from network.");
// find lines that contain data
preg_match_all("/^\s*(\d+)\s+0x([0-9A-Z]+)/m", $data, $matches, \PREG_SET_ORDER);
// set up
$dec_gbk = [];
// loop through each line
foreach ($matches as $match) {
// only the code point is relevant
$dec_gbk[] = hexdec($match[2]);
}
// retrieve the GB18030 range index file for four-byte sequences
$label = "gb18030";
$data = file_get_contents("https://encoding.spec.whatwg.org/index-$label-ranges.txt") or die("range index file for '$label' could not be retrieved from network.");
// find lines that contain data
preg_match_all("/^\s*(\d+)\s+0x([0-9A-Z]+)/m", $data, $matches, \PREG_SET_ORDER);
// set up
$dec_max = [];
$dec_off = [];
// loop through each line
foreach ($matches as $match) {
// gather the range starts in one array; they will actually be used as range ends
$dec_max[] = (int) $match[1];
// gather the starting code points in another array
$dec_off[] = hexdec($match[2]);
}
// fudge the top of the ranges
// see https://encoding.spec.whatwg.org/#index-gb18030-ranges-code-point Step 1
$penult = array_pop($dec_max);
$dec_max = array_merge($dec_max, [39420, $penult, 1237576]);
array_splice($dec_off, -1, 0, "null");
// output
$dec_gbk = implode(",", $dec_gbk);
$dec_max = implode(",", $dec_max);
$dec_off = implode(",", $dec_off);
echo " const TABLE_GBK = [$dec_gbk];\n";
echo " const TABLE_RANGES = [$dec_max];\n";
echo " const TABLE_OFFSETS = [$dec_off];\n";

2
tools/mkindex.php

@ -2,7 +2,7 @@
// retrieve the relevant index file
$label = $argv[1] ?? "";
$label = trim(strtolower($label));
$data = file_get_contents("https://encoding.spec.whatwg.org/index-$label.txt") or die("index file for $label could not be retrieved from network.");
$data = file_get_contents("https://encoding.spec.whatwg.org/index-$label.txt") or die("index file for '$label' could not be retrieved from network.");
// find lines that contain data
preg_match_all("/^\s*(\d+)\s+0x([0-9A-Z]+)/m", $data, $matches, \PREG_SET_ORDER);
// set up

58
tools/mktestgbk.html

@ -0,0 +1,58 @@
<!DOCTYPE html>
<meta charset=gb18030>
<!-- Correct results are provided by Firefox -->
<pre></pre>
<script>
var data = [
// basics
{ encoding: 'gb18030', input: [0x40], name: 'sanity check' },
{ encoding: 'gb18030', input: [0x80], name: 'special case for 0x80' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x37], name: 'four-byte special case' },
{ encoding: 'gb18030', input: [0xA8, 0x4E], name: 'two-byte character' },
{ encoding: 'gb18030', input: [0x82, 0x31, 0xA2, 0x37], name: 'four-byte character' },
{ encoding: 'gb18030', input: [0x82], name: 'EOF after first byte' },
{ encoding: 'gb18030', input: [0x82, 0x30], name: 'EOF after second byte' },
{ encoding: 'gb18030', input: [0x82, 0x30, 0x81], name: 'EOF after third byte' },
{ encoding: 'gb18030', input: [0xFF, 0x35, 0xF4, 0x37], name: 'bad first byte' },
{ encoding: 'gb18030', input: [0x81, 0xFF, 0xF4, 0x37], name: 'bad second byte' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0xFF, 0x37], name: 'bad third byte' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0xFF], name: 'bad fourth byte' },
{ encoding: 'gb18030', input: [0x00, 0x35, 0xF4, 0x37], name: 'control first byte' },
{ encoding: 'gb18030', input: [0x81, 0x00, 0xF4, 0x37], name: 'control second byte' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0x00, 0x37], name: 'control third byte' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x00], name: 'control fourth byte' },
{ encoding: 'gb18030', input: [0xFF, 0x35, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'bad first byte 2' },
{ encoding: 'gb18030', input: [0x81, 0xFF, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'bad second byte 2' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0xFF, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'bad third byte 2' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0xFF, 0x00, 0x00, 0x00, 0x00], name: 'bad fourth byte 2' },
{ encoding: 'gb18030', input: [0x00, 0x35, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'control first byte 2' },
{ encoding: 'gb18030', input: [0x81, 0x00, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'control second byte 2' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0x00, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'control third byte 2' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00], name: 'control fourth byte 2' },
{ encoding: 'gb18030', input: [0x84, 0x32, 0xA4, 0x39], name: 'void sequence' },
{ encoding: 'gb18030', input: [0xFE, 0x39, 0xFE, 0x39], name: 'void sequence 2' },
]
data.forEach(function(data) {
var bytes = ""
data.input.forEach((p) => {
bytes = bytes + p.toString(16).padStart(2, "0").toUpperCase()
})
var codes = []
var text = new TextDecoder(data.encoding).decode(new Uint8Array(data.input))
var b = 0
for (let a = 0; a < text.length; a++) {
let point = text.codePointAt(a)
if (point >= 55296 && point <= 57343) {
// non-BMP characters have trailing low surrogates in JavaScript strings
continue
}
codes[b++] = point
}
codes = codes.join(", ")
var line = "'" + data.name + "' => [" + '"' + bytes + '", [' + codes + "]],\n"
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode(line));
})
var l = document.createElement("a")
l.href = "http://example.com/?" + String.fromCodePoint(0xFFFF)
//document.write(l.search.substr(1))
</script>

41
tools/mktestgbk.php

@ -0,0 +1,41 @@
<?php
// the Web Platform test suite does not have tests for gb18030, but a pull request was made in 2016 with a partial set of tests
// this script generates a test series from those tests which exercises the index and range tables with single characters
// it is a pedantic set of tests, and so the test suite itself only uses this series in an optional test
$standard_tests = [
'two-byte GBK' => "https://raw.githubusercontent.com/web-platform-tests/wpt/5847108cb16dc0047331da3f746652f35b3e9c90/encoding/legacy-mb-schinese/gb18030/gb18030_chars.html",
'four-byte Han' => "https://raw.githubusercontent.com/web-platform-tests/wpt/5847108cb16dc0047331da3f746652f35b3e9c90/encoding/legacy-mb-schinese/gb18030/gb18030_extra_han_chars.html",
'four-byte Hangul' => "https://raw.githubusercontent.com/web-platform-tests/wpt/5847108cb16dc0047331da3f746652f35b3e9c90/encoding/legacy-mb-schinese/gb18030/gb18030_extra_hangul_chars.html",
'four-byte miscellaneous' => "https://raw.githubusercontent.com/web-platform-tests/wpt/5847108cb16dc0047331da3f746652f35b3e9c90/encoding/legacy-mb-schinese/gb18030/gb18030_extra_misc_chars.html",
'four-byte private use' => "https://raw.githubusercontent.com/web-platform-tests/wpt/5847108cb16dc0047331da3f746652f35b3e9c90/encoding/legacy-mb-schinese/gb18030/gb18030_extra_pua_chars.html",
];
foreach($standard_tests as $name=> $url) {
$data = make_standard_test($url);
$in = $data[0];
$out = $data[1];
echo "'$name' => [[$in], [$out]],\n";
}
function make_standard_test(string $url): array {
// retrieve the test data
$data = file_get_contents($url) or die("Could not retrieve test $url");
// find the data
preg_match_all('/<span data-cp="([^"]+)" data-bytes="([^"]+)">/s', $data, $matches, \PREG_SET_ORDER);
// set up
$in = $out = [];
// loop through each match
foreach ($matches as $match) {
$bytes = str_replace(" ", "", $match[2]);
$code = hexdec($match[1]);
if ($bytes=="A8BC") { // this test is incorrect or out of date; both Vivaldi and Firefox yield code point 7743
$code = 7743;
}
// convert the code point to decimal
$out[] = $code;
// convert the hex bytes to PHP notation
$in[] = '"'.$bytes.'"';
}
$in = implode(",", $in);
$out = implode(",", $out);
return [$in, $out];
}
Loading…
Cancel
Save