J. King
6 years ago
7 changed files with 354 additions and 1 deletions
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -0,0 +1,43 @@ |
|||
<?php |
|||
// retrieve the GB18030 index file for two-byte sequences |
|||
$label = "gb18030"; |
|||
$data = file_get_contents("https://encoding.spec.whatwg.org/index-$label.txt") or die("index file for '$label' could not be retrieved from network."); |
|||
// find lines that contain data |
|||
preg_match_all("/^\s*(\d+)\s+0x([0-9A-Z]+)/m", $data, $matches, \PREG_SET_ORDER); |
|||
// set up |
|||
$dec_gbk = []; |
|||
// loop through each line |
|||
foreach ($matches as $match) { |
|||
// only the code point is relevant |
|||
$dec_gbk[] = hexdec($match[2]); |
|||
} |
|||
|
|||
// retrieve the GB18030 range index file for four-byte sequences |
|||
$label = "gb18030"; |
|||
$data = file_get_contents("https://encoding.spec.whatwg.org/index-$label-ranges.txt") or die("range index file for '$label' could not be retrieved from network."); |
|||
// find lines that contain data |
|||
preg_match_all("/^\s*(\d+)\s+0x([0-9A-Z]+)/m", $data, $matches, \PREG_SET_ORDER); |
|||
// set up |
|||
$dec_max = []; |
|||
$dec_off = []; |
|||
// loop through each line |
|||
foreach ($matches as $match) { |
|||
// gather the range starts in one array; they will actually be used as range ends |
|||
$dec_max[] = (int) $match[1]; |
|||
// gather the starting code points in another array |
|||
$dec_off[] = hexdec($match[2]); |
|||
} |
|||
// fudge the top of the ranges |
|||
// see https://encoding.spec.whatwg.org/#index-gb18030-ranges-code-point Step 1 |
|||
$penult = array_pop($dec_max); |
|||
$dec_max = array_merge($dec_max, [39420, $penult, 1237576]); |
|||
array_splice($dec_off, -1, 0, "null"); |
|||
|
|||
// output |
|||
$dec_gbk = implode(",", $dec_gbk); |
|||
$dec_max = implode(",", $dec_max); |
|||
$dec_off = implode(",", $dec_off); |
|||
|
|||
echo " const TABLE_GBK = [$dec_gbk];\n"; |
|||
echo " const TABLE_RANGES = [$dec_max];\n"; |
|||
echo " const TABLE_OFFSETS = [$dec_off];\n"; |
@ -0,0 +1,58 @@ |
|||
<!DOCTYPE html> |
|||
<meta charset=gb18030> |
|||
<!-- Correct results are provided by Firefox --> |
|||
<pre></pre> |
|||
<script> |
|||
var data = [ |
|||
// basics |
|||
{ encoding: 'gb18030', input: [0x40], name: 'sanity check' }, |
|||
{ encoding: 'gb18030', input: [0x80], name: 'special case for 0x80' }, |
|||
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x37], name: 'four-byte special case' }, |
|||
{ encoding: 'gb18030', input: [0xA8, 0x4E], name: 'two-byte character' }, |
|||
{ encoding: 'gb18030', input: [0x82, 0x31, 0xA2, 0x37], name: 'four-byte character' }, |
|||
{ encoding: 'gb18030', input: [0x82], name: 'EOF after first byte' }, |
|||
{ encoding: 'gb18030', input: [0x82, 0x30], name: 'EOF after second byte' }, |
|||
{ encoding: 'gb18030', input: [0x82, 0x30, 0x81], name: 'EOF after third byte' }, |
|||
{ encoding: 'gb18030', input: [0xFF, 0x35, 0xF4, 0x37], name: 'bad first byte' }, |
|||
{ encoding: 'gb18030', input: [0x81, 0xFF, 0xF4, 0x37], name: 'bad second byte' }, |
|||
{ encoding: 'gb18030', input: [0x81, 0x35, 0xFF, 0x37], name: 'bad third byte' }, |
|||
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0xFF], name: 'bad fourth byte' }, |
|||
{ encoding: 'gb18030', input: [0x00, 0x35, 0xF4, 0x37], name: 'control first byte' }, |
|||
{ encoding: 'gb18030', input: [0x81, 0x00, 0xF4, 0x37], name: 'control second byte' }, |
|||
{ encoding: 'gb18030', input: [0x81, 0x35, 0x00, 0x37], name: 'control third byte' }, |
|||
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x00], name: 'control fourth byte' }, |
|||
{ encoding: 'gb18030', input: [0xFF, 0x35, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'bad first byte 2' }, |
|||
{ encoding: 'gb18030', input: [0x81, 0xFF, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'bad second byte 2' }, |
|||
{ encoding: 'gb18030', input: [0x81, 0x35, 0xFF, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'bad third byte 2' }, |
|||
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0xFF, 0x00, 0x00, 0x00, 0x00], name: 'bad fourth byte 2' }, |
|||
{ encoding: 'gb18030', input: [0x00, 0x35, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'control first byte 2' }, |
|||
{ encoding: 'gb18030', input: [0x81, 0x00, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'control second byte 2' }, |
|||
{ encoding: 'gb18030', input: [0x81, 0x35, 0x00, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'control third byte 2' }, |
|||
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00], name: 'control fourth byte 2' }, |
|||
{ encoding: 'gb18030', input: [0x84, 0x32, 0xA4, 0x39], name: 'void sequence' }, |
|||
{ encoding: 'gb18030', input: [0xFE, 0x39, 0xFE, 0x39], name: 'void sequence 2' }, |
|||
] |
|||
data.forEach(function(data) { |
|||
var bytes = "" |
|||
data.input.forEach((p) => { |
|||
bytes = bytes + p.toString(16).padStart(2, "0").toUpperCase() |
|||
}) |
|||
var codes = [] |
|||
var text = new TextDecoder(data.encoding).decode(new Uint8Array(data.input)) |
|||
var b = 0 |
|||
for (let a = 0; a < text.length; a++) { |
|||
let point = text.codePointAt(a) |
|||
if (point >= 55296 && point <= 57343) { |
|||
// non-BMP characters have trailing low surrogates in JavaScript strings |
|||
continue |
|||
} |
|||
codes[b++] = point |
|||
} |
|||
codes = codes.join(", ") |
|||
var line = "'" + data.name + "' => [" + '"' + bytes + '", [' + codes + "]],\n" |
|||
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode(line)); |
|||
}) |
|||
var l = document.createElement("a") |
|||
l.href = "http://example.com/?" + String.fromCodePoint(0xFFFF) |
|||
//document.write(l.search.substr(1)) |
|||
</script> |
@ -0,0 +1,41 @@ |
|||
<?php |
|||
// the Web Platform test suite does not have tests for gb18030, but a pull request was made in 2016 with a partial set of tests |
|||
// this script generates a test series from those tests which exercises the index and range tables with single characters |
|||
// it is a pedantic set of tests, and so the test suite itself only uses this series in an optional test |
|||
$standard_tests = [ |
|||
'two-byte GBK' => "https://raw.githubusercontent.com/web-platform-tests/wpt/5847108cb16dc0047331da3f746652f35b3e9c90/encoding/legacy-mb-schinese/gb18030/gb18030_chars.html", |
|||
'four-byte Han' => "https://raw.githubusercontent.com/web-platform-tests/wpt/5847108cb16dc0047331da3f746652f35b3e9c90/encoding/legacy-mb-schinese/gb18030/gb18030_extra_han_chars.html", |
|||
'four-byte Hangul' => "https://raw.githubusercontent.com/web-platform-tests/wpt/5847108cb16dc0047331da3f746652f35b3e9c90/encoding/legacy-mb-schinese/gb18030/gb18030_extra_hangul_chars.html", |
|||
'four-byte miscellaneous' => "https://raw.githubusercontent.com/web-platform-tests/wpt/5847108cb16dc0047331da3f746652f35b3e9c90/encoding/legacy-mb-schinese/gb18030/gb18030_extra_misc_chars.html", |
|||
'four-byte private use' => "https://raw.githubusercontent.com/web-platform-tests/wpt/5847108cb16dc0047331da3f746652f35b3e9c90/encoding/legacy-mb-schinese/gb18030/gb18030_extra_pua_chars.html", |
|||
]; |
|||
foreach($standard_tests as $name=> $url) { |
|||
$data = make_standard_test($url); |
|||
$in = $data[0]; |
|||
$out = $data[1]; |
|||
echo "'$name' => [[$in], [$out]],\n"; |
|||
} |
|||
|
|||
function make_standard_test(string $url): array { |
|||
// retrieve the test data |
|||
$data = file_get_contents($url) or die("Could not retrieve test $url"); |
|||
// find the data |
|||
preg_match_all('/<span data-cp="([^"]+)" data-bytes="([^"]+)">/s', $data, $matches, \PREG_SET_ORDER); |
|||
// set up |
|||
$in = $out = []; |
|||
// loop through each match |
|||
foreach ($matches as $match) { |
|||
$bytes = str_replace(" ", "", $match[2]); |
|||
$code = hexdec($match[1]); |
|||
if ($bytes=="A8BC") { // this test is incorrect or out of date; both Vivaldi and Firefox yield code point 7743 |
|||
$code = 7743; |
|||
} |
|||
// convert the code point to decimal |
|||
$out[] = $code; |
|||
// convert the hex bytes to PHP notation |
|||
$in[] = '"'.$bytes.'"'; |
|||
} |
|||
$in = implode(",", $in); |
|||
$out = implode(",", $out); |
|||
return [$in, $out]; |
|||
} |
Loading…
Reference in new issue