You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
45 lines
1.8 KiB
45 lines
1.8 KiB
<?php
|
|
// retrieve the GB18030 index file for two-byte sequences
|
|
$label = "gb18030";
|
|
$data = file_get_contents("https://encoding.spec.whatwg.org/index-$label.txt") or die("index file for '$label' could not be retrieved from network.");
|
|
// find lines that contain data
|
|
preg_match_all("/^\s*(\d+)\s+0x([0-9A-Z]+)/m", $data, $matches, \PREG_SET_ORDER);
|
|
// set up
|
|
$dec_gbk = [];
|
|
// loop through each line
|
|
foreach ($matches as $match) {
|
|
// only the code point is relevant
|
|
$dec_gbk[] = hexdec($match[2]);
|
|
}
|
|
|
|
// retrieve the GB18030 range index file for four-byte sequences
|
|
$label = "gb18030";
|
|
$data = file_get_contents("https://encoding.spec.whatwg.org/index-$label-ranges.txt") or die("range index file for '$label' could not be retrieved from network.");
|
|
// find lines that contain data
|
|
preg_match_all("/^\s*(\d+)\s+0x([0-9A-Z]+)/m", $data, $matches, \PREG_SET_ORDER);
|
|
// set up
|
|
$dec_max = [];
|
|
$dec_off = [];
|
|
// loop through each line
|
|
foreach ($matches as $match) {
|
|
// gather the range starts in one array; they will actually be used as range ends
|
|
$dec_max[] = (int) $match[1];
|
|
// gather the starting code points in another array
|
|
$dec_off[] = hexdec($match[2]);
|
|
}
|
|
// fudge the top of the ranges
|
|
// see https://encoding.spec.whatwg.org/#index-gb18030-ranges-code-point Step 1
|
|
// we also add 0x110000 (one beyond the top of the Unicode range) to the offsets for encoding
|
|
$penult = array_pop($dec_max);
|
|
$dec_max = array_merge($dec_max, [39420, $penult, 1237576]);
|
|
array_splice($dec_off, -1, 0, "null");
|
|
$dec_off[] = 0x110000;
|
|
|
|
// output
|
|
$dec_gbk = implode(",", $dec_gbk);
|
|
$dec_max = implode(",", $dec_max);
|
|
$dec_off = implode(",", $dec_off);
|
|
|
|
echo " const TABLE_GBK = [$dec_gbk];\n";
|
|
echo " const TABLE_RANGES = [$dec_max];\n";
|
|
echo " const TABLE_OFFSETS = [$dec_off];\n";
|
|
|