J. King
6 years ago
2 changed files with 163 additions and 94 deletions
@ -1,45 +0,0 @@ |
|||||
<?php |
|
||||
// retrieve the GB18030 index file for two-byte sequences |
|
||||
$label = "gb18030"; |
|
||||
$data = file_get_contents("https://encoding.spec.whatwg.org/index-$label.txt") or die("index file for '$label' could not be retrieved from network."); |
|
||||
// find lines that contain data |
|
||||
preg_match_all("/^\s*(\d+)\s+0x([0-9A-Z]+)/m", $data, $matches, \PREG_SET_ORDER); |
|
||||
// set up |
|
||||
$dec_gbk = []; |
|
||||
// loop through each line |
|
||||
foreach ($matches as $match) { |
|
||||
// only the code point is relevant |
|
||||
$dec_gbk[] = hexdec($match[2]); |
|
||||
} |
|
||||
|
|
||||
// retrieve the GB18030 range index file for four-byte sequences |
|
||||
$label = "gb18030"; |
|
||||
$data = file_get_contents("https://encoding.spec.whatwg.org/index-$label-ranges.txt") or die("range index file for '$label' could not be retrieved from network."); |
|
||||
// find lines that contain data |
|
||||
preg_match_all("/^\s*(\d+)\s+0x([0-9A-Z]+)/m", $data, $matches, \PREG_SET_ORDER); |
|
||||
// set up |
|
||||
$dec_max = []; |
|
||||
$dec_off = []; |
|
||||
// loop through each line |
|
||||
foreach ($matches as $match) { |
|
||||
// gather the range starts in one array; they will actually be used as range ends |
|
||||
$dec_max[] = (int) $match[1]; |
|
||||
// gather the starting code points in another array |
|
||||
$dec_off[] = hexdec($match[2]); |
|
||||
} |
|
||||
// fudge the top of the ranges |
|
||||
// see https://encoding.spec.whatwg.org/#index-gb18030-ranges-code-point Step 1 |
|
||||
// we also add 0x110000 (one beyond the top of the Unicode range) to the offsets for encoding |
|
||||
$penult = array_pop($dec_max); |
|
||||
$dec_max = array_merge($dec_max, [39420, $penult, 1237576]); |
|
||||
array_splice($dec_off, -1, 0, "null"); |
|
||||
$dec_off[] = 0x110000; |
|
||||
|
|
||||
// output |
|
||||
$dec_gbk = implode(",", $dec_gbk); |
|
||||
$dec_max = implode(",", $dec_max); |
|
||||
$dec_off = implode(",", $dec_off); |
|
||||
|
|
||||
echo " const TABLE_GBK = [$dec_gbk];\n"; |
|
||||
echo " const TABLE_RANGES = [$dec_max];\n"; |
|
||||
echo " const TABLE_OFFSETS = [$dec_off];\n"; |
|
@ -1,54 +1,168 @@ |
|||||
<?php |
<?php |
||||
// retrieve the relevant index file |
$labels = [ |
||||
|
'big5' => "big5", |
||||
|
//'euc-jp' => "eucjp", |
||||
|
'euc-kr' => "euckr", |
||||
|
'gb18030' => "gb18030", |
||||
|
'ibm866' => "single_byte", |
||||
|
//'iso-2022-jp' => "iso2022jp", |
||||
|
'iso-8859-10' => "single_byte", |
||||
|
'iso-8859-13' => "single_byte", |
||||
|
'iso-8859-14' => "single_byte", |
||||
|
'iso-8859-15' => "single_byte", |
||||
|
'iso-8859-16' => "single_byte", |
||||
|
'iso-8859-2' => "single_byte", |
||||
|
'iso-8859-3' => "single_byte", |
||||
|
'iso-8859-4' => "single_byte", |
||||
|
'iso-8859-5' => "single_byte", |
||||
|
'iso-8859-6' => "single_byte", |
||||
|
'iso-8859-7' => "single_byte", |
||||
|
'iso-8859-8' => "single_byte", |
||||
|
'koi8-r' => "single_byte", |
||||
|
'koi8-u' => "single_byte", |
||||
|
'macintosh' => "single_byte", |
||||
|
//'shift-jis' => "shiftjis", |
||||
|
'windows-1250' => "single_byte", |
||||
|
'windows-1251' => "single_byte", |
||||
|
'windows-1252' => "single_byte", |
||||
|
'windows-1253' => "single_byte", |
||||
|
'windows-1254' => "single_byte", |
||||
|
'windows-1255' => "single_byte", |
||||
|
'windows-1256' => "single_byte", |
||||
|
'windows-1257' => "single_byte", |
||||
|
'windows-1258' => "single_byte", |
||||
|
'windows-874' => "single_byte", |
||||
|
'x-mac-cyrillic' => "single_byte", |
||||
|
]; |
||||
$label = $argv[1] ?? ""; |
$label = $argv[1] ?? ""; |
||||
$label = trim(strtolower($label)); |
$label = trim(strtolower($label)); |
||||
$data = file_get_contents("https://encoding.spec.whatwg.org/index-$label.txt") or die("index file for '$label' could not be retrieved from network."); |
if (!isset($labels[$label])) { |
||||
// find lines that contain data |
die("Invalid label specified. Must be one of: ".json_encode(array_keys($labels))); |
||||
preg_match_all("/^\s*(\d+)\s+0x([0-9A-Z]+)/m", $data, $matches, \PREG_SET_ORDER); |
} |
||||
// set up |
($labels[$label])($label); |
||||
$dec_char = []; |
|
||||
$dec_code = []; |
// encoding-specific output generators |
||||
$enc = []; |
|
||||
$i = 0; |
function single_byte(string $label) { |
||||
// loop through each line |
$entries = read_index($label, "https://encoding.spec.whatwg.org/index-$label.txt"); |
||||
foreach ($matches as $match) { |
$dec_char = make_decoder_char_array($entries); |
||||
// index is the byte value minus 128 |
$dec_code = make_decoder_point_array($entires); |
||||
$index = (int) $match[1]; |
$enc = make_encoder_array($entries); |
||||
// byte is a reconstruction of the hexdecimal value of the byte value, padded to two nybbles |
echo "const TABLE_DEC_CHAR = $dec_char;\n"; |
||||
$byte = strtoupper(str_pad(dechex($index + 128), 2, "0", \STR_PAD_LEFT)); |
echo "const TABLE_DEC_CODE = $dec_code;\n"; |
||||
// code is the Unocide code point |
echo "const TABLE_ENC = $enc;\n"; |
||||
$code = hexdec($match[2]); |
} |
||||
// hex is the code point in hexadecimal |
|
||||
$hex = dechex($code); |
function gb18030(string $label) { |
||||
// missing indexes necessitate specifying keys explicitly |
$dec_gbk = make_decoder_point_array(read_index($label, "https://encoding.spec.whatwg.org/index-$label.txt")); |
||||
if ($index == $i) { |
$ranges = read_index($label, "https://encoding.spec.whatwg.org/index-$label-ranges.txt"); |
||||
$key = ""; |
$dec_max = []; |
||||
} else { |
$dec_off = []; |
||||
$key = "$index=>"; |
foreach ($ranges as $match) { |
||||
$i = $index; |
// gather the range starts in one array; they will actually be used as range ends |
||||
|
$dec_max[] = (int) $match[1]; |
||||
|
// gather the starting code points in another array |
||||
|
$dec_off[] = hexdec($match[2]); |
||||
|
} |
||||
|
// fudge the top of the ranges |
||||
|
// see https://encoding.spec.whatwg.org/#index-gb18030-ranges-code-point Step 1 |
||||
|
// we also add 0x110000 (one beyond the top of the Unicode range) to the offsets for encoding |
||||
|
$penult = array_pop($dec_max); |
||||
|
$dec_max = array_merge($dec_max, [39420, $penult, 1237576]); |
||||
|
array_splice($dec_off, -1, 0, "null"); |
||||
|
$dec_off[] = 0x110000; |
||||
|
$dec_max = "[".implode(",", $dec_max)."]"; |
||||
|
$dec_off = "[".implode(",", $dec_off)."]"; |
||||
|
echo "const TABLE_GBK = $dec_gbk;\n"; |
||||
|
echo "const TABLE_RANGES = $dec_max;\n"; |
||||
|
echo "const TABLE_OFFSETS = $dec_off;\n"; |
||||
|
} |
||||
|
|
||||
|
function big5(string $label) { |
||||
|
$codes = make_decoder_point_array(read_index($label, "https://encoding.spec.whatwg.org/index-$label.txt")); |
||||
|
$specials = <<<ARRAY_LITERAL |
||||
|
[ |
||||
|
1133 => [0x00CA, 0x0304], |
||||
|
1135 => [0x00CA, 0x030C], |
||||
|
1164 => [0x00EA, 0x0304], |
||||
|
1166 => [0x00EA, 0x030C], |
||||
|
] |
||||
|
ARRAY_LITERAL; |
||||
|
echo "const TABLE_CODES = $codes;\n"; |
||||
|
echo "const TABLE_DOUBLES = $specials;\n"; |
||||
|
} |
||||
|
|
||||
|
function euckr(string $label) { |
||||
|
$codes = make_decoder_point_array(read_index($label, "https://encoding.spec.whatwg.org/index-$label.txt")); |
||||
|
echo "const TABLE_CODES = $codes;\n"; |
||||
|
} |
||||
|
|
||||
|
// generic helper functions |
||||
|
|
||||
|
function read_index(string $label, string $url): array { |
||||
|
$data = file_get_contents($url) or die("index file for '$label' could not be retrieved from network."); |
||||
|
// find lines that contain data |
||||
|
preg_match_all("/^\s*(\d+)\s+0x([0-9A-Z]+)/m", $data, $matches, \PREG_SET_ORDER); |
||||
|
return $matches; |
||||
|
} |
||||
|
|
||||
|
function make_decoder_point_array(array $entries): string { |
||||
|
$out = []; |
||||
|
$i = 0; |
||||
|
foreach ($entries as $match) { |
||||
|
$index = (int) $match[1]; |
||||
|
$code = hexdec($match[2]); |
||||
|
// missing indexes necessitate specifying keys explicitly |
||||
|
if ($index == $i) { |
||||
|
$key = ""; |
||||
|
} else { |
||||
|
$key = "$index=>"; |
||||
|
$i = $index; |
||||
|
} |
||||
|
$out[] = $key."$code"; |
||||
|
$i++; |
||||
|
} |
||||
|
return "[".implode(",", $out)."]"; |
||||
|
} |
||||
|
|
||||
|
function make_decoder_char_array(array $entries): string { |
||||
|
$out = []; |
||||
|
foreach ($entries as $match) { |
||||
|
$index = (int) $match[1]; |
||||
|
$code = $match[2]; |
||||
|
// missing indexes necessitate specifying keys explicitly |
||||
|
if ($index == $i) { |
||||
|
$key = ""; |
||||
|
} else { |
||||
|
$key = "$index=>"; |
||||
|
$i = $index; |
||||
|
} |
||||
|
$out[] = $key."\"\\u{".$code."}\""; |
||||
|
$i++; |
||||
} |
} |
||||
$dec_code[] = $key."$code"; |
return "[".implode(",", $out)."]"; |
||||
$dec_char[] = $key."\"\\u{".$hex."}\""; |
} |
||||
// the encoder table will be reprocessed later |
|
||||
$enc[$code] = "\"\\x$byte\""; |
// this is only used for single-byte encoders; other encoders instead flip their decoder arrays |
||||
$i++; |
function make_encoder_array(array $entries): string { |
||||
} |
$out = []; |
||||
// sort the encoder table by keys to order it correctly |
foreach ($entries as $match) { |
||||
ksort($enc); |
$index = (int) $match[1]; |
||||
$i = 0; |
$code = $match[2]; |
||||
foreach ($enc as $index => $value) { |
$byte = strtoupper(str_pad(dechex($index + 128), 2, "0", \STR_PAD_LEFT)); |
||||
if ($index == $i) { |
$out[$code] = "\"\\x$byte\""; |
||||
$key = ""; |
} |
||||
} else { |
ksort($out); |
||||
$key = "$index=>"; |
$i = 0; |
||||
$i = $index; |
foreach ($out as $index => $value) { |
||||
|
if ($index == $i) { |
||||
|
$key = ""; |
||||
|
} else { |
||||
|
$key = "$index=>"; |
||||
|
$i = $index; |
||||
|
} |
||||
|
$out[$index] = "$key$value"; |
||||
|
$i++; |
||||
} |
} |
||||
$enc[$index] = "$key$value"; |
return "[".implode(",", $out)."]"; |
||||
$i++; |
} |
||||
} |
|
||||
$dec_char = implode(",", $dec_char); |
|
||||
$dec_code = implode(",", $dec_code); |
|
||||
$enc = implode(",", $enc); |
|
||||
echo " const TABLE_DEC_CHAR = [$dec_char];\n"; |
|
||||
echo " const TABLE_DEC_CODE = [$dec_code];\n"; |
|
||||
echo " const TABLE_ENC = [$enc];\n"; |
|
||||
|
Loading…
Reference in new issue