diff --git a/tools/mkgbk.php b/tools/mkgbk.php deleted file mode 100644 index b321eac..0000000 --- a/tools/mkgbk.php +++ /dev/null @@ -1,45 +0,0 @@ - "big5", + //'euc-jp' => "eucjp", + 'euc-kr' => "euckr", + 'gb18030' => "gb18030", + 'ibm866' => "single_byte", + //'iso-2022-jp' => "iso2022jp", + 'iso-8859-10' => "single_byte", + 'iso-8859-13' => "single_byte", + 'iso-8859-14' => "single_byte", + 'iso-8859-15' => "single_byte", + 'iso-8859-16' => "single_byte", + 'iso-8859-2' => "single_byte", + 'iso-8859-3' => "single_byte", + 'iso-8859-4' => "single_byte", + 'iso-8859-5' => "single_byte", + 'iso-8859-6' => "single_byte", + 'iso-8859-7' => "single_byte", + 'iso-8859-8' => "single_byte", + 'koi8-r' => "single_byte", + 'koi8-u' => "single_byte", + 'macintosh' => "single_byte", + //'shift-jis' => "shiftjis", + 'windows-1250' => "single_byte", + 'windows-1251' => "single_byte", + 'windows-1252' => "single_byte", + 'windows-1253' => "single_byte", + 'windows-1254' => "single_byte", + 'windows-1255' => "single_byte", + 'windows-1256' => "single_byte", + 'windows-1257' => "single_byte", + 'windows-1258' => "single_byte", + 'windows-874' => "single_byte", + 'x-mac-cyrillic' => "single_byte", +]; $label = $argv[1] ?? ""; $label = trim(strtolower($label)); -$data = file_get_contents("https://encoding.spec.whatwg.org/index-$label.txt") or die("index file for '$label' could not be retrieved from network."); -// find lines that contain data -preg_match_all("/^\s*(\d+)\s+0x([0-9A-Z]+)/m", $data, $matches, \PREG_SET_ORDER); -// set up -$dec_char = []; -$dec_code = []; -$enc = []; -$i = 0; -// loop through each line -foreach ($matches as $match) { - // index is the byte value minus 128 - $index = (int) $match[1]; - // byte is a reconstruction of the hexdecimal value of the byte value, padded to two nybbles - $byte = strtoupper(str_pad(dechex($index + 128), 2, "0", \STR_PAD_LEFT)); - // code is the Unocide code point - $code = hexdec($match[2]); - // hex is the code point in hexadecimal - $hex = dechex($code); - // missing indexes necessitate specifying keys explicitly - if ($index == $i) { - $key = ""; - } else { - $key = "$index=>"; - $i = $index; +if (!isset($labels[$label])) { + die("Invalid label specified. Must be one of: ".json_encode(array_keys($labels))); +} +($labels[$label])($label); + +// encoding-specific output generators + +function single_byte(string $label) { + $entries = read_index($label, "https://encoding.spec.whatwg.org/index-$label.txt"); + $dec_char = make_decoder_char_array($entries); + $dec_code = make_decoder_point_array($entires); + $enc = make_encoder_array($entries); + echo "const TABLE_DEC_CHAR = $dec_char;\n"; + echo "const TABLE_DEC_CODE = $dec_code;\n"; + echo "const TABLE_ENC = $enc;\n"; +} + +function gb18030(string $label) { + $dec_gbk = make_decoder_point_array(read_index($label, "https://encoding.spec.whatwg.org/index-$label.txt")); + $ranges = read_index($label, "https://encoding.spec.whatwg.org/index-$label-ranges.txt"); + $dec_max = []; + $dec_off = []; + foreach ($ranges as $match) { + // gather the range starts in one array; they will actually be used as range ends + $dec_max[] = (int) $match[1]; + // gather the starting code points in another array + $dec_off[] = hexdec($match[2]); + } + // fudge the top of the ranges + // see https://encoding.spec.whatwg.org/#index-gb18030-ranges-code-point Step 1 + // we also add 0x110000 (one beyond the top of the Unicode range) to the offsets for encoding + $penult = array_pop($dec_max); + $dec_max = array_merge($dec_max, [39420, $penult, 1237576]); + array_splice($dec_off, -1, 0, "null"); + $dec_off[] = 0x110000; + $dec_max = "[".implode(",", $dec_max)."]"; + $dec_off = "[".implode(",", $dec_off)."]"; + echo "const TABLE_GBK = $dec_gbk;\n"; + echo "const TABLE_RANGES = $dec_max;\n"; + echo "const TABLE_OFFSETS = $dec_off;\n"; +} + +function big5(string $label) { + $codes = make_decoder_point_array(read_index($label, "https://encoding.spec.whatwg.org/index-$label.txt")); + $specials = << [0x00CA, 0x0304], + 1135 => [0x00CA, 0x030C], + 1164 => [0x00EA, 0x0304], + 1166 => [0x00EA, 0x030C], +] +ARRAY_LITERAL; + echo "const TABLE_CODES = $codes;\n"; + echo "const TABLE_DOUBLES = $specials;\n"; +} + +function euckr(string $label) { + $codes = make_decoder_point_array(read_index($label, "https://encoding.spec.whatwg.org/index-$label.txt")); + echo "const TABLE_CODES = $codes;\n"; +} + +// generic helper functions + +function read_index(string $label, string $url): array { + $data = file_get_contents($url) or die("index file for '$label' could not be retrieved from network."); + // find lines that contain data + preg_match_all("/^\s*(\d+)\s+0x([0-9A-Z]+)/m", $data, $matches, \PREG_SET_ORDER); + return $matches; +} + +function make_decoder_point_array(array $entries): string { + $out = []; + $i = 0; + foreach ($entries as $match) { + $index = (int) $match[1]; + $code = hexdec($match[2]); + // missing indexes necessitate specifying keys explicitly + if ($index == $i) { + $key = ""; + } else { + $key = "$index=>"; + $i = $index; + } + $out[] = $key."$code"; + $i++; + } + return "[".implode(",", $out)."]"; +} + +function make_decoder_char_array(array $entries): string { + $out = []; + foreach ($entries as $match) { + $index = (int) $match[1]; + $code = $match[2]; + // missing indexes necessitate specifying keys explicitly + if ($index == $i) { + $key = ""; + } else { + $key = "$index=>"; + $i = $index; + } + $out[] = $key."\"\\u{".$code."}\""; + $i++; } - $dec_code[] = $key."$code"; - $dec_char[] = $key."\"\\u{".$hex."}\""; - // the encoder table will be reprocessed later - $enc[$code] = "\"\\x$byte\""; - $i++; -} -// sort the encoder table by keys to order it correctly -ksort($enc); -$i = 0; -foreach ($enc as $index => $value) { - if ($index == $i) { - $key = ""; - } else { - $key = "$index=>"; - $i = $index; + return "[".implode(",", $out)."]"; +} + +// this is only used for single-byte encoders; other encoders instead flip their decoder arrays +function make_encoder_array(array $entries): string { + $out = []; + foreach ($entries as $match) { + $index = (int) $match[1]; + $code = $match[2]; + $byte = strtoupper(str_pad(dechex($index + 128), 2, "0", \STR_PAD_LEFT)); + $out[$code] = "\"\\x$byte\""; + } + ksort($out); + $i = 0; + foreach ($out as $index => $value) { + if ($index == $i) { + $key = ""; + } else { + $key = "$index=>"; + $i = $index; + } + $out[$index] = "$key$value"; + $i++; } - $enc[$index] = "$key$value"; - $i++; -} -$dec_char = implode(",", $dec_char); -$dec_code = implode(",", $dec_code); -$enc = implode(",", $enc); -echo " const TABLE_DEC_CHAR = [$dec_char];\n"; -echo " const TABLE_DEC_CODE = [$dec_code];\n"; -echo " const TABLE_ENC = [$enc];\n"; + return "[".implode(",", $out)."]"; +}