Browse Source

Consolidate index generation into a single, better script

span
J. King 6 years ago
parent
commit
5967d148c0
  1. 45
      tools/mkgbk.php
  2. 212
      tools/mkindex.php

45
tools/mkgbk.php

@ -1,45 +0,0 @@
<?php
// retrieve the GB18030 index file for two-byte sequences
$label = "gb18030";
$data = file_get_contents("https://encoding.spec.whatwg.org/index-$label.txt") or die("index file for '$label' could not be retrieved from network.");
// find lines that contain data
preg_match_all("/^\s*(\d+)\s+0x([0-9A-Z]+)/m", $data, $matches, \PREG_SET_ORDER);
// set up
$dec_gbk = [];
// loop through each line
foreach ($matches as $match) {
// only the code point is relevant
$dec_gbk[] = hexdec($match[2]);
}
// retrieve the GB18030 range index file for four-byte sequences
$label = "gb18030";
$data = file_get_contents("https://encoding.spec.whatwg.org/index-$label-ranges.txt") or die("range index file for '$label' could not be retrieved from network.");
// find lines that contain data
preg_match_all("/^\s*(\d+)\s+0x([0-9A-Z]+)/m", $data, $matches, \PREG_SET_ORDER);
// set up
$dec_max = [];
$dec_off = [];
// loop through each line
foreach ($matches as $match) {
// gather the range starts in one array; they will actually be used as range ends
$dec_max[] = (int) $match[1];
// gather the starting code points in another array
$dec_off[] = hexdec($match[2]);
}
// fudge the top of the ranges
// see https://encoding.spec.whatwg.org/#index-gb18030-ranges-code-point Step 1
// we also add 0x110000 (one beyond the top of the Unicode range) to the offsets for encoding
$penult = array_pop($dec_max);
$dec_max = array_merge($dec_max, [39420, $penult, 1237576]);
array_splice($dec_off, -1, 0, "null");
$dec_off[] = 0x110000;
// output
$dec_gbk = implode(",", $dec_gbk);
$dec_max = implode(",", $dec_max);
$dec_off = implode(",", $dec_off);
echo " const TABLE_GBK = [$dec_gbk];\n";
echo " const TABLE_RANGES = [$dec_max];\n";
echo " const TABLE_OFFSETS = [$dec_off];\n";

212
tools/mkindex.php

@ -1,54 +1,168 @@
<?php
// retrieve the relevant index file
$labels = [
'big5' => "big5",
//'euc-jp' => "eucjp",
'euc-kr' => "euckr",
'gb18030' => "gb18030",
'ibm866' => "single_byte",
//'iso-2022-jp' => "iso2022jp",
'iso-8859-10' => "single_byte",
'iso-8859-13' => "single_byte",
'iso-8859-14' => "single_byte",
'iso-8859-15' => "single_byte",
'iso-8859-16' => "single_byte",
'iso-8859-2' => "single_byte",
'iso-8859-3' => "single_byte",
'iso-8859-4' => "single_byte",
'iso-8859-5' => "single_byte",
'iso-8859-6' => "single_byte",
'iso-8859-7' => "single_byte",
'iso-8859-8' => "single_byte",
'koi8-r' => "single_byte",
'koi8-u' => "single_byte",
'macintosh' => "single_byte",
//'shift-jis' => "shiftjis",
'windows-1250' => "single_byte",
'windows-1251' => "single_byte",
'windows-1252' => "single_byte",
'windows-1253' => "single_byte",
'windows-1254' => "single_byte",
'windows-1255' => "single_byte",
'windows-1256' => "single_byte",
'windows-1257' => "single_byte",
'windows-1258' => "single_byte",
'windows-874' => "single_byte",
'x-mac-cyrillic' => "single_byte",
];
$label = $argv[1] ?? "";
$label = trim(strtolower($label));
$data = file_get_contents("https://encoding.spec.whatwg.org/index-$label.txt") or die("index file for '$label' could not be retrieved from network.");
// find lines that contain data
preg_match_all("/^\s*(\d+)\s+0x([0-9A-Z]+)/m", $data, $matches, \PREG_SET_ORDER);
// set up
$dec_char = [];
$dec_code = [];
$enc = [];
$i = 0;
// loop through each line
foreach ($matches as $match) {
// index is the byte value minus 128
$index = (int) $match[1];
// byte is a reconstruction of the hexdecimal value of the byte value, padded to two nybbles
$byte = strtoupper(str_pad(dechex($index + 128), 2, "0", \STR_PAD_LEFT));
// code is the Unocide code point
$code = hexdec($match[2]);
// hex is the code point in hexadecimal
$hex = dechex($code);
// missing indexes necessitate specifying keys explicitly
if ($index == $i) {
$key = "";
} else {
$key = "$index=>";
$i = $index;
if (!isset($labels[$label])) {
die("Invalid label specified. Must be one of: ".json_encode(array_keys($labels)));
}
($labels[$label])($label);
// encoding-specific output generators
function single_byte(string $label) {
$entries = read_index($label, "https://encoding.spec.whatwg.org/index-$label.txt");
$dec_char = make_decoder_char_array($entries);
$dec_code = make_decoder_point_array($entires);
$enc = make_encoder_array($entries);
echo "const TABLE_DEC_CHAR = $dec_char;\n";
echo "const TABLE_DEC_CODE = $dec_code;\n";
echo "const TABLE_ENC = $enc;\n";
}
function gb18030(string $label) {
$dec_gbk = make_decoder_point_array(read_index($label, "https://encoding.spec.whatwg.org/index-$label.txt"));
$ranges = read_index($label, "https://encoding.spec.whatwg.org/index-$label-ranges.txt");
$dec_max = [];
$dec_off = [];
foreach ($ranges as $match) {
// gather the range starts in one array; they will actually be used as range ends
$dec_max[] = (int) $match[1];
// gather the starting code points in another array
$dec_off[] = hexdec($match[2]);
}
// fudge the top of the ranges
// see https://encoding.spec.whatwg.org/#index-gb18030-ranges-code-point Step 1
// we also add 0x110000 (one beyond the top of the Unicode range) to the offsets for encoding
$penult = array_pop($dec_max);
$dec_max = array_merge($dec_max, [39420, $penult, 1237576]);
array_splice($dec_off, -1, 0, "null");
$dec_off[] = 0x110000;
$dec_max = "[".implode(",", $dec_max)."]";
$dec_off = "[".implode(",", $dec_off)."]";
echo "const TABLE_GBK = $dec_gbk;\n";
echo "const TABLE_RANGES = $dec_max;\n";
echo "const TABLE_OFFSETS = $dec_off;\n";
}
function big5(string $label) {
$codes = make_decoder_point_array(read_index($label, "https://encoding.spec.whatwg.org/index-$label.txt"));
$specials = <<<ARRAY_LITERAL
[
1133 => [0x00CA, 0x0304],
1135 => [0x00CA, 0x030C],
1164 => [0x00EA, 0x0304],
1166 => [0x00EA, 0x030C],
]
ARRAY_LITERAL;
echo "const TABLE_CODES = $codes;\n";
echo "const TABLE_DOUBLES = $specials;\n";
}
function euckr(string $label) {
$codes = make_decoder_point_array(read_index($label, "https://encoding.spec.whatwg.org/index-$label.txt"));
echo "const TABLE_CODES = $codes;\n";
}
// generic helper functions
function read_index(string $label, string $url): array {
$data = file_get_contents($url) or die("index file for '$label' could not be retrieved from network.");
// find lines that contain data
preg_match_all("/^\s*(\d+)\s+0x([0-9A-Z]+)/m", $data, $matches, \PREG_SET_ORDER);
return $matches;
}
function make_decoder_point_array(array $entries): string {
$out = [];
$i = 0;
foreach ($entries as $match) {
$index = (int) $match[1];
$code = hexdec($match[2]);
// missing indexes necessitate specifying keys explicitly
if ($index == $i) {
$key = "";
} else {
$key = "$index=>";
$i = $index;
}
$out[] = $key."$code";
$i++;
}
return "[".implode(",", $out)."]";
}
function make_decoder_char_array(array $entries): string {
$out = [];
foreach ($entries as $match) {
$index = (int) $match[1];
$code = $match[2];
// missing indexes necessitate specifying keys explicitly
if ($index == $i) {
$key = "";
} else {
$key = "$index=>";
$i = $index;
}
$out[] = $key."\"\\u{".$code."}\"";
$i++;
}
$dec_code[] = $key."$code";
$dec_char[] = $key."\"\\u{".$hex."}\"";
// the encoder table will be reprocessed later
$enc[$code] = "\"\\x$byte\"";
$i++;
}
// sort the encoder table by keys to order it correctly
ksort($enc);
$i = 0;
foreach ($enc as $index => $value) {
if ($index == $i) {
$key = "";
} else {
$key = "$index=>";
$i = $index;
return "[".implode(",", $out)."]";
}
// this is only used for single-byte encoders; other encoders instead flip their decoder arrays
function make_encoder_array(array $entries): string {
$out = [];
foreach ($entries as $match) {
$index = (int) $match[1];
$code = $match[2];
$byte = strtoupper(str_pad(dechex($index + 128), 2, "0", \STR_PAD_LEFT));
$out[$code] = "\"\\x$byte\"";
}
ksort($out);
$i = 0;
foreach ($out as $index => $value) {
if ($index == $i) {
$key = "";
} else {
$key = "$index=>";
$i = $index;
}
$out[$index] = "$key$value";
$i++;
}
$enc[$index] = "$key$value";
$i++;
}
$dec_char = implode(",", $dec_char);
$dec_code = implode(",", $dec_code);
$enc = implode(",", $enc);
echo " const TABLE_DEC_CHAR = [$dec_char];\n";
echo " const TABLE_DEC_CODE = [$dec_code];\n";
echo " const TABLE_ENC = [$enc];\n";
return "[".implode(",", $out)."]";
}

Loading…
Cancel
Save