A set of dependency-free basic internationalization tools
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

55 lines
1.7 KiB

// retrieve the relevant index file
$label = $argv[1] ?? "";
$label = trim(strtolower($label));
$data = file_get_contents("https://encoding.spec.whatwg.org/index-$label.txt") or die("index file for $label could not be retrieved from network.");
// find lines that contain data
preg_match_all("/^\s*(\d+)\s+0x([0-9A-Z]+)/m", $data, $matches, \PREG_SET_ORDER);
// set up
$dec_char = [];
$dec_code = [];
$enc = [];
$i = 0;
// loop through each line
foreach ($matches as $match) {
// index is the byte value minus 128
$index = (int) $match[1];
// byte is a reconstruction of the hexdecimal value of the byte value, padded to two nybbles
$byte = strtoupper(str_pad(dechex($index + 128), 2, "0", \STR_PAD_LEFT));
// code is the Unocide code point
$code = hexdec($match[2]);
// hex is the code point in hexadecimal
$hex = dechex($code);
// missing indexes necessitate specifying keys explicitly
if ($index == $i) {
$key = "";
} else {
$key = "$index=>";
$i = $index;
$dec_code[] = $key."$code";
$dec_char[] = $key."\"\\u{".$hex."}\"";
// the encoder table will be reprocessed later
$enc[$code] = "\"\\x$byte\"";
// sort the encoder table by keys to order it correctly
$i = 0;
foreach ($enc as $index => $value) {
if ($index == $i) {
$key = "";
} else {
$key = "$index=>";
$i = $index;
$enc[$index] = "$key$value";
$dec_char = implode(",", $dec_char);
$dec_code = implode(",", $dec_code);
$enc = implode(",", $enc);
echo " const TABLE_DEC_CHAR = [$dec_char];\n";
echo " const TABLE_DEC_CODE = [$dec_code];\n";
echo " const TABLE_ENC = [$enc];\n";