A set of dependency-free basic internationalization tools
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

178 lines
118 KiB

<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Intl\Encoding;
class Big5 extends AbstractEncoding implements Coder, Decoder {
const NAME = "Big5";
4 years ago
const LABELS = [
4 years ago
"big5",
"big5-hkscs",
"cn-big5",
"csbig5",
"x-x-big5",
4 years ago
];
const TABLE_DOUBLES = [
1133 => [0x00CA, 0x0304],
1135 => [0x00CA, 0x030C],
1164 => [0x00EA, 0x0304],
1166 => [0x00EA, 0x030C],
];
const TABLE_CODES_TW = [5024=>12288,65292,12289,12290,65294,8231,65307,65306,65311,65281,65072,8230,8229,65104,65105,65106,183,65108,65109,65110,65111,65372,8211,65073,8212,65075,9588,65076,65103,65288,65289,65077,65078,65371,65373,65079,65080,12308,12309,65081,65082,12304,12305,65083,65084,12298,12299,65085,65086,12296,12297,65087,65088,12300,12301,65089,65090,12302,12303,65091,65092,65113,65114,65115,65116,65117,65118,8216,8217,8220,8221,12317,12318,8245,8242,65283,65286,65290,8251,167,12291,9675,9679,9651,9650,9678,9734,9733,9671,9670,9633,9632,9661,9660,12963,8453,175,65507,65343,717,65097,65098,65101,65102,65099,65100,65119,65120,65121,65291,65293,215,247,177,8730,65308,65310,65309,8806,8807,8800,8734,8786,8801,65122,65123,65124,65125,65126,65374,8745,8746,8869,8736,8735,8895,13266,13265,8747,8750,8757,8756,9792,9794,8853,8857,8593,8595,8592,8594,8598,8599,8601,8600,8741,8739,65295,65340,8725,65128,65284,65509,12306,65504,65505,65285,65312,8451,8457,65129,65130,65131,13269,13212,13213,13214,13262,13217,13198,13199,13252,176,20825,20827,20830,20829,20833,20835,21991,29929,31950,9601,9602,9603,9604,9605,9606,9607,9608,9615,9614,9613,9612,9611,9610,9609,9532,9524,9516,9508,9500,9620,9472,9474,9621,9484,9488,9492,9496,9581,9582,9584,9583,9552,9566,9578,9569,9698,9699,9701,9700,9585,9586,9587,65296,65297,65298,65299,65300,65301,65302,65303,65304,65305,8544,8545,8546,8547,8548,8549,8550,8551,8552,8553,12321,12322,12323,12324,12325,12326,12327,12328,12329,21313,21316,21317,65313,65314,65315,65316,65317,65318,65319,65320,65321,65322,65323,65324,65325,65326,65327,65328,65329,65330,65331,65332,65333,65334,65335,65336,65337,65338,65345,65346,65347,65348,65349,65350,65351,65352,65353,65354,65355,65356,65357,65358,65359,65360,65361,65362,65363,65364,65365,65366,65367,65368,65369,65370,913,914,915,916,917,918,919,920,921,922,923,924,925,926,927,928,929,931,932,933,934,935,936,937,945,946,947,948,949,950,951,952,953,954,955,956,957,958,959,960,961,963,964,965,966,967,968,969,12549,12550,12551,12552,12553,12554,12555,12556,12557,12558,12559,12560,12561,12562,12563,12564,12565,12566,12567,12568,12569,12570,12571,12572,12573,12574,12575,12576,12577,12578,12579,12580,12581,12582,12583,12584,12585,729,713,714,711,715,9216,9217,9218,9219,9220,9221,9222,9223,9224,9225,9226,9227,9228,9229,9230,9231,9232,9233,9234,9235,9236,9237,9238,9239,9240,9241,9242,9243,9244,9245,9246,9247,9249,8364,5495=>19968,20057,19969,19971,20035,20061,20102,20108,20154,20799,20837,20843,20960,20992,20993,21147,21269,21313,21340,21448,19977,19979,19976,19978,20011,20024,20961,20037,20040,20063,20062,20110,20129,20800,20995,21242,21315,21449,21475,22303,22763,22805,22823,22899,23376,23377,23379,23544,23567,23586,23608,23665,24029,24037,24049,24050,24051,24062,24178,24318,24331,24339,25165,19985,19984,19981,20013,20016,20025,20043,23609,20104,20113,20117,20114,20116,20130,20161,20160,20163,20166,20167,20173,20170,20171,20164,20803,20801,20839,20845,20846,20844,20887,20982,20998,20999,21000,21243,21246,21247,21270,21305,21320,21319,21317,21342,21380,21451,21450,21453,22764,22825,22827,22826,22829,23380,23569,23588,23610,23663,24052,24187,24319,24340,24341,24515,25096,25142,25163,25166,25903,25991,26007,26020,26041,26085,26352,26376,26408,27424,27490,27513,27595,27604,27611,27663,27700,28779,29226,29238,29243,29255,29273,29275,29356,29579,19993,19990,19989,19988,19992,20027,20045,20047,20046,20197,20184,20180,20181,20182,20183,20195,20196,20185,20190,20805,20804,20873,20874,20908,20985,20986,20984,21002,21152,21151,21253,21254,21271,21277,20191,21322,21321,21345,21344,21359,21358,21435,21487,21476,21491,21484,21486,21481,21480,21500,21496,21493,21483,21478,21482,21490,21489,21488,21477,21485,21499,22235,22234,22806,22830,22833,22900,22902,23381,23427,23612,24040,24039,24038,24066,24067,24179,24188,24321,24344,24343,24517,25098,25171,25172,25170,25169,26021,26086,26414,26412,26410,26411,26413,27491,27597,27665,27664,27704,27713,27712,27710,29359,29572,29577,29916,29926,29976,29983,29992,29993,30000,30001,30002,30003,30091,30333,30382,30399,30446,30683,30690,30707
const TABLE_CODES_HK = [942=>17392,19506,17923,17830,17784,160359,19831,17843,162993,19682,163013,15253,18230,18244,19527,19520,148159,144919,160594,159371,159954,19543,172881,18255,17882,19589,162924,19719,19108,18081,158499,29221,154196,137827,146950,147297,26189,22267,981=>32149,22813,166841,15860,38708,162799,23515,138590,23204,13861,171696,23249,23479,23804,26478,34195,170309,29793,29853,14453,138579,145054,155681,16108,153822,15093,31484,40855,147809,166157,143850,133770,143966,17162,33924,40854,37935,18736,34323,22678,38730,37400,31184,31282,26208,27177,34973,29772,31685,26498,31276,21071,36934,13542,29636,155065,29894,40903,22451,18735,21580,16689,145038,22552,31346,162661,35727,18094,159368,16769,155033,31662,140476,40904,140481,140489,140492,40905,34052,144827,16564,40906,17633,175615,25281,28782,40907,1099=>12736,12737,12738,12739,12740,131340,12741,131281,131277,12742,12743,131275,139240,12744,131274,12745,12746,12747,12748,131342,12749,12750,256,193,461,192,274,201,282,200,332,211,465,210,1134=>7870,1136=>7872,202,257,225,462,224,593,275,233,283,232,299,237,464,236,333,243,466,242,363,250,468,249,470,472,474,476,252,1165=>7871,1167=>7873,234,609,9178,9179,1256=>172969,135493,1259=>25866,1262=>20029,28381,40270,37343,1268=>161589,25745,20250,20264,20392,20822,20852,20892,20964,21153,21160,21307,21326,21457,21464,22242,22768,22788,22791,22834,22836,23398,23454,23455,23706,24198,24635,25993,26622,26628,26725,27982,28860,30005,32420,32428,32442,32455,32463,32479,32518,32567,33402,33487,33647,35270,35774,35810,36710,36711,36718,29713,31996,32205,26950,31433,21031,1329=>37260,30904,37214,32956,1334=>36107,33014,133607,1339=>32927,40647,19661,40393,40460,19518,171510,159758,40458,172339,13761,1351=>28314,33342,29977,1355=>18705,39532,39567,40857,31111,164972,138698,132560,142054,20004,20097,20096,20103,20159,20203,20279,13388,20413,15944,20483,20616,13437,13459,13477,20870,22789,20955,20988,20997,20105,21113,21136,21287,13767,21417,13649,21424,13651,21442,21539,13677,13682,13953,21651,21667,21684,21689,21712,21743,21784,21795,21800,13720,21823,13733,13759,21975,13765,163204,21797,1416=>134210,134421,151851,21904,142534,14828,131905,36422,150968,169189,16467,164030,30586,142392,14900,18389,164189,158194,151018,25821,134524,135092,134357,135412,25741,36478,134806,134155,135012,142505,164438,148691,1449=>134470,170573,164073,18420,151207,142530,39602,14951,169460,16365,13574,152263,169940,161992,142660,40302,38933,1467=>17369,155813,25780,21731,142668,142282,135287,14843,135279,157402,157462,162208,25834,151634,134211,36456,139681,166732,132913,1487=>18443,131497,16378,22643,142733,1493=>148936,132348,155799,134988,134550,21881,16571,17338,1502=>19124,141926,135325,33194,39157,134556,25465,14846,141173,36288,22177,25724,15939,1516=>173569,134665,142031,142537,1521=>135368,145858,14738,14854,164507,13688,155209,139463,22098,134961,142514,169760,13500,27709,151099,1538=>161140,142987,139784,173659,167117,134778,134196,157724,32659,135375,141315,141625,13819,152035,134796,135053,134826,16275,134960,134471,135503,134732,1561=>134827,134057,134472,135360,135485,16377,140950,25650,135085,144372,161337,142286,134526,134527,142417,142421,14872,134808,135367,134958,173618,158544,167122,167321,167114,38314,21708,33476,21945,1591=>171715,39974,39606,161630,142830,28992,33133,33004,23580,157042,33076,14231,21343,164029,37302,134906,134671,134775,134907,13789,151019,13833,134358,22191,141237,135369,134672,134776,135288,135496,164359,136277,134777,151120,142756,23124,135197,135198,135413,135414,22428,134673,161428,164557,135093,134779,151934,14083,135094,135552,152280,172733,149978,137274,147831,164476,22681,21096,13850,153405,31666,23400,18432,19244,40743,18919,39967,39821,154484,143677,22011,13810,22153,20008,22786,138177,194680,38737,131206,20059,20155,13630,23587,24401,24516,14586,25164,25909,27514,27701,27706,28780,29227,20012,29357,149737,32594,31035,31993,32595,156266,13505,1694=>156491,32770,32896,157202,158033,21341,34916,35265,161970,35744,36125,38021,38264,38271,38376,167439,38886,39029,39118,39134,39267,170000,4006
const TABLE_POINTERS = [9581=>5243,5244,5246,5245,12291=>5104,20189=>10958,20521=>7007,20605=>7457,21253=>5681,21316=>5288,21375=>7045,23159=>8007,24181=>11425,24308=>10942,24408=>13805,25744=>9346,26080=>10946,26228=>8093,26462=>6119,27804=>11642,28189=>8161,28207=>8137,29038=>8178,29788=>8616,29929=>5213,30326=>10948,30729=>12389];
protected $bufferedCode = 0;
protected static $pointerCache;
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) {
$this->stateProps[] = "bufferedCode";
parent::__construct($string, $fatal, $allowSurrogates);
}
public function nextCode() {
$this->posChar++;
if ($this->bufferedCode > 0) {
// certain sequences return two characters, the second of which is buffered without moving the byte pointer
// if a code is buffered, then we move the pointer forward two bytes, clear the buffer, and return the code
$code = $this->bufferedCode;
$this->bufferedCode = 0;
$this->posByte += 2;
return $code;
}
$lead = 0x00;
while (($b = @$this->string[$this->posByte++]) !== "") {
$b = ord($b);
if ($lead == 0) {
if ($b < 0x80) {
return $b;
} elseif ($b == 0x80 || $b == 0xFF) {
return $this->errDec($this->errMode, $this->posChar -1, $this->posByte - 1);
} else {
$lead = $b;
continue;
}
} else {
$pointer = null;
$offset = $b < 0x7F ? 0x40 : 0x62;
if (($b >= 0x40 && $b <= 0x7E) || ($b >= 0xA1 && $b <= 0xFE)) {
$pointer = ($lead - 0x81) * 157 + ($b - $offset);
}
if (isset(self::TABLE_DOUBLES[$pointer])) {
// special double-character sequence
// unconsume both bytes, buffer the second character, and return the first
$this->posByte -= 2;
$this->bufferedCode = self::TABLE_DOUBLES[$pointer][1];
return self::TABLE_DOUBLES[$pointer][0];
}
$code = self::TABLE_CODES_HK[$pointer] ?? self::TABLE_CODES_TW[$pointer] ?? null;
if (isset($code)) {
return $code;
} else {
if ($b < 0x80) {
return $this->errDec($this->errMode, $this->posChar -1, --$this->posByte - 1);
} else {
return $this->errDec($this->errMode, $this->posChar -1, $this->posByte - 2);
}
}
}
}
$this->posByte--;
if ($lead == 0) {
// clean EOF
$this->posChar--;
return false;
} else {
// dirty EOF
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
}
}
public static function encode(int $codePoint, bool $fatal = true): string {
if ($codePoint < 0 || $codePoint > 0x10FFFF) {
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);
} elseif ($codePoint < 128) {
return chr($codePoint);
} else {
$pointer = self::TABLE_POINTERS[$codePoint] ?? (self::$pointerCache ?? (self::$pointerCache = array_flip(self::TABLE_CODES_TW)))[$codePoint] ?? null;
if (isset($pointer)) {
$lead = (int) ($pointer / 157) + 0x81;
$trail = $pointer % 157;
$offset = ($trail < 0x3F) ? 0x40 : 0x62;
return chr($lead).chr($trail + $offset);
} else {
return self::errEnc(!$fatal, $codePoint);
}
}
}
/** Implements backward seeking $distance characters */
protected function seekBack(int $distance): int {
while ($distance > 0 && ($this->posByte > 0 || $this->bufferedCode > 0)) {
$distance--;
$this->posChar--;
if ($this->bufferedCode > 0) { // we're in the middle of a special double-character sequence
// clear the buffered character and continue
$this->bufferedCode = 0;
continue;
}
if ($this->posByte === $this->errMark) { // the previous character was malformed
// move to the correct sync position, pop the error stack, and continue
$this->posByte = $this->errSync;
list($this->errMark, $this->errSync) = array_pop($this->errStack);
continue;
}
// go back one byte
$b1 = ord(@$this->string[--$this->posByte]);
if ($b1 < 0x40 || $b1 == 0x7F || $this->posByte === $this->errMark || $this->posByte == 0) { // these bytes never appear in sequences, a byte coming after an error is necessarily its own character, and the first byte is necessarily the start of a sequence
// the byte is a character
continue;
}
// go back a second byte
$b2 = ord(@$this->string[--$this->posByte]);
if ($b2 < 0x80) { // these bytes never appear in the lead of a sequence
// the first byte was a character
$this->posByte += 1;
continue;
} elseif ($b1 > 0x7F && $b2 > 0x7F) { // two non-ASCII bytes in a row with no error necessarily form a sequence
// the second byte is a charactrer
// check if it's a double-character sequence
$pointer = ($b2 - 0x81) * 157 + ($b1 - ($b1 < 0x7F ? 0x40 : 0x62));
if (isset(self::TABLE_DOUBLES[$pointer])) {
$this->bufferedCode = self::TABLE_DOUBLES[$pointer][1];
}
continue;
} else { // the second byte is part of a two-byte sequence, but it's unclear if it's the lead or trail byte
$start = $this->posByte + 2;
$pos = $this->posByte;
// go back bytes until an error mark, an ASCII byte, or start of string
while ($pos > 0 && $pos > $this->errMark) {
$b = ord(@$this->string[--$pos]);
if ($b < 0x80) {
$pos++;
break;
}
}
if (($start - $pos) % 2) { // the number of bytes is odd
// the first byte was a character
$this->posByte += 1;
continue;
} else { // the number of bytes is even
// the second byte was a character
// check if it's a double-character sequence
$pointer = ($b2 - 0x81) * 157 + ($b1 - ($b1 < 0x7F ? 0x40 : 0x62));
if (isset(self::TABLE_DOUBLES[$pointer])) {
$this->bufferedCode = self::TABLE_DOUBLES[$pointer][1];
}
continue;
}
}
}
return $distance;
}
}