Intl/lib/Encoding/GBCommon.php

212 lines
151 KiB
PHP
Raw Normal View History

<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Intl\Encoding;
2020-10-18 15:32:49 -04:00
abstract class GBCommon extends AbstractEncoding implements Coder, Decoder {
protected const TABLE_CODES = [19970,19972,19973,19974,19983,19986,19991,19999,20000,20001,20003,20006,20009,20014,20015,20017,20019,20021,20023,20028,20032,20033,20034,20036,20038,20042,20049,20053,20055,20058,20059,20066,20067,20068,20069,20071,20072,20074,20075,20076,20077,20078,20079,20082,20084,20085,20086,20087,20088,20089,20090,20091,20092,20093,20095,20096,20097,20098,20099,20100,20101,20103,20106,20112,20118,20119,20121,20124,20125,20126,20131,20138,20143,20144,20145,20148,20150,20151,20152,20153,20156,20157,20158,20168,20172,20175,20176,20178,20186,20187,20188,20192,20194,20198,20199,20201,20205,20206,20207,20209,20212,20216,20217,20218,20220,20222,20224,20226,20227,20228,20229,20230,20231,20232,20235,20236,20242,20243,20244,20245,20246,20252,20253,20257,20259,20264,20265,20268,20269,20270,20273,20275,20277,20279,20281,20283,20286,20287,20288,20289,20290,20292,20293,20295,20296,20297,20298,20299,20300,20306,20308,20310,20321,20322,20326,20328,20330,20331,20333,20334,20337,20338,20341,20343,20344,20345,20346,20349,20352,20353,20354,20357,20358,20359,20362,20364,20366,20368,20370,20371,20373,20374,20376,20377,20378,20380,20382,20383,20385,20386,20388,20395,20397,20400,20401,20402,20403,20404,20406,20407,20408,20409,20410,20411,20412,20413,20414,20416,20417,20418,20422,20423,20424,20425,20427,20428,20429,20434,20435,20436,20437,20438,20441,20443,20448,20450,20452,20453,20455,20459,20460,20464,20466,20468,20469,20470,20471,20473,20475,20476,20477,20479,20480,20481,20482,20483,20484,20485,20486,20487,20488,20489,20490,20491,20494,20496,20497,20499,20501,20502,20503,20507,20509,20510,20512,20514,20515,20516,20519,20523,20527,20528,20529,20530,20531,20532,20533,20534,20535,20536,20537,20539,20541,20543,20544,20545,20546,20548,20549,20550,20553,20554,20555,20557,20560,20561,20562,20563,20564,20566,20567,20568,20569,20571,20573,20574,20575,20576,20577,20578,20579,20580,20582,20583,20584,20585,20586,20587,20589,20590,20591,20592,20593,20594,20595,20596,20597,20600,20601,20602,20604,20605,20609,20610,20611,20612,20614,20615,20617,20618,20619,20620,20622,20623,20624,20625,20626,20627,20628,20629,20630,20631,20632,20633,20634,20635,20636,20637,20638,20639,20640,20641,20642,20644,20646,20650,20651,20653,20654,20655,20656,20657,20659,20660,20661,20662,20663,20664,20665,20668,20669,20670,20671,20672,20673,20674,20675,20676,20677,20678,20679,20680,20681,20682,20683,20684,20685,20686,20688,20689,20690,20691,20692,20693,20695,20696,20697,20699,20700,20701,20702,20703,20704,20705,20706,20707,20708,20709,20712,20713,20714,20715,20719,20720,20721,20722,20724,20726,20727,20728,20729,20730,20732,20733,20734,20735,20736,20737,20738,20739,20740,20741,20744,20745,20746,20748,20749,20750,20751,20752,20753,20755,20756,20757,20758,20759,20760,20761,20762,20763,20764,20765,20766,20767,20768,20770,20771,20772,20773,20774,20775,20776,20777,20778,20779,20780,20781,20782,20783,20784,20785,20786,20787,20788,20789,20790,20791,20792,20793,20794,20795,20796,20797,20798,20802,20807,20810,20812,20814,20815,20816,20818,20819,20823,20824,20825,20827,20829,20830,20831,20832,20833,20835,20836,20838,20839,20841,20842,20847,20850,20858,20862,20863,20867,20868,20870,20871,20874,20875,20878,20879,20880,20881,20883,20884,20888,20890,20893,20894,20895,20897,20899,20902,20903,20904,20905,20906,20909,20910,20916,20920,20921,20922,20926,20927,20929,20930,20931,20933,20936,20938,20941,20942,20944,20946,20947,20948,20949,20950,20951,20952,20953,20954,20956,20958,20959,20962,20963,20965,20966,20967,20968,20969,20970,20972,20974,20977,20978,20980,20983,20990,20996,20997,21001,21003,21004,21007,21008,21011,21012,21013,21020,21022,21023,21025,21026,21027,21029,21030,21031,21034,21036,21039,21041,21042,21044,21045,21052,21054,21060,21061,21062,21063,21064,21065,21067,21070,21071,21074,21075,21077,21079,21080,21081,21082,21083,21085,21087,21088,21090,21091,21092,21094,21096,21099,21100,21101,21102,21104,21105,21107,21108,21109,21110,21111,21112,21113,21114,21115,21116,21118,21120,21123,21124,21125,21126,21127,21129,21130,21131,21132,21133,21134,21135,21137,21138,2114
protected const TABLE_POINTERS = [12288=>6176];
protected const TABLE_RANGES = [0,36,38,45,50,81,89,95,96,100,103,104,105,109,126,133,148,172,175,179,208,306,307,308,309,310,311,312,313,341,428,443,544,545,558,741,742,749,750,805,819,820,7922,7924,7925,7927,7934,7943,7944,7945,7950,8062,8148,8149,8152,8164,8174,8236,8240,8262,8264,8374,8380,8381,8384,8388,8390,8392,8393,8394,8396,8401,8406,8416,8419,8424,8437,8439,8445,8482,8485,8496,8521,8603,8936,8946,9046,9050,9063,9066,9076,9092,9100,9108,9111,9113,9131,9162,9164,9218,9219,11329,11331,11334,11336,11346,11361,11363,11366,11370,11372,11375,11389,11682,11686,11687,11692,11694,11714,11716,11723,11725,11730,11736,11982,11989,12102,12336,12348,12350,12384,12393,12395,12397,12510,12553,12851,12962,12973,13738,13823,13919,13933,14080,14298,14585,14698,15583,15847,16318,16434,16438,16481,16729,17102,17122,17315,17320,17402,17418,17859,17909,17911,17915,17916,17936,17939,17961,18664,18703,18814,18962,19043,33469,33470,33471,33484,33485,33490,33497,33501,33505,33513,33520,33536,33550,37845,37921,37948,38029,38038,38064,38065,38066,38069,38075,38076,38078,39108,39109,39113,39114,39115,39116,39265,39394,39420,189000,1237576];
protected const TABLE_OFFSETS = [128,165,169,178,184,216,226,235,238,244,248,251,253,258,276,284,300,325,329,334,364,463,465,467,469,471,473,475,477,506,594,610,712,716,730,930,938,962,970,1026,1104,1106,8209,8215,8218,8222,8231,8241,8244,8246,8252,8365,8452,8454,8458,8471,8482,8556,8570,8596,8602,8713,8720,8722,8726,8731,8737,8740,8742,8748,8751,8760,8766,8777,8781,8787,8802,8808,8816,8854,8858,8870,8896,8979,9322,9372,9548,9588,9616,9622,9634,9652,9662,9672,9676,9680,9702,9735,9738,9793,9795,11906,11909,11913,11917,11928,11944,11947,11951,11956,11960,11964,11979,12284,12292,12312,12319,12330,12351,12436,12447,12535,12543,12586,12842,12850,12964,13200,13215,13218,13253,13263,13267,13270,13384,13428,13727,13839,13851,14617,14703,14801,14816,14964,15183,15471,15585,16471,16736,17208,17325,17330,17374,17623,17997,18018,18212,18218,18301,18318,18760,18811,18814,18820,18823,18844,18848,18872,19576,19620,19738,19887,40870,59244,59336,59367,59413,59417,59423,59431,59437,59443,59452,59460,59478,59493,63789,63866,63894,63976,63986,64016,64018,64021,64025,64034,64037,64042,65074,65093,65107,65112,65127,65132,65375,65510,null,65536,1114112];
2020-10-24 14:24:23 -04:00
/** @var array $pointerCache A cached result of flipping the pointer-to-code-point table */
protected static $pointerCache;
public function nextCode() {
$first = 0;
$second = 0;
$third = 0;
$this->posChar++;
while (($b = @$this->string[$this->posByte++]) !== "") {
$b = ord($b);
if ($first === 0) {
if ($b < 0x80) {
return $b;
} elseif ($b === 0x80) {
return 0x20AC;
} elseif ($b > 0x80 && $b < 0xFF) {
$first = $b;
2018-08-29 15:58:53 -04:00
continue;
} else {
2020-09-25 16:59:51 -04:00
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
}
} elseif ($second === 0) {
2018-08-29 15:58:53 -04:00
if ($b > 0x2F && $b < 0x3A) {
$second = $b;
2018-08-29 15:58:53 -04:00
continue;
} else {
$codePoint = null;
2018-08-29 17:16:16 -04:00
if (($b > 0x3A && $b < 0x7F) || ($b > 0x7F && $b < 0xFF)) {
$offset = ($b < 0x7F) ? 0x40 : 0x41;
$pointer = ($first - 0x81) * 190 + ($b - $offset);
$codePoint = self::TABLE_CODES[$pointer] ?? null;
}
if (!is_null($codePoint)) {
return $codePoint;
} elseif ($b < 0x80) {
return $this->errDec($this->errMode, $this->posChar - 1, --$this->posByte - 1);
} else {
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 2);
}
}
} elseif ($third === 0) {
if ($b > 0x80 && $b < 0xFF) {
$third = $b;
2018-08-29 15:58:53 -04:00
continue;
} else {
$this->posByte -= 2;
2020-09-25 16:59:51 -04:00
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
}
} else {
2018-08-29 15:58:53 -04:00
if ($b > 0x2F && $b < 0x3A) {
// look up code point
$pointer = (($first - 0x81) * (10 * 126 * 10)) + (($second - 0x30) * (10 * 126)) + (($third - 0x81) * 10) + $b - 0x30;
if ($pointer === 7457) {
return 0xE7C7;
}
for ($a = 1; $a < sizeof(self::TABLE_RANGES); $a++) {
if ($pointer < self::TABLE_RANGES[$a]) {
$offset = self::TABLE_RANGES[$a - 1];
$codePointOffset = self::TABLE_OFFSETS[$a - 1];
break;
}
}
if (isset($codePointOffset)) {
return $codePointOffset + $pointer - $offset;
} else {
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 4);
}
} else {
$this->posByte -= 3;
2020-09-25 16:59:51 -04:00
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
}
}
}
2018-08-29 15:58:53 -04:00
$this->posByte--;
if (($first + $second + $third) == 0) {
// clean EOF
$this->posChar--;
return false;
} else {
// dirty EOF
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - ($third ? 3 : ($second ? 2 : 1)));
}
}
public static function encode(int $codePoint, bool $fatal = true): string {
2018-08-28 11:48:25 -04:00
if ($codePoint < 0 || $codePoint > 0x10FFFF) {
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);
} elseif ($codePoint < 128) {
return chr($codePoint);
} elseif ($codePoint == 0xE5E5) {
2020-09-25 16:59:51 -04:00
return self::errEnc(!$fatal, $codePoint);
2018-08-28 11:48:25 -04:00
} elseif (static::GBK && $codePoint == 0x20AC) {
return "\x80";
} else {
$pointer = self::TABLE_POINTERS[$codePoint] ?? (self::$pointerCache ?? (self::$pointerCache = array_flip(self::TABLE_CODES)))[$codePoint] ?? null;
2018-08-28 11:48:25 -04:00
if (isset($pointer)) {
$lead = (int) ($pointer / 190) + 0x81;
$trail = $pointer % 190;
$offset = ($trail < 0x3F) ? 0x40 : 0x41;
return chr($lead).chr($trail + $offset);
} elseif (static::GBK) {
2020-09-25 16:59:51 -04:00
return self::errEnc(!$fatal, $codePoint);
2018-08-28 11:48:25 -04:00
} else {
if ($codePoint == 0xE7C7) {
$pointer = 7457;
} else {
$index = 0;
while ($codePoint >= self::TABLE_OFFSETS[$index + 1]) {
$index++;
}
$offset = self::TABLE_OFFSETS[$index];
$pointer_offset = self::TABLE_RANGES[$index];
$pointer = $pointer_offset + $codePoint - $offset;
}
$byte1 = (int) ($pointer / (10 * 126 * 10)) + 0x81;
$pointer %= (10 * 126 * 10);
$byte2 = (int) ($pointer / (10 * 126)) + 0x30;
$pointer %= (10 * 126);
$byte3 = (int) ($pointer / 10) + 0x81;
$byte4 = ($pointer % 10) + 0x30;
return chr($byte1).chr($byte2).chr($byte3).chr($byte4);
}
}
}
protected function seekBack(int $distance): int {
while ($distance > 0 && $this->posByte > 0) {
$distance--;
$this->posChar--;
if ($this->posByte === $this->errMark) { // the previous character was malformed
// move to the correct sync position, pop the error stack, and continue
$this->posByte = $this->errSync;
list($this->errMark, $this->errSync) = array_pop($this->errStack);
continue;
}
// go back one byte
$b1 = ord(@$this->string[--$this->posByte]);
2020-10-05 10:16:25 -04:00
if ($b1 > 0x80) { // only GBK characters end in high bytes
// the preceeding byte starts the character
$this->posByte--;
2018-08-29 15:58:53 -04:00
continue;
2020-10-05 10:16:25 -04:00
} elseif ($b1 < 0x30 || $this->errMark === $this->posByte || $this->posByte === 0) { // the byte is unambiguously a single-byte character
// the byte is a character
2018-08-29 15:58:53 -04:00
continue;
2020-10-05 10:16:25 -04:00
} elseif ($b1 >= 0x30 && $b1 <= 0x39) { // this can either be the last byte of a four-byte gb18030 character or an ASCII character
if ($this->posByte < 3) { // there are not enough bytes left for this to be a four-byte sequence
// the byte is a character
continue;
} elseif ($this->errMark > ($this->posByte - 3)) { // there was an error in what would otherwise be the four-byte sequence
// the byte is a character
continue;
}
// go back a second byte
$b2 = ord(@$this->string[$this->posByte - 1]);
if ($b2 > 0x80) {
// go back a third byte
$b3 = ord(@$this->string[$this->posByte - 2]);
if ($b3 >= 0x30 && $b3 <= 0x39) {
// the next byte starts the character
$this->posByte -= 3;
continue;
}
}
2020-10-05 10:16:25 -04:00
// if the byte pattern doesn't match the first byte is a character
2018-08-29 15:58:53 -04:00
continue;
2020-10-05 10:16:25 -04:00
} else { // this can either be the trail of a two-byte GBK character, or a single-byte character
// go back a second byte
$b2 = ord(@$this->string[--$this->posByte]);
if ($b2 < 0x81) { // these bytes never appear in the lead of a sequence
// the first byte was a character
$this->posByte += 1;
continue;
} else { // the second byte is part of a two-byte sequence, but it's unclear if it's the lead or trail byte
$start = $this->posByte + 2;
$pos = $this->posByte;
// go back bytes until an error mark, an ASCII byte, or start of string
while ($pos > 0 && $pos > $this->errMark) {
$b = ord(@$this->string[--$pos]);
if ($b < 0x81) {
$pos++;
break;
}
}
if (($start - $pos) % 2) { // the number of bytes is odd
// the first byte was a character
$this->posByte += 1;
continue;
} else { // the number of bytes is even
// the second byte was a character
continue;
}
}
}
}
return $distance;
}
}