A set of dependency-free basic internationalization tools
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

162 lines
52 KiB

<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Intl\Encoding;
class ShiftJIS extends AbstractEncoding implements Coder, Decoder {
const NAME = "Shift_JIS";
const LABELS = [
"csshiftjis",
"ms932",
"ms_kanji",
"shift-jis",
"shift_jis",
"sjis",
"windows-31j",
"x-sjis",
];
const TABLE_CODES = [12288,12289,12290,65292,65294,12539,65306,65307,65311,65281,12443,12444,180,65344,168,65342,65507,65343,12541,12542,12445,12446,12291,20189,12293,12294,12295,12540,8213,8208,65295,65340,65374,8741,65372,8230,8229,8216,8217,8220,8221,65288,65289,12308,12309,65339,65341,65371,65373,12296,12297,12298,12299,12300,12301,12302,12303,12304,12305,65291,65293,177,215,247,65309,8800,65308,65310,8806,8807,8734,8756,9794,9792,176,8242,8243,8451,65509,65284,65504,65505,65285,65283,65286,65290,65312,167,9734,9733,9675,9679,9678,9671,9670,9633,9632,9651,9650,9661,9660,8251,12306,8594,8592,8593,8595,12307,119=>8712,8715,8838,8839,8834,8835,8746,8745,135=>8743,8744,65506,8658,8660,8704,8707,153=>8736,8869,8978,8706,8711,8801,8786,8810,8811,8730,8765,8733,8757,8747,8748,175=>8491,8240,9839,9837,9834,8224,8225,182,187=>9711,203=>65296,65297,65298,65299,65300,65301,65302,65303,65304,65305,220=>65313,65314,65315,65316,65317,65318,65319,65320,65321,65322,65323,65324,65325,65326,65327,65328,65329,65330,65331,65332,65333,65334,65335,65336,65337,65338,252=>65345,65346,65347,65348,65349,65350,65351,65352,65353,65354,65355,65356,65357,65358,65359,65360,65361,65362,65363,65364,65365,65366,65367,65368,65369,65370,282=>12353,12354,12355,12356,12357,12358,12359,12360,12361,12362,12363,12364,12365,12366,12367,12368,12369,12370,12371,12372,12373,12374,12375,12376,12377,12378,12379,12380,12381,12382,12383,12384,12385,12386,12387,12388,12389,12390,12391,12392,12393,12394,12395,12396,12397,12398,12399,12400,12401,12402,12403,12404,12405,12406,12407,12408,12409,12410,12411,12412,12413,12414,12415,12416,12417,12418,12419,12420,12421,12422,12423,12424,12425,12426,12427,12428,12429,12430,12431,12432,12433,12434,12435,376=>12449,12450,12451,12452,12453,12454,12455,12456,12457,12458,12459,12460,12461,12462,12463,12464,12465,12466,12467,12468,12469,12470,12471,12472,12473,12474,12475,12476,12477,12478,12479,12480,12481,12482,12483,12484,12485,12486,12487,12488,12489,12490,12491,12492,12493,12494,12495,12496,12497,12498,12499,12500,12501,12502,12503,12504,12505,12506,12507,12508,12509,12510,12511,12512,12513,12514,12515,12516,12517,12518,12519,12520,12521,12522,12523,12524,12525,12526,12527,12528,12529,12530,12531,12532,12533,12534,470=>913,914,915,916,917,918,919,920,921,922,923,924,925,926,927,928,929,931,932,933,934,935,936,937,502=>945,946,947,948,949,950,951,952,953,954,955,956,957,958,959,960,961,963,964,965,966,967,968,969,564=>1040,1041,1042,1043,1044,1045,1025,1046,1047,1048,1049,1050,1051,1052,1053,1054,1055,1056,1057,1058,1059,1060,1061,1062,1063,1064,1065,1066,1067,1068,1069,1070,1071,612=>1072,1073,1074,1075,1076,1077,1105,1078,1079,1080,1081,1082,1083,1084,1085,1086,1087,1088,1089,1090,1091,1092,1093,1094,1095,1096,1097,1098,1099,1100,1101,1102,1103,658=>9472,9474,9484,9488,9496,9492,9500,9516,9508,9524,9532,9473,9475,9487,9491,9499,9495,9507,9523,9515,9531,9547,9504,9519,9512,9527,9535,9501,9520,9509,9528,9538,1128=>9312,9313,9314,9315,9316,9317,9318,9319,9320,9321,9322,9323,9324,9325,9326,9327,9328,9329,9330,9331,8544,8545,8546,8547,8548,8549,8550,8551,8552,8553,1159=>13129,13076,13090,13133,13080,13095,13059,13110,13137,13143,13069,13094,13091,13099,13130,13115,13212,13213,13214,13198,13199,13252,13217,1190=>13179,12317,12319,8470,13261,8481,12964,12965,12966,12967,12968,12849,12850,12857,13182,13181,13180,8786,8801,8747,8750,8721,8730,8869,8736,8735,8895,8757,8745,8746,1410=>20124,21782,23043,38463,21696,24859,25384,23030,36898,33909,33564,31312,24746,25569,28197,26093,33894,33446,39925,26771,22311,26017,25201,23451,22992,34427,39156,32098,32190,39822,25110,31903,34999,23433,24245,25353,26263,26696,38343,38797,26447,20197,20234,20301,20381,20553,22258,22839,22996,23041,23561,24799,24847,24944,26131,26885,28858,30031,30064,31227,32173,32239,32963,33806,34915,35586,36949,36986,21307,20117,20133,22495,32946,37057,30959,19968,22769,28322,36920,31282,33576,33419,39983,20801,21360,21693,21729,22240,23035,24341,39154,28139,32996,34093,38498,38512,38560,38907,21515,21491,23431,28879,32701,36802,38632,21359,40284,31418,19985,3086
const TABLE_CODES_EXTRA = [8272=>32394,35100,37704,37512,34012,20425,28859,26161,26824,37625,26363,24389,20008,20193,20220,20224,20227,20281,20310,20370,20362,20378,20372,20429,20544,20514,20479,20510,20550,20592,20546,20628,20724,20696,20810,20836,20893,20926,20972,21013,21148,21158,21184,21211,21248,21255,21284,21362,21395,21426,21469,64014,21660,21642,21673,21759,21894,22361,22373,22444,22472,22471,64015,64016,22686,22706,22795,22867,22875,22877,22883,22948,22970,23382,23488,29999,23512,23532,23582,23718,23738,23797,23847,23891,64017,23874,23917,23992,23993,24016,24353,24372,24423,24503,24542,24669,24709,24714,24798,24789,24864,24818,24849,24887,24880,24984,25107,25254,25589,25696,25757,25806,25934,26112,26133,26171,26121,26158,26142,26148,26213,26199,26201,64018,26227,26265,26272,26290,26303,26362,26382,63785,26470,26555,26706,26560,26625,26692,26831,64019,26984,64020,27032,27106,27184,27243,27206,27251,27262,27362,27364,27606,27711,27740,27782,27759,27866,27908,28039,28015,28054,28076,28111,28152,28146,28156,28217,28252,28199,28220,28351,28552,28597,28661,28677,28679,28712,28805,28843,28943,28932,29020,28998,28999,64021,29121,29182,29361,29374,29476,64022,29559,29629,29641,29654,29667,29650,29703,29685,29734,29738,29737,29742,29794,29833,29855,29953,30063,30338,30364,30366,30363,30374,64023,30534,21167,30753,30798,30820,30842,31024,64024,64025,64026,31124,64027,31131,31441,31463,64028,31467,31646,64029,32072,32092,32183,32160,32214,32338,32583,32673,64030,33537,33634,33663,33735,33782,33864,33972,34131,34137,34155,64031,34224,64032,64033,34823,35061,35346,35383,35449,35495,35518,35551,64034,35574,35667,35711,36080,36084,36114,36214,64035,36559,64036,64037,36967,37086,64038,37141,37159,37338,37335,37342,37357,37358,37348,37349,37382,37392,37386,37434,37440,37436,37454,37465,37457,37433,37479,37543,37495,37496,37607,37591,37593,37584,64039,37589,37600,37587,37669,37665,37627,64040,37662,37631,37661,37634,37744,37719,37796,37830,37854,37880,37937,37957,37960,38290,63964,64041,38557,38575,38707,38715,38723,38733,38735,38737,38741,38999,39013,64042,64043,39207,64044,39326,39502,39641,39644,39797,39794,39823,39857,39867,39936,40304,40299,64045,40473,40657,8634=>8560,8561,8562,8563,8564,8565,8566,8567,8568,8569,65506,65508,65287,65282];
const TABLE_POINTERS = [8470=>1193,8481=>1195,8544=>1148,1149,1150,1151,1152,1153,1154,1155,1156,1157,8730=>162,8736=>153,8745=>126,125,166,8757=>165,8786=>159,8801=>158,8869=>154,12849=>1201,65506=>137];
protected static $pointerCache;
public function nextCode() {
if (($b = @$this->string[$this->posByte++]) === "") {
// clean EOF
$this->posByte--;
return false;
} else {
$this->posChar++;
$b = ord($b);
if ($b < 0x81) {
return $b;
} elseif ($b >= 0xA1 && $b <= 0xDF) {
return 0xFF61 - 0xA1 + $b;
} elseif (($b >= 0x81 && $b <= 0x9F) || ($b >= 0xE0 && $b <= 0xFC)) {
$lead = $b;
if (($b = @$this->string[$this->posByte++]) === "") {
// dirty EOF
return $this->errDec($this->errMode, $this->posChar - 1, --$this->posByte - 1);
}
$b = ord($b);
$offset = ($b < 0x7F) ? 0x40 : 0x41;
$leadOffset = ($lead < 0xA0) ? 0x81 : 0xC1;
$pointer = null;
$codePoint = null;
if (($b >= 0x40 && $b <= 0x7E) || ($b >= 0x80 && $b <= 0xFC)) {
$pointer = ($lead - $leadOffset) * 188 + $b - $offset;
if ($pointer >= 8836 && $pointer <= 10715) {
return 0xE000 - 8836 + $pointer;
} else {
4 years ago
$codePoint = self::TABLE_CODES[$pointer] ?? self::TABLE_CODES_EXTRA[$pointer] ?? null;
}
}
if ($codePoint === null) {
if ($b < 0x80) {
$this->posByte--;
$errOffset = 1;
} else {
$errOffset = 2;
}
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - $errOffset);
} else {
return $codePoint;
}
} else {
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
}
}
}
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
*/
public static function encode(int $codePoint, bool $fatal = true): string {
if ($codePoint < 0 || $codePoint > 0x10FFFF) {
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);
} elseif ($codePoint <= 0x80) {
return chr($codePoint);
} elseif ($codePoint >= 0xFF61 && $codePoint <= 0xFF9F) {
return chr($codePoint - 0xFF61 + 0xA1);
} else {
switch ($codePoint) {
case 0xA5:
return chr(0x5C);
case 0x203E:
return chr(0x7E);
case 0x2212:
$codePoint = 0xFF0D;
// no break;
default:
$pointer = self::TABLE_POINTERS[$codePoint] ?? (self::$pointerCache ?? (self::$pointerCache = array_flip(self::TABLE_CODES)))[$codePoint] ?? null;
if (isset($pointer)) {
$lead = (int) ($pointer / 188);
$leadOffset = ($lead < 0x1F) ? 0x81 : 0xC1;
$trail = $pointer % 188;
$offset = ($trail < 0x3F) ? 0x40 : 0x41;
return chr($lead + $leadOffset).chr($trail + $offset);
} else {
return self::errEnc(!$fatal, $codePoint);
}
}
}
}
/** Implements backward seeking $distance characters */
protected function seekBack(int $distance): int {
while ($distance > 0 && $this->posByte > 0) {
$distance--;
$this->posChar--;
if ($this->posByte === $this->errMark) { // the previous character was malformed
// move to the correct sync position, pop the error stack, and continue
$this->posByte = $this->errSync;
list($this->errMark, $this->errSync) = array_pop($this->errStack);
continue;
}
// go back one byte
$b1 = ord(@$this->string[--$this->posByte]);
if ($b1 < 0x40 || $b1 > 0xFC || $b1 === 0x7F || $this->posByte === 0 || $this->posByte === $this->errMark) { // these bytes never appear in sequences, and the first byte is necessarily the start of a sequence
// the byte is a character
continue;
}
// go back a second byte
$b2 = ord(@$this->string[--$this->posByte]);
if ($b2 < 0x81 || $b2 > 0xFC || ($b2 >= 0xA0 && $b2 <= 0xDF)) { // these bytes never appear in the lead of a sequence
// the first byte was a character
$this->posByte += 1;
continue;
} elseif ($this->posByte === $this->errMark || $this->posByte === 0) { // position is unambiguously the start of a character
// the two bytes form a character
continue;
} else { // the second byte is part of a two-byte sequence, but it's unclear if it's the lead or trail byte
$start = $this->posByte + 2;
$pos = $this->posByte;
// go back bytes until an error mark, a definite byte, or start of string
while ($pos > 0 && $pos > $this->errMark) {
$b = ord(@$this->string[--$pos]);
if ($b < 0x81 || ($b >= 0xA0 && $b <= 0xDF) || $b > 0xFC) {
$pos++;
break;
}
}
if (($start - $pos) % 2) { // the number of bytes is odd
// the first byte was a character
$this->posByte += 1;
continue;
} else { // the number of bytes is even
// the second byte was a character
continue;
}
}
}
return $distance;
}
}