A set of dependency-free basic internationalization tools
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

160 lines
116 KiB

<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Intl\Encoding;
class ShiftJIS extends AbstractEncoding implements StatelessEncoding {
const NAME = "Shift_JIS";
const LABELS = [
"csshiftjis",
"ms932",
"ms_kanji",
"shift-jis",
"shift_jis",
"sjis",
"windows-31j",
"x-sjis",
];
const TABLE_CODES_DEC = [12288,12289,12290,65292,65294,12539,65306,65307,65311,65281,12443,12444,180,65344,168,65342,65507,65343,12541,12542,12445,12446,12291,20189,12293,12294,12295,12540,8213,8208,65295,65340,65374,8741,65372,8230,8229,8216,8217,8220,8221,65288,65289,12308,12309,65339,65341,65371,65373,12296,12297,12298,12299,12300,12301,12302,12303,12304,12305,65291,65293,177,215,247,65309,8800,65308,65310,8806,8807,8734,8756,9794,9792,176,8242,8243,8451,65509,65284,65504,65505,65285,65283,65286,65290,65312,167,9734,9733,9675,9679,9678,9671,9670,9633,9632,9651,9650,9661,9660,8251,12306,8594,8592,8593,8595,12307,119=>8712,8715,8838,8839,8834,8835,8746,8745,135=>8743,8744,65506,8658,8660,8704,8707,153=>8736,8869,8978,8706,8711,8801,8786,8810,8811,8730,8765,8733,8757,8747,8748,175=>8491,8240,9839,9837,9834,8224,8225,182,187=>9711,203=>65296,65297,65298,65299,65300,65301,65302,65303,65304,65305,220=>65313,65314,65315,65316,65317,65318,65319,65320,65321,65322,65323,65324,65325,65326,65327,65328,65329,65330,65331,65332,65333,65334,65335,65336,65337,65338,252=>65345,65346,65347,65348,65349,65350,65351,65352,65353,65354,65355,65356,65357,65358,65359,65360,65361,65362,65363,65364,65365,65366,65367,65368,65369,65370,282=>12353,12354,12355,12356,12357,12358,12359,12360,12361,12362,12363,12364,12365,12366,12367,12368,12369,12370,12371,12372,12373,12374,12375,12376,12377,12378,12379,12380,12381,12382,12383,12384,12385,12386,12387,12388,12389,12390,12391,12392,12393,12394,12395,12396,12397,12398,12399,12400,12401,12402,12403,12404,12405,12406,12407,12408,12409,12410,12411,12412,12413,12414,12415,12416,12417,12418,12419,12420,12421,12422,12423,12424,12425,12426,12427,12428,12429,12430,12431,12432,12433,12434,12435,376=>12449,12450,12451,12452,12453,12454,12455,12456,12457,12458,12459,12460,12461,12462,12463,12464,12465,12466,12467,12468,12469,12470,12471,12472,12473,12474,12475,12476,12477,12478,12479,12480,12481,12482,12483,12484,12485,12486,12487,12488,12489,12490,12491,12492,12493,12494,12495,12496,12497,12498,12499,12500,12501,12502,12503,12504,12505,12506,12507,12508,12509,12510,12511,12512,12513,12514,12515,12516,12517,12518,12519,12520,12521,12522,12523,12524,12525,12526,12527,12528,12529,12530,12531,12532,12533,12534,470=>913,914,915,916,917,918,919,920,921,922,923,924,925,926,927,928,929,931,932,933,934,935,936,937,502=>945,946,947,948,949,950,951,952,953,954,955,956,957,958,959,960,961,963,964,965,966,967,968,969,564=>1040,1041,1042,1043,1044,1045,1025,1046,1047,1048,1049,1050,1051,1052,1053,1054,1055,1056,1057,1058,1059,1060,1061,1062,1063,1064,1065,1066,1067,1068,1069,1070,1071,612=>1072,1073,1074,1075,1076,1077,1105,1078,1079,1080,1081,1082,1083,1084,1085,1086,1087,1088,1089,1090,1091,1092,1093,1094,1095,1096,1097,1098,1099,1100,1101,1102,1103,658=>9472,9474,9484,9488,9496,9492,9500,9516,9508,9524,9532,9473,9475,9487,9491,9499,9495,9507,9523,9515,9531,9547,9504,9519,9512,9527,9535,9501,9520,9509,9528,9538,1128=>9312,9313,9314,9315,9316,9317,9318,9319,9320,9321,9322,9323,9324,9325,9326,9327,9328,9329,9330,9331,8544,8545,8546,8547,8548,8549,8550,8551,8552,8553,1159=>13129,13076,13090,13133,13080,13095,13059,13110,13137,13143,13069,13094,13091,13099,13130,13115,13212,13213,13214,13198,13199,13252,13217,1190=>13179,12317,12319,8470,13261,8481,12964,12965,12966,12967,12968,12849,12850,12857,13182,13181,13180,8786,8801,8747,8750,8721,8730,8869,8736,8735,8895,8757,8745,8746,1410=>20124,21782,23043,38463,21696,24859,25384,23030,36898,33909,33564,31312,24746,25569,28197,26093,33894,33446,39925,26771,22311,26017,25201,23451,22992,34427,39156,32098,32190,39822,25110,31903,34999,23433,24245,25353,26263,26696,38343,38797,26447,20197,20234,20301,20381,20553,22258,22839,22996,23041,23561,24799,24847,24944,26131,26885,28858,30031,30064,31227,32173,32239,32963,33806,34915,35586,36949,36986,21307,20117,20133,22495,32946,37057,30959,19968,22769,28322,36920,31282,33576,33419,39983,20801,21360,21693,21729,22240,23035,24341,39154,28139,32996,34093,38498,38512,38560,38907,21515,21491,23431,28879,32701,36802,38632,21359,40284,31418,19985,
const TABLE_CODES_ENC = [167=>87,14,176=>74,61,180=>12,182=>182,215=>62,247=>63,913=>470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,931=>487,488,489,490,491,492,493,945=>502,503,504,505,506,507,508,509,510,511,512,513,514,515,516,517,518,963=>519,520,521,522,523,524,525,1025=>570,1040=>564,565,566,567,568,569,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,612,613,614,615,616,617,619,620,621,622,623,624,625,626,627,628,629,630,631,632,633,634,635,636,637,638,639,640,641,642,643,644,1105=>618,8208=>29,8213=>28,8216=>37,38,8220=>39,40,8224=>180,181,8229=>36,35,8240=>176,8242=>75,76,8251=>101,8451=>77,8470=>1193,8481=>1195,8491=>175,8544=>1148,1149,1150,1151,1152,1153,1154,1155,1156,1157,8560=>10716,10717,10718,10719,10720,10721,10722,10723,10724,10725,8592=>104,105,103,106,8658=>138,8660=>139,8704=>140,8706=>156,141,8711=>157,119,8715=>120,8721=>1211,8730=>162,8733=>164,70,1215,153,8741=>33,8743=>135,136,126,125,166,167,8750=>1210,8756=>71,165,8765=>163,8786=>159,8800=>65,158,8806=>68,69,8810=>160,161,8834=>123,124,8838=>121,122,8869=>154,8895=>1216,8978=>155,9312=>1128,1129,1130,1131,1132,1133,1134,1135,1136,1137,1138,1139,1140,1141,1142,1143,1144,1145,1146,1147,9472=>658,669,659,670,9484=>660,9487=>671,661,9491=>672,663,9495=>674,662,9499=>673,664,685,9504=>680,9507=>675,666,687,9512=>682,9515=>677,665,9519=>681,686,9523=>676,667,9527=>683,688,9531=>678,668,9535=>684,9538=>689,9547=>679,9632=>96,95,9650=>98,97,9660=>100,99,9670=>94,93,9675=>90,9678=>92,91,9711=>187,9733=>89,88,9792=>73,9794=>72,9834=>179,9837=>178,9839=>177,12288=>0,1,2,22,12293=>24,25,26,49,50,51,52,53,54,55,56,57,58,102,107,43,44,12317=>1191,12319=>1192,12353=>282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,12443=>10,11,20,21,12449=>376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,12539=>5,27,18,19,12849=>1201,1202,12857=>1203,12964=>1196,1197,1198,1199,1200,13059=>1165,13069=>1169,13076=>1160,13080=>1163,13090=>1161,1171,13094=>1170,1164,13099=>1172,13110=>1166,13115=>1174,13129=>1159,1173,13133=>1162,13137=>1167,13143=>1168,13179=>1190,1206,1205,1204,13198=>1178,1179,13212=>1175,1176,1177,13217=>1181,13252=>1180,13261=>1194,19968=>1485,3285,19971=>2560,19975=>4039,2795,2459,2794,1625,19981=>3811,4166,19984=>4419,1518,19988=>1769,4420,2917,4639,1933,3879,19998=>2796,20001=>4259,20006=>3887,20008=>10756,20010=>4421,20013=>3265,20017=>4422,2063,20022=>4423,20024=>1846,3211,20027=>2608,4424,20031=>4425,20034=>4426,3592,20037=>1934,20043=>3594,20045=>3544,2196,3966,20053=>6808,4427,2797,4428,1618,20061=>2042,2241,4114,20066=>4987,20081=>4215,20083=>3570,20094=>1790,20096=>1900,20098=>4429,20101=>4430,4256,20104=>4164,3079,4432,2529,3560,20110=>4435,20113=>1533,2224,20116=>2223,1479,20120=>4366,4365,20123=>2368,1410,20126=>4436,4437,4438,3967,4439,20132=>2243,1480,4029,20136=>1971,20139=>1972,1973,3355,4257,20144=>4440,20147=>4441,20150=>4442,20154=>2864,20160=>2657,2865,4447,20164=>4445,20166=>4446,1935,20170=>2352,1677,20173=>4444,4443,3862,20180=>2481,2480,3131,4448,3812,2984,20189=>23,4449,4451,20193=>10757,20195=>3170,4294,1451,20205=>4450,1627,20208=>2009,20210=>3266,20214=>2146,4452,20219=>3575,10758,20224=>10759,1860,20227=>10760,20233=>4453,1452,20237=>2225,1861,3849,3683,1936,20250=>1678,20252=>4488,3416,20271=>3647,4455,20276=>3693,20278=>4295,20280=>2832,10761,2482,20284=>2530,1629,20291=>3338,20294=>3197,4459,20301=>1453,3356,2658,2369,4139,20307=>3147,20309=>1628,10762,4458,20313=>4165,4454,445
public function nextCode() {
if (($b = @$this->string[$this->posByte++]) === "") {
// clean EOF
$this->posByte--;
return false;
} else {
$this->posChar++;
$b = ord($b);
if ($b < 0x81) {
return $b;
} elseif ($b >= 0xA1 && $b <= 0xDF) {
return 0xFF61 - 0xA1 + $b;
} elseif (($b >= 0x81 && $b <= 0x9F) || ($b >= 0xE0 && $b <= 0xFC)) {
$lead = $b;
if (($b = @$this->string[$this->posByte++]) === "") {
// dirty EOF
$this->posByte--;
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
}
$b = ord($b);
$offset = ($b < 0x7F) ? 0x40 : 0x41;
$leadOffset = ($lead < 0xA0) ? 0x81 : 0xC1;
$pointer = null;
$codePoint = null;
if (($b >= 0x40 && 0x7E) || ($b >= 0x80 && $b <= 0xFC)) {
$pointer = ($lead - $leadOffset) * 188 + $b - $offset;
if ($pointer >= 8836 && $pointer <= 10715) {
return 0xE000 - 8836 + $pointer;
} else {
$codePoint = self::TABLE_CODES_DEC[$pointer] ?? null;
}
}
if ($codePoint === null) {
if ($b < 0x80) {
$this->posByte--;
$errOffset = 1;
} else {
$errOffset = 2;
}
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - $errOffset);
} else {
return $codePoint;
}
} else {
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
}
}
}
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
*/
public static function encode(int $codePoint, bool $fatal = true): string {
if ($codePoint < 0 || $codePoint > 0x10FFFF) {
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);
} elseif ($codePoint <= 0x80) {
return chr($codePoint);
} elseif ($codePoint >= 0xFF61 && $codePoint <= 0xFF9F) {
return chr($codePoint - 0xFF61 + 0xA1);
} else {
switch ($codePoint) {
case 0xA5:
return chr(0x5C);
case 0x203E:
return chr(0x7E);
case 0x2212:
$codePoint = 0xFF0D;
// no break;
default:
$pointer = self::TABLE_CODES_ENC[$codePoint] ?? null;
if (isset($pointer)) {
$lead = (int) ($pointer / 188);
$leadOffset = ($lead < 0x1F) ? 0x81 : 0xC1;
$trail = $pointer % 188;
$offset = ($trail < 0x3F) ? 0x40 : 0x41;
return chr($lead + $leadOffset).chr($trail + $offset);
} else {
return self::errEnc(!$fatal, $codePoint);
}
}
}
}
/** Implements backward seeking $distance characters */
protected function seekBack(int $distance): int {
while ($distance > 0 && $this->posByte > 0) {
$distance--;
$this->posChar--;
if ($this->posByte === $this->errMark) { // the previous character was malformed
// move to the correct sync position, pop the error stack, and continue
$this->posByte = $this->errSync;
list($this->errMark, $this->errSync) = array_pop($this->errStack);
continue;
}
// go back one byte
$b1 = ord(@$this->string[--$this->posByte]);
if ($b1 < 0x40 || $b1 > 0xFC || $b1 == 0x7F || $this->posByte === 0 || $this->posByte === $this->errMark) { // these bytes never appear in sequences, and the first byte is necessarily the start of a sequence
// the byte is a character
continue;
}
// go back a second byte
$b2 = ord(@$this->string[--$this->posByte]);
if ($this->posByte === $this->errMark || $this->posByte === 0) { // position is unambiguously the start of a character
// the two bytes form a character
continue;
} elseif ($b2 < 0x81 || $b2 > 0xFC || ($b2 >= 0xA0 && $b2 <= 0xDF)) { // these bytes never appear in the lead of a sequence
// the first byte was a character
$this->posByte += 1;
continue;
} else { // the second byte is part of a two-byte sequence, but it's unclear if it's the lead or trail byte
$start = $this->posByte + 2;
$pos = $this->posByte;
// go back bytes until an error mark, a definite byte, or start of string
while ($pos > 0 && $pos > $this->errMark) {
$b = ord(@$this->string[--$pos]);
if ($b < 0x81 || ($b >= 0xA0 && $b <= 0xDF) || $b > 0xFC) {
$pos++;
break;
}
}
if (($start - $pos) % 2) { // the number of bytes is odd
// the first byte was a character
$this->posByte += 1;
continue;
} else { // the number of bytes is even
// the second byte was a character
continue;
}
}
}
return $distance;
}
}