A set of dependency-free basic internationalization tools
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

272 lines
60 KiB

<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Intl\Encoding;
class ISO2022JP extends AbstractEncoding implements ModalCoder, Decoder {
const NAME = "ISO-2022-JP";
const LABELS = [
"csiso2022jp",
"iso-2022-jp",
];
const TABLE_JIS0208 = [12288,12289,12290,65292,65294,12539,65306,65307,65311,65281,12443,12444,180,65344,168,65342,65507,65343,12541,12542,12445,12446,12291,20189,12293,12294,12295,12540,8213,8208,65295,65340,65374,8741,65372,8230,8229,8216,8217,8220,8221,65288,65289,12308,12309,65339,65341,65371,65373,12296,12297,12298,12299,12300,12301,12302,12303,12304,12305,65291,65293,177,215,247,65309,8800,65308,65310,8806,8807,8734,8756,9794,9792,176,8242,8243,8451,65509,65284,65504,65505,65285,65283,65286,65290,65312,167,9734,9733,9675,9679,9678,9671,9670,9633,9632,9651,9650,9661,9660,8251,12306,8594,8592,8593,8595,12307,119=>8712,8715,8838,8839,8834,8835,8746,8745,135=>8743,8744,65506,8658,8660,8704,8707,153=>8736,8869,8978,8706,8711,8801,8786,8810,8811,8730,8765,8733,8757,8747,8748,175=>8491,8240,9839,9837,9834,8224,8225,182,187=>9711,203=>65296,65297,65298,65299,65300,65301,65302,65303,65304,65305,220=>65313,65314,65315,65316,65317,65318,65319,65320,65321,65322,65323,65324,65325,65326,65327,65328,65329,65330,65331,65332,65333,65334,65335,65336,65337,65338,252=>65345,65346,65347,65348,65349,65350,65351,65352,65353,65354,65355,65356,65357,65358,65359,65360,65361,65362,65363,65364,65365,65366,65367,65368,65369,65370,282=>12353,12354,12355,12356,12357,12358,12359,12360,12361,12362,12363,12364,12365,12366,12367,12368,12369,12370,12371,12372,12373,12374,12375,12376,12377,12378,12379,12380,12381,12382,12383,12384,12385,12386,12387,12388,12389,12390,12391,12392,12393,12394,12395,12396,12397,12398,12399,12400,12401,12402,12403,12404,12405,12406,12407,12408,12409,12410,12411,12412,12413,12414,12415,12416,12417,12418,12419,12420,12421,12422,12423,12424,12425,12426,12427,12428,12429,12430,12431,12432,12433,12434,12435,376=>12449,12450,12451,12452,12453,12454,12455,12456,12457,12458,12459,12460,12461,12462,12463,12464,12465,12466,12467,12468,12469,12470,12471,12472,12473,12474,12475,12476,12477,12478,12479,12480,12481,12482,12483,12484,12485,12486,12487,12488,12489,12490,12491,12492,12493,12494,12495,12496,12497,12498,12499,12500,12501,12502,12503,12504,12505,12506,12507,12508,12509,12510,12511,12512,12513,12514,12515,12516,12517,12518,12519,12520,12521,12522,12523,12524,12525,12526,12527,12528,12529,12530,12531,12532,12533,12534,470=>913,914,915,916,917,918,919,920,921,922,923,924,925,926,927,928,929,931,932,933,934,935,936,937,502=>945,946,947,948,949,950,951,952,953,954,955,956,957,958,959,960,961,963,964,965,966,967,968,969,564=>1040,1041,1042,1043,1044,1045,1025,1046,1047,1048,1049,1050,1051,1052,1053,1054,1055,1056,1057,1058,1059,1060,1061,1062,1063,1064,1065,1066,1067,1068,1069,1070,1071,612=>1072,1073,1074,1075,1076,1077,1105,1078,1079,1080,1081,1082,1083,1084,1085,1086,1087,1088,1089,1090,1091,1092,1093,1094,1095,1096,1097,1098,1099,1100,1101,1102,1103,658=>9472,9474,9484,9488,9496,9492,9500,9516,9508,9524,9532,9473,9475,9487,9491,9499,9495,9507,9523,9515,9531,9547,9504,9519,9512,9527,9535,9501,9520,9509,9528,9538,1128=>9312,9313,9314,9315,9316,9317,9318,9319,9320,9321,9322,9323,9324,9325,9326,9327,9328,9329,9330,9331,8544,8545,8546,8547,8548,8549,8550,8551,8552,8553,1159=>13129,13076,13090,13133,13080,13095,13059,13110,13137,13143,13069,13094,13091,13099,13130,13115,13212,13213,13214,13198,13199,13252,13217,1190=>13179,12317,12319,8470,13261,8481,12964,12965,12966,12967,12968,12849,12850,12857,13182,13181,13180,8786,8801,8747,8750,8721,8730,8869,8736,8735,8895,8757,8745,8746,1410=>20124,21782,23043,38463,21696,24859,25384,23030,36898,33909,33564,31312,24746,25569,28197,26093,33894,33446,39925,26771,22311,26017,25201,23451,22992,34427,39156,32098,32190,39822,25110,31903,34999,23433,24245,25353,26263,26696,38343,38797,26447,20197,20234,20301,20381,20553,22258,22839,22996,23041,23561,24799,24847,24944,26131,26885,28858,30031,30064,31227,32173,32239,32963,33806,34915,35586,36949,36986,21307,20117,20133,22495,32946,37057,30959,19968,22769,28322,36920,31282,33576,33419,39983,20801,21360,21693,21729,22240,23035,24341,39154,28139,32996,34093,38498,38512,38560,38907,21515,21491,23431,28879,32701,36802,38632,21359,40284,31418,19985,30
const TABLE_KATAKANA = [12290,12300,12301,12289,12539,12530,12449,12451,12453,12455,12457,12515,12517,12519,12483,12540,12450,12452,12454,12456,12458,12459,12461,12463,12465,12467,12469,12471,12473,12475,12477,12479,12481,12484,12486,12488,12490,12491,12492,12493,12494,12495,12498,12501,12504,12507,12510,12511,12512,12513,12514,12516,12518,12520,12521,12522,12523,12524,12525,12527,12531,12443,12444];
const TABLE_POINTERS = [8470=>1193,8481=>1195,8544=>1148,1149,1150,1151,1152,1153,1154,1155,1156,1157,8560=>8634,8635,8636,8637,8638,8639,8640,8641,8642,8643,8730=>162,8736=>153,8745=>126,125,166,8757=>165,8786=>159,8801=>158,8869=>154,12849=>1201,20008=>8284,20193=>8285,20220=>8286,20224=>8287,20227=>8288,20281=>8289,20310=>8290,20362=>8292,20370=>8291,20372=>8294,20378=>8293,20425=>8277,20429=>8295,20479=>8298,20510=>8299,20514=>8297,20544=>8296,20546=>8302,20550=>8300,20592=>8301,20628=>8303,20696=>8305,20724=>8304,20810=>8306,20836=>8307,20893=>8308,20926=>8309,20972=>8310,21013=>8311,21148=>8312,21158=>8313,21167=>8487,21184=>8314,21211=>8315,21248=>8316,21255=>8317,21284=>8318,21362=>8319,21395=>8320,21426=>8321,21469=>8322,21642=>8325,21660=>8324,21673=>8326,21759=>8327,21894=>8328,22361=>8329,22373=>8330,22444=>8331,22471=>8333,8332,22686=>8336,22706=>8337,22795=>8338,22867=>8339,22875=>8340,22877=>8341,22883=>8342,22948=>8343,22970=>8344,23382=>8345,23488=>8346,23512=>8348,23532=>8349,23582=>8350,23718=>8351,23738=>8352,23797=>8353,23847=>8354,23874=>8357,23891=>8355,23917=>8358,23992=>8359,8360,24016=>8361,24353=>8362,24372=>8363,24389=>8283,24423=>8364,24503=>8365,24542=>8366,24669=>8367,24709=>8368,24714=>8369,24789=>8371,24798=>8370,24818=>8373,24849=>8374,24864=>8372,24880=>8376,24887=>8375,24984=>8377,25107=>8378,25254=>8379,25589=>8380,25696=>8381,25757=>8382,25806=>8383,25934=>8384,26112=>8385,26121=>8388,26133=>8386,26142=>8390,26148=>8391,26158=>8389,26161=>8279,26171=>8387,26199=>8393,26201=>8394,26213=>8392,26227=>8396,26265=>8397,26272=>8398,26290=>8399,26303=>8400,26362=>8401,8282,26382=>8402,26470=>8404,26555=>8405,26560=>8407,26625=>8408,26692=>8409,26706=>8406,26824=>8280,26831=>8410,26984=>8412,27032=>8414,27106=>8415,27184=>8416,27206=>8418,27243=>8417,27251=>8419,27262=>8420,27362=>8421,27364=>8422,27606=>8423,27711=>8424,27740=>8425,27759=>8427,27782=>8426,27866=>8428,27908=>8429,28015=>8431,28039=>8430,28054=>8432,28076=>8433,28111=>8434,28146=>8436,28152=>8435,28156=>8437,28199=>8440,28217=>8438,28220=>8441,28252=>8439,28351=>8442,28552=>8443,28597=>8444,28661=>8445,28677=>8446,28679=>8447,28712=>8448,28805=>8449,28843=>8450,28859=>8278,28932=>8452,28943=>8451,28998=>8454,8455,29020=>8453,29121=>8457,29182=>8458,29361=>8459,29374=>8460,29476=>8461,29559=>8463,29629=>8464,29641=>8465,29650=>8468,29654=>8466,29667=>8467,29685=>8470,29703=>8469,29734=>8471,29737=>8473,8472,29742=>8474,29794=>8475,29833=>8476,29855=>8477,29953=>8478,29999=>8347,30063=>8479,30338=>8480,30363=>8483,8481,30366=>8482,30374=>8484,30534=>8486,30753=>8488,30798=>8489,30820=>8490,30842=>8491,31024=>8492,31124=>8496,31131=>8498,31441=>8499,31463=>8500,31467=>8502,31646=>8503,32072=>8505,32092=>8506,32160=>8508,32183=>8507,32214=>8509,32338=>8510,32394=>8272,32583=>8511,32673=>8512,33537=>8514,33634=>8515,33663=>8516,33735=>8517,33782=>8518,33864=>8519,33972=>8520,34012=>8276,34131=>8521,34137=>8522,34155=>8523,34224=>8525,34823=>8528,35061=>8529,35100=>8273,35346=>8530,35383=>8531,35449=>8532,35495=>8533,35518=>8534,35551=>8535,35574=>8537,35667=>8538,35711=>8539,36080=>8540,36084=>8541,36114=>8542,36214=>8543,36559=>8545,36967=>8548,37086=>8549,37141=>8551,37159=>8552,37335=>8554,37338=>8553,37342=>8555,37348=>8558,8559,37357=>8556,8557,37382=>8560,37386=>8562,37392=>8561,37433=>8569,8563,37436=>8565,37440=>8564,37454=>8566,37457=>8568,37465=>8567,37479=>8570,37495=>8572,8573,37512=>8275,37543=>8571,37584=>8577,37587=>8581,37589=>8579,37591=>8575,37593=>8576,37600=>8580,37607=>8574,37625=>8281,37627=>8584,37631=>8587,37634=>8589,37661=>8588,8586,37665=>8583,37669=>8582,37704=>8274,37719=>8591,37744=>8590,37796=>8592,37830=>8593,37854=>8594,37880=>8595,37937=>8596,37957=>8597,37960=>8598,38290=>8599,38557=>8602,38575=>8603,38707=>8604,38715=>8605,38723=>8606,38733=>8607,38735=>8608,38737=>8609,38741=>8610,38999=>8611,39013=>8612,39207=>8615,39326=>8617,39502=>8618,39641=>8619,39644=>8620,39794=>8622,39797=>8621,39823=>8623,39857=>8624,39867=>8625,39936=>8626,40299=>8628,40304=>8627,40473=>8630,40657=>8631,63785=>8
const ASCII_STATE = 0;
const ROMAN_STATE = 1;
const KATAKANA_STATE = 2;
const LEAD_BYTE_STATE = 3;
const TRAIL_BYTE_STATE = 4;
const ESCAPE_START_STATE = 5;
const ESCAPE_STATE = 6;
protected $mode = self::ASCII_STATE;
protected $modeMark = \PHP_INT_MIN;
protected $modeStack = [];
protected $dirtyEOF = 0;
4 years ago
protected static $pointerCache;
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) {
parent::__construct($string, $fatal, $allowSurrogates);
$this->stateProps[] = "dirtyEOF";
}
public function nextChar(): string {
$code = $this->nextCode();
if ($code !== false) {
return UTF8::encode($code);
}
return "";
}
public function nextCode() {
$this->posChar++;
$state = $this->mode;
4 years ago
while (true) {
$b = @$this->string[$this->posByte++];
$eof = ($b === "");
$b = ord($b);
// unify handling of basic states where possible
if ($state < self::TRAIL_BYTE_STATE) {
if ($eof) {
$this->posByte--;
$this->posChar--;
return false;
} elseif ($b === 0x1B) {
$state = self::ESCAPE_START_STATE;
continue;
} elseif ($b > 0x7F || $b === 0x0E || $b === 0x0F) {
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
}
}
if ($state === self::ASCII_STATE) {
return $b;
} elseif ($state === self::ROMAN_STATE) {
if ($b === 0x5C) {
return 0xA5;
} elseif ($b === 0x7E) {
return 0x203E;
} else {
return $b;
}
} elseif ($state === self::KATAKANA_STATE) {
if ($b >= 0x21 && $b <= 0x5F) {
return 0xFF61 - 0x21 + $b;
} else {
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
}
} elseif ($state === self::LEAD_BYTE_STATE) {
if ($b >= 0x21 && $b <= 0x7E) {
$lead = $b;
$state = self::TRAIL_BYTE_STATE;
continue;
} else {
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
}
} elseif ($state === self::TRAIL_BYTE_STATE) {
if ($eof || $b === 0x1B) {
return $this->errDec($this->errMode, $this->posChar - 1, --$this->posByte - 1);
} elseif ($b >= 0x21 && $b <= 0x7E) {
$pointer = (($lead - 0x21) * 94) + $b - 0x21;
$codePoint = self::TABLE_JIS0208[$pointer] ?? null;
if (!is_null($codePoint)) {
return $codePoint;
} else {
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 2);
}
} else {
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 2);
}
} elseif ($state === self::ESCAPE_START_STATE) {
if ($b === 0x24 || $b === 0x28) {
$lead = $b;
$state = self::ESCAPE_STATE;
continue;
} else {
return $this->errDec($this->errMode, $this->posChar - 1, --$this->posByte - 1);
}
} elseif ($state === self::ESCAPE_STATE) {
if ($lead === 0x28 && $b === 0x42) {
$newState = self::ASCII_STATE;
} elseif ($lead === 0x28 && $b === 0x4A) {
$newState = self::ROMAN_STATE;
} elseif ($lead === 0x28 && $b === 0x49) {
$newState = self::KATAKANA_STATE;
} elseif ($lead === 0x24 && ($b === 0x40 || $b === 0x42)) {
$newState = self::LEAD_BYTE_STATE;
} else {
$this->posByte -= 2;
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
}
if ($this->modeMark === ($this->posByte - 3)) {
$this->modeSet($newState);
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 3);
} else {
$state = $this->modeSet($newState);
unset($lead);
// if we're at the end of the string, mark the string as dirty
if ($this->posByte === $this->lenByte) {
$this->dirtyEOF = 3;
}
continue;
}
}
}
4 years ago
} // @codeCoverageIgnore
protected function modeSet(int $mode): int {
$this->modeStack[] = [$this->modeMark, $this->mode];
$this->mode = $mode;
$this->modeMark = $this->posByte;
return $mode;
}
4 years ago
protected function seekBack(int $distance): int {
if ($this->dirtyEOF && $this->posByte === $this->lenByte) {
list($this->modeMark, $this->mode) = array_pop($this->modeStack);
$this->posByte -= $this->dirtyEOF;
$this->dirtyEOF = 0;
}
while ($distance > 0 && $this->posByte > 0) {
$this->posChar--;
$distance--;
if ($this->posByte === $this->errMark) { // the previous character was malformed
// if the position also marks a mode change, pop the mode stack
if ($this->posByte === $this->modeMark) {
list($this->modeMark, $this->mode) = array_pop($this->modeStack);
}
// move to the correct sync position, pop the error stack, and continue
$this->posByte = $this->errSync;
list($this->errMark, $this->errSync) = array_pop($this->errStack);
} else {
$this->posByte -= ($this->mode === self::LEAD_BYTE_STATE ? 2 : 1);
}
4 years ago
// check for a mode change that is not also an error character
if ($this->posByte === $this->modeMark && $this->posByte !== $this->errMark) {
$this->posByte -= 3;
list($this->modeMark, $this->mode) = array_pop($this->modeStack);
}
}
return $distance;
}
4 years ago
protected function stateSave(): array {
$out = parent::stateSave();
$out['modeCount'] = sizeof($this->modeStack);
return $out;
}
4 years ago
protected function stateApply(array $state): void {
while (sizeof($this->modeStack) > $state['modeCount']) {
list($this->modeMark, $this->mode) = array_pop($this->modeStack);
}
unset($state['modeCount']);
parent::stateApply($state);
}
4 years ago
public function rewind(): void {
$this->modeStack = [];
$this->modeMark = \PHP_INT_MIN;
$this->mode = self::ASCII_STATE;
$this->dirtyEOF = 0;
parent::rewind();
}
4 years ago
public function eof(): bool {
return $this->posByte === $this->lenByte || ($this->posByte === ($this->lenByte - 3) && $this->peekCode() === false);
}
public static function encode(?int $codePoint, bool $fatal = true, &$mode = null): string {
$mode = $mode ?? self::MODE_ASCII;
if ($codePoint < 0 || $codePoint > 0x10FFFF) {
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", Coder::E_INVALID_CODE_POINT);
} elseif ($codePoint === null) {
if ($mode !== self::MODE_ASCII) {
$mode = self::MODE_ASCII;
return "\x1B\x28\x42";
}
return "";
} elseif ($codePoint === 0xE || $codePoint === 0xF || $codePoint === 0x1B) {
if (!$fatal) {
$out = "&#".(string) 0xFFFD.";";
if ($mode === self::MODE_JIS) {
$mode = self::MODE_ASCII;
return "\x1B\x28\x42". $out;
}
return $out;
} else {
throw new EncoderException("Code point $codePoint not available in target encoding", Coder::E_UNAVAILABLE_CODE_POINT);
}
} elseif ($codePoint === 0x5C || $codePoint === 0x7E) {
if ($mode !== self::MODE_ASCII) {
$mode = self::MODE_ASCII;
return "\x1B\x28\x42".chr($codePoint);
}
return chr($codePoint);
} elseif ($codePoint < 0x80) {
if ($mode === self::MODE_JIS) {
$mode = self::MODE_ASCII;
return "\x1B\x28\x42".chr($codePoint);
}
return chr($codePoint);
} elseif ($codePoint === 0xA5 || $codePoint === 0x203E) {
$ord = $codePoint === 0xA5 ? 0x5C : 0x7E;
if ($mode !== self::MODE_ROMAN) {
$mode = self::MODE_ROMAN;
return "\x1B\x28\x4A".chr($ord);
}
return chr($ord);
} else {
if ($codePoint >= 0xFF61 && $codePoint <= 0xFF9F) {
$codePoint = self::TABLE_KATAKANA[$codePoint - 0xFF61];
} elseif ($codePoint === 0x2212) {
$codePoint = 0xFF0D;
}
$pointer = self::TABLE_POINTERS[$codePoint] ?? (self::$pointerCache ?? (self::$pointerCache = array_flip(self::TABLE_JIS0208)))[$codePoint] ?? null;
if (!is_null($pointer)) {
$lead = chr((int) ($pointer / 94) + 0x21);
$trail = chr(($pointer % 94) + 0x21);
if ($mode !== self::MODE_JIS) {
$mode = self::MODE_JIS;
return "\x1B\x24\x42".$lead.$trail;
}
return $lead.$trail;
}
if (!$fatal) {
$out = "&#".(string) ($codePoint).";";
if ($mode === self::MODE_JIS) {
$mode = self::MODE_ASCII;
return "\x1B\x28\x42". $out;
}
return $out;
} else {
throw new EncoderException("Code point $codePoint not available in target encoding", Coder::E_UNAVAILABLE_CODE_POINT);
}
}
}
}