Browse Source

Speed up encoding; make ISO 2022-JP more consistent

- The ISO 2022-JP encoder is now static as with all others; this is
slightly slower, but localises the encoder logic to its class
- Indexed encoders now cache pointer tables on first use, yielding
significant performance benefits
- Encoding multiple characters now uses fewer function calls, yielding
moderate performance benefits at the expense of slight complication
multi-byte
J. King 3 years ago
parent
commit
c234702cce
  1. 2
      CHANGELOG
  2. 4
      lib/Encoding/Big5.php
  3. 4
      lib/Encoding/EUCJP.php
  4. 4
      lib/Encoding/EUCKR.php
  5. 265
      lib/Encoding/Encoder.php
  6. 4
      lib/Encoding/GBCommon.php
  7. 73
      lib/Encoding/ISO2022JP.php
  8. 21
      lib/Encoding/ModalCoder.php
  9. 4
      lib/Encoding/ShiftJIS.php
  10. 109
      tests/cases/Encoding/TestISO2022JP.php
  11. 5
      tests/cases/Encoding/TestSingleByte.php
  12. 6
      tests/lib/CoderDecoderTest.php
  13. 3
      tools/test-iso2022jp.html

2
CHANGELOG

@ -18,6 +18,8 @@ Bug fixes:
Changes:
- Added new labels for UTF-8 and UTF-16
- Improved performance of Big5, gb18030, GBK, and EUC-KR encoders
- Corrected requirement of PHP 7.1
Version 0.7.1 (2020-10-05)
==========================

4
lib/Encoding/Big5.php

@ -27,6 +27,8 @@ class Big5 extends AbstractEncoding implements Coder, Decoder {
protected $bufferedCode = 0;
protected static $pointerCache;
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) {
$this->stateProps[] = "bufferedCode";
parent::__construct($string, $fatal, $allowSurrogates);
@ -96,7 +98,7 @@ class Big5 extends AbstractEncoding implements Coder, Decoder {
} elseif ($codePoint < 128) {
return chr($codePoint);
} else {
$pointer = self::TABLE_POINTERS[$codePoint] ?? array_flip(self::TABLE_CODES_TW)[$codePoint] ?? null;
$pointer = self::TABLE_POINTERS[$codePoint] ?? (self::$pointerCache ?? (self::$pointerCache = array_flip(self::TABLE_CODES_TW)))[$codePoint] ?? null;
if (isset($pointer)) {
$lead = (int) ($pointer / 157) + 0x81;
$trail = $pointer % 157;

4
lib/Encoding/EUCJP.php

File diff suppressed because one or more lines are too long

4
lib/Encoding/EUCKR.php

File diff suppressed because one or more lines are too long

265
lib/Encoding/Encoder.php

@ -17,8 +17,6 @@ class Encoder {
protected $fatal = true;
protected $mode = self::MODE_ASCII;
protected $pointerCache;
public function __construct(string $label, bool $fatal = true) {
$l = Matcher::matchLabel($label);
if (!$l || !$l['encoder']) {
@ -30,24 +28,199 @@ class Encoder {
}
public function encode(iterable $codePoints): string {
$oldMode = $this->mode;
$this->mode = self::MODE_ASCII;
$out = "";
try {
foreach ($codePoints as $codePoint) {
$out .= $this->encodeChar($codePoint);
}
$out .= $this->finalize();
} finally {
$this->mode = $oldMode;
switch ($this->name) {
case "UTF-8":
foreach ($codePoints as $codePoint) {
$out .= UTF8::encode($codePoint, $this->fatal);
}
break;
case "Big5":
foreach ($codePoints as $codePoint) {
$out .= Big5::encode($codePoint, $this->fatal);
}
break;
case "EUC-JP":
foreach ($codePoints as $codePoint) {
$out .= EUCJP::encode($codePoint, $this->fatal);
}
break;
case "EUC-KR":
foreach ($codePoints as $codePoint) {
$out .= EUCKR::encode($codePoint, $this->fatal);
}
break;
case "gb18030":
foreach ($codePoints as $codePoint) {
$out .= GB18030::encode($codePoint, $this->fatal);
}
break;
case "GBK":
foreach ($codePoints as $codePoint) {
$out .= GBK::encode($codePoint, $this->fatal);
}
break;
case "IBM866":
foreach ($codePoints as $codePoint) {
$out .= IBM866::encode($codePoint, $this->fatal);
}
break;
case "ISO-8859-2":
foreach ($codePoints as $codePoint) {
$out .= ISO88592::encode($codePoint, $this->fatal);
}
break;
case "ISO-8859-3":
foreach ($codePoints as $codePoint) {
$out .= ISO88593::encode($codePoint, $this->fatal);
}
break;
case "ISO-8859-4":
foreach ($codePoints as $codePoint) {
$out .= ISO88594::encode($codePoint, $this->fatal);
}
break;
case "ISO-8859-5":
foreach ($codePoints as $codePoint) {
$out .= ISO88595::encode($codePoint, $this->fatal);
}
break;
case "ISO-8859-6":
foreach ($codePoints as $codePoint) {
$out .= ISO88596::encode($codePoint, $this->fatal);
}
break;
case "ISO-8859-7":
foreach ($codePoints as $codePoint) {
$out .= ISO88597::encode($codePoint, $this->fatal);
}
break;
case "ISO-8859-8":
foreach ($codePoints as $codePoint) {
$out .= ISO88598::encode($codePoint, $this->fatal);
}
break;
case "ISO-8859-8-I":
foreach ($codePoints as $codePoint) {
$out .= ISO88598I::encode($codePoint, $this->fatal);
}
break;
case "ISO-8859-10":
foreach ($codePoints as $codePoint) {
$out .= ISO885910::encode($codePoint, $this->fatal);
}
break;
case "ISO-8859-13":
foreach ($codePoints as $codePoint) {
$out .= ISO885913::encode($codePoint, $this->fatal);
}
break;
case "ISO-8859-14":
foreach ($codePoints as $codePoint) {
$out .= ISO885914::encode($codePoint, $this->fatal);
}
break;
case "ISO-8859-15":
foreach ($codePoints as $codePoint) {
$out .= ISO885915::encode($codePoint, $this->fatal);
}
break;
case "ISO-8859-16":
foreach ($codePoints as $codePoint) {
$out .= ISO885916::encode($codePoint, $this->fatal);
}
break;
case "KOI8-R":
foreach ($codePoints as $codePoint) {
$out .= KOI8R::encode($codePoint, $this->fatal);
}
break;
case "KOI8-U":
foreach ($codePoints as $codePoint) {
$out .= KOI8U::encode($codePoint, $this->fatal);
}
break;
case "macintosh":
foreach ($codePoints as $codePoint) {
$out .= Macintosh::encode($codePoint, $this->fatal);
}
break;
case "Shift_JIS":
foreach ($codePoints as $codePoint) {
$out .= ShiftJIS::encode($codePoint, $this->fatal);
}
break;
case "windows-1250":
foreach ($codePoints as $codePoint) {
$out .= Windows1250::encode($codePoint, $this->fatal);
}
break;
case "windows-1251":
foreach ($codePoints as $codePoint) {
$out .= Windows1251::encode($codePoint, $this->fatal);
}
break;
case "windows-1252":
foreach ($codePoints as $codePoint) {
$out .= Windows1252::encode($codePoint, $this->fatal);
}
break;
case "windows-1253":
foreach ($codePoints as $codePoint) {
$out .= Windows1253::encode($codePoint, $this->fatal);
}
break;
case "windows-1254":
foreach ($codePoints as $codePoint) {
$out .= Windows1254::encode($codePoint, $this->fatal);
}
break;
case "windows-1255":
foreach ($codePoints as $codePoint) {
$out .= Windows1255::encode($codePoint, $this->fatal);
}
break;
case "windows-1256":
foreach ($codePoints as $codePoint) {
$out .= Windows1256::encode($codePoint, $this->fatal);
}
break;
case "windows-1257":
foreach ($codePoints as $codePoint) {
$out .= Windows1257::encode($codePoint, $this->fatal);
}
break;
case "windows-1258":
foreach ($codePoints as $codePoint) {
$out .= Windows1258::encode($codePoint, $this->fatal);
}
break;
case "windows-874":
foreach ($codePoints as $codePoint) {
$out .= Windows874::encode($codePoint, $this->fatal);
}
break;
case "x-mac-cyrillic":
foreach ($codePoints as $codePoint) {
$out .= XMacCyrillic::encode($codePoint, $this->fatal);
}
break;
case "x-user-defined":
foreach ($codePoints as $codePoint) {
$out .= XUserDefined::encode($codePoint, $this->fatal);
}
break;
case "ISO-2022-JP":
foreach ($codePoints as $codePoint) {
$out .= ISO2022JP::encode($codePoint, $this->fatal, $mode);
}
$out .= ISO2022JP::encode(null, $this->fatal, $mode);
break;
}
return $out;
}
public function encodeChar(int $codePoint): string {
if ($codePoint < 0 || $codePoint > 0x10FFFF) {
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", Coder::E_INVALID_CODE_POINT);
}
switch ($this->name) {
case "UTF-8":
return UTF8::encode($codePoint, $this->fatal);
@ -122,71 +295,11 @@ class Encoder {
case "x-user-defined":
return XUserDefined::encode($codePoint, $this->fatal);
case "ISO-2022-JP":
if ($codePoint === 0xE || $codePoint === 0xF || $codePoint === 0x1B) {
if (!$this->fatal) {
$out = "&#".(string) 0xFFFD.";";
if ($this->mode === self::MODE_JIS) {
$this->mode = self::MODE_ASCII;
return "\x1B\x28\x42". $out;
}
return $out;
} else {
throw new EncoderException("Code point $codePoint not available in target encoding", Coder::E_UNAVAILABLE_CODE_POINT);
}
} elseif ($codePoint === 0x5C || $codePoint === 0x7E) {
if ($this->mode !== self::MODE_ASCII) {
$this->mode = self::MODE_ASCII;
return "\x1B\x28\x42".chr($codePoint);
}
return chr($codePoint);
} elseif ($codePoint < 0x80) {
if ($this->mode === self::MODE_JIS) {
$this->mode = self::MODE_ASCII;
return "\x1B\x28\x42".chr($codePoint);
}
return chr($codePoint);
} elseif ($codePoint === 0xA5 || $codePoint === 0x203E) {
$ord = $codePoint === 0xA5 ? 0x5C : 0x7E;
if ($this->mode !== self::MODE_ROMAN) {
$this->mode = self::MODE_ROMAN;
return "\x1B\x28\x4A".chr($ord);
}
return chr($ord);
} else {
if ($codePoint >= 0xFF61 && $codePoint <= 0xFF9F) {
$codePoint = ISO2022JP::TABLE_KATAKANA[$codePoint - 0xFF61];
} elseif ($codePoint === 0x2212) {
$codePoint = 0xFF0D;
}
$pointer = ISO2022JP::TABLE_POINTERS[$codePoint] ?? ($this->pointerCache ?? ($this->pointerCache = array_flip(ISO2022JP::TABLE_JIS0208)))[$codePoint] ?? null;
if (!is_null($pointer)) {
$lead = chr((int) ($pointer / 94) + 0x21);
$trail = chr(($pointer % 94) + 0x21);
if ($this->mode !== self::MODE_JIS) {
$this->mode = self::MODE_JIS;
return "\x1B\x24\x42".$lead.$trail;
}
return $lead.$trail;
}
if (!$this->fatal) {
$out = "&#".(string) ($codePoint).";";
if ($this->mode === self::MODE_JIS) {
$this->mode = self::MODE_ASCII;
return "\x1B\x28\x42". $out;
}
return $out;
} else {
throw new EncoderException("Code point $codePoint not available in target encoding", Coder::E_UNAVAILABLE_CODE_POINT);
}
}
return ISO2022JP::encode($codePoint, $this->fatal, $this->mode);
}
} // @codeCoverageIgnore
public function finalize(): string {
if ($this->mode !== self::MODE_ASCII) {
$this->mode = self::MODE_ASCII;
return "\x1B\x28\x42";
}
return "";
return ISO2022JP::encode(null, $this->fatal, $this->mode);
}
}

4
lib/Encoding/GBCommon.php

@ -12,6 +12,8 @@ abstract class GBCommon extends AbstractEncoding implements Coder, Decoder {
const TABLE_RANGES = [0,36,38,45,50,81,89,95,96,100,103,104,105,109,126,133,148,172,175,179,208,306,307,308,309,310,311,312,313,341,428,443,544,545,558,741,742,749,750,805,819,820,7922,7924,7925,7927,7934,7943,7944,7945,7950,8062,8148,8149,8152,8164,8174,8236,8240,8262,8264,8374,8380,8381,8384,8388,8390,8392,8393,8394,8396,8401,8406,8416,8419,8424,8437,8439,8445,8482,8485,8496,8521,8603,8936,8946,9046,9050,9063,9066,9076,9092,9100,9108,9111,9113,9131,9162,9164,9218,9219,11329,11331,11334,11336,11346,11361,11363,11366,11370,11372,11375,11389,11682,11686,11687,11692,11694,11714,11716,11723,11725,11730,11736,11982,11989,12102,12336,12348,12350,12384,12393,12395,12397,12510,12553,12851,12962,12973,13738,13823,13919,13933,14080,14298,14585,14698,15583,15847,16318,16434,16438,16481,16729,17102,17122,17315,17320,17402,17418,17859,17909,17911,17915,17916,17936,17939,17961,18664,18703,18814,18962,19043,33469,33470,33471,33484,33485,33490,33497,33501,33505,33513,33520,33536,33550,37845,37921,37948,38029,38038,38064,38065,38066,38069,38075,38076,38078,39108,39109,39113,39114,39115,39116,39265,39394,39420,189000,1237576];
const TABLE_OFFSETS = [128,165,169,178,184,216,226,235,238,244,248,251,253,258,276,284,300,325,329,334,364,463,465,467,469,471,473,475,477,506,594,610,712,716,730,930,938,962,970,1026,1104,1106,8209,8215,8218,8222,8231,8241,8244,8246,8252,8365,8452,8454,8458,8471,8482,8556,8570,8596,8602,8713,8720,8722,8726,8731,8737,8740,8742,8748,8751,8760,8766,8777,8781,8787,8802,8808,8816,8854,8858,8870,8896,8979,9322,9372,9548,9588,9616,9622,9634,9652,9662,9672,9676,9680,9702,9735,9738,9793,9795,11906,11909,11913,11917,11928,11944,11947,11951,11956,11960,11964,11979,12284,12292,12312,12319,12330,12351,12436,12447,12535,12543,12586,12842,12850,12964,13200,13215,13218,13253,13263,13267,13270,13384,13428,13727,13839,13851,14617,14703,14801,14816,14964,15183,15471,15585,16471,16736,17208,17325,17330,17374,17623,17997,18018,18212,18218,18301,18318,18760,18811,18814,18820,18823,18844,18848,18872,19576,19620,19738,19887,40870,59244,59336,59367,59413,59417,59423,59431,59437,59443,59452,59460,59478,59493,63789,63866,63894,63976,63986,64016,64018,64021,64025,64034,64037,64042,65074,65093,65107,65112,65127,65132,65375,65510,null,65536,1114112];
protected static $pointerCache;
public function nextCode() {
$first = 0;
$second = 0;
@ -103,7 +105,7 @@ abstract class GBCommon extends AbstractEncoding implements Coder, Decoder {
} elseif (static::GBK && $codePoint == 0x20AC) {
return "\x80";
} else {
$pointer = self::TABLE_POINTERS[$codePoint] ?? array_flip(self::TABLE_CODES)[$codePoint] ?? null;
$pointer = self::TABLE_POINTERS[$codePoint] ?? (self::$pointerCache ?? (self::$pointerCache = array_flip(self::TABLE_CODES)))[$codePoint] ?? null;
if (isset($pointer)) {
$lead = (int) ($pointer / 190) + 0x81;
$trail = $pointer % 190;

73
lib/Encoding/ISO2022JP.php

@ -6,7 +6,7 @@
declare(strict_types=1);
namespace MensBeam\Intl\Encoding;
class ISO2022JP extends AbstractEncoding implements Decoder {
class ISO2022JP extends AbstractEncoding implements ModalCoder, Decoder {
const NAME = "ISO-2022-JP";
const LABELS = [
"csiso2022jp",
@ -28,6 +28,8 @@ class ISO2022JP extends AbstractEncoding implements Decoder {
protected $modeStack = [];
protected $dirtyEOF = 0;
protected static $pointerCache;
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) {
parent::__construct($string, $fatal, $allowSurrogates);
$this->stateProps[] = "dirtyEOF";
@ -197,4 +199,73 @@ class ISO2022JP extends AbstractEncoding implements Decoder {
public function eof(): bool {
return $this->posByte === $this->lenByte || ($this->posByte === ($this->lenByte - 3) && $this->peekCode() === false);
}
public static function encode(?int $codePoint, bool $fatal = true, &$mode = null): string {
$mode = $mode ?? self::MODE_ASCII;
if ($codePoint < 0 || $codePoint > 0x10FFFF) {
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", Coder::E_INVALID_CODE_POINT);
} elseif ($codePoint === null) {
if ($mode !== self::MODE_ASCII) {
$mode = self::MODE_ASCII;
return "\x1B\x28\x42";
}
return "";
} elseif ($codePoint === 0xE || $codePoint === 0xF || $codePoint === 0x1B) {
if (!$fatal) {
$out = "&#".(string) 0xFFFD.";";
if ($mode === self::MODE_JIS) {
$mode = self::MODE_ASCII;
return "\x1B\x28\x42". $out;
}
return $out;
} else {
throw new EncoderException("Code point $codePoint not available in target encoding", Coder::E_UNAVAILABLE_CODE_POINT);
}
} elseif ($codePoint === 0x5C || $codePoint === 0x7E) {
if ($mode !== self::MODE_ASCII) {
$mode = self::MODE_ASCII;
return "\x1B\x28\x42".chr($codePoint);
}
return chr($codePoint);
} elseif ($codePoint < 0x80) {
if ($mode === self::MODE_JIS) {
$mode = self::MODE_ASCII;
return "\x1B\x28\x42".chr($codePoint);
}
return chr($codePoint);
} elseif ($codePoint === 0xA5 || $codePoint === 0x203E) {
$ord = $codePoint === 0xA5 ? 0x5C : 0x7E;
if ($mode !== self::MODE_ROMAN) {
$mode = self::MODE_ROMAN;
return "\x1B\x28\x4A".chr($ord);
}
return chr($ord);
} else {
if ($codePoint >= 0xFF61 && $codePoint <= 0xFF9F) {
$codePoint = self::TABLE_KATAKANA[$codePoint - 0xFF61];
} elseif ($codePoint === 0x2212) {
$codePoint = 0xFF0D;
}
$pointer = self::TABLE_POINTERS[$codePoint] ?? (self::$pointerCache ?? (self::$pointerCache = array_flip(self::TABLE_JIS0208)))[$codePoint] ?? null;
if (!is_null($pointer)) {
$lead = chr((int) ($pointer / 94) + 0x21);
$trail = chr(($pointer % 94) + 0x21);
if ($mode !== self::MODE_JIS) {
$mode = self::MODE_JIS;
return "\x1B\x24\x42".$lead.$trail;
}
return $lead.$trail;
}
if (!$fatal) {
$out = "&#".(string) ($codePoint).";";
if ($mode === self::MODE_JIS) {
$mode = self::MODE_ASCII;
return "\x1B\x28\x42". $out;
}
return $out;
} else {
throw new EncoderException("Code point $codePoint not available in target encoding", Coder::E_UNAVAILABLE_CODE_POINT);
}
}
}
}

21
lib/Encoding/ModalCoder.php

@ -0,0 +1,21 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Intl\Encoding;
interface ModalCoder {
const MODE_ASCII = 0;
const MODE_ROMAN = 1;
const MODE_JIS = 2;
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown; if $codePoint is null this signals end-of-file
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
*/
public static function encode(?int $codePoint, bool $fatal = true, &$mode = self::MODE_ASCII): string;
}

4
lib/Encoding/ShiftJIS.php

@ -22,6 +22,8 @@ class ShiftJIS extends AbstractEncoding implements Coder, Decoder {
const TABLE_CODES_EXTRA = [8272=>32394,35100,37704,37512,34012,20425,28859,26161,26824,37625,26363,24389,20008,20193,20220,20224,20227,20281,20310,20370,20362,20378,20372,20429,20544,20514,20479,20510,20550,20592,20546,20628,20724,20696,20810,20836,20893,20926,20972,21013,21148,21158,21184,21211,21248,21255,21284,21362,21395,21426,21469,64014,21660,21642,21673,21759,21894,22361,22373,22444,22472,22471,64015,64016,22686,22706,22795,22867,22875,22877,22883,22948,22970,23382,23488,29999,23512,23532,23582,23718,23738,23797,23847,23891,64017,23874,23917,23992,23993,24016,24353,24372,24423,24503,24542,24669,24709,24714,24798,24789,24864,24818,24849,24887,24880,24984,25107,25254,25589,25696,25757,25806,25934,26112,26133,26171,26121,26158,26142,26148,26213,26199,26201,64018,26227,26265,26272,26290,26303,26362,26382,63785,26470,26555,26706,26560,26625,26692,26831,64019,26984,64020,27032,27106,27184,27243,27206,27251,27262,27362,27364,27606,27711,27740,27782,27759,27866,27908,28039,28015,28054,28076,28111,28152,28146,28156,28217,28252,28199,28220,28351,28552,28597,28661,28677,28679,28712,28805,28843,28943,28932,29020,28998,28999,64021,29121,29182,29361,29374,29476,64022,29559,29629,29641,29654,29667,29650,29703,29685,29734,29738,29737,29742,29794,29833,29855,29953,30063,30338,30364,30366,30363,30374,64023,30534,21167,30753,30798,30820,30842,31024,64024,64025,64026,31124,64027,31131,31441,31463,64028,31467,31646,64029,32072,32092,32183,32160,32214,32338,32583,32673,64030,33537,33634,33663,33735,33782,33864,33972,34131,34137,34155,64031,34224,64032,64033,34823,35061,35346,35383,35449,35495,35518,35551,64034,35574,35667,35711,36080,36084,36114,36214,64035,36559,64036,64037,36967,37086,64038,37141,37159,37338,37335,37342,37357,37358,37348,37349,37382,37392,37386,37434,37440,37436,37454,37465,37457,37433,37479,37543,37495,37496,37607,37591,37593,37584,64039,37589,37600,37587,37669,37665,37627,64040,37662,37631,37661,37634,37744,37719,37796,37830,37854,37880,37937,37957,37960,38290,63964,64041,38557,38575,38707,38715,38723,38733,38735,38737,38741,38999,39013,64042,64043,39207,64044,39326,39502,39641,39644,39797,39794,39823,39857,39867,39936,40304,40299,64045,40473,40657,8634=>8560,8561,8562,8563,8564,8565,8566,8567,8568,8569,65506,65508,65287,65282];
const TABLE_POINTERS = [8470=>1193,8481=>1195,8544=>1148,1149,1150,1151,1152,1153,1154,1155,1156,1157,8730=>162,8736=>153,8745=>126,125,166,8757=>165,8786=>159,8801=>158,8869=>154,12849=>1201,65506=>137];
protected static $pointerCache;
public function nextCode() {
if (($b = @$this->string[$this->posByte++]) === "") {
// clean EOF
@ -93,7 +95,7 @@ class ShiftJIS extends AbstractEncoding implements Coder, Decoder {
$codePoint = 0xFF0D;
// no break;
default:
$pointer = self::TABLE_POINTERS[$codePoint] ?? array_flip(self::TABLE_CODES)[$codePoint] ?? null;
$pointer = self::TABLE_POINTERS[$codePoint] ?? (self::$pointerCache ?? (self::$pointerCache = array_flip(self::TABLE_CODES)))[$codePoint] ?? null;
if (isset($pointer)) {
$lead = (int) ($pointer / 188);
$leadOffset = ($lead < 0x1F) ? 0x81 : 0xC1;

109
tests/cases/Encoding/TestISO2022JP.php

@ -34,52 +34,54 @@ class TestISO2022JP extends \MensBeam\Intl\Test\CoderDecoderTest {
public function provideCodePoints() {
return [
'U+0020 (HTML)' => [false, [0x20], "20"],
'U+0020 (fatal)' => [true, [0x20], "20"],
'U+005C (HTML)' => [false, [0x5C], "5C"],
'U+005C (fatal)' => [true, [0x5C], "5C"],
'U+007E (HTML)' => [false, [0x7E], "7E"],
'U+007E (fatal)' => [true, [0x7E], "7E"],
'U+00A5 (HTML)' => [false, [0xA5], "1B 28 4A 5C 1B 28 42"],
'U+00A5 (fatal)' => [true, [0xA5], "1B 28 4A 5C 1B 28 42"],
'U+203E (HTML)' => [false, [0x203E], "1B 28 4A 7E 1B 28 42"],
'U+203E (fatal)' => [true, [0x203E], "1B 28 4A 7E 1B 28 42"],
'U+FF61 (HTML)' => [false, [0xFF61], "1B 24 42 21 23 1B 28 42"],
'U+FF61 (fatal)' => [true, [0xFF61], "1B 24 42 21 23 1B 28 42"],
'U+FF9F (HTML)' => [false, [0xFF9F], "1B 24 42 21 2C 1B 28 42"],
'U+FF9F (fatal)' => [true, [0xFF9F], "1B 24 42 21 2C 1B 28 42"],
'U+2212 (HTML)' => [false, [0x2212], "1B 24 42 21 5D 1B 28 42"],
'U+2212 (fatal)' => [true, [0x2212], "1B 24 42 21 5D 1B 28 42"],
'U+2116 (HTML)' => [false, [0x2116], "1B 24 42 2D 62 1B 28 42"],
'U+2116 (fatal)' => [true, [0x2116], "1B 24 42 2D 62 1B 28 42"],
'U+FFE2 (HTML)' => [false, [0xFFE2], "1B 24 42 22 4C 1B 28 42"],
'U+FFE2 (fatal)' => [true, [0xFFE2], "1B 24 42 22 4C 1B 28 42"],
'U+00C6 (HTML)' => [false, [0xC6], "26 23 31 39 38 3B"],
'U+00C6 (fatal)' => [true, [0xC6], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)],
'U+FFFD (HTML)' => [false, [0xFFFD], "26 23 36 35 35 33 33 3B"],
'U+FFFD (fatal)' => [true, [0xFFFD], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)],
'Roman (HTML)' => [false, [0xA5, 0x20, 0x203E], "1B 28 4A 5C 20 7E 1B 28 42"],
'Roman (fatal)' => [true, [0xA5, 0x20, 0x203E], "1B 28 4A 5C 20 7E 1B 28 42"],
'Roman to ASCII (HTML)' => [false, [0xA5, 0x5C], "1B 28 4A 5C 1B 28 42 5C"],
'Roman to ASCII (fatal)' => [true, [0xA5, 0x5C], "1B 28 4A 5C 1B 28 42 5C"],
'Roman to error (HTML)' => [false, [0xA5, 0x80], "1B 28 4A 5C 26 23 31 32 38 3B 1B 28 42"],
'Roman to error (fatal)' => [true, [0xA5, 0x80], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)],
'JIS (HTML)' => [false, [0x2116, 0xFFE2, 0x2212], "1B 24 42 2D 62 22 4C 21 5D 1B 28 42"],
'JIS (fatal)' => [true, [0x2116, 0xFFE2, 0x2212], "1B 24 42 2D 62 22 4C 21 5D 1B 28 42"],
'JIS to Roman (HTML)' => [false, [0x2116, 0xA5], "1B 24 42 2D 62 1B 28 4A 5C 1B 28 42"],
'JIS to Roman (fatal)' => [true, [0x2116, 0xA5], "1B 24 42 2D 62 1B 28 4A 5C 1B 28 42"],
'JIS to ASCII 1 (HTML)' => [false, [0x2116, 0x20], "1B 24 42 2D 62 1B 28 42 20"],
'JIS to ASCII 1 (fatal)' => [true, [0x2116, 0x20], "1B 24 42 2D 62 1B 28 42 20"],
'JIS to ASCII 2 (HTML)' => [false, [0x2116, 0x5C], "1B 24 42 2D 62 1B 28 42 5C"],
'JIS to ASCII 2 (fatal)' => [true, [0x2116, 0x5C], "1B 24 42 2D 62 1B 28 42 5C"],
'JIS to error (HTML)' => [false, [0x2116, 0x80], "1B 24 42 2D 62 1B 28 42 26 23 31 32 38 3B"],
'JIS to error (fatal)' => [true, [0x2116, 0x80], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)],
'Escape characters (HTML)' => [false, [0x1B, 0xE, 0xF], "26 23 36 35 35 33 33 3B 26 23 36 35 35 33 33 3B 26 23 36 35 35 33 33 3B"],
'Escape characters (fatal)' => [true, [0x1B, 0xE, 0xF], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)],
'-1 (HTML)' => [false, [-1], new EncoderException("", Coder::E_INVALID_CODE_POINT)],
'-1 (fatal)' => [true, [-1], new EncoderException("", Coder::E_INVALID_CODE_POINT)],
'0x110000 (HTML)' => [false, [0x110000], new EncoderException("", Coder::E_INVALID_CODE_POINT)],
'0x110000 (fatal)' => [true, [0x110000], new EncoderException("", Coder::E_INVALID_CODE_POINT)],
'U+0020 (HTML)' => [false, [0x20], "20"],
'U+0020 (fatal)' => [true, [0x20], "20"],
'U+005C (HTML)' => [false, [0x5C], "5C"],
'U+005C (fatal)' => [true, [0x5C], "5C"],
'U+007E (HTML)' => [false, [0x7E], "7E"],
'U+007E (fatal)' => [true, [0x7E], "7E"],
'U+00A5 (HTML)' => [false, [0xA5], "1B 28 4A 5C 1B 28 42"],
'U+00A5 (fatal)' => [true, [0xA5], "1B 28 4A 5C 1B 28 42"],
'U+203E (HTML)' => [false, [0x203E], "1B 28 4A 7E 1B 28 42"],
'U+203E (fatal)' => [true, [0x203E], "1B 28 4A 7E 1B 28 42"],
'U+FF61 (HTML)' => [false, [0xFF61], "1B 24 42 21 23 1B 28 42"],
'U+FF61 (fatal)' => [true, [0xFF61], "1B 24 42 21 23 1B 28 42"],
'U+FF9F (HTML)' => [false, [0xFF9F], "1B 24 42 21 2C 1B 28 42"],
'U+FF9F (fatal)' => [true, [0xFF9F], "1B 24 42 21 2C 1B 28 42"],
'U+2212 (HTML)' => [false, [0x2212], "1B 24 42 21 5D 1B 28 42"],
'U+2212 (fatal)' => [true, [0x2212], "1B 24 42 21 5D 1B 28 42"],
'U+2116 (HTML)' => [false, [0x2116], "1B 24 42 2D 62 1B 28 42"],
'U+2116 (fatal)' => [true, [0x2116], "1B 24 42 2D 62 1B 28 42"],
'U+FFE2 (HTML)' => [false, [0xFFE2], "1B 24 42 22 4C 1B 28 42"],
'U+FFE2 (fatal)' => [true, [0xFFE2], "1B 24 42 22 4C 1B 28 42"],
'U+00C6 (HTML)' => [false, [0xC6], "26 23 31 39 38 3B"],
'U+00C6 (fatal)' => [true, [0xC6], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)],
'U+FFFD (HTML)' => [false, [0xFFFD], "26 23 36 35 35 33 33 3B"],
'U+FFFD (fatal)' => [true, [0xFFFD], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)],
'Roman (HTML)' => [false, [0xA5, 0x20, 0x203E], "1B 28 4A 5C 20 7E 1B 28 42"],
'Roman (fatal)' => [true, [0xA5, 0x20, 0x203E], "1B 28 4A 5C 20 7E 1B 28 42"],
'Roman to ASCII (HTML)' => [false, [0xA5, 0x5C], "1B 28 4A 5C 1B 28 42 5C"],
'Roman to ASCII (fatal)' => [true, [0xA5, 0x5C], "1B 28 4A 5C 1B 28 42 5C"],
'Roman to error (HTML)' => [false, [0xA5, 0x80], "1B 28 4A 5C 26 23 31 32 38 3B 1B 28 42"],
'Roman to error (fatal)' => [true, [0xA5, 0x80], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)],
'JIS (HTML)' => [false, [0x2116, 0xFFE2, 0x2212], "1B 24 42 2D 62 22 4C 21 5D 1B 28 42"],
'JIS (fatal)' => [true, [0x2116, 0xFFE2, 0x2212], "1B 24 42 2D 62 22 4C 21 5D 1B 28 42"],
'JIS to Roman (HTML)' => [false, [0x2116, 0xA5], "1B 24 42 2D 62 1B 28 4A 5C 1B 28 42"],
'JIS to Roman (fatal)' => [true, [0x2116, 0xA5], "1B 24 42 2D 62 1B 28 4A 5C 1B 28 42"],
'JIS to ASCII 1 (HTML)' => [false, [0x2116, 0x20], "1B 24 42 2D 62 1B 28 42 20"],
'JIS to ASCII 1 (fatal)' => [true, [0x2116, 0x20], "1B 24 42 2D 62 1B 28 42 20"],
'JIS to ASCII 2 (HTML)' => [false, [0x2116, 0x5C], "1B 24 42 2D 62 1B 28 42 5C"],
'JIS to ASCII 2 (fatal)' => [true, [0x2116, 0x5C], "1B 24 42 2D 62 1B 28 42 5C"],
'JIS to error 1 (HTML)' => [false, [0x2116, 0x80], "1B 24 42 2D 62 1B 28 42 26 23 31 32 38 3B"],
'JIS to error 1 (fatal)' => [true, [0x2116, 0x80], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)],
'JIS to error 2 (HTML)' => [false, [0x2116, 0x1B], "1B 24 42 2D 62 1B 28 42 26 23 36 35 35 33 33 3B"],
'JIS to error 2 (fatal)' => [true, [0x2116, 0x1B], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)],
'Escape characters (HTML)' => [false, [0x1B, 0xE, 0xF], "26 23 36 35 35 33 33 3B 26 23 36 35 35 33 33 3B 26 23 36 35 35 33 33 3B"],
'Escape characters (fatal)' => [true, [0x1B, 0xE, 0xF], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)],
'-1 (HTML)' => [false, [-1], new EncoderException("", Coder::E_INVALID_CODE_POINT)],
'-1 (fatal)' => [true, [-1], new EncoderException("", Coder::E_INVALID_CODE_POINT)],
'0x110000 (HTML)' => [false, [0x110000], new EncoderException("", Coder::E_INVALID_CODE_POINT)],
'0x110000 (fatal)' => [true, [0x110000], new EncoderException("", Coder::E_INVALID_CODE_POINT)],
];
}
@ -116,10 +118,21 @@ class TestISO2022JP extends \MensBeam\Intl\Test\CoderDecoderTest {
/**
* @dataProvider provideCodePoints
* @coversNothing
* @covers MensBeam\Intl\Encoding\ISO2022JP::encode
*/
public function testEncodeCodePointsStatically(bool $fatal, $input, $exp) {
return parent::testEncodeCodePointsStatically($fatal, $input, $exp);
$out = "";
if ($exp instanceof \Throwable) {
$this->expectException(get_class($exp));
$this->expectExceptionCode($exp->getCode());
} else {
$exp = strtolower(str_replace(" ", "", $exp));
}
foreach ($input as $char) {
$out .= ISO2022JP::encode($char, $fatal, $mode);
}
$out .= ISO2022JP::encode(null, $fatal, $mode);
$this->assertSame($exp, bin2hex($out));
}
/**

5
tests/cases/Encoding/TestSingleByte.php

@ -88,9 +88,8 @@ class TestSingleByte extends \MensBeam\Intl\Test\CoderDecoderTest {
* @covers MensBeam\Intl\Encoding\SingleByteEncoding::encode
*/
public function testEncodeCodePoints(bool $fatal, $input, $exp, string $class = SingleByteEncoding::class) {
$e = new Encoder($class::NAME, $fatal);
$out = $e->encode($input);
$this->assertSame(bin2hex($exp), bin2hex($out));
$this->testedClass = $class;
return parent::testEncodeCodePoints($fatal, $input, bin2hex($exp));
}
/**

6
tests/lib/CoderDecoderTest.php

@ -22,6 +22,12 @@ abstract class CoderDecoderTest extends DecoderTest {
}
$out = $e->encode($input);
$this->assertSame($exp, bin2hex($out));
$out = "";
foreach ($input as $c) {
$out .= $e->encodeChar($c);
}
$out .= $e->finalize();
$this->assertSame($exp, bin2hex($out));
}
public function testEncodeCodePointsStatically(bool $fatal, $input, $exp) {

3
tools/test-iso2022jp.html

@ -36,7 +36,8 @@ var sampleCharacters = {
'JIS to Roman': [0x2116, 0xA5],
'JIS to ASCII 1': [0x2116, 0x20],
'JIS to ASCII 2': [0x2116, 0x5C],
'JIS to error': [0x2116, 0x80],
'JIS to error 1': [0x2116, 0x80],
'JIS to error 2': [0x2116, 0x1B], // Even Firefox is wrong here; see https://github.com/web-platform-tests/wpt/pull/26158
'Escape characters': [0x1B, 0xE, 0xF], // Even Firefox is wrong here; see https://github.com/web-platform-tests/wpt/pull/26158
'-1': [-1],
'0x110000': [0x110000],

Loading…
Cancel
Save