From 7de6d7a6fc5eddbd9ccbb385516b1fd3f8e1d505 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Sat, 11 Aug 2018 14:39:58 -0400 Subject: [PATCH] Implement ISO-8859-6 single-byte encoding Other single-byte encodings to follow --- lib/Encoding/Encoding.php | 1 + lib/Encoding/GenericEncoding.php | 8 +- lib/Encoding/ISO88596.php | 31 ++ lib/Encoding/SingleByteEncoding.php | 107 ++++++ tests/cases/Encoding/TestSingleByte.php | 459 ++++++++++++++++++++++++ tests/phpunit.xml | 3 +- tools/mkindex.php | 54 +++ 7 files changed, 658 insertions(+), 5 deletions(-) create mode 100644 lib/Encoding/ISO88596.php create mode 100644 lib/Encoding/SingleByteEncoding.php create mode 100644 tests/cases/Encoding/TestSingleByte.php create mode 100644 tools/mkindex.php diff --git a/lib/Encoding/Encoding.php b/lib/Encoding/Encoding.php index 15a9616..8cf7561 100644 --- a/lib/Encoding/Encoding.php +++ b/lib/Encoding/Encoding.php @@ -16,6 +16,7 @@ interface Encoding { const E_INVALID_CODE_POINT = 1; const E_INVALID_BYTE = 2; const E_INVALID_MODE = 3; + const E_UNAVAILABLE_CODE_POINT = 4; /** Constructs a new decoder * diff --git a/lib/Encoding/GenericEncoding.php b/lib/Encoding/GenericEncoding.php index 3bfc31b..b184552 100644 --- a/lib/Encoding/GenericEncoding.php +++ b/lib/Encoding/GenericEncoding.php @@ -124,15 +124,15 @@ trait GenericEncoding { case self::MODE_REPLACE: // standard "replace" mode return 0xFFFD; - case self::MODE_HTML: // @codeCoverageIgnore + case self::MODE_HTML: // the "html" replacement mode; not applicable to Unicode transformation formats - return "&#".(string) $data.";"; // @codeCoverageIgnore + return "&#".(string) $data.";"; case self::MODE_FATAL_DEC: // fatal replacement mode for decoders throw new DecoderException("Invalid code sequence at character offset {$data[0]} (byte offset {$data[1]})", self::E_INVALID_BYTE); - case self::MODE_FATAL_ENC: // @codeCoverageIgnore + case self::MODE_FATAL_ENC: // fatal replacement mode for decoders; not applicable to Unicode transformation formats - throw new EncoderException("Code point $data not available in target encoding", self::E_INVALID_BYTE); // @codeCoverageIgnore + throw new EncoderException("Code point $data not available in target encoding", self::E_UNAVAILABLE_CODE_POINT); default: // indicative of internal bug; should never be triggered throw new DecoderException("Invalid replacement mode {$mode}", self::E_INVALID_MODE); // @codeCoverageIgnore diff --git a/lib/Encoding/ISO88596.php b/lib/Encoding/ISO88596.php new file mode 100644 index 0000000..bc4d332 --- /dev/null +++ b/lib/Encoding/ISO88596.php @@ -0,0 +1,31 @@ +"\u{a4}",44=>"\u{60c}","\u{ad}",59=>"\u{61b}",63=>"\u{61f}",65=>"\u{621}","\u{622}","\u{623}","\u{624}","\u{625}","\u{626}","\u{627}","\u{628}","\u{629}","\u{62a}","\u{62b}","\u{62c}","\u{62d}","\u{62e}","\u{62f}","\u{630}","\u{631}","\u{632}","\u{633}","\u{634}","\u{635}","\u{636}","\u{637}","\u{638}","\u{639}","\u{63a}",96=>"\u{640}","\u{641}","\u{642}","\u{643}","\u{644}","\u{645}","\u{646}","\u{647}","\u{648}","\u{649}","\u{64a}","\u{64b}","\u{64c}","\u{64d}","\u{64e}","\u{64f}","\u{650}","\u{651}","\u{652}"]; + const TABLE_DEC_CODE = [128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,36=>164,44=>1548,173,59=>1563,63=>1567,65=>1569,1570,1571,1572,1573,1574,1575,1576,1577,1578,1579,1580,1581,1582,1583,1584,1585,1586,1587,1588,1589,1590,1591,1592,1593,1594,96=>1600,1601,1602,1603,1604,1605,1606,1607,1608,1609,1610,1611,1612,1613,1614,1615,1616,1617,1618]; + const TABLE_ENC = [128=>"\x80","\x81","\x82","\x83","\x84","\x85","\x86","\x87","\x88","\x89","\x8A","\x8B","\x8C","\x8D","\x8E","\x8F","\x90","\x91","\x92","\x93","\x94","\x95","\x96","\x97","\x98","\x99","\x9A","\x9B","\x9C","\x9D","\x9E","\x9F","\xA0",164=>"\xA4",173=>"\xAD",1548=>"\xAC",1563=>"\xBB",1567=>"\xBF",1569=>"\xC1","\xC2","\xC3","\xC4","\xC5","\xC6","\xC7","\xC8","\xC9","\xCA","\xCB","\xCC","\xCD","\xCE","\xCF","\xD0","\xD1","\xD2","\xD3","\xD4","\xD5","\xD6","\xD7","\xD8","\xD9","\xDA",1600=>"\xE0","\xE1","\xE2","\xE3","\xE4","\xE5","\xE6","\xE7","\xE8","\xE9","\xEA","\xEB","\xEC","\xED","\xEE","\xEF","\xF0","\xF1","\xF2"]; +} diff --git a/lib/Encoding/SingleByteEncoding.php b/lib/Encoding/SingleByteEncoding.php new file mode 100644 index 0000000..ee7593c --- /dev/null +++ b/lib/Encoding/SingleByteEncoding.php @@ -0,0 +1,107 @@ +string[$this->posChar]; + if ($b === "") { + return ""; + } + $this->posChar++; + $p = ord($b); + if ($p < 0x80) { + // if the byte is an ASCII character or end of input, simply return it + return $b; + } else { + return static::TABLE_DEC_CHAR[$p - 128] ?? UTF8::encode(static::err($this->errMode, [$this->posChar, $this->posChar])); + } + } + + /** Decodes the next character from the string and returns its code point number + * + * If the end of the string has been reached, false is returned + * + * @return int|bool + */ + public function nextCode() { + // get the byte at the current position + $b = @$this->string[$this->posChar]; + if ($b === "") { + return false; + } + $this->posChar++; + $p = ord($b); + if ($p < 0x80) { + // if the byte is an ASCII character or end of input, simply return it + return $p; + } else { + return static::TABLE_DEC_CODE[$p - 128] ?? static::err($this->errMode, [$this->posChar, $this->posChar]); + } + } + + /** Returns the encoding of $codePoint as a byte string + * + * If $codePoint is less than 0 or greater than 1114111, an exception is thrown + * + * If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted + */ + public static function encode(int $codePoint, bool $fatal = true): string { + if ($codePoint < 0 || $codePoint > 0x10FFFF) { + throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT); + } elseif ($codePoint < 128) { + return chr($codePoint); + } else { + return static::TABLE_ENC[$codePoint] ?? static::err($fatal ? self::MODE_FATAL_ENC : self::MODE_HTML, $codePoint); + } + } + + /** Advance $distance characters through the string + * + * If $distance is negative, the operation will be performed in reverse + * + * If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned + */ + public function seek(int $distance): int { + if ($distance > 0) { + while ($this->posChar < $this->lenByte && $distance > 0) { + $this->nextCode(); + $distance--; + } + return $distance; + } elseif ($distance < 0) { + $distance = abs($distance); + while ($this->posChar > 0 && $distance > 0) { + $this->posChar--; + $distance--; + } + return $distance; + } else { + return 0; + } + } + + /** Returns the current byte position of the decoder */ + public function posByte(): int { + return $this->posChar; + } + + /** Calculates the length of the string in code points + * + * Note that this may involve processing to the end of the string + */ + public function len(): int { + return $this->lenByte; + } +} diff --git a/tests/cases/Encoding/TestSingleByte.php b/tests/cases/Encoding/TestSingleByte.php new file mode 100644 index 0000000..e14fcb4 --- /dev/null +++ b/tests/cases/Encoding/TestSingleByte.php @@ -0,0 +1,459 @@ + [1040,1041,1042,1043,1044,1045,1046,1047,1048,1049,1050,1051,1052,1053,1054,1055,1056,1057,1058,1059,1060,1061,1062,1063,1064,1065,1066,1067,1068,1069,1070,1071,1072,1073,1074,1075,1076,1077,1078,1079,1080,1081,1082,1083,1084,1085,1086,1087,9617,9618,9619,9474,9508,9569,9570,9558,9557,9571,9553,9559,9565,9564,9563,9488,9492,9524,9516,9500,9472,9532,9566,9567,9562,9556,9577,9574,9568,9552,9580,9575,9576,9572,9573,9561,9560,9554,9555,9579,9578,9496,9484,9608,9604,9612,9616,9600,1088,1089,1090,1091,1092,1093,1094,1095,1096,1097,1098,1099,1100,1101,1102,1103,1025,1105,1028,1108,1031,1111,1038,1118,176,8729,183,8730,8470,164,9632,160], + "ISO-8859-2" => [128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,260,728,321,164,317,346,167,168,352,350,356,377,173,381,379,176,261,731,322,180,318,347,711,184,353,351,357,378,733,382,380,340,193,194,258,196,313,262,199,268,201,280,203,282,205,206,270,272,323,327,211,212,336,214,215,344,366,218,368,220,221,354,223,341,225,226,259,228,314,263,231,269,233,281,235,283,237,238,271,273,324,328,243,244,337,246,247,345,367,250,369,252,253,355,729], + "ISO-8859-3" => [128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,294,728,163,164,null,292,167,168,304,350,286,308,173,null,379,176,295,178,179,180,181,293,183,184,305,351,287,309,189,null,380,192,193,194,null,196,266,264,199,200,201,202,203,204,205,206,207,null,209,210,211,212,288,214,215,284,217,218,219,220,364,348,223,224,225,226,null,228,267,265,231,232,233,234,235,236,237,238,239,null,241,242,243,244,289,246,247,285,249,250,251,252,365,349,729], + "ISO-8859-4" => [128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,260,312,342,164,296,315,167,168,352,274,290,358,173,381,175,176,261,731,343,180,297,316,711,184,353,275,291,359,330,382,331,256,193,194,195,196,197,198,302,268,201,280,203,278,205,206,298,272,325,332,310,212,213,214,215,216,370,218,219,220,360,362,223,257,225,226,227,228,229,230,303,269,233,281,235,279,237,238,299,273,326,333,311,244,245,246,247,248,371,250,251,252,361,363,729], + "ISO-8859-5" => [128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,1025,1026,1027,1028,1029,1030,1031,1032,1033,1034,1035,1036,173,1038,1039,1040,1041,1042,1043,1044,1045,1046,1047,1048,1049,1050,1051,1052,1053,1054,1055,1056,1057,1058,1059,1060,1061,1062,1063,1064,1065,1066,1067,1068,1069,1070,1071,1072,1073,1074,1075,1076,1077,1078,1079,1080,1081,1082,1083,1084,1085,1086,1087,1088,1089,1090,1091,1092,1093,1094,1095,1096,1097,1098,1099,1100,1101,1102,1103,8470,1105,1106,1107,1108,1109,1110,1111,1112,1113,1114,1115,1116,167,1118,1119], + "ISO-8859-6" => [128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,null,null,null,164,null,null,null,null,null,null,null,1548,173,null,null,null,null,null,null,null,null,null,null,null,null,null,1563,null,null,null,1567,null,1569,1570,1571,1572,1573,1574,1575,1576,1577,1578,1579,1580,1581,1582,1583,1584,1585,1586,1587,1588,1589,1590,1591,1592,1593,1594,null,null,null,null,null,1600,1601,1602,1603,1604,1605,1606,1607,1608,1609,1610,1611,1612,1613,1614,1615,1616,1617,1618,null,null,null,null,null,null,null,null,null,null,null,null,null], + "ISO-8859-7" => [128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,8216,8217,163,8364,8367,166,167,168,169,890,171,172,173,null,8213,176,177,178,179,900,901,902,183,904,905,906,187,908,189,910,911,912,913,914,915,916,917,918,919,920,921,922,923,924,925,926,927,928,929,null,931,932,933,934,935,936,937,938,939,940,941,942,943,944,945,946,947,948,949,950,951,952,953,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,969,970,971,972,973,974,null], + "ISO-8859-8" => [128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,null,162,163,164,165,166,167,168,169,215,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,247,187,188,189,190,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,8215,1488,1489,1490,1491,1492,1493,1494,1495,1496,1497,1498,1499,1500,1501,1502,1503,1504,1505,1506,1507,1508,1509,1510,1511,1512,1513,1514,null,null,8206,8207,null], + "ISO-8859-8-I" => [128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,null,162,163,164,165,166,167,168,169,215,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,247,187,188,189,190,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,8215,1488,1489,1490,1491,1492,1493,1494,1495,1496,1497,1498,1499,1500,1501,1502,1503,1504,1505,1506,1507,1508,1509,1510,1511,1512,1513,1514,null,null,8206,8207,null], + "ISO-8859-10" => [128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,260,274,290,298,296,310,167,315,272,352,358,381,173,362,330,176,261,275,291,299,297,311,183,316,273,353,359,382,8213,363,331,256,193,194,195,196,197,198,302,268,201,280,203,278,205,206,207,208,325,332,211,212,213,214,360,216,370,218,219,220,221,222,223,257,225,226,227,228,229,230,303,269,233,281,235,279,237,238,239,240,326,333,243,244,245,246,361,248,371,250,251,252,253,254,312], + "ISO-8859-13" => [128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,8221,162,163,164,8222,166,167,216,169,342,171,172,173,174,198,176,177,178,179,8220,181,182,183,248,185,343,187,188,189,190,230,260,302,256,262,196,197,280,274,268,201,377,278,290,310,298,315,352,323,325,211,332,213,214,215,370,321,346,362,220,379,381,223,261,303,257,263,228,229,281,275,269,233,378,279,291,311,299,316,353,324,326,243,333,245,246,247,371,322,347,363,252,380,382,8217], + "ISO-8859-14" => [128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,7682,7683,163,266,267,7690,167,7808,169,7810,7691,7922,173,174,376,7710,7711,288,289,7744,7745,182,7766,7809,7767,7811,7776,7923,7812,7813,7777,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,372,209,210,211,212,213,214,7786,216,217,218,219,220,221,374,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,373,241,242,243,244,245,246,7787,248,249,250,251,252,253,375,255], + "ISO-8859-15" => [128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,8364,165,352,167,353,169,170,171,172,173,174,175,176,177,178,179,381,181,182,183,382,185,186,187,338,339,376,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255], + "ISO-8859-16" => [128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,260,261,321,8364,8222,352,167,353,169,536,171,377,173,378,379,176,177,268,322,381,8221,182,183,382,269,537,187,338,339,376,380,192,193,194,258,196,262,198,199,200,201,202,203,204,205,206,207,272,323,210,211,212,336,214,346,368,217,218,219,220,280,538,223,224,225,226,259,228,263,230,231,232,233,234,235,236,237,238,239,273,324,242,243,244,337,246,347,369,249,250,251,252,281,539,255], + "KOI8-R" => [9472,9474,9484,9488,9492,9496,9500,9508,9516,9524,9532,9600,9604,9608,9612,9616,9617,9618,9619,8992,9632,8729,8730,8776,8804,8805,160,8993,176,178,183,247,9552,9553,9554,1105,9555,9556,9557,9558,9559,9560,9561,9562,9563,9564,9565,9566,9567,9568,9569,1025,9570,9571,9572,9573,9574,9575,9576,9577,9578,9579,9580,169,1102,1072,1073,1094,1076,1077,1092,1075,1093,1080,1081,1082,1083,1084,1085,1086,1087,1103,1088,1089,1090,1091,1078,1074,1100,1099,1079,1096,1101,1097,1095,1098,1070,1040,1041,1062,1044,1045,1060,1043,1061,1048,1049,1050,1051,1052,1053,1054,1055,1071,1056,1057,1058,1059,1046,1042,1068,1067,1047,1064,1069,1065,1063,1066], + "KOI8-U" => [9472,9474,9484,9488,9492,9496,9500,9508,9516,9524,9532,9600,9604,9608,9612,9616,9617,9618,9619,8992,9632,8729,8730,8776,8804,8805,160,8993,176,178,183,247,9552,9553,9554,1105,1108,9556,1110,1111,9559,9560,9561,9562,9563,1169,1118,9566,9567,9568,9569,1025,1028,9571,1030,1031,9574,9575,9576,9577,9578,1168,1038,169,1102,1072,1073,1094,1076,1077,1092,1075,1093,1080,1081,1082,1083,1084,1085,1086,1087,1103,1088,1089,1090,1091,1078,1074,1100,1099,1079,1096,1101,1097,1095,1098,1070,1040,1041,1062,1044,1045,1060,1043,1061,1048,1049,1050,1051,1052,1053,1054,1055,1071,1056,1057,1058,1059,1046,1042,1068,1067,1047,1064,1069,1065,1063,1066], + "macintosh" => [196,197,199,201,209,214,220,225,224,226,228,227,229,231,233,232,234,235,237,236,238,239,241,243,242,244,246,245,250,249,251,252,8224,176,162,163,167,8226,182,223,174,169,8482,180,168,8800,198,216,8734,177,8804,8805,165,181,8706,8721,8719,960,8747,170,186,937,230,248,191,161,172,8730,402,8776,8710,171,187,8230,160,192,195,213,338,339,8211,8212,8220,8221,8216,8217,247,9674,255,376,8260,8364,8249,8250,64257,64258,8225,183,8218,8222,8240,194,202,193,203,200,205,206,207,204,211,212,63743,210,218,219,217,305,710,732,175,728,729,730,184,733,731,711], + "windows-874" => [8364,129,130,131,132,8230,134,135,136,137,138,139,140,141,142,143,144,8216,8217,8220,8221,8226,8211,8212,152,153,154,155,156,157,158,159,160,3585,3586,3587,3588,3589,3590,3591,3592,3593,3594,3595,3596,3597,3598,3599,3600,3601,3602,3603,3604,3605,3606,3607,3608,3609,3610,3611,3612,3613,3614,3615,3616,3617,3618,3619,3620,3621,3622,3623,3624,3625,3626,3627,3628,3629,3630,3631,3632,3633,3634,3635,3636,3637,3638,3639,3640,3641,3642,null,null,null,null,3647,3648,3649,3650,3651,3652,3653,3654,3655,3656,3657,3658,3659,3660,3661,3662,3663,3664,3665,3666,3667,3668,3669,3670,3671,3672,3673,3674,3675,null,null,null,null], + "windows-1250" => [8364,129,8218,131,8222,8230,8224,8225,136,8240,352,8249,346,356,381,377,144,8216,8217,8220,8221,8226,8211,8212,152,8482,353,8250,347,357,382,378,160,711,728,321,164,260,166,167,168,169,350,171,172,173,174,379,176,177,731,322,180,181,182,183,184,261,351,187,317,733,318,380,340,193,194,258,196,313,262,199,268,201,280,203,282,205,206,270,272,323,327,211,212,336,214,215,344,366,218,368,220,221,354,223,341,225,226,259,228,314,263,231,269,233,281,235,283,237,238,271,273,324,328,243,244,337,246,247,345,367,250,369,252,253,355,729], + "windows-1251" => [1026,1027,8218,1107,8222,8230,8224,8225,8364,8240,1033,8249,1034,1036,1035,1039,1106,8216,8217,8220,8221,8226,8211,8212,152,8482,1113,8250,1114,1116,1115,1119,160,1038,1118,1032,164,1168,166,167,1025,169,1028,171,172,173,174,1031,176,177,1030,1110,1169,181,182,183,1105,8470,1108,187,1112,1029,1109,1111,1040,1041,1042,1043,1044,1045,1046,1047,1048,1049,1050,1051,1052,1053,1054,1055,1056,1057,1058,1059,1060,1061,1062,1063,1064,1065,1066,1067,1068,1069,1070,1071,1072,1073,1074,1075,1076,1077,1078,1079,1080,1081,1082,1083,1084,1085,1086,1087,1088,1089,1090,1091,1092,1093,1094,1095,1096,1097,1098,1099,1100,1101,1102,1103], + "windows-1252" => [8364,129,8218,402,8222,8230,8224,8225,710,8240,352,8249,338,141,381,143,144,8216,8217,8220,8221,8226,8211,8212,732,8482,353,8250,339,157,382,376,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255], + "windows-1253" => [8364,129,8218,402,8222,8230,8224,8225,136,8240,138,8249,140,141,142,143,144,8216,8217,8220,8221,8226,8211,8212,152,8482,154,8250,156,157,158,159,160,901,902,163,164,165,166,167,168,169,null,171,172,173,174,8213,176,177,178,179,900,181,182,183,904,905,906,187,908,189,910,911,912,913,914,915,916,917,918,919,920,921,922,923,924,925,926,927,928,929,null,931,932,933,934,935,936,937,938,939,940,941,942,943,944,945,946,947,948,949,950,951,952,953,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,969,970,971,972,973,974,null], + "windows-1254" => [8364,129,8218,402,8222,8230,8224,8225,710,8240,352,8249,338,141,142,143,144,8216,8217,8220,8221,8226,8211,8212,732,8482,353,8250,339,157,158,376,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,286,209,210,211,212,213,214,215,216,217,218,219,220,304,350,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,287,241,242,243,244,245,246,247,248,249,250,251,252,305,351,255], + "windows-1255" => [8364,129,8218,402,8222,8230,8224,8225,710,8240,138,8249,140,141,142,143,144,8216,8217,8220,8221,8226,8211,8212,732,8482,154,8250,156,157,158,159,160,161,162,163,8362,165,166,167,168,169,215,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,247,187,188,189,190,191,1456,1457,1458,1459,1460,1461,1462,1463,1464,1465,1466,1467,1468,1469,1470,1471,1472,1473,1474,1475,1520,1521,1522,1523,1524,null,null,null,null,null,null,null,1488,1489,1490,1491,1492,1493,1494,1495,1496,1497,1498,1499,1500,1501,1502,1503,1504,1505,1506,1507,1508,1509,1510,1511,1512,1513,1514,null,null,8206,8207,null], + "windows-1256" => [8364,1662,8218,402,8222,8230,8224,8225,710,8240,1657,8249,338,1670,1688,1672,1711,8216,8217,8220,8221,8226,8211,8212,1705,8482,1681,8250,339,8204,8205,1722,160,1548,162,163,164,165,166,167,168,169,1726,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,1563,187,188,189,190,1567,1729,1569,1570,1571,1572,1573,1574,1575,1576,1577,1578,1579,1580,1581,1582,1583,1584,1585,1586,1587,1588,1589,1590,215,1591,1592,1593,1594,1600,1601,1602,1603,224,1604,226,1605,1606,1607,1608,231,232,233,234,235,1609,1610,238,239,1611,1612,1613,1614,244,1615,1616,247,1617,249,1618,251,252,8206,8207,1746], + "windows-1257" => [8364,129,8218,131,8222,8230,8224,8225,136,8240,138,8249,140,168,711,184,144,8216,8217,8220,8221,8226,8211,8212,152,8482,154,8250,156,175,731,159,160,null,162,163,164,null,166,167,216,169,342,171,172,173,174,198,176,177,178,179,180,181,182,183,248,185,343,187,188,189,190,230,260,302,256,262,196,197,280,274,268,201,377,278,290,310,298,315,352,323,325,211,332,213,214,215,370,321,346,362,220,379,381,223,261,303,257,263,228,229,281,275,269,233,378,279,291,311,299,316,353,324,326,243,333,245,246,247,371,322,347,363,252,380,382,729], + "windows-1258" => [8364,129,8218,402,8222,8230,8224,8225,710,8240,138,8249,338,141,142,143,144,8216,8217,8220,8221,8226,8211,8212,732,8482,154,8250,339,157,158,376,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,258,196,197,198,199,200,201,202,203,768,205,206,207,272,209,777,211,212,416,214,215,216,217,218,219,220,431,771,223,224,225,226,259,228,229,230,231,232,233,234,235,769,237,238,239,273,241,803,243,244,417,246,247,248,249,250,251,252,432,8363,255], + "x-mac-cyrillic" => [1040,1041,1042,1043,1044,1045,1046,1047,1048,1049,1050,1051,1052,1053,1054,1055,1056,1057,1058,1059,1060,1061,1062,1063,1064,1065,1066,1067,1068,1069,1070,1071,8224,176,1168,163,167,8226,182,1030,174,169,8482,1026,1106,8800,1027,1107,8734,177,8804,8805,1110,181,1169,1032,1028,1108,1031,1111,1033,1113,1034,1114,1112,1029,172,8730,402,8776,8710,171,187,8230,160,1035,1115,1036,1116,1109,8211,8212,8220,8221,8216,8217,247,8222,1038,1118,1039,1119,8470,1025,1105,1103,1072,1073,1074,1075,1076,1077,1078,1079,1080,1081,1082,1083,1084,1085,1086,1087,1088,1089,1090,1091,1092,1093,1094,1095,1096,1097,1098,1099,1100,1101,1102,8364], + ]; + protected static $classes = [ + "IBM866" => "IBM866", + "ISO-8859-2" => "ISO88592", + "ISO-8859-3" => "ISO88593", + "ISO-8859-4" => "ISO88594", + "ISO-8859-5" => "ISO88595", + "ISO-8859-6" => "ISO88596", + "ISO-8859-7" => "ISO88597", + "ISO-8859-8" => "ISO88598", + "ISO-8859-8-I" => "ISO88598I", + "ISO-8859-10" => "ISO85910", + "ISO-8859-13" => "ISO885913", + "ISO-8859-14" => "ISO885914", + "ISO-8859-15" => "ISO885915", + "ISO-8859-16" => "ISO885916", + "KOI8-R" => "KOI8R", + "KOI8-U" => "KOI8U", + "macintosh" => "Macintosh", + "windows-874" => "Windows874", + "windows-1250" => "Windows1250", + "windows-1251" => "Windows1251", + "windows-1252" => "Windows1252", + "windows-1253" => "Windows1253", + "windows-1254" => "Windows1254", + "windows-1255" => "Windows1255", + "windows-1256" => "Windows1256", + "windows-1257" => "Windows1257", + "windows-1258" => "Windows1258", + "x-mac-cyrillic" => "XMacCyrillic", + ]; + + protected function checkClass($class) { + $class = '\MensBeam\Intl\Encoding\\'.$class; + if (!class_exists($class)) { + $this->markTestSkipped("Not implemented"); + } + return $class; + } + + /** + * @dataProvider provideCodePoints + * @covers MensBeam\Intl\Encoding\SingleByteEncoding::encode + */ + public function testEncodeCodePoints(string $class, array $input, string $exp) { + $class = $this->checkClass($class); + $out = ""; + foreach ($input as $code) { + $out .= $class::encode($code); + } + $this->assertSame(bin2hex($exp), bin2hex($out)); + } + + /** + * @dataProvider provideInvalids + * @covers MensBeam\Intl\Encoding\SingleByteEncoding::encode + * @covers MensBeam\Intl\Encoding\SingleByteEncoding::err + */ + public function testEncodeInvalidCodePoints(string $class, bool $mode, int $input, $exp) { + $class = $this->checkClass($class); + if ($exp instanceof \Throwable) { + $this->expectException(get_class($exp)); + $this->expectExceptionCode($exp->getCode()); + } + $out = $class::encode($input, $mode); + $this->assertSame($exp, $out); + } + + /** + * @dataProvider provideStrings + * @covers MensBeam\Intl\Encoding\SingleByteEncoding::__construct + * @covers MensBeam\Intl\Encoding\SingleByteEncoding::nextCode + */ + public function testDecodeMultipleCharactersAsCodePoints(string $class, string $input, array $exp) { + $class = $this->checkClass($class); + $s = new $class($input); + $out = []; + while (($p = $s->nextCode()) !== false) { + $out[] = $p; + } + $this->assertEquals($exp, $out); + } + + /** + * @dataProvider provideStrings + * @covers MensBeam\Intl\Encoding\SingleByteEncoding::__construct + * @covers MensBeam\Intl\Encoding\SingleByteEncoding::nextChar + */ + public function testDecodeMultipleCharactersAsStrings(string $class, string $input, array $exp) { + $class = $this->checkClass($class); + $exp = array_map(function($v) { + return \IntlChar::chr($v); + }, $exp); + $s = new $class($input); + $out = []; + while (($p = $s->nextChar()) !== "") { + $out[] = $p; + } + $this->assertEquals($exp, $out); + } + + /** + * @dataProvider provideStrings + * @covers MensBeam\Intl\Encoding\SingleByteEncoding::rewind + * @covers MensBeam\Intl\Encoding\SingleByteEncoding::chars + * @covers MensBeam\Intl\Encoding\SingleByteEncoding::codes + */ + public function testIterateThroughAString(string $class, string $input, array $exp) { + $out = []; + $class = $this->checkClass($class); + $s = new $class($input); + $a = 0; + $this->assertTrue(true); // prevent risky test of empty string + foreach ($s->codes() as $index => $p) { + $this->assertSame($a, $index, "Character key at index $a reported incorrectly"); + $this->assertSame($exp[$a], $p, "Character at index $a decoded incorrectly"); + $a++; + } + $a = 0; + foreach ($s->codes() as $p) { + $a++; + } + $this->assertSame(0, $a); + $s->rewind(); + foreach ($s->codes() as $p) { + $a++; + } + $this->assertSame(sizeof($exp), $a); + + $exp = array_map(function($v) { + return \IntlChar::chr($v); + }, $exp); + + foreach ($s->chars() as $index => $p) { + $this->assertSame($a, $index, "Character key at index $a reported incorrectly"); + $this->assertSame(bin2hex($exp[$a]), bin2hex($p), "Character at index $a decoded incorrectly"); + $a++; + } + $a = 0; + foreach ($s->chars() as $p) { + $a++; + } + $this->assertSame(0, $a); + $s->rewind(); + foreach ($s->chars() as $p) { + $a++; + } + $this->assertSame(sizeof($exp), $a); + } + + /** + * @dataProvider provideClasses + * @covers MensBeam\Intl\Encoding\SingleByteEncoding::seek + * @covers MensBeam\Intl\Encoding\SingleByteEncoding::posChar + * @covers MensBeam\Intl\Encoding\SingleByteEncoding::posByte + */ + public function testSeekThroughAString(string $class) { + $class = $this->checkClass($class); + $input = "0123456"; + $s = new $class($input); + $this->assertSame(0, $s->posChar()); + $this->assertSame(0, $s->posByte()); + + $this->assertSame(0, $s->seek(0)); + $this->assertSame(0, $s->posChar()); + $this->assertSame(0, $s->posByte()); + + $this->assertSame(1, $s->seek(-1)); + $this->assertSame(0, $s->posChar()); + $this->assertSame(0, $s->posByte()); + + $this->assertSame(0, $s->seek(1)); + $this->assertSame(1, $s->posChar()); + $this->assertSame(1, $s->posByte()); + + $this->assertSame(0, $s->seek(2)); + $this->assertSame(3, $s->posChar()); + $this->assertSame(3, $s->posByte()); + + $this->assertSame(0, $s->seek(4)); + $this->assertSame(7, $s->posChar()); + $this->assertSame(7, $s->posByte()); + + $this->assertSame(1, $s->seek(1)); + $this->assertSame(7, $s->posChar()); + $this->assertSame(7, $s->posByte()); + + $this->assertSame(0, $s->seek(-3)); + $this->assertSame(4, $s->posChar()); + $this->assertSame(4, $s->posByte()); + + $this->assertSame(6, $s->seek(-10)); + $this->assertSame(0, $s->posChar()); + $this->assertSame(0, $s->posByte()); + } + + /** + * @dataProvider provideClasses + * @covers MensBeam\Intl\Encoding\SingleByteEncoding::posChar + * @covers MensBeam\Intl\Encoding\SingleByteEncoding::posByte + */ + public function testTraversePastTheEndOfAString(string $class) { + $class = $this->checkClass($class); + $input = "a"; + $s = new $class($input); + $this->assertSame(0, $s->posChar()); + $this->assertSame(0, $s->posByte()); + + $this->assertSame("a", $s->nextChar()); + $this->assertSame(1, $s->posChar()); + $this->assertSame(1, $s->posByte()); + + $this->assertSame("", $s->nextChar()); + $this->assertSame(1, $s->posChar()); + $this->assertSame(1, $s->posByte()); + + $s = new $class($input); + $this->assertSame(0, $s->posChar()); + $this->assertSame(0, $s->posByte()); + + $this->assertSame(ord("a"), $s->nextCode()); + $this->assertSame(1, $s->posChar()); + $this->assertSame(1, $s->posByte()); + + $this->assertSame(false, $s->nextCode()); + $this->assertSame(1, $s->posChar()); + $this->assertSame(1, $s->posByte()); + } + + /** + * @dataProvider provideClasses + * @covers MensBeam\Intl\Encoding\SingleByteEncoding::peekChar + */ + public function testPeekAtCharacters(string $class) { + $class = $this->checkClass($class); + $input = "0123456"; + $s = new $class($input); + $s->seek(2); + $this->assertSame(2, $s->posChar()); + $this->assertSame(2, $s->posByte()); + + $this->assertSame(bin2hex("2"), bin2hex($s->peekChar())); + $this->assertSame(2, $s->posChar()); + $this->assertSame(2, $s->posByte()); + + $this->assertSame(bin2hex("23"), bin2hex($s->peekChar(2))); + $this->assertSame(2, $s->posChar()); + $this->assertSame(2, $s->posByte()); + + $s->seek(3); + $this->assertSame(5, $s->posChar()); + $this->assertSame(5, $s->posByte()); + + $this->assertSame(bin2hex("56"), bin2hex($s->peekChar(3))); + $this->assertSame(5, $s->posChar()); + $this->assertSame(5, $s->posByte()); + + $this->assertSame("", $s->peekChar(-5)); + $this->assertSame(5, $s->posChar()); + $this->assertSame(5, $s->posByte()); + } + + /** + * @dataProvider provideClasses + * @covers MensBeam\Intl\Encoding\SingleByteEncoding::peekCode + */ + public function testPeekAtCodePoints(string $class) { + $class = $this->checkClass($class); + $input = "0123456"; + $s = new $class($input); + $s->seek(2); + $this->assertSame(2, $s->posChar()); + $this->assertSame(2, $s->posByte()); + + $this->assertSame([0x32], $s->peekCode()); + $this->assertSame(2, $s->posChar()); + $this->assertSame(2, $s->posByte()); + + $this->assertSame([0x32, 0x33], $s->peekCode(2)); + $this->assertSame(2, $s->posChar()); + $this->assertSame(2, $s->posByte()); + + $s->seek(3); + $this->assertSame(5, $s->posChar()); + $this->assertSame(5, $s->posByte()); + + $this->assertSame([0x35, 0x36], $s->peekCode(3)); + $this->assertSame(5, $s->posChar()); + $this->assertSame(5, $s->posByte()); + + $this->assertSame([], $s->peekCode(-5)); + $this->assertSame(5, $s->posChar()); + $this->assertSame(5, $s->posByte()); + } + + /** + * @dataProvider provideClasses + * @covers MensBeam\Intl\Encoding\SingleByteEncoding::len + * @covers MensBeam\Intl\Encoding\SingleByteEncoding::stateSave + * @covers MensBeam\Intl\Encoding\SingleByteEncoding::stateApply + */ + public function testGetStringLength(string $class) { + $class = $this->checkClass($class); + $input = "0123456"; + $s = new $class($input); + $s->seek(1); + $posChar = $s->posChar(); + $posByte = $s->posByte(); + + $this->assertSame(strlen($input), $s->len()); + $this->assertSame($posChar, $s->posChar()); + $this->assertSame($posByte, $s->posByte()); + } + + /** + * @dataProvider provideStrings + * @covers MensBeam\Intl\Encoding\SingleByteEncoding::err + */ + public function testReplacementModes(string $class, string $input, array $points) { + $class = $this->checkClass($class); + if (($bump = array_search(0xFFFD, $points, true)) === false) { + // if the encoding uses all 128 high byte values, this test is non-operative + $this->assertTrue(true); + return; + } + // officially test replacement characters and null replacement (already effectively tested by other tests) + $s = new $class($input, false); + $s->seek($bump); + $this->assertSame(0xFFFD, $s->nextCode()); + $s->seek(-2); + // test fatal mode + $s = new $class($input, true); + $s->seek($bump); + try { + $p = $s->nextCode(); + } catch (DecoderException $e) { + $p = $e; + } finally { + $this->assertInstanceOf(DecoderException::class, $p); + } + $this->assertSame($bump + 1, $s->posChar()); + $s->seek(-2); + $this->assertSame($bump - 1, $s->posChar()); + try { + $p = $s->peekCode(2); + } catch (DecoderException $e) { + $p = $e; + } finally { + $this->assertInstanceOf(DecoderException::class, $p); + } + $this->assertSame($bump - 1, $s->posChar()); + try { + $p = $s->peekChar(2); + } catch (DecoderException $e) { + $p = $e; + } finally { + $this->assertInstanceOf(DecoderException::class, $p); + } + $this->assertSame($bump - 1, $s->posChar()); + } + + public function provideClasses() { + foreach (self::$classes as $name => $class) { + yield $name => [$class]; + } + } + + public function provideInvalids() { + $exc1 = new EncoderException("", SingleByteEncoding::E_INVALID_CODE_POINT); + $exc2 = new EncoderException("", SingleByteEncoding::E_UNAVAILABLE_CODE_POINT); + foreach (self::$classes as $name => $class) { + yield "$name point < 0 (fatal mode)" => [$class, true, -1, $exc1]; + yield "$name point > 0x10FFFF (fatal mode)" => [$class, true, 0x110000, $exc1]; + yield "$name point unavailable (fatal mode)" => [$class, true, 0xFFFD, $exc2]; + yield "$name point < 0 (HTML mode)" => [$class, false, -1, $exc1]; + yield "$name point > 0x10FFFF (HTML mode)" => [$class, false, 0x110000, $exc1]; + yield "$name point unavailable (HTML mode)" => [$class, false, 0xFFFD, "�"]; + } + } + + public function provideCodePoints() { + foreach (self::$classes as $name => $class) { + $bytes = ""; + $codes = []; + for ($a = 0; $a < 128; $a++) { + $bytes .= chr($a); + $codes[] = $a; + } + for ($a = 0; $a < 128; $a++) { + if (is_null(self::$maps[$name][$a])) { + continue; + } + $bytes .= chr($a + 128); + $codes[] = self::$maps[$name][$a]; + } + yield $name => [$class, $codes, $bytes]; + } + } + + public function provideStrings() { + $bytes = (function() { + $out = ""; + for ($a = 0; $a < 256; $a++) { + $out .= chr($a); + } + return $out; + })(); + foreach (self::$classes as $name => $class) { + $codes = array_merge(range(0,127),array_map(function($v) { + return $v ?? 0xFFFD; + }, self::$maps[$name])); + yield $name => [$class, $bytes, $codes]; + } + } +} diff --git a/tests/phpunit.xml b/tests/phpunit.xml index 7201cb2..7cc127d 100644 --- a/tests/phpunit.xml +++ b/tests/phpunit.xml @@ -17,8 +17,9 @@ - + cases/Encoding/TestUTF8.php + cases/Encoding/TestSingleByte.php diff --git a/tools/mkindex.php b/tools/mkindex.php new file mode 100644 index 0000000..939dee5 --- /dev/null +++ b/tools/mkindex.php @@ -0,0 +1,54 @@ +"; + $i = $index; + } + $dec_code[] = $key."$code"; + $dec_char[] = $key."\"\\u{".$hex."}\""; + // the encoder table will be reprocessed later + $enc[$code] = "\"\\x$byte\""; + $i++; +} +// sort the encoder table by keys to order it correctly +ksort($enc); +$i = 0; +foreach ($enc as $index => $value) { + if ($index == $i) { + $key = ""; + } else { + $key = "$index=>"; + $i = $index; + } + $enc[$index] = "$key$value"; + $i++; +} +$dec_char = implode(",", $dec_char); +$dec_code = implode(",", $dec_code); +$enc = implode(",", $enc); +echo " const TABLE_DEC_CHAR = [$dec_char];\n"; +echo " const TABLE_DEC_CODE = [$dec_code];\n"; +echo " const TABLE_ENC = [$enc];\n";