diff --git a/tests/cases/Encoding/TestBig5.php b/tests/cases/Encoding/TestBig5.php index 77d486d..3652391 100644 --- a/tests/cases/Encoding/TestBig5.php +++ b/tests/cases/Encoding/TestBig5.php @@ -7,6 +7,7 @@ declare(strict_types=1); namespace MensBeam\Intl\TestCase\Encoding; use MensBeam\Intl\Encoding\Big5; +use MensBeam\Intl\Encoding\Encoding; use MensBeam\Intl\Encoding\EncoderException; class TestBig5 extends \MensBeam\Intl\Test\CoderDecoderTest { diff --git a/tests/cases/Encoding/TestGB18030.php b/tests/cases/Encoding/TestGB18030.php index f4a97d0..fee1755 100644 --- a/tests/cases/Encoding/TestGB18030.php +++ b/tests/cases/Encoding/TestGB18030.php @@ -8,6 +8,7 @@ namespace MensBeam\Intl\TestCase\Encoding; use MensBeam\Intl\Encoding\GBK; use MensBeam\Intl\Encoding\GB18030; +use MensBeam\Intl\Encoding\Encoding; use MensBeam\Intl\Encoding\EncoderException; class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest { @@ -136,48 +137,53 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest { public function provideCodePoints() { // bytes confirmed using Firefox - $series = [ - "GBK ASCII (fatal)" => [GBK::class, true, 0x64, "64"], - "GBK 0x20AC (fatal)" => [GBK::class, true, 0x20AC, "80"], - "GBK 0x2164 (fatal)" => [GBK::class, true, 0x2164, "A2 F5"], - "GBK 0x3A74 (fatal)" => [GBK::class, true, 0x3A74, new EncoderException("", GBK::E_UNAVAILABLE_CODE_POINT)], - "GBK 0xE7C7 (fatal)" => [GBK::class, true, 0xE7C7, new EncoderException("", GBK::E_UNAVAILABLE_CODE_POINT)], - "GBK 0x1D11E (fatal)" => [GBK::class, true, 0x1D11E, new EncoderException("", GBK::E_UNAVAILABLE_CODE_POINT)], - "GBK 0xE5E5 (fatal)" => [GBK::class, true, 0xE5E5, new EncoderException("", GBK::E_UNAVAILABLE_CODE_POINT)], - "GBK -1 (fatal)" => [GBK::class, true, -1, new EncoderException("", GBK::E_INVALID_CODE_POINT)], - "GBK 0x110000 (fatal)" => [GBK::class, true, 0x110000, new EncoderException("", GBK::E_INVALID_CODE_POINT)], - "GB18030 ASCII (fatal)" => [GB18030::class, true, 0x64, "64"], - "GB18030 0x20AC (fatal)" => [GB18030::class, true, 0x20AC, "A2 E3"], - "GB18030 0x2164 (fatal)" => [GB18030::class, true, 0x2164, "A2 F5"], - "GB18030 0x3A74 (fatal)" => [GB18030::class, true, 0x3A74, "82 31 97 30"], - "GB18030 0xE7C7 (fatal)" => [GB18030::class, true, 0xE7C7, "81 35 F4 37"], - "GB18030 0x1D11E (fatal)" => [GB18030::class, true, 0x1D11E, "94 32 BE 34"], - "GB18030 0xE5E5 (fatal)" => [GB18030::class, true, 0xE5E5, new EncoderException("", GB18030::E_UNAVAILABLE_CODE_POINT)], - "GB18030 -1 (fatal)" => [GB18030::class, true, -1, new EncoderException("", GB18030::E_INVALID_CODE_POINT)], - "GB18030 0x110000 (fatal)" => [GB18030::class, true, 0x110000, new EncoderException("", GB18030::E_INVALID_CODE_POINT)], - "GBK ASCII (HTML)" => [GBK::class, false, 0x64, "64"], - "GBK 0x20AC (HTML)" => [GBK::class, false, 0x20AC, "80"], - "GBK 0x2164 (HTML)" => [GBK::class, false, 0x2164, "A2 F5"], - "GBK 0x3A74 (HTML)" => [GBK::class, false, 0x3A74, bin2hex("".(0x3A74).";")], - "GBK 0xE7C7 (HTML)" => [GBK::class, false, 0xE7C7, bin2hex("".(0xE7C7).";")], - "GBK 0x1D11E (HTML)" => [GBK::class, false, 0x1D11E, bin2hex("".(0x1D11E).";")], - "GBK 0xE5E5 (HTML)" => [GBK::class, false, 0xE5E5, bin2hex("".(0xE5E5).";")], - "GBK -1 (HTML)" => [GBK::class, false, -1, new EncoderException("", GBK::E_INVALID_CODE_POINT)], - "GBK 0x110000 (HTML)" => [GBK::class, false, 0x110000, new EncoderException("", GBK::E_INVALID_CODE_POINT)], - "GB18030 ASCII (HTML)" => [GB18030::class, false, 0x64, "64"], - "GB18030 0x20AC (HTML)" => [GB18030::class, false, 0x20AC, "A2 E3"], - "GB18030 0x2164 (HTML)" => [GB18030::class, false, 0x2164, "A2 F5"], - "GB18030 0x3A74 (HTML)" => [GB18030::class, false, 0x3A74, "82 31 97 30"], - "GB18030 0xE7C7 (HTML)" => [GB18030::class, false, 0xE7C7, "81 35 F4 37"], - "GB18030 0x1D11E (HTML)" => [GB18030::class, false, 0x1D11E, "94 32 BE 34"], - "GB18030 0xE5E5 (HTML)" => [GB18030::class, false, 0xE5E5, bin2hex("".(0xE5E5).";")], - "GB18030 -1 (HTML)" => [GB18030::class, false, -1, new EncoderException("", GB18030::E_INVALID_CODE_POINT)], - "GB18030 0x110000 (HTML)" => [GB18030::class, false, 0x110000, new EncoderException("", GB18030::E_INVALID_CODE_POINT)], + $series_gb18030 = [ + 'U+0064 (HTML)' => [false, 0x64, "64"], + 'U+0064 (fatal)' => [true, 0x64, "64"], + 'U+20AC (HTML)' => [false, 0x20AC, "A2 E3"], + 'U+20AC (fatal)' => [true, 0x20AC, "A2 E3"], + 'U+2164 (HTML)' => [false, 0x2164, "A2 F5"], + 'U+2164 (fatal)' => [true, 0x2164, "A2 F5"], + 'U+3A74 (HTML)' => [false, 0x3A74, "82 31 97 30"], + 'U+3A74 (fatal)' => [true, 0x3A74, "82 31 97 30"], + 'U+E7C7 (HTML)' => [false, 0xE7C7, "81 35 F4 37"], + 'U+E7C7 (fatal)' => [true, 0xE7C7, "81 35 F4 37"], + 'U+1D11E (HTML)' => [false, 0x1D11E, "94 32 BE 34"], + 'U+1D11E (fatal)' => [true, 0x1D11E, "94 32 BE 34"], + 'U+E5E5 (HTML)' => [false, 0xE5E5, bin2hex("")], + 'U+E5E5 (fatal)' => [true, 0xE5E5, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)], + '-1 (HTML)' => [false, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], + '-1 (fatal)' => [true, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], + 'U+110000 (HTML)' => [false, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], + 'U+110000 (fatal)' => [true, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], + ]; + $series_gbk = [ + 'U+0064 (HTML)' => [false, 0x64, "64"], + 'U+0064 (fatal)' => [true, 0x64, "64"], + 'U+20AC (HTML)' => [false, 0x20AC, "80"], + 'U+20AC (fatal)' => [true, 0x20AC, "80"], + 'U+2164 (HTML)' => [false, 0x2164, "A2 F5"], + 'U+2164 (fatal)' => [true, 0x2164, "A2 F5"], + 'U+3A74 (HTML)' => [false, 0x3A74, bin2hex("㩴")], + 'U+3A74 (fatal)' => [true, 0x3A74, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)], + 'U+E7C7 (HTML)' => [false, 0xE7C7, bin2hex("")], + 'U+E7C7 (fatal)' => [true, 0xE7C7, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)], + 'U+1D11E (HTML)' => [false, 0x1D11E, bin2hex("𝄞")], + 'U+1D11E (fatal)' => [true, 0x1D11E, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)], + 'U+E5E5 (HTML)' => [false, 0xE5E5, bin2hex("")], + 'U+E5E5 (fatal)' => [true, 0xE5E5, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)], + '-1 (HTML)' => [false, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], + '-1 (fatal)' => [true, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], + 'U+110000 (HTML)' => [false, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], + 'U+110000 (fatal)' => [true, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], ]; - foreach ($series as $name => $test) { - $class = array_shift($test); - array_push($test, $class); - yield $name => $test; + foreach ($series_gb18030 as $name => $test) { + array_push($test, GB18030::class); + yield "gb18030 $name" => $test; + } + foreach ($series_gbk as $name => $test) { + array_push($test, GBK::class); + yield "GBK $name" => $test; } } @@ -232,7 +238,6 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest { 'seek test 6 (padded)' => ["00 00 00 00 81 30 81 81 00 00 00 00", [0, 0, 0, 0, 65533, 48, 20118, 0, 0, 0, 0]], 'seek test 7 (padded)' => ["00 00 00 00 30 30 81 81 00 00 00 00", [0, 0, 0, 0, 48, 48, 20118, 0, 0, 0, 0]], 'seek test 8 (padded)' => ["00 00 00 00 F8 83 FE 80 00 00 00 00", [0, 0, 0, 0, 40229, 18211, 0, 0, 0, 0]], - ]; } diff --git a/tests/cases/Encoding/TestUTF8.php b/tests/cases/Encoding/TestUTF8.php index 96b2f76..eaf0ab9 100644 --- a/tests/cases/Encoding/TestUTF8.php +++ b/tests/cases/Encoding/TestUTF8.php @@ -7,6 +7,7 @@ declare(strict_types=1); namespace MensBeam\Intl\TestCase\Encoding; use MensBeam\Intl\Encoding\UTF8; +use MensBeam\Intl\Encoding\Encoding; use MensBeam\Intl\Encoding\EncoderException; class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest { @@ -127,21 +128,26 @@ class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest { } public function provideCodePoints() { - $series = [ - "122" => [122, "7A"], - "162" => [162, "C2 A2"], - "27700" => [27700, "E6 B0 B4"], - "119070" => [119070, "F0 9D 84 9E"], - "63743" => [63743, "EF A3 BF"], - "1114109" => [1114109, "F4 8F BF BD"], - "65534" => [65534, "EF BF BE"], - "-1" => [-1, new EncoderException("", UTF8::E_INVALID_CODE_POINT)], - "1114112" => [1114112, new EncoderException("", UTF8::E_INVALID_CODE_POINT)], + return [ + 'U+007A (HTML)' => [false, 0x7A, "7A"], + 'U+007A (fatal)' => [true, 0x7A, "7A"], + 'U+00A2 (HTML)' => [false, 0xA2, "C2 A2"], + 'U+00A2 (fatal)' => [true, 0xA2, "C2 A2"], + 'U+6C34 (HTML)' => [false, 0x6C34, "E6 B0 B4"], + 'U+6C34 (fatal)' => [true, 0x6C34, "E6 B0 B4"], + 'U+1D11E (HTML)' => [false, 0x1D11E, "F0 9D 84 9E"], + 'U+1D11E (fatal)' => [true, 0x1D11E, "F0 9D 84 9E"], + 'U+F8FF (HTML)' => [false, 0xF8FF, "EF A3 BF"], + 'U+F8FF (fatal)' => [true, 0xF8FF, "EF A3 BF"], + 'U+10FFFD (HTML)' => [false, 0x10FFFD, "F4 8F BF BD"], + 'U+10FFFD (fatal)' => [true, 0x10FFFD, "F4 8F BF BD"], + 'U+FFFE (HTML)' => [false, 0xFFFE, "EF BF BE"], + 'U+FFFE (fatal)' => [true, 0xFFFE, "EF BF BE"], + '-1 (HTML)' => [false, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], + '-1 (fatal)' => [true, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], + '0x110000 (HTML)' => [false, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], + '0x110000 (fatal)' => [true, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], ]; - foreach ($series as $name => $test) { - yield "$name (fatal)" => array_merge([true], $test); - yield "$name (HTML)" => array_merge([false], $test); - } } public function provideStrings() { diff --git a/tools/mktestbig5.html b/tools/mktestbig5.html deleted file mode 100644 index 034ff95..0000000 --- a/tools/mktestbig5.html +++ /dev/null @@ -1,62 +0,0 @@ - - - -
- diff --git a/tools/mktestgbk.html b/tools/mktestgbk.html deleted file mode 100644 index 03534dc..0000000 --- a/tools/mktestgbk.html +++ /dev/null @@ -1,121 +0,0 @@ - - - - - diff --git a/tools/test-big5.html b/tools/test-big5.html new file mode 100644 index 0000000..59d8869 --- /dev/null +++ b/tools/test-big5.html @@ -0,0 +1,14 @@ + + + + diff --git a/tools/test-gb18030.html b/tools/test-gb18030.html new file mode 100644 index 0000000..9722f3d --- /dev/null +++ b/tools/test-gb18030.html @@ -0,0 +1,76 @@ + + + + diff --git a/tools/test-gbk.html b/tools/test-gbk.html new file mode 100644 index 0000000..2b02d4b --- /dev/null +++ b/tools/test-gbk.html @@ -0,0 +1,16 @@ + + + + diff --git a/tools/test-utf16.html b/tools/test-utf16.html new file mode 100644 index 0000000..75b7b5e --- /dev/null +++ b/tools/test-utf16.html @@ -0,0 +1,20 @@ + + + + diff --git a/tools/test-utf8.html b/tools/test-utf8.html new file mode 100644 index 0000000..06ba251 --- /dev/null +++ b/tools/test-utf8.html @@ -0,0 +1,70 @@ + + + + diff --git a/tools/test.js b/tools/test.js new file mode 100644 index 0000000..2b65348 --- /dev/null +++ b/tools/test.js @@ -0,0 +1,141 @@ +"use strict"; +// set out the output pre-formatted text element +window.out = document.createElement("pre"); +document.documentElement.appendChild(out); + +var encoding = document.getElementsByTagName("meta").charset; + +function encodeCodePoint(code, fatal) { + if (code < 0 || code > 0x10FFFF) { + return 'new EncoderException("", Encoding::E_INVALID_CODE_POINT)'; + } else { + var l = document.createElement("a"); + l.href = "http://example.com/?" + String.fromCodePoint(code); + var bytes = []; + let url = l.search.substr(1); + for (let a = 0; a < url.length; a++) { + if ((url.charAt(a) == "%" && url.substr(a, 6) == "%26%23") || url.charAt(a) == "&") { + // character cannot be encoded + if (fatal) { + return 'new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)'; + } else { + return decodeURIComponent(url); + } + } else if (url.charAt(a) == "%") { + bytes.push(url.charAt(a + 1) + url.charAt(a + 2)); + a = a + 2; + } else { + bytes.push(url.charCodeAt(a).toString(16).padStart(2, "0")); + } + } + } + return bytes; +} + +function wrapCodePoint(code, fatal) { + var out = encodeCodePoint(code, fatal); + if (Array.isArray(out)) { + return '"' + out.join(" ") + '"'; + } else if (out.charAt(0) == "&") { + return 'bin2hex("' + out + '")'; + } else { + return out; + } +} + +if(typeof sampleStrings != 'undefined') { + var decoder = new TextDecoder(encoding); + for (let name in sampleStrings) { + let input = sampleStrings[name].replace(/\s/g, ""); + let bytes = []; + for (let a = 0; a < input.length; a = a + 2) { + bytes.push(parseInt(input.substr(a, 2), 16)); + } + let text = decoder.decode(new Uint8Array(bytes)); + let codes = []; + for (let a = 0; a < text.length; a++) { + let point = text.codePointAt(a); + if (point >= 55296 && point <= 57343) { + // non-BMP characters have trailing low surrogates in JavaScript strings + continue; + } + codes.push(point); + } + codes = codes.join(", "); + bytes = sampleStrings[name]; + let line = "'" + name + "' => [" + '"' + bytes + '", [' + codes + "]],\n"; + out.appendChild(document.createTextNode(line)); + } + out.appendChild(document.createTextNode("\n\n")); +} + +if(typeof sampleCharacters != 'undefined') { + for (name in sampleCharacters) { + let code = sampleCharacters[name]; + if (code > -1 && code % 1 == 0) code = "0x" + code.toString(16).toUpperCase(); + let line1 = "'" + name + " (HTML)' => [false, " + code + ", " + wrapCodePoint(code, false) + "],\n"; + let line2 = "'" + name + " (fatal)' => [true, " + code + ", " + wrapCodePoint(code, true) + "],\n"; + out.appendChild(document.createTextNode(line1)); + out.appendChild(document.createTextNode(line2)); + } + out.appendChild(document.createTextNode("\n\n")); +} + +if(typeof seekCodePoints != 'undefined') { + // first gather statistics on the encoding of the specified array of code points + var stats = []; + var a = 0; + var offset = 0; + for (let b = 0; b < seekCodePoints.length; b++) { + let code = seekCodePoints[b]; + stats[a] = { + 'code': code, + 'offset': offset, + 'length': 0, + 'bytes': "", + }; + let bytes = encodeCodePoint(code, true); + if (Array.isArray(bytes)) { + stats[a].length = bytes.length; + stats[a].bytes = bytes.join("").toUpperCase(); + offset = offset + bytes.length; + } else { + stats[a].length = 1; + stats[a].bytes = "()"; + offset = offset + 1; + } + a++; + } + var end = [a, offset]; + // summarize the statistics in a comment + var comment = "/*\n"; + for (let a = 0; a < stats.length; a++) { + let length = (stats[a].length == 1) ? "(1 byte) " : "(" + stats[a].length + " bytes)"; + comment = comment + " Char " + a + " U+" + stats[a].code.toString(16).padStart(4, "0").padEnd(6, " ").toUpperCase() + " " + length + " Offset " + stats[a].offset + "\n"; + } + comment = comment + " End of string at char " + end[0] + ", offset " + end[1] + "\n"; + comment = comment + "*/\n"; + // build the encoded byte string + var bytes = []; + for (let char of stats) { + bytes.push(char.bytes); + } + bytes = 'protected $seekString = "' + bytes.join(" ") + '";' + "\n"; + // build the array of code points + var codes = []; + for (let char of stats) { + codes.push("0x" + char.code.toString(16).toUpperCase()); + } + codes = 'protected $seekCodes = [' + codes.join(", ") + "];\n"; + // build the array of offsets + var offs = []; + for (let char of stats) { + offs.push(char.offset); + } + offs = 'protected $seekOffsets = [' + offs.join(", ") + "];\n"; + // output the results + out.appendChild(document.createTextNode(comment)); + out.appendChild(document.createTextNode(bytes)); + out.appendChild(document.createTextNode(codes)); + out.appendChild(document.createTextNode(offs)); +}