diff --git a/tests/cases/Encoding/TestBig5.php b/tests/cases/Encoding/TestBig5.php index 77d486d..3652391 100644 --- a/tests/cases/Encoding/TestBig5.php +++ b/tests/cases/Encoding/TestBig5.php @@ -7,6 +7,7 @@ declare(strict_types=1); namespace MensBeam\Intl\TestCase\Encoding; use MensBeam\Intl\Encoding\Big5; +use MensBeam\Intl\Encoding\Encoding; use MensBeam\Intl\Encoding\EncoderException; class TestBig5 extends \MensBeam\Intl\Test\CoderDecoderTest { diff --git a/tests/cases/Encoding/TestGB18030.php b/tests/cases/Encoding/TestGB18030.php index f4a97d0..fee1755 100644 --- a/tests/cases/Encoding/TestGB18030.php +++ b/tests/cases/Encoding/TestGB18030.php @@ -8,6 +8,7 @@ namespace MensBeam\Intl\TestCase\Encoding; use MensBeam\Intl\Encoding\GBK; use MensBeam\Intl\Encoding\GB18030; +use MensBeam\Intl\Encoding\Encoding; use MensBeam\Intl\Encoding\EncoderException; class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest { @@ -136,48 +137,53 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest { public function provideCodePoints() { // bytes confirmed using Firefox - $series = [ - "GBK ASCII (fatal)" => [GBK::class, true, 0x64, "64"], - "GBK 0x20AC (fatal)" => [GBK::class, true, 0x20AC, "80"], - "GBK 0x2164 (fatal)" => [GBK::class, true, 0x2164, "A2 F5"], - "GBK 0x3A74 (fatal)" => [GBK::class, true, 0x3A74, new EncoderException("", GBK::E_UNAVAILABLE_CODE_POINT)], - "GBK 0xE7C7 (fatal)" => [GBK::class, true, 0xE7C7, new EncoderException("", GBK::E_UNAVAILABLE_CODE_POINT)], - "GBK 0x1D11E (fatal)" => [GBK::class, true, 0x1D11E, new EncoderException("", GBK::E_UNAVAILABLE_CODE_POINT)], - "GBK 0xE5E5 (fatal)" => [GBK::class, true, 0xE5E5, new EncoderException("", GBK::E_UNAVAILABLE_CODE_POINT)], - "GBK -1 (fatal)" => [GBK::class, true, -1, new EncoderException("", GBK::E_INVALID_CODE_POINT)], - "GBK 0x110000 (fatal)" => [GBK::class, true, 0x110000, new EncoderException("", GBK::E_INVALID_CODE_POINT)], - "GB18030 ASCII (fatal)" => [GB18030::class, true, 0x64, "64"], - "GB18030 0x20AC (fatal)" => [GB18030::class, true, 0x20AC, "A2 E3"], - "GB18030 0x2164 (fatal)" => [GB18030::class, true, 0x2164, "A2 F5"], - "GB18030 0x3A74 (fatal)" => [GB18030::class, true, 0x3A74, "82 31 97 30"], - "GB18030 0xE7C7 (fatal)" => [GB18030::class, true, 0xE7C7, "81 35 F4 37"], - "GB18030 0x1D11E (fatal)" => [GB18030::class, true, 0x1D11E, "94 32 BE 34"], - "GB18030 0xE5E5 (fatal)" => [GB18030::class, true, 0xE5E5, new EncoderException("", GB18030::E_UNAVAILABLE_CODE_POINT)], - "GB18030 -1 (fatal)" => [GB18030::class, true, -1, new EncoderException("", GB18030::E_INVALID_CODE_POINT)], - "GB18030 0x110000 (fatal)" => [GB18030::class, true, 0x110000, new EncoderException("", GB18030::E_INVALID_CODE_POINT)], - "GBK ASCII (HTML)" => [GBK::class, false, 0x64, "64"], - "GBK 0x20AC (HTML)" => [GBK::class, false, 0x20AC, "80"], - "GBK 0x2164 (HTML)" => [GBK::class, false, 0x2164, "A2 F5"], - "GBK 0x3A74 (HTML)" => [GBK::class, false, 0x3A74, bin2hex("&#".(0x3A74).";")], - "GBK 0xE7C7 (HTML)" => [GBK::class, false, 0xE7C7, bin2hex("&#".(0xE7C7).";")], - "GBK 0x1D11E (HTML)" => [GBK::class, false, 0x1D11E, bin2hex("&#".(0x1D11E).";")], - "GBK 0xE5E5 (HTML)" => [GBK::class, false, 0xE5E5, bin2hex("&#".(0xE5E5).";")], - "GBK -1 (HTML)" => [GBK::class, false, -1, new EncoderException("", GBK::E_INVALID_CODE_POINT)], - "GBK 0x110000 (HTML)" => [GBK::class, false, 0x110000, new EncoderException("", GBK::E_INVALID_CODE_POINT)], - "GB18030 ASCII (HTML)" => [GB18030::class, false, 0x64, "64"], - "GB18030 0x20AC (HTML)" => [GB18030::class, false, 0x20AC, "A2 E3"], - "GB18030 0x2164 (HTML)" => [GB18030::class, false, 0x2164, "A2 F5"], - "GB18030 0x3A74 (HTML)" => [GB18030::class, false, 0x3A74, "82 31 97 30"], - "GB18030 0xE7C7 (HTML)" => [GB18030::class, false, 0xE7C7, "81 35 F4 37"], - "GB18030 0x1D11E (HTML)" => [GB18030::class, false, 0x1D11E, "94 32 BE 34"], - "GB18030 0xE5E5 (HTML)" => [GB18030::class, false, 0xE5E5, bin2hex("&#".(0xE5E5).";")], - "GB18030 -1 (HTML)" => [GB18030::class, false, -1, new EncoderException("", GB18030::E_INVALID_CODE_POINT)], - "GB18030 0x110000 (HTML)" => [GB18030::class, false, 0x110000, new EncoderException("", GB18030::E_INVALID_CODE_POINT)], + $series_gb18030 = [ + 'U+0064 (HTML)' => [false, 0x64, "64"], + 'U+0064 (fatal)' => [true, 0x64, "64"], + 'U+20AC (HTML)' => [false, 0x20AC, "A2 E3"], + 'U+20AC (fatal)' => [true, 0x20AC, "A2 E3"], + 'U+2164 (HTML)' => [false, 0x2164, "A2 F5"], + 'U+2164 (fatal)' => [true, 0x2164, "A2 F5"], + 'U+3A74 (HTML)' => [false, 0x3A74, "82 31 97 30"], + 'U+3A74 (fatal)' => [true, 0x3A74, "82 31 97 30"], + 'U+E7C7 (HTML)' => [false, 0xE7C7, "81 35 F4 37"], + 'U+E7C7 (fatal)' => [true, 0xE7C7, "81 35 F4 37"], + 'U+1D11E (HTML)' => [false, 0x1D11E, "94 32 BE 34"], + 'U+1D11E (fatal)' => [true, 0x1D11E, "94 32 BE 34"], + 'U+E5E5 (HTML)' => [false, 0xE5E5, bin2hex("")], + 'U+E5E5 (fatal)' => [true, 0xE5E5, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)], + '-1 (HTML)' => [false, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], + '-1 (fatal)' => [true, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], + 'U+110000 (HTML)' => [false, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], + 'U+110000 (fatal)' => [true, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], + ]; + $series_gbk = [ + 'U+0064 (HTML)' => [false, 0x64, "64"], + 'U+0064 (fatal)' => [true, 0x64, "64"], + 'U+20AC (HTML)' => [false, 0x20AC, "80"], + 'U+20AC (fatal)' => [true, 0x20AC, "80"], + 'U+2164 (HTML)' => [false, 0x2164, "A2 F5"], + 'U+2164 (fatal)' => [true, 0x2164, "A2 F5"], + 'U+3A74 (HTML)' => [false, 0x3A74, bin2hex("㩴")], + 'U+3A74 (fatal)' => [true, 0x3A74, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)], + 'U+E7C7 (HTML)' => [false, 0xE7C7, bin2hex("")], + 'U+E7C7 (fatal)' => [true, 0xE7C7, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)], + 'U+1D11E (HTML)' => [false, 0x1D11E, bin2hex("𝄞")], + 'U+1D11E (fatal)' => [true, 0x1D11E, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)], + 'U+E5E5 (HTML)' => [false, 0xE5E5, bin2hex("")], + 'U+E5E5 (fatal)' => [true, 0xE5E5, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)], + '-1 (HTML)' => [false, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], + '-1 (fatal)' => [true, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], + 'U+110000 (HTML)' => [false, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], + 'U+110000 (fatal)' => [true, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], ]; - foreach ($series as $name => $test) { - $class = array_shift($test); - array_push($test, $class); - yield $name => $test; + foreach ($series_gb18030 as $name => $test) { + array_push($test, GB18030::class); + yield "gb18030 $name" => $test; + } + foreach ($series_gbk as $name => $test) { + array_push($test, GBK::class); + yield "GBK $name" => $test; } } @@ -232,7 +238,6 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest { 'seek test 6 (padded)' => ["00 00 00 00 81 30 81 81 00 00 00 00", [0, 0, 0, 0, 65533, 48, 20118, 0, 0, 0, 0]], 'seek test 7 (padded)' => ["00 00 00 00 30 30 81 81 00 00 00 00", [0, 0, 0, 0, 48, 48, 20118, 0, 0, 0, 0]], 'seek test 8 (padded)' => ["00 00 00 00 F8 83 FE 80 00 00 00 00", [0, 0, 0, 0, 40229, 18211, 0, 0, 0, 0]], - ]; } diff --git a/tests/cases/Encoding/TestUTF8.php b/tests/cases/Encoding/TestUTF8.php index 96b2f76..eaf0ab9 100644 --- a/tests/cases/Encoding/TestUTF8.php +++ b/tests/cases/Encoding/TestUTF8.php @@ -7,6 +7,7 @@ declare(strict_types=1); namespace MensBeam\Intl\TestCase\Encoding; use MensBeam\Intl\Encoding\UTF8; +use MensBeam\Intl\Encoding\Encoding; use MensBeam\Intl\Encoding\EncoderException; class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest { @@ -127,21 +128,26 @@ class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest { } public function provideCodePoints() { - $series = [ - "122" => [122, "7A"], - "162" => [162, "C2 A2"], - "27700" => [27700, "E6 B0 B4"], - "119070" => [119070, "F0 9D 84 9E"], - "63743" => [63743, "EF A3 BF"], - "1114109" => [1114109, "F4 8F BF BD"], - "65534" => [65534, "EF BF BE"], - "-1" => [-1, new EncoderException("", UTF8::E_INVALID_CODE_POINT)], - "1114112" => [1114112, new EncoderException("", UTF8::E_INVALID_CODE_POINT)], + return [ + 'U+007A (HTML)' => [false, 0x7A, "7A"], + 'U+007A (fatal)' => [true, 0x7A, "7A"], + 'U+00A2 (HTML)' => [false, 0xA2, "C2 A2"], + 'U+00A2 (fatal)' => [true, 0xA2, "C2 A2"], + 'U+6C34 (HTML)' => [false, 0x6C34, "E6 B0 B4"], + 'U+6C34 (fatal)' => [true, 0x6C34, "E6 B0 B4"], + 'U+1D11E (HTML)' => [false, 0x1D11E, "F0 9D 84 9E"], + 'U+1D11E (fatal)' => [true, 0x1D11E, "F0 9D 84 9E"], + 'U+F8FF (HTML)' => [false, 0xF8FF, "EF A3 BF"], + 'U+F8FF (fatal)' => [true, 0xF8FF, "EF A3 BF"], + 'U+10FFFD (HTML)' => [false, 0x10FFFD, "F4 8F BF BD"], + 'U+10FFFD (fatal)' => [true, 0x10FFFD, "F4 8F BF BD"], + 'U+FFFE (HTML)' => [false, 0xFFFE, "EF BF BE"], + 'U+FFFE (fatal)' => [true, 0xFFFE, "EF BF BE"], + '-1 (HTML)' => [false, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], + '-1 (fatal)' => [true, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], + '0x110000 (HTML)' => [false, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], + '0x110000 (fatal)' => [true, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)], ]; - foreach ($series as $name => $test) { - yield "$name (fatal)" => array_merge([true], $test); - yield "$name (HTML)" => array_merge([false], $test); - } } public function provideStrings() { diff --git a/tools/mktestbig5.html b/tools/mktestbig5.html deleted file mode 100644 index 034ff95..0000000 --- a/tools/mktestbig5.html +++ /dev/null @@ -1,62 +0,0 @@ - - - -

-
diff --git a/tools/mktestgbk.html b/tools/mktestgbk.html
deleted file mode 100644
index 03534dc..0000000
--- a/tools/mktestgbk.html
+++ /dev/null
@@ -1,121 +0,0 @@
-
-
-
-

-
diff --git a/tools/test-big5.html b/tools/test-big5.html
new file mode 100644
index 0000000..59d8869
--- /dev/null
+++ b/tools/test-big5.html
@@ -0,0 +1,14 @@
+
+
+
+
diff --git a/tools/test-gb18030.html b/tools/test-gb18030.html
new file mode 100644
index 0000000..9722f3d
--- /dev/null
+++ b/tools/test-gb18030.html
@@ -0,0 +1,76 @@
+
+
+
+
diff --git a/tools/test-gbk.html b/tools/test-gbk.html
new file mode 100644
index 0000000..2b02d4b
--- /dev/null
+++ b/tools/test-gbk.html
@@ -0,0 +1,16 @@
+
+
+
+
diff --git a/tools/test-utf16.html b/tools/test-utf16.html
new file mode 100644
index 0000000..75b7b5e
--- /dev/null
+++ b/tools/test-utf16.html
@@ -0,0 +1,20 @@
+
+
+
+
diff --git a/tools/test-utf8.html b/tools/test-utf8.html
new file mode 100644
index 0000000..06ba251
--- /dev/null
+++ b/tools/test-utf8.html
@@ -0,0 +1,70 @@
+
+
+
+
diff --git a/tools/test.js b/tools/test.js
new file mode 100644
index 0000000..2b65348
--- /dev/null
+++ b/tools/test.js
@@ -0,0 +1,141 @@
+"use strict";
+// set out the output pre-formatted text element
+window.out = document.createElement("pre");
+document.documentElement.appendChild(out);
+
+var encoding = document.getElementsByTagName("meta").charset;
+
+function encodeCodePoint(code, fatal) {
+    if (code < 0 || code > 0x10FFFF) {
+        return 'new EncoderException("", Encoding::E_INVALID_CODE_POINT)';
+    } else {
+        var l = document.createElement("a");
+        l.href = "http://example.com/?" + String.fromCodePoint(code);
+        var bytes = [];
+        let url = l.search.substr(1);
+        for (let a = 0; a < url.length; a++) {
+            if ((url.charAt(a) == "%" && url.substr(a, 6) == "%26%23") || url.charAt(a) == "&") {
+                // character cannot be encoded
+                if (fatal) {
+                    return 'new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)';
+                } else {
+                    return decodeURIComponent(url);
+                }
+            } else if (url.charAt(a) == "%") {
+                bytes.push(url.charAt(a + 1) + url.charAt(a + 2));
+                a = a + 2;
+            } else {
+                bytes.push(url.charCodeAt(a).toString(16).padStart(2, "0"));
+            }
+        }
+    }
+    return bytes;
+}
+
+function wrapCodePoint(code, fatal) {
+    var out = encodeCodePoint(code, fatal);
+    if (Array.isArray(out)) {
+        return '"' + out.join(" ") + '"';
+    } else if (out.charAt(0) == "&") {
+        return 'bin2hex("' + out + '")';
+    } else {
+        return out;
+    }
+}
+
+if(typeof sampleStrings != 'undefined') {
+    var decoder = new TextDecoder(encoding);
+    for (let name in sampleStrings) {
+        let input = sampleStrings[name].replace(/\s/g, "");
+        let bytes = [];
+        for (let a = 0; a < input.length; a = a + 2) {
+            bytes.push(parseInt(input.substr(a, 2), 16));
+        }
+        let text = decoder.decode(new Uint8Array(bytes));
+        let codes = [];
+        for (let a = 0; a < text.length; a++) {
+            let point = text.codePointAt(a);
+            if (point >= 55296 && point <= 57343) {
+                // non-BMP characters have trailing low surrogates in JavaScript strings
+                continue;
+            }
+            codes.push(point);
+        }
+        codes = codes.join(", ");
+        bytes = sampleStrings[name];
+        let line = "'" + name + "' => [" + '"' + bytes + '", [' + codes + "]],\n";
+        out.appendChild(document.createTextNode(line));
+    }
+    out.appendChild(document.createTextNode("\n\n"));
+}
+
+if(typeof sampleCharacters != 'undefined') {
+    for (name in sampleCharacters) {
+        let code = sampleCharacters[name];
+        if (code > -1 && code % 1 == 0) code = "0x" + code.toString(16).toUpperCase();
+        let line1 = "'" + name + " (HTML)'  => [false, " + code + ", " + wrapCodePoint(code, false) + "],\n";
+        let line2 = "'" + name + " (fatal)' => [true,  " + code + ", " + wrapCodePoint(code, true) + "],\n";
+        out.appendChild(document.createTextNode(line1));
+        out.appendChild(document.createTextNode(line2));
+    }
+    out.appendChild(document.createTextNode("\n\n"));
+}
+
+if(typeof seekCodePoints != 'undefined') {
+    // first gather statistics on the encoding of the specified array of code points
+    var stats = [];
+    var a = 0;
+    var offset = 0;
+    for (let b = 0; b < seekCodePoints.length; b++) {
+        let code = seekCodePoints[b];
+        stats[a] = {
+            'code': code,
+            'offset': offset,
+            'length': 0,
+            'bytes': "",
+        };
+        let bytes = encodeCodePoint(code, true);
+        if (Array.isArray(bytes)) {
+            stats[a].length = bytes.length;
+            stats[a].bytes = bytes.join("").toUpperCase();
+            offset = offset + bytes.length;
+        } else {
+            stats[a].length = 1;
+            stats[a].bytes = "()";
+            offset = offset + 1;
+        }
+        a++;
+    }
+    var end = [a, offset];
+    // summarize the statistics in a comment
+    var comment = "/*\n";
+    for (let a = 0; a < stats.length; a++) {
+        let length = (stats[a].length == 1) ? "(1 byte) " : "(" + stats[a].length + " bytes)";
+        comment = comment + "    Char " + a + " U+" + stats[a].code.toString(16).padStart(4, "0").padEnd(6, " ").toUpperCase() + " " + length + " Offset " + stats[a].offset + "\n";
+    }
+    comment = comment + "    End of string at char " + end[0] + ", offset " + end[1] + "\n";
+    comment = comment + "*/\n";
+    // build the encoded byte string
+    var bytes = [];
+    for (let char of stats) {
+        bytes.push(char.bytes);
+    }
+    bytes = 'protected $seekString = "' + bytes.join(" ") + '";' + "\n";
+    // build the array of code points
+    var codes = [];
+    for (let char of stats) {
+        codes.push("0x" + char.code.toString(16).toUpperCase());
+    }
+    codes = 'protected $seekCodes = [' + codes.join(", ") + "];\n";
+    // build the array of offsets
+    var offs = [];
+    for (let char of stats) {
+        offs.push(char.offset);
+    }
+    offs = 'protected $seekOffsets = [' + offs.join(", ") + "];\n";
+    // output the results
+    out.appendChild(document.createTextNode(comment));
+    out.appendChild(document.createTextNode(bytes));
+    out.appendChild(document.createTextNode(codes));
+    out.appendChild(document.createTextNode(offs));
+}