Browse Source

Refactor HTML-based test generators

span
J. King 6 years ago
parent
commit
55cbc915c3
  1. 1
      tests/cases/Encoding/TestBig5.php
  2. 89
      tests/cases/Encoding/TestGB18030.php
  3. 34
      tests/cases/Encoding/TestUTF8.php
  4. 62
      tools/mktestbig5.html
  5. 121
      tools/mktestgbk.html
  6. 14
      tools/test-big5.html
  7. 76
      tools/test-gb18030.html
  8. 16
      tools/test-gbk.html
  9. 20
      tools/test-utf16.html
  10. 70
      tools/test-utf8.html
  11. 141
      tools/test.js

1
tests/cases/Encoding/TestBig5.php

@ -7,6 +7,7 @@ declare(strict_types=1);
namespace MensBeam\Intl\TestCase\Encoding;
use MensBeam\Intl\Encoding\Big5;
use MensBeam\Intl\Encoding\Encoding;
use MensBeam\Intl\Encoding\EncoderException;
class TestBig5 extends \MensBeam\Intl\Test\CoderDecoderTest {

89
tests/cases/Encoding/TestGB18030.php

@ -8,6 +8,7 @@ namespace MensBeam\Intl\TestCase\Encoding;
use MensBeam\Intl\Encoding\GBK;
use MensBeam\Intl\Encoding\GB18030;
use MensBeam\Intl\Encoding\Encoding;
use MensBeam\Intl\Encoding\EncoderException;
class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest {
@ -136,48 +137,53 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest {
public function provideCodePoints() {
// bytes confirmed using Firefox
$series = [
"GBK ASCII (fatal)" => [GBK::class, true, 0x64, "64"],
"GBK 0x20AC (fatal)" => [GBK::class, true, 0x20AC, "80"],
"GBK 0x2164 (fatal)" => [GBK::class, true, 0x2164, "A2 F5"],
"GBK 0x3A74 (fatal)" => [GBK::class, true, 0x3A74, new EncoderException("", GBK::E_UNAVAILABLE_CODE_POINT)],
"GBK 0xE7C7 (fatal)" => [GBK::class, true, 0xE7C7, new EncoderException("", GBK::E_UNAVAILABLE_CODE_POINT)],
"GBK 0x1D11E (fatal)" => [GBK::class, true, 0x1D11E, new EncoderException("", GBK::E_UNAVAILABLE_CODE_POINT)],
"GBK 0xE5E5 (fatal)" => [GBK::class, true, 0xE5E5, new EncoderException("", GBK::E_UNAVAILABLE_CODE_POINT)],
"GBK -1 (fatal)" => [GBK::class, true, -1, new EncoderException("", GBK::E_INVALID_CODE_POINT)],
"GBK 0x110000 (fatal)" => [GBK::class, true, 0x110000, new EncoderException("", GBK::E_INVALID_CODE_POINT)],
"GB18030 ASCII (fatal)" => [GB18030::class, true, 0x64, "64"],
"GB18030 0x20AC (fatal)" => [GB18030::class, true, 0x20AC, "A2 E3"],
"GB18030 0x2164 (fatal)" => [GB18030::class, true, 0x2164, "A2 F5"],
"GB18030 0x3A74 (fatal)" => [GB18030::class, true, 0x3A74, "82 31 97 30"],
"GB18030 0xE7C7 (fatal)" => [GB18030::class, true, 0xE7C7, "81 35 F4 37"],
"GB18030 0x1D11E (fatal)" => [GB18030::class, true, 0x1D11E, "94 32 BE 34"],
"GB18030 0xE5E5 (fatal)" => [GB18030::class, true, 0xE5E5, new EncoderException("", GB18030::E_UNAVAILABLE_CODE_POINT)],
"GB18030 -1 (fatal)" => [GB18030::class, true, -1, new EncoderException("", GB18030::E_INVALID_CODE_POINT)],
"GB18030 0x110000 (fatal)" => [GB18030::class, true, 0x110000, new EncoderException("", GB18030::E_INVALID_CODE_POINT)],
"GBK ASCII (HTML)" => [GBK::class, false, 0x64, "64"],
"GBK 0x20AC (HTML)" => [GBK::class, false, 0x20AC, "80"],
"GBK 0x2164 (HTML)" => [GBK::class, false, 0x2164, "A2 F5"],
"GBK 0x3A74 (HTML)" => [GBK::class, false, 0x3A74, bin2hex("&#".(0x3A74).";")],
"GBK 0xE7C7 (HTML)" => [GBK::class, false, 0xE7C7, bin2hex("&#".(0xE7C7).";")],
"GBK 0x1D11E (HTML)" => [GBK::class, false, 0x1D11E, bin2hex("&#".(0x1D11E).";")],
"GBK 0xE5E5 (HTML)" => [GBK::class, false, 0xE5E5, bin2hex("&#".(0xE5E5).";")],
"GBK -1 (HTML)" => [GBK::class, false, -1, new EncoderException("", GBK::E_INVALID_CODE_POINT)],
"GBK 0x110000 (HTML)" => [GBK::class, false, 0x110000, new EncoderException("", GBK::E_INVALID_CODE_POINT)],
"GB18030 ASCII (HTML)" => [GB18030::class, false, 0x64, "64"],
"GB18030 0x20AC (HTML)" => [GB18030::class, false, 0x20AC, "A2 E3"],
"GB18030 0x2164 (HTML)" => [GB18030::class, false, 0x2164, "A2 F5"],
"GB18030 0x3A74 (HTML)" => [GB18030::class, false, 0x3A74, "82 31 97 30"],
"GB18030 0xE7C7 (HTML)" => [GB18030::class, false, 0xE7C7, "81 35 F4 37"],
"GB18030 0x1D11E (HTML)" => [GB18030::class, false, 0x1D11E, "94 32 BE 34"],
"GB18030 0xE5E5 (HTML)" => [GB18030::class, false, 0xE5E5, bin2hex("&#".(0xE5E5).";")],
"GB18030 -1 (HTML)" => [GB18030::class, false, -1, new EncoderException("", GB18030::E_INVALID_CODE_POINT)],
"GB18030 0x110000 (HTML)" => [GB18030::class, false, 0x110000, new EncoderException("", GB18030::E_INVALID_CODE_POINT)],
$series_gb18030 = [
'U+0064 (HTML)' => [false, 0x64, "64"],
'U+0064 (fatal)' => [true, 0x64, "64"],
'U+20AC (HTML)' => [false, 0x20AC, "A2 E3"],
'U+20AC (fatal)' => [true, 0x20AC, "A2 E3"],
'U+2164 (HTML)' => [false, 0x2164, "A2 F5"],
'U+2164 (fatal)' => [true, 0x2164, "A2 F5"],
'U+3A74 (HTML)' => [false, 0x3A74, "82 31 97 30"],
'U+3A74 (fatal)' => [true, 0x3A74, "82 31 97 30"],
'U+E7C7 (HTML)' => [false, 0xE7C7, "81 35 F4 37"],
'U+E7C7 (fatal)' => [true, 0xE7C7, "81 35 F4 37"],
'U+1D11E (HTML)' => [false, 0x1D11E, "94 32 BE 34"],
'U+1D11E (fatal)' => [true, 0x1D11E, "94 32 BE 34"],
'U+E5E5 (HTML)' => [false, 0xE5E5, bin2hex("")],
'U+E5E5 (fatal)' => [true, 0xE5E5, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)],
'-1 (HTML)' => [false, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
'-1 (fatal)' => [true, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
'U+110000 (HTML)' => [false, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
'U+110000 (fatal)' => [true, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
];
$series_gbk = [
'U+0064 (HTML)' => [false, 0x64, "64"],
'U+0064 (fatal)' => [true, 0x64, "64"],
'U+20AC (HTML)' => [false, 0x20AC, "80"],
'U+20AC (fatal)' => [true, 0x20AC, "80"],
'U+2164 (HTML)' => [false, 0x2164, "A2 F5"],
'U+2164 (fatal)' => [true, 0x2164, "A2 F5"],
'U+3A74 (HTML)' => [false, 0x3A74, bin2hex("㩴")],
'U+3A74 (fatal)' => [true, 0x3A74, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)],
'U+E7C7 (HTML)' => [false, 0xE7C7, bin2hex("")],
'U+E7C7 (fatal)' => [true, 0xE7C7, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)],
'U+1D11E (HTML)' => [false, 0x1D11E, bin2hex("𝄞")],
'U+1D11E (fatal)' => [true, 0x1D11E, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)],
'U+E5E5 (HTML)' => [false, 0xE5E5, bin2hex("")],
'U+E5E5 (fatal)' => [true, 0xE5E5, new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)],
'-1 (HTML)' => [false, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
'-1 (fatal)' => [true, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
'U+110000 (HTML)' => [false, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
'U+110000 (fatal)' => [true, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
];
foreach ($series as $name => $test) {
$class = array_shift($test);
array_push($test, $class);
yield $name => $test;
foreach ($series_gb18030 as $name => $test) {
array_push($test, GB18030::class);
yield "gb18030 $name" => $test;
}
foreach ($series_gbk as $name => $test) {
array_push($test, GBK::class);
yield "GBK $name" => $test;
}
}
@ -232,7 +238,6 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest {
'seek test 6 (padded)' => ["00 00 00 00 81 30 81 81 00 00 00 00", [0, 0, 0, 0, 65533, 48, 20118, 0, 0, 0, 0]],
'seek test 7 (padded)' => ["00 00 00 00 30 30 81 81 00 00 00 00", [0, 0, 0, 0, 48, 48, 20118, 0, 0, 0, 0]],
'seek test 8 (padded)' => ["00 00 00 00 F8 83 FE 80 00 00 00 00", [0, 0, 0, 0, 40229, 18211, 0, 0, 0, 0]],
];
}

34
tests/cases/Encoding/TestUTF8.php

@ -7,6 +7,7 @@ declare(strict_types=1);
namespace MensBeam\Intl\TestCase\Encoding;
use MensBeam\Intl\Encoding\UTF8;
use MensBeam\Intl\Encoding\Encoding;
use MensBeam\Intl\Encoding\EncoderException;
class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest {
@ -127,21 +128,26 @@ class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest {
}
public function provideCodePoints() {
$series = [
"122" => [122, "7A"],
"162" => [162, "C2 A2"],
"27700" => [27700, "E6 B0 B4"],
"119070" => [119070, "F0 9D 84 9E"],
"63743" => [63743, "EF A3 BF"],
"1114109" => [1114109, "F4 8F BF BD"],
"65534" => [65534, "EF BF BE"],
"-1" => [-1, new EncoderException("", UTF8::E_INVALID_CODE_POINT)],
"1114112" => [1114112, new EncoderException("", UTF8::E_INVALID_CODE_POINT)],
return [
'U+007A (HTML)' => [false, 0x7A, "7A"],
'U+007A (fatal)' => [true, 0x7A, "7A"],
'U+00A2 (HTML)' => [false, 0xA2, "C2 A2"],
'U+00A2 (fatal)' => [true, 0xA2, "C2 A2"],
'U+6C34 (HTML)' => [false, 0x6C34, "E6 B0 B4"],
'U+6C34 (fatal)' => [true, 0x6C34, "E6 B0 B4"],
'U+1D11E (HTML)' => [false, 0x1D11E, "F0 9D 84 9E"],
'U+1D11E (fatal)' => [true, 0x1D11E, "F0 9D 84 9E"],
'U+F8FF (HTML)' => [false, 0xF8FF, "EF A3 BF"],
'U+F8FF (fatal)' => [true, 0xF8FF, "EF A3 BF"],
'U+10FFFD (HTML)' => [false, 0x10FFFD, "F4 8F BF BD"],
'U+10FFFD (fatal)' => [true, 0x10FFFD, "F4 8F BF BD"],
'U+FFFE (HTML)' => [false, 0xFFFE, "EF BF BE"],
'U+FFFE (fatal)' => [true, 0xFFFE, "EF BF BE"],
'-1 (HTML)' => [false, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
'-1 (fatal)' => [true, -1, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
'0x110000 (HTML)' => [false, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
'0x110000 (fatal)' => [true, 0x110000, new EncoderException("", Encoding::E_INVALID_CODE_POINT)],
];
foreach ($series as $name => $test) {
yield "$name (fatal)" => array_merge([true], $test);
yield "$name (HTML)" => array_merge([false], $test);
}
}
public function provideStrings() {

62
tools/mktestbig5.html

@ -1,62 +0,0 @@
<!DOCTYPE html>
<meta charset=big5>
<!-- Correct results are provided by Firefox -->
<pre style="font-family: 'Consolas', monospace;"></pre>
<script>
"use strict";
/*
Char 0 U+007A (1 byte) Offset 0
Char 1 U+00A2 (2 bytes) Offset 1
Char 2 U+6C34 (3 bytes) Offset 3
Char 3 U+1D11E (4 bytes) Offset 6
Char 4 U+F8FF (3 bytes) Offset 10
Char 5 U+10FFFD (4 bytes) Offset 13
Char 6 U+FFFE (3 bytes) Offset 17
End of string at char 7, offset 20
*/
[0x7A, 0xA2, 0x6C34, 0x1D11E, 0xF8FF, 0x10FFFD, 0xFFFE].forEach(function(code) {
var l = document.createElement("a");
l.href = "http://example.com/?" + String.fromCodePoint(code);
var url = l.search.substr(1);
var bytes = [];
for (let a = 0; a < url.length; a++) {
if (url.charAt(a) == "%") {
bytes.push(url.charAt(a + 1) + url.charAt(a + 2));
a = a + 2;
} else {
bytes.push(url.charCodeAt(a).toString(16).padStart(2, "0"));
}
}
var line = bytes.join(" ").toUpperCase() + "\n";
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode(line));
})
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode("\n\n\n"));
var dec = new TextDecoder("big5");
for (let lead = 0x87; lead < 0xFF; lead++) {
for (let trail = 0x40; trail < 0xFF; trail++) {
if (trail == 0x7F) trail = 0xA1;
let bytes = [];
bytes.push(lead.toString(16).padStart(2, "0").toUpperCase());
bytes.push(trail.toString(16).padStart(2, "0").toUpperCase());
let codes = [];
let text = dec.decode(new Uint8Array([lead, trail]));
for (let a = 0; a < text.length; a++) {
let point = text.codePointAt(a);
if (point >= 55296 && point <= 57343) {
// non-BMP characters have trailing low surrogates in JavaScript strings
continue;
}
codes.push(point);
}
if (codes.length == 1) {
//continue;
}
bytes = bytes.join(" ");
codes = codes.join(", ");
var line = "[" + '"' + bytes + '", [' + codes + "]],\n";
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode(line));
}
}
</script>

121
tools/mktestgbk.html

@ -1,121 +0,0 @@
<!DOCTYPE html>
<meta charset=gb18030>
<!-- Correct results are provided by Firefox -->
<pre style="font-family: 'Consolas', monospace;"></pre>
<script>
"use strict";
var data = [
{ encoding: 'gb18030', input: [], name: 'empty string' },
{ encoding: 'gb18030', input: [0x40], name: 'sanity check' },
{ encoding: 'gb18030', input: [0x80], name: 'special case for 0x80' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x37], name: 'four-byte special case' },
{ encoding: 'gb18030', input: [0xA8, 0x4E], name: 'two-byte character' },
{ encoding: 'gb18030', input: [0x82, 0x31, 0xA2, 0x37], name: 'four-byte character' },
{ encoding: 'gb18030', input: [0x82], name: 'EOF after first byte' },
{ encoding: 'gb18030', input: [0x82, 0x30], name: 'EOF after second byte' },
{ encoding: 'gb18030', input: [0x82, 0x30, 0x81], name: 'EOF after third byte' },
{ encoding: 'gb18030', input: [0xFF, 0x35, 0xF4, 0x37], name: 'bad first byte' },
{ encoding: 'gb18030', input: [0x81, 0xFF, 0xF4, 0x37], name: 'bad second byte' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0xFF, 0x37], name: 'bad third byte' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0xFF], name: 'bad fourth byte' },
{ encoding: 'gb18030', input: [0x00, 0x35, 0xF4, 0x37], name: 'control first byte' },
{ encoding: 'gb18030', input: [0x81, 0x00, 0xF4, 0x37], name: 'control second byte' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0x00, 0x37], name: 'control third byte' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x00], name: 'control fourth byte' },
{ encoding: 'gb18030', input: [0xFF, 0x35, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'bad first byte (padded)' },
{ encoding: 'gb18030', input: [0x81, 0xFF, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'bad second byte (padded)' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0xFF, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'bad third byte (padded)' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0xFF, 0x00, 0x00, 0x00, 0x00], name: 'bad fourth byte (padded)' },
{ encoding: 'gb18030', input: [0x00, 0x35, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'control first byte (padded)' },
{ encoding: 'gb18030', input: [0x81, 0x00, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'control second byte (padded)' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0x00, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'control third byte (padded)' },
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00], name: 'control fourth byte (padded)' },
{ encoding: 'gb18030', input: [0x84, 0x32, 0xA4, 0x39], name: 'void sequence' },
{ encoding: 'gb18030', input: [0xFE, 0x39, 0xFE, 0x39], name: 'void sequence 2' },
{ encoding: 'gb18030', input: [0x81, 0x81, 0x81, 0x30], name: 'seek test 1' },
{ encoding: 'gb18030', input: [0x81, 0x81, 0x80], name: 'seek test 2' },
{ encoding: 'gb18030', input: [0x81, 0x81, 0x00], name: 'seek test 3' },
{ encoding: 'gb18030', input: [0x81, 0x81, 0x81, 0x00], name: 'seek test 4' },
{ encoding: 'gb18030', input: [0x81, 0x30, 0x30, 0x30], name: 'seek test 5' },
{ encoding: 'gb18030', input: [0x81, 0x30, 0x81, 0x81], name: 'seek test 6' },
{ encoding: 'gb18030', input: [0x30, 0x30, 0x81, 0x81], name: 'seek test 7' },
{ encoding: 'gb18030', input: [0xF8, 0x83, 0xFE, 0x80], name: 'seek test 8' },
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x81, 0x81, 0x30, 0x00, 0x00, 0x00, 0x00], name: 'seek test 1 (padded)' },
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x81, 0x80, 0x00, 0x00, 0x00, 0x00], name: 'seek test 2 (padded)' },
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00], name: 'seek test 3 (padded)' },
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x81, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00], name: 'seek test 4 (padded)' },
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x30, 0x30, 0x30, 0x00, 0x00, 0x00, 0x00], name: 'seek test 5 (padded)' },
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x30, 0x81, 0x81, 0x00, 0x00, 0x00, 0x00], name: 'seek test 6 (padded)' },
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x30, 0x30, 0x81, 0x81, 0x00, 0x00, 0x00, 0x00], name: 'seek test 7 (padded)' },
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0xF8, 0x83, 0xFE, 0x80, 0x00, 0x00, 0x00, 0x00], name: 'seek test 8 (padded)' },
];
data.forEach(function(data) {
var bytes = [];
data.input.forEach((p) => {
bytes.push(p.toString(16).padStart(2, "0").toUpperCase());
});
var codes = [];
var text = new TextDecoder(data.encoding).decode(new Uint8Array(data.input));
var b = 0;
for (let a = 0; a < text.length; a++) {
let point = text.codePointAt(a);
if (point >= 55296 && point <= 57343) {
// non-BMP characters have trailing low surrogates in JavaScript strings
continue;
}
codes[b++] = point;
}
bytes = bytes.join(" ");
codes = codes.join(", ");
var line = "'" + data.name + "' => [" + '"' + bytes + '", [' + codes + "]],\n";
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode(line));
})
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode("\n\n\n"));
/*
Char 0 U+007A (1 byte) Offset 0
Char 1 U+00A2 (2 bytes) Offset 1
Char 2 U+6C34 (3 bytes) Offset 3
Char 3 U+1D11E (4 bytes) Offset 6
Char 4 U+F8FF (3 bytes) Offset 10
Char 5 U+10FFFD (4 bytes) Offset 13
Char 6 U+FFFE (3 bytes) Offset 17
End of string at char 7, offset 20
*/
[0x7A, 0xA2, 0x6C34, 0x1D11E, 0xF8FF, 0x10FFFD, 0xFFFE].forEach(function(code) {
var l = document.createElement("a");
l.href = "http://example.com/?" + String.fromCodePoint(code);
var url = l.search.substr(1);
var bytes = [];
for (let a = 0; a < url.length; a++) {
if (url.charAt(a) == "%") {
bytes.push(url.charAt(a + 1) + url.charAt(a + 2));
a = a + 2;
} else {
bytes.push(url.charCodeAt(a).toString(16).padStart(2, "0"));
}
}
var line = bytes.join(" ").toUpperCase() + "\n";
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode(line));
})
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode("\n\n\n"));
[0x64, 0x20AC, 0x2164, 0x3A74, 0xE7C7, 0x1D11E].forEach(function(code) {
var l = document.createElement("a");
l.href = "http://example.com/?" + String.fromCodePoint(code);
var url = l.search.substr(1);
var bytes = [];
for (let a = 0; a < url.length; a++) {
if (url.charAt(a) == "%") {
bytes.push(url.charAt(a + 1) + url.charAt(a + 2));
a = a + 2;
} else {
bytes.push(url.charCodeAt(a).toString(16).padStart(2, "0"));
}
}
var line = "0x" + code.toString(16).toUpperCase() + ", " + bytes.join(" ").toUpperCase() + "\n";
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode(line));
})
</script>

14
tools/test-big5.html

@ -0,0 +1,14 @@
<!DOCTYPE html>
<meta charset=big5>
<script>
var sampleStrings = {
'empty string': "",
// valid single characters
'sanity check': "40",
};
var sampleCharacters = {
};
var seekCodePoints = [
];
</script>
<script src="test.js"></script>

76
tools/test-gb18030.html

@ -0,0 +1,76 @@
<!DOCTYPE html>
<meta charset=gb18030>
<script>
var sampleStrings = {
'empty string': "",
// valid single characters
'sanity check': "40",
'special case for 0x80': "80",
'four-byte special case': "81 35 F4 37",
'two-byte character': "A8 4E",
'four-byte character': "82 31 A2 37",
// cut sequences
'EOF after first byte': "82",
'EOF after second byte': "82 30",
'EOF after third byte': "82 30 81",
// invalid sequences
'bad first byte': "FF 35 F4 37",
'bad second byte': "81 FF F4 37",
'bad third byte': "81 35 FF 37",
'bad fourth byte': "81 35 F4 FF",
'control first byte': "00 35 F4 37",
'control second byte': "81 00 F4 37",
'control third byte': "81 35 00 37",
'control fourth byte': "81 35 F4 00",
// invalid sequences with clean EOF
'bad first byte (padded)': "FF 35 F4 37 00 00 00 00",
'bad second byte (padded)': "81 FF F4 37 00 00 00 00",
'bad third byte (padded)': "81 35 FF 37 00 00 00 00",
'bad fourth byte (padded)': "81 35 F4 FF 00 00 00 00",
'control first byte (padded)': "00 35 F4 37 00 00 00 00",
'control second byte (padded)': "81 00 F4 37 00 00 00 00",
'control third byte (padded)': "81 35 00 37 00 00 00 00",
'control fourth byte (padded)': "81 35 F4 00 00 00 00 00",
// out-of-range sequences
'void sequence': "84 32 A4 39",
'void sequence 2': "FE 39 FE 39",
// backward seeking tests
'seek test 1': "81 81 81 30",
'seek test 2': "81 81 80",
'seek test 3': "81 81 00",
'seek test 4': "81 81 81 00",
'seek test 5': "81 30 30 30",
'seek test 6': "81 30 81 81",
'seek test 7': "30 30 81 81",
'seek test 8': "F8 83 FE 80",
'seek test 1 (padded)': "00 00 00 00 81 81 81 30 00 00 00 00",
'seek test 2 (padded)': "00 00 00 00 81 81 80 00 00 00 00",
'seek test 3 (padded)': "00 00 00 00 81 81 00 00 00 00 00",
'seek test 4 (padded)': "00 00 00 00 81 81 81 00 00 00 00 00",
'seek test 5 (padded)': "00 00 00 00 81 30 30 30 00 00 00 00",
'seek test 6 (padded)': "00 00 00 00 81 30 81 81 00 00 00 00",
'seek test 7 (padded)': "00 00 00 00 30 30 81 81 00 00 00 00",
'seek test 8 (padded)': "00 00 00 00 F8 83 FE 80 00 00 00 00",
};
var sampleCharacters = {
'U+0064': 0x64,
'U+20AC': 0x20AC,
'U+2164': 0x2164,
'U+3A74': 0x3A74,
'U+E7C7': 0xE7C7,
'U+1D11E': 0x1D11E,
'U+E5E5': 0xE5E5,
'-1': -1,
'0x110000': 0x110000,
};
var seekCodePoints = [
0x007A,
0x00A2,
0x6C34,
0x1D11E,
0xF8FF,
0x10FFFD,
0xFFFE,
];
</script>
<script src="test.js"></script>

16
tools/test-gbk.html

@ -0,0 +1,16 @@
<!DOCTYPE html>
<meta charset=gbk>
<script>
var sampleCharacters = {
'U+0064': 0x64,
'U+20AC': 0x20AC,
'U+2164': 0x2164,
'U+3A74': 0x3A74,
'U+E7C7': 0xE7C7,
'U+1D11E': 0x1D11E,
'U+E5E5': 0xE5E5,
'-1': -1,
'0x110000': 0x110000,
};
</script>
<script src="test.js"></script>

20
tools/test-utf16.html

@ -0,0 +1,20 @@
<!DOCTYPE html>
<meta charset=utf-16>
<script>
var sampleStrings = {
// control samples
'empty string': "",
'sanity check': "6100 6200 6300 3100 3200 3300",
'mixed sample': "7A00 A200 346C 34D8 1EDD FFF8 FFDB FDDF FEFF",
// unexpected EOF
'EOF in BMP character': "0000 FF",
'EOF after lead surrogate': "0000 34D8",
'EOF in trail surrogate': "0000 34D8 1E",
// invalid UTF-16 surrogates
'lead surrogate without trail': "34D8 0000",
'trail surrogate without lead': "1EDD 0000",
'double lead surrogate': "34D8 34D8 1EDD",
'double trail surrogate': "34D8 1EDD 1EDD",
};
</script>
<script src="test.js"></script>

70
tools/test-utf8.html

@ -0,0 +1,70 @@
<!DOCTYPE html>
<meta charset=utf-8>
<script>
var sampleStrings = {
// control samples
'empty string': "",
'sanity check': "61 62 63 31 32 33",
'multibyte control': "E5 8F A4 E6 B1 A0 E3 82 84 E8 9B 99 E9 A3 9B E3 81 B3 E8 BE BC E3 82 80 E6 B0 B4 E3 81 AE E9 9F B3",
'mixed sample': "7A C2 A2 E6 B0 B4 F0 9D 84 9E EF A3 BF F4 8F BF BD EF BF BE",
// various invalid sequences
'invalid code': "FF",
'ends early': "C0",
'ends early 2': "E0",
'invalid trail': "C0 00",
'invalid trail 2': "C0 C0",
'invalid trail 3': "E0 00",
'invalid trail 4': "E0 C0",
'invalid trail 5': "E0 80 00",
'invalid trail 6': "E0 80 C0",
'> 0x10FFFF': "FC 80 80 80 80 80",
'obsolete lead byte': "FE 80 80 80 80 80",
'overlong U+0000 - 2 bytes': "C0 80",
'overlong U+0000 - 3 bytes': "E0 80 80",
'overlong U+0000 - 4 bytes': "F0 80 80 80",
'overlong U+0000 - 5 bytes': "F8 80 80 80 80",
'overlong U+0000 - 6 bytes': "FC 80 80 80 80 80",
'overlong U+007F - 2 bytes': "C1 BF",
'overlong U+007F - 3 bytes': "E0 81 BF",
'overlong U+007F - 4 bytes': "F0 80 81 BF",
'overlong U+007F - 5 bytes': "F8 80 80 81 BF",
'overlong U+007F - 6 bytes': "FC 80 80 80 81 BF",
'overlong U+07FF - 3 bytes': "E0 9F BF",
'overlong U+07FF - 4 bytes': "F0 80 9F BF",
'overlong U+07FF - 5 bytes': "F8 80 80 9F BF",
'overlong U+07FF - 6 bytes': "FC 80 80 80 9F BF",
'overlong U+FFFF - 4 bytes': "F0 8F BF BF",
'overlong U+FFFF - 5 bytes': "F8 80 8F BF BF",
'overlong U+FFFF - 6 bytes': "FC 80 80 8F BF BF",
'overlong U+10FFFF - 5 bytes': "F8 84 8F BF BF",
'overlong U+10FFFF - 6 bytes': "FC 80 84 8F BF BF",
// UTF-16 surrogates
'lead surrogate': "ED A0 80",
'trail surrogate': "ED B0 80",
'surrogate pair': "ED A0 80 ED B0 80",
// self-sync edge cases
'trailing continuation': "0A 80 80",
'trailing continuation 2': "E5 8F A4 80",
};
var sampleCharacters = {
'U+007A': 0x007A,
'U+00A2': 0x00A2,
'U+6C34': 0x6C34,
'U+1D11E': 0x1D11E,
'U+F8FF': 0xF8FF,
'U+10FFFD': 0x10FFFD,
'U+FFFE': 0xFFFE,
'-1': -1,
'0x110000': 0x110000,
};
var seekCodePoints = [
0x007A,
0x00A2,
0x6C34,
0x1D11E,
0xF8FF,
0x10FFFD,
0xFFFE,
];
</script>
<script src="test.js"></script>

141
tools/test.js

@ -0,0 +1,141 @@
"use strict";
// set out the output pre-formatted text element
window.out = document.createElement("pre");
document.documentElement.appendChild(out);
var encoding = document.getElementsByTagName("meta").charset;
function encodeCodePoint(code, fatal) {
if (code < 0 || code > 0x10FFFF) {
return 'new EncoderException("", Encoding::E_INVALID_CODE_POINT)';
} else {
var l = document.createElement("a");
l.href = "http://example.com/?" + String.fromCodePoint(code);
var bytes = [];
let url = l.search.substr(1);
for (let a = 0; a < url.length; a++) {
if ((url.charAt(a) == "%" && url.substr(a, 6) == "%26%23") || url.charAt(a) == "&") {
// character cannot be encoded
if (fatal) {
return 'new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)';
} else {
return decodeURIComponent(url);
}
} else if (url.charAt(a) == "%") {
bytes.push(url.charAt(a + 1) + url.charAt(a + 2));
a = a + 2;
} else {
bytes.push(url.charCodeAt(a).toString(16).padStart(2, "0"));
}
}
}
return bytes;
}
function wrapCodePoint(code, fatal) {
var out = encodeCodePoint(code, fatal);
if (Array.isArray(out)) {
return '"' + out.join(" ") + '"';
} else if (out.charAt(0) == "&") {
return 'bin2hex("' + out + '")';
} else {
return out;
}
}
if(typeof sampleStrings != 'undefined') {
var decoder = new TextDecoder(encoding);
for (let name in sampleStrings) {
let input = sampleStrings[name].replace(/\s/g, "");
let bytes = [];
for (let a = 0; a < input.length; a = a + 2) {
bytes.push(parseInt(input.substr(a, 2), 16));
}
let text = decoder.decode(new Uint8Array(bytes));
let codes = [];
for (let a = 0; a < text.length; a++) {
let point = text.codePointAt(a);
if (point >= 55296 && point <= 57343) {
// non-BMP characters have trailing low surrogates in JavaScript strings
continue;
}
codes.push(point);
}
codes = codes.join(", ");
bytes = sampleStrings[name];
let line = "'" + name + "' => [" + '"' + bytes + '", [' + codes + "]],\n";
out.appendChild(document.createTextNode(line));
}
out.appendChild(document.createTextNode("\n\n"));
}
if(typeof sampleCharacters != 'undefined') {
for (name in sampleCharacters) {
let code = sampleCharacters[name];
if (code > -1 && code % 1 == 0) code = "0x" + code.toString(16).toUpperCase();
let line1 = "'" + name + " (HTML)' => [false, " + code + ", " + wrapCodePoint(code, false) + "],\n";
let line2 = "'" + name + " (fatal)' => [true, " + code + ", " + wrapCodePoint(code, true) + "],\n";
out.appendChild(document.createTextNode(line1));
out.appendChild(document.createTextNode(line2));
}
out.appendChild(document.createTextNode("\n\n"));
}
if(typeof seekCodePoints != 'undefined') {
// first gather statistics on the encoding of the specified array of code points
var stats = [];
var a = 0;
var offset = 0;
for (let b = 0; b < seekCodePoints.length; b++) {
let code = seekCodePoints[b];
stats[a] = {
'code': code,
'offset': offset,
'length': 0,
'bytes': "",
};
let bytes = encodeCodePoint(code, true);
if (Array.isArray(bytes)) {
stats[a].length = bytes.length;
stats[a].bytes = bytes.join("").toUpperCase();
offset = offset + bytes.length;
} else {
stats[a].length = 1;
stats[a].bytes = "()";
offset = offset + 1;
}
a++;
}
var end = [a, offset];
// summarize the statistics in a comment
var comment = "/*\n";
for (let a = 0; a < stats.length; a++) {
let length = (stats[a].length == 1) ? "(1 byte) " : "(" + stats[a].length + " bytes)";
comment = comment + " Char " + a + " U+" + stats[a].code.toString(16).padStart(4, "0").padEnd(6, " ").toUpperCase() + " " + length + " Offset " + stats[a].offset + "\n";
}
comment = comment + " End of string at char " + end[0] + ", offset " + end[1] + "\n";
comment = comment + "*/\n";
// build the encoded byte string
var bytes = [];
for (let char of stats) {
bytes.push(char.bytes);
}
bytes = 'protected $seekString = "' + bytes.join(" ") + '";' + "\n";
// build the array of code points
var codes = [];
for (let char of stats) {
codes.push("0x" + char.code.toString(16).toUpperCase());
}
codes = 'protected $seekCodes = [' + codes.join(", ") + "];\n";
// build the array of offsets
var offs = [];
for (let char of stats) {
offs.push(char.offset);
}
offs = 'protected $seekOffsets = [' + offs.join(", ") + "];\n";
// output the results
out.appendChild(document.createTextNode(comment));
out.appendChild(document.createTextNode(bytes));
out.appendChild(document.createTextNode(codes));
out.appendChild(document.createTextNode(offs));
}
Loading…
Cancel
Save