J. King
6 years ago
11 changed files with 405 additions and 239 deletions
@ -1,62 +0,0 @@ |
|||||
<!DOCTYPE html> |
|
||||
<meta charset=big5> |
|
||||
<!-- Correct results are provided by Firefox --> |
|
||||
<pre style="font-family: 'Consolas', monospace;"></pre> |
|
||||
<script> |
|
||||
"use strict"; |
|
||||
/* |
|
||||
Char 0 U+007A (1 byte) Offset 0 |
|
||||
Char 1 U+00A2 (2 bytes) Offset 1 |
|
||||
Char 2 U+6C34 (3 bytes) Offset 3 |
|
||||
Char 3 U+1D11E (4 bytes) Offset 6 |
|
||||
Char 4 U+F8FF (3 bytes) Offset 10 |
|
||||
Char 5 U+10FFFD (4 bytes) Offset 13 |
|
||||
Char 6 U+FFFE (3 bytes) Offset 17 |
|
||||
End of string at char 7, offset 20 |
|
||||
*/ |
|
||||
[0x7A, 0xA2, 0x6C34, 0x1D11E, 0xF8FF, 0x10FFFD, 0xFFFE].forEach(function(code) { |
|
||||
var l = document.createElement("a"); |
|
||||
l.href = "http://example.com/?" + String.fromCodePoint(code); |
|
||||
var url = l.search.substr(1); |
|
||||
var bytes = []; |
|
||||
for (let a = 0; a < url.length; a++) { |
|
||||
if (url.charAt(a) == "%") { |
|
||||
bytes.push(url.charAt(a + 1) + url.charAt(a + 2)); |
|
||||
a = a + 2; |
|
||||
} else { |
|
||||
bytes.push(url.charCodeAt(a).toString(16).padStart(2, "0")); |
|
||||
} |
|
||||
} |
|
||||
var line = bytes.join(" ").toUpperCase() + "\n"; |
|
||||
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode(line)); |
|
||||
}) |
|
||||
|
|
||||
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode("\n\n\n")); |
|
||||
|
|
||||
var dec = new TextDecoder("big5"); |
|
||||
for (let lead = 0x87; lead < 0xFF; lead++) { |
|
||||
for (let trail = 0x40; trail < 0xFF; trail++) { |
|
||||
if (trail == 0x7F) trail = 0xA1; |
|
||||
let bytes = []; |
|
||||
bytes.push(lead.toString(16).padStart(2, "0").toUpperCase()); |
|
||||
bytes.push(trail.toString(16).padStart(2, "0").toUpperCase()); |
|
||||
let codes = []; |
|
||||
let text = dec.decode(new Uint8Array([lead, trail])); |
|
||||
for (let a = 0; a < text.length; a++) { |
|
||||
let point = text.codePointAt(a); |
|
||||
if (point >= 55296 && point <= 57343) { |
|
||||
// non-BMP characters have trailing low surrogates in JavaScript strings |
|
||||
continue; |
|
||||
} |
|
||||
codes.push(point); |
|
||||
} |
|
||||
if (codes.length == 1) { |
|
||||
//continue; |
|
||||
} |
|
||||
bytes = bytes.join(" "); |
|
||||
codes = codes.join(", "); |
|
||||
var line = "[" + '"' + bytes + '", [' + codes + "]],\n"; |
|
||||
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode(line)); |
|
||||
} |
|
||||
} |
|
||||
</script> |
|
@ -1,121 +0,0 @@ |
|||||
<!DOCTYPE html> |
|
||||
<meta charset=gb18030> |
|
||||
<!-- Correct results are provided by Firefox --> |
|
||||
<pre style="font-family: 'Consolas', monospace;"></pre> |
|
||||
<script> |
|
||||
"use strict"; |
|
||||
var data = [ |
|
||||
{ encoding: 'gb18030', input: [], name: 'empty string' }, |
|
||||
{ encoding: 'gb18030', input: [0x40], name: 'sanity check' }, |
|
||||
{ encoding: 'gb18030', input: [0x80], name: 'special case for 0x80' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x37], name: 'four-byte special case' }, |
|
||||
{ encoding: 'gb18030', input: [0xA8, 0x4E], name: 'two-byte character' }, |
|
||||
{ encoding: 'gb18030', input: [0x82, 0x31, 0xA2, 0x37], name: 'four-byte character' }, |
|
||||
{ encoding: 'gb18030', input: [0x82], name: 'EOF after first byte' }, |
|
||||
{ encoding: 'gb18030', input: [0x82, 0x30], name: 'EOF after second byte' }, |
|
||||
{ encoding: 'gb18030', input: [0x82, 0x30, 0x81], name: 'EOF after third byte' }, |
|
||||
{ encoding: 'gb18030', input: [0xFF, 0x35, 0xF4, 0x37], name: 'bad first byte' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0xFF, 0xF4, 0x37], name: 'bad second byte' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x35, 0xFF, 0x37], name: 'bad third byte' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0xFF], name: 'bad fourth byte' }, |
|
||||
{ encoding: 'gb18030', input: [0x00, 0x35, 0xF4, 0x37], name: 'control first byte' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x00, 0xF4, 0x37], name: 'control second byte' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x35, 0x00, 0x37], name: 'control third byte' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x00], name: 'control fourth byte' }, |
|
||||
{ encoding: 'gb18030', input: [0xFF, 0x35, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'bad first byte (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0xFF, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'bad second byte (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x35, 0xFF, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'bad third byte (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0xFF, 0x00, 0x00, 0x00, 0x00], name: 'bad fourth byte (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x00, 0x35, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'control first byte (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x00, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'control second byte (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x35, 0x00, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'control third byte (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00], name: 'control fourth byte (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x84, 0x32, 0xA4, 0x39], name: 'void sequence' }, |
|
||||
{ encoding: 'gb18030', input: [0xFE, 0x39, 0xFE, 0x39], name: 'void sequence 2' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x81, 0x81, 0x30], name: 'seek test 1' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x81, 0x80], name: 'seek test 2' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x81, 0x00], name: 'seek test 3' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x81, 0x81, 0x00], name: 'seek test 4' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x30, 0x30, 0x30], name: 'seek test 5' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x30, 0x81, 0x81], name: 'seek test 6' }, |
|
||||
{ encoding: 'gb18030', input: [0x30, 0x30, 0x81, 0x81], name: 'seek test 7' }, |
|
||||
{ encoding: 'gb18030', input: [0xF8, 0x83, 0xFE, 0x80], name: 'seek test 8' }, |
|
||||
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x81, 0x81, 0x30, 0x00, 0x00, 0x00, 0x00], name: 'seek test 1 (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x81, 0x80, 0x00, 0x00, 0x00, 0x00], name: 'seek test 2 (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00], name: 'seek test 3 (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x81, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00], name: 'seek test 4 (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x30, 0x30, 0x30, 0x00, 0x00, 0x00, 0x00], name: 'seek test 5 (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x30, 0x81, 0x81, 0x00, 0x00, 0x00, 0x00], name: 'seek test 6 (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x30, 0x30, 0x81, 0x81, 0x00, 0x00, 0x00, 0x00], name: 'seek test 7 (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0xF8, 0x83, 0xFE, 0x80, 0x00, 0x00, 0x00, 0x00], name: 'seek test 8 (padded)' }, |
|
||||
]; |
|
||||
data.forEach(function(data) { |
|
||||
var bytes = []; |
|
||||
data.input.forEach((p) => { |
|
||||
bytes.push(p.toString(16).padStart(2, "0").toUpperCase()); |
|
||||
}); |
|
||||
var codes = []; |
|
||||
var text = new TextDecoder(data.encoding).decode(new Uint8Array(data.input)); |
|
||||
var b = 0; |
|
||||
for (let a = 0; a < text.length; a++) { |
|
||||
let point = text.codePointAt(a); |
|
||||
if (point >= 55296 && point <= 57343) { |
|
||||
// non-BMP characters have trailing low surrogates in JavaScript strings |
|
||||
continue; |
|
||||
} |
|
||||
codes[b++] = point; |
|
||||
} |
|
||||
bytes = bytes.join(" "); |
|
||||
codes = codes.join(", "); |
|
||||
var line = "'" + data.name + "' => [" + '"' + bytes + '", [' + codes + "]],\n"; |
|
||||
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode(line)); |
|
||||
}) |
|
||||
|
|
||||
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode("\n\n\n")); |
|
||||
|
|
||||
/* |
|
||||
Char 0 U+007A (1 byte) Offset 0 |
|
||||
Char 1 U+00A2 (2 bytes) Offset 1 |
|
||||
Char 2 U+6C34 (3 bytes) Offset 3 |
|
||||
Char 3 U+1D11E (4 bytes) Offset 6 |
|
||||
Char 4 U+F8FF (3 bytes) Offset 10 |
|
||||
Char 5 U+10FFFD (4 bytes) Offset 13 |
|
||||
Char 6 U+FFFE (3 bytes) Offset 17 |
|
||||
End of string at char 7, offset 20 |
|
||||
*/ |
|
||||
[0x7A, 0xA2, 0x6C34, 0x1D11E, 0xF8FF, 0x10FFFD, 0xFFFE].forEach(function(code) { |
|
||||
var l = document.createElement("a"); |
|
||||
l.href = "http://example.com/?" + String.fromCodePoint(code); |
|
||||
var url = l.search.substr(1); |
|
||||
var bytes = []; |
|
||||
for (let a = 0; a < url.length; a++) { |
|
||||
if (url.charAt(a) == "%") { |
|
||||
bytes.push(url.charAt(a + 1) + url.charAt(a + 2)); |
|
||||
a = a + 2; |
|
||||
} else { |
|
||||
bytes.push(url.charCodeAt(a).toString(16).padStart(2, "0")); |
|
||||
} |
|
||||
} |
|
||||
var line = bytes.join(" ").toUpperCase() + "\n"; |
|
||||
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode(line)); |
|
||||
}) |
|
||||
|
|
||||
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode("\n\n\n")); |
|
||||
|
|
||||
[0x64, 0x20AC, 0x2164, 0x3A74, 0xE7C7, 0x1D11E].forEach(function(code) { |
|
||||
var l = document.createElement("a"); |
|
||||
l.href = "http://example.com/?" + String.fromCodePoint(code); |
|
||||
var url = l.search.substr(1); |
|
||||
var bytes = []; |
|
||||
for (let a = 0; a < url.length; a++) { |
|
||||
if (url.charAt(a) == "%") { |
|
||||
bytes.push(url.charAt(a + 1) + url.charAt(a + 2)); |
|
||||
a = a + 2; |
|
||||
} else { |
|
||||
bytes.push(url.charCodeAt(a).toString(16).padStart(2, "0")); |
|
||||
} |
|
||||
} |
|
||||
var line = "0x" + code.toString(16).toUpperCase() + ", " + bytes.join(" ").toUpperCase() + "\n"; |
|
||||
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode(line)); |
|
||||
}) |
|
||||
</script> |
|
@ -0,0 +1,14 @@ |
|||||
|
<!DOCTYPE html> |
||||
|
<meta charset=big5> |
||||
|
<script> |
||||
|
var sampleStrings = { |
||||
|
'empty string': "", |
||||
|
// valid single characters |
||||
|
'sanity check': "40", |
||||
|
}; |
||||
|
var sampleCharacters = { |
||||
|
}; |
||||
|
var seekCodePoints = [ |
||||
|
]; |
||||
|
</script> |
||||
|
<script src="test.js"></script> |
@ -0,0 +1,76 @@ |
|||||
|
<!DOCTYPE html> |
||||
|
<meta charset=gb18030> |
||||
|
<script> |
||||
|
var sampleStrings = { |
||||
|
'empty string': "", |
||||
|
// valid single characters |
||||
|
'sanity check': "40", |
||||
|
'special case for 0x80': "80", |
||||
|
'four-byte special case': "81 35 F4 37", |
||||
|
'two-byte character': "A8 4E", |
||||
|
'four-byte character': "82 31 A2 37", |
||||
|
// cut sequences |
||||
|
'EOF after first byte': "82", |
||||
|
'EOF after second byte': "82 30", |
||||
|
'EOF after third byte': "82 30 81", |
||||
|
// invalid sequences |
||||
|
'bad first byte': "FF 35 F4 37", |
||||
|
'bad second byte': "81 FF F4 37", |
||||
|
'bad third byte': "81 35 FF 37", |
||||
|
'bad fourth byte': "81 35 F4 FF", |
||||
|
'control first byte': "00 35 F4 37", |
||||
|
'control second byte': "81 00 F4 37", |
||||
|
'control third byte': "81 35 00 37", |
||||
|
'control fourth byte': "81 35 F4 00", |
||||
|
// invalid sequences with clean EOF |
||||
|
'bad first byte (padded)': "FF 35 F4 37 00 00 00 00", |
||||
|
'bad second byte (padded)': "81 FF F4 37 00 00 00 00", |
||||
|
'bad third byte (padded)': "81 35 FF 37 00 00 00 00", |
||||
|
'bad fourth byte (padded)': "81 35 F4 FF 00 00 00 00", |
||||
|
'control first byte (padded)': "00 35 F4 37 00 00 00 00", |
||||
|
'control second byte (padded)': "81 00 F4 37 00 00 00 00", |
||||
|
'control third byte (padded)': "81 35 00 37 00 00 00 00", |
||||
|
'control fourth byte (padded)': "81 35 F4 00 00 00 00 00", |
||||
|
// out-of-range sequences |
||||
|
'void sequence': "84 32 A4 39", |
||||
|
'void sequence 2': "FE 39 FE 39", |
||||
|
// backward seeking tests |
||||
|
'seek test 1': "81 81 81 30", |
||||
|
'seek test 2': "81 81 80", |
||||
|
'seek test 3': "81 81 00", |
||||
|
'seek test 4': "81 81 81 00", |
||||
|
'seek test 5': "81 30 30 30", |
||||
|
'seek test 6': "81 30 81 81", |
||||
|
'seek test 7': "30 30 81 81", |
||||
|
'seek test 8': "F8 83 FE 80", |
||||
|
'seek test 1 (padded)': "00 00 00 00 81 81 81 30 00 00 00 00", |
||||
|
'seek test 2 (padded)': "00 00 00 00 81 81 80 00 00 00 00", |
||||
|
'seek test 3 (padded)': "00 00 00 00 81 81 00 00 00 00 00", |
||||
|
'seek test 4 (padded)': "00 00 00 00 81 81 81 00 00 00 00 00", |
||||
|
'seek test 5 (padded)': "00 00 00 00 81 30 30 30 00 00 00 00", |
||||
|
'seek test 6 (padded)': "00 00 00 00 81 30 81 81 00 00 00 00", |
||||
|
'seek test 7 (padded)': "00 00 00 00 30 30 81 81 00 00 00 00", |
||||
|
'seek test 8 (padded)': "00 00 00 00 F8 83 FE 80 00 00 00 00", |
||||
|
}; |
||||
|
var sampleCharacters = { |
||||
|
'U+0064': 0x64, |
||||
|
'U+20AC': 0x20AC, |
||||
|
'U+2164': 0x2164, |
||||
|
'U+3A74': 0x3A74, |
||||
|
'U+E7C7': 0xE7C7, |
||||
|
'U+1D11E': 0x1D11E, |
||||
|
'U+E5E5': 0xE5E5, |
||||
|
'-1': -1, |
||||
|
'0x110000': 0x110000, |
||||
|
}; |
||||
|
var seekCodePoints = [ |
||||
|
0x007A, |
||||
|
0x00A2, |
||||
|
0x6C34, |
||||
|
0x1D11E, |
||||
|
0xF8FF, |
||||
|
0x10FFFD, |
||||
|
0xFFFE, |
||||
|
]; |
||||
|
</script> |
||||
|
<script src="test.js"></script> |
@ -0,0 +1,16 @@ |
|||||
|
<!DOCTYPE html> |
||||
|
<meta charset=gbk> |
||||
|
<script> |
||||
|
var sampleCharacters = { |
||||
|
'U+0064': 0x64, |
||||
|
'U+20AC': 0x20AC, |
||||
|
'U+2164': 0x2164, |
||||
|
'U+3A74': 0x3A74, |
||||
|
'U+E7C7': 0xE7C7, |
||||
|
'U+1D11E': 0x1D11E, |
||||
|
'U+E5E5': 0xE5E5, |
||||
|
'-1': -1, |
||||
|
'0x110000': 0x110000, |
||||
|
}; |
||||
|
</script> |
||||
|
<script src="test.js"></script> |
@ -0,0 +1,20 @@ |
|||||
|
<!DOCTYPE html> |
||||
|
<meta charset=utf-16> |
||||
|
<script> |
||||
|
var sampleStrings = { |
||||
|
// control samples |
||||
|
'empty string': "", |
||||
|
'sanity check': "6100 6200 6300 3100 3200 3300", |
||||
|
'mixed sample': "7A00 A200 346C 34D8 1EDD FFF8 FFDB FDDF FEFF", |
||||
|
// unexpected EOF |
||||
|
'EOF in BMP character': "0000 FF", |
||||
|
'EOF after lead surrogate': "0000 34D8", |
||||
|
'EOF in trail surrogate': "0000 34D8 1E", |
||||
|
// invalid UTF-16 surrogates |
||||
|
'lead surrogate without trail': "34D8 0000", |
||||
|
'trail surrogate without lead': "1EDD 0000", |
||||
|
'double lead surrogate': "34D8 34D8 1EDD", |
||||
|
'double trail surrogate': "34D8 1EDD 1EDD", |
||||
|
}; |
||||
|
</script> |
||||
|
<script src="test.js"></script> |
@ -0,0 +1,70 @@ |
|||||
|
<!DOCTYPE html> |
||||
|
<meta charset=utf-8> |
||||
|
<script> |
||||
|
var sampleStrings = { |
||||
|
// control samples |
||||
|
'empty string': "", |
||||
|
'sanity check': "61 62 63 31 32 33", |
||||
|
'multibyte control': "E5 8F A4 E6 B1 A0 E3 82 84 E8 9B 99 E9 A3 9B E3 81 B3 E8 BE BC E3 82 80 E6 B0 B4 E3 81 AE E9 9F B3", |
||||
|
'mixed sample': "7A C2 A2 E6 B0 B4 F0 9D 84 9E EF A3 BF F4 8F BF BD EF BF BE", |
||||
|
// various invalid sequences |
||||
|
'invalid code': "FF", |
||||
|
'ends early': "C0", |
||||
|
'ends early 2': "E0", |
||||
|
'invalid trail': "C0 00", |
||||
|
'invalid trail 2': "C0 C0", |
||||
|
'invalid trail 3': "E0 00", |
||||
|
'invalid trail 4': "E0 C0", |
||||
|
'invalid trail 5': "E0 80 00", |
||||
|
'invalid trail 6': "E0 80 C0", |
||||
|
'> 0x10FFFF': "FC 80 80 80 80 80", |
||||
|
'obsolete lead byte': "FE 80 80 80 80 80", |
||||
|
'overlong U+0000 - 2 bytes': "C0 80", |
||||
|
'overlong U+0000 - 3 bytes': "E0 80 80", |
||||
|
'overlong U+0000 - 4 bytes': "F0 80 80 80", |
||||
|
'overlong U+0000 - 5 bytes': "F8 80 80 80 80", |
||||
|
'overlong U+0000 - 6 bytes': "FC 80 80 80 80 80", |
||||
|
'overlong U+007F - 2 bytes': "C1 BF", |
||||
|
'overlong U+007F - 3 bytes': "E0 81 BF", |
||||
|
'overlong U+007F - 4 bytes': "F0 80 81 BF", |
||||
|
'overlong U+007F - 5 bytes': "F8 80 80 81 BF", |
||||
|
'overlong U+007F - 6 bytes': "FC 80 80 80 81 BF", |
||||
|
'overlong U+07FF - 3 bytes': "E0 9F BF", |
||||
|
'overlong U+07FF - 4 bytes': "F0 80 9F BF", |
||||
|
'overlong U+07FF - 5 bytes': "F8 80 80 9F BF", |
||||
|
'overlong U+07FF - 6 bytes': "FC 80 80 80 9F BF", |
||||
|
'overlong U+FFFF - 4 bytes': "F0 8F BF BF", |
||||
|
'overlong U+FFFF - 5 bytes': "F8 80 8F BF BF", |
||||
|
'overlong U+FFFF - 6 bytes': "FC 80 80 8F BF BF", |
||||
|
'overlong U+10FFFF - 5 bytes': "F8 84 8F BF BF", |
||||
|
'overlong U+10FFFF - 6 bytes': "FC 80 84 8F BF BF", |
||||
|
// UTF-16 surrogates |
||||
|
'lead surrogate': "ED A0 80", |
||||
|
'trail surrogate': "ED B0 80", |
||||
|
'surrogate pair': "ED A0 80 ED B0 80", |
||||
|
// self-sync edge cases |
||||
|
'trailing continuation': "0A 80 80", |
||||
|
'trailing continuation 2': "E5 8F A4 80", |
||||
|
}; |
||||
|
var sampleCharacters = { |
||||
|
'U+007A': 0x007A, |
||||
|
'U+00A2': 0x00A2, |
||||
|
'U+6C34': 0x6C34, |
||||
|
'U+1D11E': 0x1D11E, |
||||
|
'U+F8FF': 0xF8FF, |
||||
|
'U+10FFFD': 0x10FFFD, |
||||
|
'U+FFFE': 0xFFFE, |
||||
|
'-1': -1, |
||||
|
'0x110000': 0x110000, |
||||
|
}; |
||||
|
var seekCodePoints = [ |
||||
|
0x007A, |
||||
|
0x00A2, |
||||
|
0x6C34, |
||||
|
0x1D11E, |
||||
|
0xF8FF, |
||||
|
0x10FFFD, |
||||
|
0xFFFE, |
||||
|
]; |
||||
|
</script> |
||||
|
<script src="test.js"></script> |
@ -0,0 +1,141 @@ |
|||||
|
"use strict"; |
||||
|
// set out the output pre-formatted text element
|
||||
|
window.out = document.createElement("pre"); |
||||
|
document.documentElement.appendChild(out); |
||||
|
|
||||
|
var encoding = document.getElementsByTagName("meta").charset; |
||||
|
|
||||
|
function encodeCodePoint(code, fatal) { |
||||
|
if (code < 0 || code > 0x10FFFF) { |
||||
|
return 'new EncoderException("", Encoding::E_INVALID_CODE_POINT)'; |
||||
|
} else { |
||||
|
var l = document.createElement("a"); |
||||
|
l.href = "http://example.com/?" + String.fromCodePoint(code); |
||||
|
var bytes = []; |
||||
|
let url = l.search.substr(1); |
||||
|
for (let a = 0; a < url.length; a++) { |
||||
|
if ((url.charAt(a) == "%" && url.substr(a, 6) == "%26%23") || url.charAt(a) == "&") { |
||||
|
// character cannot be encoded
|
||||
|
if (fatal) { |
||||
|
return 'new EncoderException("", Encoding::E_UNAVAILABLE_CODE_POINT)'; |
||||
|
} else { |
||||
|
return decodeURIComponent(url); |
||||
|
} |
||||
|
} else if (url.charAt(a) == "%") { |
||||
|
bytes.push(url.charAt(a + 1) + url.charAt(a + 2)); |
||||
|
a = a + 2; |
||||
|
} else { |
||||
|
bytes.push(url.charCodeAt(a).toString(16).padStart(2, "0")); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
return bytes; |
||||
|
} |
||||
|
|
||||
|
function wrapCodePoint(code, fatal) { |
||||
|
var out = encodeCodePoint(code, fatal); |
||||
|
if (Array.isArray(out)) { |
||||
|
return '"' + out.join(" ") + '"'; |
||||
|
} else if (out.charAt(0) == "&") { |
||||
|
return 'bin2hex("' + out + '")'; |
||||
|
} else { |
||||
|
return out; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if(typeof sampleStrings != 'undefined') { |
||||
|
var decoder = new TextDecoder(encoding); |
||||
|
for (let name in sampleStrings) { |
||||
|
let input = sampleStrings[name].replace(/\s/g, ""); |
||||
|
let bytes = []; |
||||
|
for (let a = 0; a < input.length; a = a + 2) { |
||||
|
bytes.push(parseInt(input.substr(a, 2), 16)); |
||||
|
} |
||||
|
let text = decoder.decode(new Uint8Array(bytes)); |
||||
|
let codes = []; |
||||
|
for (let a = 0; a < text.length; a++) { |
||||
|
let point = text.codePointAt(a); |
||||
|
if (point >= 55296 && point <= 57343) { |
||||
|
// non-BMP characters have trailing low surrogates in JavaScript strings
|
||||
|
continue; |
||||
|
} |
||||
|
codes.push(point); |
||||
|
} |
||||
|
codes = codes.join(", "); |
||||
|
bytes = sampleStrings[name]; |
||||
|
let line = "'" + name + "' => [" + '"' + bytes + '", [' + codes + "]],\n"; |
||||
|
out.appendChild(document.createTextNode(line)); |
||||
|
} |
||||
|
out.appendChild(document.createTextNode("\n\n")); |
||||
|
} |
||||
|
|
||||
|
if(typeof sampleCharacters != 'undefined') { |
||||
|
for (name in sampleCharacters) { |
||||
|
let code = sampleCharacters[name]; |
||||
|
if (code > -1 && code % 1 == 0) code = "0x" + code.toString(16).toUpperCase(); |
||||
|
let line1 = "'" + name + " (HTML)' => [false, " + code + ", " + wrapCodePoint(code, false) + "],\n"; |
||||
|
let line2 = "'" + name + " (fatal)' => [true, " + code + ", " + wrapCodePoint(code, true) + "],\n"; |
||||
|
out.appendChild(document.createTextNode(line1)); |
||||
|
out.appendChild(document.createTextNode(line2)); |
||||
|
} |
||||
|
out.appendChild(document.createTextNode("\n\n")); |
||||
|
} |
||||
|
|
||||
|
if(typeof seekCodePoints != 'undefined') { |
||||
|
// first gather statistics on the encoding of the specified array of code points
|
||||
|
var stats = []; |
||||
|
var a = 0; |
||||
|
var offset = 0; |
||||
|
for (let b = 0; b < seekCodePoints.length; b++) { |
||||
|
let code = seekCodePoints[b]; |
||||
|
stats[a] = { |
||||
|
'code': code, |
||||
|
'offset': offset, |
||||
|
'length': 0, |
||||
|
'bytes': "", |
||||
|
}; |
||||
|
let bytes = encodeCodePoint(code, true); |
||||
|
if (Array.isArray(bytes)) { |
||||
|
stats[a].length = bytes.length; |
||||
|
stats[a].bytes = bytes.join("").toUpperCase(); |
||||
|
offset = offset + bytes.length; |
||||
|
} else { |
||||
|
stats[a].length = 1; |
||||
|
stats[a].bytes = "()"; |
||||
|
offset = offset + 1; |
||||
|
} |
||||
|
a++; |
||||
|
} |
||||
|
var end = [a, offset]; |
||||
|
// summarize the statistics in a comment
|
||||
|
var comment = "/*\n"; |
||||
|
for (let a = 0; a < stats.length; a++) { |
||||
|
let length = (stats[a].length == 1) ? "(1 byte) " : "(" + stats[a].length + " bytes)"; |
||||
|
comment = comment + " Char " + a + " U+" + stats[a].code.toString(16).padStart(4, "0").padEnd(6, " ").toUpperCase() + " " + length + " Offset " + stats[a].offset + "\n"; |
||||
|
} |
||||
|
comment = comment + " End of string at char " + end[0] + ", offset " + end[1] + "\n"; |
||||
|
comment = comment + "*/\n"; |
||||
|
// build the encoded byte string
|
||||
|
var bytes = []; |
||||
|
for (let char of stats) { |
||||
|
bytes.push(char.bytes); |
||||
|
} |
||||
|
bytes = 'protected $seekString = "' + bytes.join(" ") + '";' + "\n"; |
||||
|
// build the array of code points
|
||||
|
var codes = []; |
||||
|
for (let char of stats) { |
||||
|
codes.push("0x" + char.code.toString(16).toUpperCase()); |
||||
|
} |
||||
|
codes = 'protected $seekCodes = [' + codes.join(", ") + "];\n"; |
||||
|
// build the array of offsets
|
||||
|
var offs = []; |
||||
|
for (let char of stats) { |
||||
|
offs.push(char.offset); |
||||
|
} |
||||
|
offs = 'protected $seekOffsets = [' + offs.join(", ") + "];\n"; |
||||
|
// output the results
|
||||
|
out.appendChild(document.createTextNode(comment)); |
||||
|
out.appendChild(document.createTextNode(bytes)); |
||||
|
out.appendChild(document.createTextNode(codes)); |
||||
|
out.appendChild(document.createTextNode(offs)); |
||||
|
} |
Loading…
Reference in new issue