Browse Source

Implement gb18030 and GBK encoders

span
J. King 6 years ago
parent
commit
40d0054bd1
  1. 2
      lib/Encoding/GB18030.php
  2. 41
      lib/Encoding/GBCommon.php
  3. 2
      lib/Encoding/GBK.php
  4. 59
      tests/cases/Encoding/TestGB18030.php
  5. 2
      tools/mkgbk.php
  6. 44
      tools/mktestgbk.html

2
lib/Encoding/GB18030.php

@ -7,8 +7,8 @@ declare(strict_types=1);
namespace MensBeam\Intl\Encoding;
class GB18030 extends GBCommon {
const GBK = false;
const NAME = "gb18030";
const LABELS = ["gb18030"];
const GBK = false;
}

41
lib/Encoding/GBCommon.php

File diff suppressed because one or more lines are too long

2
lib/Encoding/GBK.php

@ -7,6 +7,7 @@ declare(strict_types=1);
namespace MensBeam\Intl\Encoding;
class GBK extends GBCommon {
const GBK = true;
const NAME = "GBK";
const LABELS = [
"chinese",
@ -20,5 +21,4 @@ class GBK extends GBCommon {
"x-gbk",
];
const GBK = true;
}

59
tests/cases/Encoding/TestGB18030.php

@ -6,12 +6,29 @@
declare(strict_types=1);
namespace MensBeam\Intl\TestCase\Encoding;
use MensBeam\Intl\Encoding\GBK;
use MensBeam\Intl\Encoding\GB18030;
use MensBeam\Intl\Encoding\EncoderException;
use MensBeam\Intl\Encoding\DecoderException;
class TestGB18030 extends \PHPUnit\Framework\TestCase {
/**
* @dataProvider provideCodePoints
* @covers MensBeam\Intl\Encoding\GB18030::encode
* @covers MensBeam\Intl\Encoding\GBK::encode
* @covers MensBeam\Intl\Encoding\GB18030::err
* @covers MensBeam\Intl\Encoding\GBK::err
*/
public function testEncodeCodePoints(string $class, bool $fatal, int $input, $exp) {
if ($exp instanceof \Throwable) {
$this->expectException(get_class($exp));
$this->expectExceptionCode($exp->getCode());
}
$out = $class::encode($input, $fatal);
$this->assertSame(strtolower($exp), bin2hex($out));
}
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\GB18030::__construct
@ -43,6 +60,48 @@ class TestGB18030 extends \PHPUnit\Framework\TestCase {
$this->assertSame($exp, $out);
}
public function provideCodePoints() {
// bytes confirmed using Firefox
return [
"GBK ASCII (fatal)" => [GBK::class, true, 0x64, "64"],
"GBK 0x20AC (fatal)" => [GBK::class, true, 0x20AC, "80"],
"GBK 0x2164 (fatal)" => [GBK::class, true, 0x2164, "A2F5"],
"GBK 0x3A74 (fatal)" => [GBK::class, true, 0x3A74, new EncoderException("", GBK::E_UNAVAILABLE_CODE_POINT)],
"GBK 0xE7C7 (fatal)" => [GBK::class, true, 0xE7C7, new EncoderException("", GBK::E_UNAVAILABLE_CODE_POINT)],
"GBK 0x1D11E (fatal)" => [GBK::class, true, 0x1D11E, new EncoderException("", GBK::E_UNAVAILABLE_CODE_POINT)],
"GBK 0xE5E5 (fatal)" => [GBK::class, true, 0xE5E5, new EncoderException("", GBK::E_UNAVAILABLE_CODE_POINT)],
"GBK -1 (fatal)" => [GBK::class, true, -1, new EncoderException("", GBK::E_INVALID_CODE_POINT)],
"GBK 0x110000 (fatal)" => [GBK::class, true, 0x110000, new EncoderException("", GBK::E_INVALID_CODE_POINT)],
"GB18030 ASCII (fatal)" => [GB18030::class, true, 0x64, "64"],
"GB18030 0x20AC (fatal)" => [GB18030::class, true, 0x20AC, "A2E3"],
"GB18030 0x2164 (fatal)" => [GB18030::class, true, 0x2164, "A2F5"],
"GB18030 0x3A74 (fatal)" => [GB18030::class, true, 0x3A74, "82319730"],
"GB18030 0xE7C7 (fatal)" => [GB18030::class, true, 0xE7C7, "8135F437"],
"GB18030 0x1D11E (fatal)" => [GB18030::class, true, 0x1D11E, "9432BE34"],
"GB18030 0xE5E5 (fatal)" => [GB18030::class, true, 0xE5E5, new EncoderException("", GB18030::E_UNAVAILABLE_CODE_POINT)],
"GB18030 -1 (fatal)" => [GB18030::class, true, -1, new EncoderException("", GB18030::E_INVALID_CODE_POINT)],
"GB18030 0x110000 (fatal)" => [GB18030::class, true, 0x110000, new EncoderException("", GB18030::E_INVALID_CODE_POINT)],
"GBK ASCII (HTML)" => [GBK::class, false, 0x64, "64"],
"GBK 0x20AC (HTML)" => [GBK::class, false, 0x20AC, "80"],
"GBK 0x2164 (HTML)" => [GBK::class, false, 0x2164, "A2F5"],
"GBK 0x3A74 (HTML)" => [GBK::class, false, 0x3A74, bin2hex("&#".(0x3A74).";")],
"GBK 0xE7C7 (HTML)" => [GBK::class, false, 0xE7C7, bin2hex("&#".(0xE7C7).";")],
"GBK 0x1D11E (HTML)" => [GBK::class, false, 0x1D11E, bin2hex("&#".(0x1D11E).";")],
"GBK 0xE5E5 (HTML)" => [GBK::class, false, 0xE5E5, bin2hex("&#".(0xE5E5).";")],
"GBK -1 (HTML)" => [GBK::class, false, -1, new EncoderException("", GBK::E_INVALID_CODE_POINT)],
"GBK 0x110000 (HTML)" => [GBK::class, false, 0x110000, new EncoderException("", GBK::E_INVALID_CODE_POINT)],
"GB18030 ASCII (HTML)" => [GB18030::class, false, 0x64, "64"],
"GB18030 0x20AC (HTML)" => [GB18030::class, false, 0x20AC, "A2E3"],
"GB18030 0x2164 (HTML)" => [GB18030::class, false, 0x2164, "A2F5"],
"GB18030 0x3A74 (HTML)" => [GB18030::class, false, 0x3A74, "82319730"],
"GB18030 0xE7C7 (HTML)" => [GB18030::class, false, 0xE7C7, "8135F437"],
"GB18030 0x1D11E (HTML)" => [GB18030::class, false, 0x1D11E, "9432BE34"],
"GB18030 0xE5E5 (HTML)" => [GB18030::class, false, 0xE5E5, bin2hex("&#".(0xE5E5).";")],
"GB18030 -1 (HTML)" => [GB18030::class, false, -1, new EncoderException("", GB18030::E_INVALID_CODE_POINT)],
"GB18030 0x110000 (HTML)" => [GB18030::class, false, 0x110000, new EncoderException("", GB18030::E_INVALID_CODE_POINT)],
];
}
public function provideStrings() {
return [
// valid single characters

2
tools/mkgbk.php

@ -29,9 +29,11 @@ foreach ($matches as $match) {
}
// fudge the top of the ranges
// see https://encoding.spec.whatwg.org/#index-gb18030-ranges-code-point Step 1
// we also add 0x110000 (one beyond the top of the Unicode range) to the offsets for encoding
$penult = array_pop($dec_max);
$dec_max = array_merge($dec_max, [39420, $penult, 1237576]);
array_splice($dec_off, -1, 0, "null");
$dec_off[] = 0x110000;
// output
$dec_gbk = implode(",", $dec_gbk);

44
tools/mktestgbk.html

@ -31,28 +31,44 @@ var data = [
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00], name: 'control fourth byte 2' },
{ encoding: 'gb18030', input: [0x84, 0x32, 0xA4, 0x39], name: 'void sequence' },
{ encoding: 'gb18030', input: [0xFE, 0x39, 0xFE, 0x39], name: 'void sequence 2' },
]
];
data.forEach(function(data) {
var bytes = ""
var bytes = "";
data.input.forEach((p) => {
bytes = bytes + p.toString(16).padStart(2, "0").toUpperCase()
})
var codes = []
var text = new TextDecoder(data.encoding).decode(new Uint8Array(data.input))
var b = 0
});
var codes = [];
var text = new TextDecoder(data.encoding).decode(new Uint8Array(data.input));
var b = 0;
for (let a = 0; a < text.length; a++) {
let point = text.codePointAt(a)
let point = text.codePointAt(a);
if (point >= 55296 && point <= 57343) {
// non-BMP characters have trailing low surrogates in JavaScript strings
continue
continue;
}
codes[b++] = point
codes[b++] = point;
}
codes = codes.join(", ")
var line = "'" + data.name + "' => [" + '"' + bytes + '", [' + codes + "]],\n"
codes = codes.join(", ");
var line = "'" + data.name + "' => [" + '"' + bytes + '", [' + codes + "]],\n";
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode(line));
})
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode("\n\n\n"));
[0x64, 0x20AC, 0x2164, 0x3A74, 0xE7C7, 0x1D11E].forEach(function(code) {
var l = document.createElement("a");
l.href = "http://example.com/?" + String.fromCodePoint(code);
var url = l.search.substr(1);
var bytes = "";
for (let a = 0; a < url.length; a++) {
if (url.charAt(a) == "%") {
bytes = bytes.concat(url.charAt(a + 1), url.charAt(a + 2));
a = a + 2;
} else {
bytes = bytes.concat(url.charCodeAt(a).toString(16).padStart(2, "0"));
}
}
var line = "0x" + code.toString(16).toUpperCase() + ", " + bytes.toUpperCase() + "\n";
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode(line));
})
var l = document.createElement("a")
l.href = "http://example.com/?" + String.fromCodePoint(0xFFFF)
//document.write(l.search.substr(1))
</script>

Loading…
Cancel
Save