Browse Source

Initial round of decoding tests, with one fix

labels
J. King 6 years ago
parent
commit
434e41cc2c
  1. 1
      lib/UTF8.php
  2. 87
      tests/cases/TestCodec.php
  3. 67
      tests/data/decode-multiple-characters.html
  4. 3
      tests/phpunit.xml

1
lib/UTF8.php

@ -232,6 +232,7 @@ abstract class UTF8 {
break;
}
} elseif ($b < $lower || $b > $upper) {
$next--;
switch ($errMode ?? self::$errMode) {
case self::M_SKIP:
goto start;

87
tests/cases/TestCodec.php

@ -0,0 +1,87 @@
<?php
/** @license MIT
* Copyright 2017 J. King, Dustin Wilson et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\UTF8\TestCase\Codec;
use MensBeam\UTF8\UTF8;
/** @covers \MensBeam\UTF8\UTF8 */
class TestConf extends \PHPUnit\Framework\TestCase {
/** @group optional */
public function testDecodeSingleCharacter() {
for ($a = 0; $a <= 0x10FFFF; $a++) {
// the UTF-8 encoding of the code point
$bytes = \IntlChar::chr($a);
// the expected result of decoding the bytes: surrogates are supposed to result in failures on every byte
$exp1 = ($a >= 55296 && $a <= 57343) ? array_fill(0, strlen($bytes), false) : [$a];
// the expected next-character poisitions: surrogates are supposed to return multiple positions; others always return only the end of the string
$exp2 = ($a >= 55296 && $a <= 57343) ? range(1, strlen($bytes)) : [strlen($bytes)];
$act1 = [];
$act2 = [];
$pos = 0;
do {
$act1[] = UTF8::ord($bytes, $pos, $pos);
$act2[] = $pos;
} while ($pos < strlen($bytes));
$this->assertSame($exp1, $act1, 'Character '.strtoupper(bin2hex(\IntlChar::chr($a))).' was not decoded correctly.');
$this->assertSame($exp2, $act2, 'Next offset for character '.strtoupper(bin2hex(\IntlChar::chr($a))).' is incorrect.');
}
}
/** @dataProvider provideStrings */
public function testDecodeMultipleCharacters(string $input, array $exp) {
$pos = 0;
$out = [];
$eof = strlen($input);
while ($pos < $eof) {
$p = UTF8::ord($input, $pos, $pos);
$out[] = is_int($p) ? $p : 0xFFFD;
}
$this->assertEquals($exp, $out);
}
public function provideStrings() {
return [
'sanity check' => ["\x61\x62\x63\x31\x32\x33", [97, 98, 99, 49, 50, 51]],
'multibyte control' => ["\xE5\x8F\xA4\xE6\xB1\xA0\xE3\x82\x84\xE8\x9B\x99\xE9\xA3\x9B\xE3\x81\xB3\xE8\xBE\xBC\xE3\x82\x80\xE6\xB0\xB4\xE3\x81\xAE\xE9\x9F\xB3", [21476, 27744, 12420, 34521, 39131, 12403, 36796, 12416, 27700, 12398, 38899]],
'invalid code' => ["\xFF", [65533]],
'ends early' => ["\xC0", [65533]],
'ends early 2' => ["\xE0", [65533]],
'invalid trail' => ["\xC0\x00", [65533, 0]],
'invalid trail 2' => ["\xC0\xC0", [65533, 65533]],
'invalid trail 3' => ["\xE0\x00", [65533, 0]],
'invalid trail 4' => ["\xE0\xC0", [65533, 65533]],
'invalid trail 5' => ["\xE0\x80\x00", [65533, 65533, 0]],
'invalid trail 6' => ["\xE0\x80\xC0", [65533, 65533, 65533]],
'> 0x10FFFF' => ["\xFC\x80\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533, 65533]],
'obsolete lead byte' => ["\xFE\x80\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533, 65533]],
'overlong U+0000 - 2 bytes' => ["\xC0\x80", [65533, 65533]],
'overlong U+0000 - 3 bytes' => ["\xE0\x80\x80", [65533, 65533, 65533]],
'overlong U+0000 - 4 bytes' => ["\xF0\x80\x80\x80", [65533, 65533, 65533, 65533]],
'overlong U+0000 - 5 bytes' => ["\xF8\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533]],
'overlong U+0000 - 6 bytes' => ["\xFC\x80\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533, 65533]],
'overlong U+007F - 2 bytes' => ["\xC1\xBF", [65533, 65533]],
'overlong U+007F - 3 bytes' => ["\xE0\x81\xBF", [65533, 65533, 65533]],
'overlong U+007F - 4 bytes' => ["\xF0\x80\x81\xBF", [65533, 65533, 65533, 65533]],
'overlong U+007F - 5 bytes' => ["\xF8\x80\x80\x81\xBF", [65533, 65533, 65533, 65533, 65533]],
'overlong U+007F - 6 bytes' => ["\xFC\x80\x80\x80\x81\xBF", [65533, 65533, 65533, 65533, 65533, 65533]],
'overlong U+07FF - 3 bytes' => ["\xE0\x9F\xBF", [65533, 65533, 65533]],
'overlong U+07FF - 4 bytes' => ["\xF0\x80\x9F\xBF", [65533, 65533, 65533, 65533]],
'overlong U+07FF - 5 bytes' => ["\xF8\x80\x80\x9F\xBF", [65533, 65533, 65533, 65533, 65533]],
'overlong U+07FF - 6 bytes' => ["\xFC\x80\x80\x80\x9F\xBF", [65533, 65533, 65533, 65533, 65533, 65533]],
'overlong U+FFFF - 4 bytes' => ["\xF0\x8F\xBF\xBF", [65533, 65533, 65533, 65533]],
'overlong U+FFFF - 5 bytes' => ["\xF8\x80\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533]],
'overlong U+FFFF - 6 bytes' => ["\xFC\x80\x80\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533, 65533]],
'overlong U+10FFFF - 5 bytes' => ["\xF8\x84\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533]],
'overlong U+10FFFF - 6 bytes' => ["\xFC\x80\x84\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533, 65533]],
'lead surrogate' => ["\xED\xA0\x80", [65533, 65533, 65533]],
'trail surrogate' => ["\xED\xB0\x80", [65533, 65533, 65533]],
'surrogate pair' => ["\xED\xA0\x80\xED\xB0\x80", [65533, 65533, 65533, 65533, 65533, 65533]],
'mixed sample' => ["\x7A\xC2\xA2\xE6\xB0\xB4\xF0\x9D\x84\x9E\xEF\xA3\xBF\xF4\x8F\xBF\xBD\xEF\xBF\xBE", [122, 162, 27700, 119070, 63743, 1114109, 65534]],
];
}
}

67
tests/data/decode-multiple-characters.html

@ -0,0 +1,67 @@
<!DOCTYPE html>
<pre></pre>
<script>
var data = [
// basics
{ encoding: 'utf-8', input: [0x61, 0x62, 0x63, 0x31, 0x32, 0x33], name: 'sanity check' },
{ encoding: 'utf-8', input: [0xE5, 0x8F, 0xA4, 0xE6, 0xB1, 0xA0, 0xE3, 0x82, 0x84, 0xE8, 0x9B, 0x99, 0xE9, 0xA3, 0x9B, 0xE3, 0x81, 0xB3, 0xE8, 0xBE, 0xBC, 0xE3, 0x82, 0x80, 0xE6, 0xB0, 0xB4, 0xE3, 0x81, 0xAE, 0xE9, 0x9F, 0xB3], name: 'multibyte control' },
// bad input
{ encoding: 'utf-8', input: [0xFF], name: 'invalid code' },
{ encoding: 'utf-8', input: [0xC0], name: 'ends early' },
{ encoding: 'utf-8', input: [0xE0], name: 'ends early 2' },
{ encoding: 'utf-8', input: [0xC0, 0x00], name: 'invalid trail' },
{ encoding: 'utf-8', input: [0xC0, 0xC0], name: 'invalid trail 2' },
{ encoding: 'utf-8', input: [0xE0, 0x00], name: 'invalid trail 3' },
{ encoding: 'utf-8', input: [0xE0, 0xC0], name: 'invalid trail 4' },
{ encoding: 'utf-8', input: [0xE0, 0x80, 0x00], name: 'invalid trail 5' },
{ encoding: 'utf-8', input: [0xE0, 0x80, 0xC0], name: 'invalid trail 6' },
{ encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], name: '> 0x10FFFF' },
{ encoding: 'utf-8', input: [0xFE, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'obsolete lead byte' },
// Overlong encodings
{ encoding: 'utf-8', input: [0xC0, 0x80], name: 'overlong U+0000 - 2 bytes' },
{ encoding: 'utf-8', input: [0xE0, 0x80, 0x80], name: 'overlong U+0000 - 3 bytes' },
{ encoding: 'utf-8', input: [0xF0, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 4 bytes' },
{ encoding: 'utf-8', input: [0xF8, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 5 bytes' },
{ encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 6 bytes' },
{ encoding: 'utf-8', input: [0xC1, 0xBF], name: 'overlong U+007F - 2 bytes' },
{ encoding: 'utf-8', input: [0xE0, 0x81, 0xBF], name: 'overlong U+007F - 3 bytes' },
{ encoding: 'utf-8', input: [0xF0, 0x80, 0x81, 0xBF], name: 'overlong U+007F - 4 bytes' },
{ encoding: 'utf-8', input: [0xF8, 0x80, 0x80, 0x81, 0xBF], name: 'overlong U+007F - 5 bytes' },
{ encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x81, 0xBF], name: 'overlong U+007F - 6 bytes' },
{ encoding: 'utf-8', input: [0xE0, 0x9F, 0xBF], name: 'overlong U+07FF - 3 bytes' },
{ encoding: 'utf-8', input: [0xF0, 0x80, 0x9F, 0xBF], name: 'overlong U+07FF - 4 bytes' },
{ encoding: 'utf-8', input: [0xF8, 0x80, 0x80, 0x9F, 0xBF], name: 'overlong U+07FF - 5 bytes' },
{ encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x9F, 0xBF], name: 'overlong U+07FF - 6 bytes' },
{ encoding: 'utf-8', input: [0xF0, 0x8F, 0xBF, 0xBF], name: 'overlong U+FFFF - 4 bytes' },
{ encoding: 'utf-8', input: [0xF8, 0x80, 0x8F, 0xBF, 0xBF], name: 'overlong U+FFFF - 5 bytes' },
{ encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x8F, 0xBF, 0xBF], name: 'overlong U+FFFF - 6 bytes' },
{ encoding: 'utf-8', input: [0xF8, 0x84, 0x8F, 0xBF, 0xBF], name: 'overlong U+10FFFF - 5 bytes' },
{ encoding: 'utf-8', input: [0xFC, 0x80, 0x84, 0x8F, 0xBF, 0xBF], name: 'overlong U+10FFFF - 6 bytes' },
// UTF-16 surrogates encoded as code points in UTF-8
{ encoding: 'utf-8', input: [0xED, 0xA0, 0x80], name: 'lead surrogate' },
{ encoding: 'utf-8', input: [0xED, 0xB0, 0x80], name: 'trail surrogate' },
{ encoding: 'utf-8', input: [0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80], name: 'surrogate pair' },
// mixed input
{ encoding: 'utf-8', input: [0x7A, 0xC2, 0xA2, 0xE6, 0xB0, 0xB4, 0xF0, 0x9D, 0x84, 0x9E, 0xEF, 0xA3, 0xBF, 0xF4, 0x8F, 0xBF, 0xBD, 0xEF, 0xBF, 0xBE], name: 'mixed sample' }
]
data.forEach(function(data) {
var bytes = ""
data.input.forEach((p) => {
bytes = bytes + "\\x" + p.toString(16).padStart(2, "0").toUpperCase()
})
var codes = []
var text = new TextDecoder("utf-8").decode(new Uint8Array(data.input))
var b = 0
for (let a = 0; a < text.length; a++) {
let point = text.codePointAt(a)
if (point >= 55296 && point <= 57343) {
// non-BMP characters have trailing low surrogates in JavaScript strings
continue
}
codes[b++] = point
}
codes = codes.join(", ")
var line = "'" + data.name + "' => [" + '"' + bytes + '", [' + codes + "]],\n"
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode(line));
})
</script>

3
tests/phpunit.xml

@ -17,5 +17,8 @@
</filter>
<testsuites>
<testsuite name="Codec">
<file>cases/TestCodec.php</file>
</testsuite>
</testsuites>
</phpunit>

Loading…
Cancel
Save