Initial round of decoding tests, with one fix

6 years ago · 434e41cc2c
4 changed files with 158 additions and 0 deletions
--- a/lib/UTF8.php
+++ b/lib/UTF8.php
@ -232,6 +232,7 @@ abstract class UTF8 {
                        break;
                }
            } elseif ($b < $lower || $b > $upper) {
+                $next--;
                switch ($errMode ?? self::$errMode) {
                    case self::M_SKIP:
                        goto start;
--- a/tests/cases/TestCodec.php
+++ b/tests/cases/TestCodec.php
@ -0,0 +1,87 @@
+<?php
+/** @license MIT
+ * Copyright 2017 J. King, Dustin Wilson et al.
+ * See LICENSE and AUTHORS files for details */
+
+declare(strict_types=1);
+namespace MensBeam\UTF8\TestCase\Codec;
+
+use MensBeam\UTF8\UTF8;
+
+/** @covers \MensBeam\UTF8\UTF8 */
+class TestConf extends \PHPUnit\Framework\TestCase {
+
+    /** @group optional */
+    public function testDecodeSingleCharacter() {
+        for ($a = 0; $a <= 0x10FFFF; $a++) {
+            // the UTF-8 encoding of the code point
+            $bytes = \IntlChar::chr($a);
+            // the expected result of decoding the bytes: surrogates are supposed to result in failures on every byte
+            $exp1 = ($a >= 55296 && $a <= 57343) ? array_fill(0, strlen($bytes), false) : [$a];
+            // the expected next-character poisitions: surrogates are supposed to return multiple positions; others always return only the end of the string
+            $exp2 = ($a >= 55296 && $a <= 57343) ? range(1, strlen($bytes)) : [strlen($bytes)];
+            $act1 = [];
+            $act2 = [];
+            $pos = 0;
+            do {
+                $act1[] = UTF8::ord($bytes, $pos, $pos);
+                $act2[] = $pos;
+            } while ($pos < strlen($bytes));
+            $this->assertSame($exp1, $act1, 'Character '.strtoupper(bin2hex(\IntlChar::chr($a))).' was not decoded correctly.');
+            $this->assertSame($exp2, $act2, 'Next offset for character '.strtoupper(bin2hex(\IntlChar::chr($a))).' is incorrect.');
+        }
+    }
+    
+    /** @dataProvider provideStrings */
+    public function testDecodeMultipleCharacters(string $input, array $exp) {
+        $pos = 0;
+        $out = [];
+        $eof = strlen($input);
+        while ($pos < $eof) {
+            $p = UTF8::ord($input, $pos, $pos);
+            $out[] = is_int($p) ? $p : 0xFFFD;
+        }
+        $this->assertEquals($exp, $out);
+    }
+
+    public function provideStrings() {
+        return [
+            'sanity check' => ["\x61\x62\x63\x31\x32\x33", [97, 98, 99, 49, 50, 51]],
+            'multibyte control' => ["\xE5\x8F\xA4\xE6\xB1\xA0\xE3\x82\x84\xE8\x9B\x99\xE9\xA3\x9B\xE3\x81\xB3\xE8\xBE\xBC\xE3\x82\x80\xE6\xB0\xB4\xE3\x81\xAE\xE9\x9F\xB3", [21476, 27744, 12420, 34521, 39131, 12403, 36796, 12416, 27700, 12398, 38899]],
+            'invalid code' => ["\xFF", [65533]],
+            'ends early' => ["\xC0", [65533]],
+            'ends early 2' => ["\xE0", [65533]],
+            'invalid trail' => ["\xC0\x00", [65533, 0]],
+            'invalid trail 2' => ["\xC0\xC0", [65533, 65533]],
+            'invalid trail 3' => ["\xE0\x00", [65533, 0]],
+            'invalid trail 4' => ["\xE0\xC0", [65533, 65533]],
+            'invalid trail 5' => ["\xE0\x80\x00", [65533, 65533, 0]],
+            'invalid trail 6' => ["\xE0\x80\xC0", [65533, 65533, 65533]],
+            '> 0x10FFFF' => ["\xFC\x80\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533, 65533]],
+            'obsolete lead byte' => ["\xFE\x80\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533, 65533]],
+            'overlong U+0000 - 2 bytes' => ["\xC0\x80", [65533, 65533]],
+            'overlong U+0000 - 3 bytes' => ["\xE0\x80\x80", [65533, 65533, 65533]],
+            'overlong U+0000 - 4 bytes' => ["\xF0\x80\x80\x80", [65533, 65533, 65533, 65533]],
+            'overlong U+0000 - 5 bytes' => ["\xF8\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533]],
+            'overlong U+0000 - 6 bytes' => ["\xFC\x80\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533, 65533]],
+            'overlong U+007F - 2 bytes' => ["\xC1\xBF", [65533, 65533]],
+            'overlong U+007F - 3 bytes' => ["\xE0\x81\xBF", [65533, 65533, 65533]],
+            'overlong U+007F - 4 bytes' => ["\xF0\x80\x81\xBF", [65533, 65533, 65533, 65533]],
+            'overlong U+007F - 5 bytes' => ["\xF8\x80\x80\x81\xBF", [65533, 65533, 65533, 65533, 65533]],
+            'overlong U+007F - 6 bytes' => ["\xFC\x80\x80\x80\x81\xBF", [65533, 65533, 65533, 65533, 65533, 65533]],
+            'overlong U+07FF - 3 bytes' => ["\xE0\x9F\xBF", [65533, 65533, 65533]],
+            'overlong U+07FF - 4 bytes' => ["\xF0\x80\x9F\xBF", [65533, 65533, 65533, 65533]],
+            'overlong U+07FF - 5 bytes' => ["\xF8\x80\x80\x9F\xBF", [65533, 65533, 65533, 65533, 65533]],
+            'overlong U+07FF - 6 bytes' => ["\xFC\x80\x80\x80\x9F\xBF", [65533, 65533, 65533, 65533, 65533, 65533]],
+            'overlong U+FFFF - 4 bytes' => ["\xF0\x8F\xBF\xBF", [65533, 65533, 65533, 65533]],
+            'overlong U+FFFF - 5 bytes' => ["\xF8\x80\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533]],
+            'overlong U+FFFF - 6 bytes' => ["\xFC\x80\x80\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533, 65533]],
+            'overlong U+10FFFF - 5 bytes' => ["\xF8\x84\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533]],
+            'overlong U+10FFFF - 6 bytes' => ["\xFC\x80\x84\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533, 65533]],
+            'lead surrogate' => ["\xED\xA0\x80", [65533, 65533, 65533]],
+            'trail surrogate' => ["\xED\xB0\x80", [65533, 65533, 65533]],
+            'surrogate pair' => ["\xED\xA0\x80\xED\xB0\x80", [65533, 65533, 65533, 65533, 65533, 65533]],
+            'mixed sample' => ["\x7A\xC2\xA2\xE6\xB0\xB4\xF0\x9D\x84\x9E\xEF\xA3\xBF\xF4\x8F\xBF\xBD\xEF\xBF\xBE", [122, 162, 27700, 119070, 63743, 1114109, 65534]],
+        ];
+    }
+}
--- a/tests/data/decode-multiple-characters.html
+++ b/tests/data/decode-multiple-characters.html
@ -0,0 +1,67 @@
+<!DOCTYPE html>
+<pre></pre>
+<script>
+var data = [
+    // basics
+    { encoding: 'utf-8', input: [0x61, 0x62, 0x63, 0x31, 0x32, 0x33], name: 'sanity check' },
+    { encoding: 'utf-8', input: [0xE5, 0x8F, 0xA4, 0xE6, 0xB1, 0xA0, 0xE3, 0x82, 0x84, 0xE8, 0x9B, 0x99, 0xE9, 0xA3, 0x9B, 0xE3, 0x81, 0xB3, 0xE8, 0xBE, 0xBC, 0xE3, 0x82, 0x80, 0xE6, 0xB0, 0xB4, 0xE3, 0x81, 0xAE, 0xE9, 0x9F, 0xB3], name: 'multibyte control' },
+    // bad input
+    { encoding: 'utf-8', input: [0xFF], name: 'invalid code' },
+    { encoding: 'utf-8', input: [0xC0], name: 'ends early' },
+    { encoding: 'utf-8', input: [0xE0], name: 'ends early 2' },
+    { encoding: 'utf-8', input: [0xC0, 0x00], name: 'invalid trail' },
+    { encoding: 'utf-8', input: [0xC0, 0xC0], name: 'invalid trail 2' },
+    { encoding: 'utf-8', input: [0xE0, 0x00], name: 'invalid trail 3' },
+    { encoding: 'utf-8', input: [0xE0, 0xC0], name: 'invalid trail 4' },
+    { encoding: 'utf-8', input: [0xE0, 0x80, 0x00], name: 'invalid trail 5' },
+    { encoding: 'utf-8', input: [0xE0, 0x80, 0xC0], name: 'invalid trail 6' },
+    { encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], name: '> 0x10FFFF' },
+    { encoding: 'utf-8', input: [0xFE, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'obsolete lead byte' },
+    // Overlong encodings
+    { encoding: 'utf-8', input: [0xC0, 0x80], name: 'overlong U+0000 - 2 bytes' },
+    { encoding: 'utf-8', input: [0xE0, 0x80, 0x80], name: 'overlong U+0000 - 3 bytes' },
+    { encoding: 'utf-8', input: [0xF0, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 4 bytes' },
+    { encoding: 'utf-8', input: [0xF8, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 5 bytes' },
+    { encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 6 bytes' },
+    { encoding: 'utf-8', input: [0xC1, 0xBF], name: 'overlong U+007F - 2 bytes' },
+    { encoding: 'utf-8', input: [0xE0, 0x81, 0xBF], name: 'overlong U+007F - 3 bytes' },
+    { encoding: 'utf-8', input: [0xF0, 0x80, 0x81, 0xBF], name: 'overlong U+007F - 4 bytes' },
+    { encoding: 'utf-8', input: [0xF8, 0x80, 0x80, 0x81, 0xBF], name: 'overlong U+007F - 5 bytes' },
+    { encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x81, 0xBF], name: 'overlong U+007F - 6 bytes' },
+    { encoding: 'utf-8', input: [0xE0, 0x9F, 0xBF], name: 'overlong U+07FF - 3 bytes' },
+    { encoding: 'utf-8', input: [0xF0, 0x80, 0x9F, 0xBF], name: 'overlong U+07FF - 4 bytes' },
+    { encoding: 'utf-8', input: [0xF8, 0x80, 0x80, 0x9F, 0xBF], name: 'overlong U+07FF - 5 bytes' },
+    { encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x9F, 0xBF], name: 'overlong U+07FF - 6 bytes' },
+    { encoding: 'utf-8', input: [0xF0, 0x8F, 0xBF, 0xBF], name: 'overlong U+FFFF - 4 bytes' },
+    { encoding: 'utf-8', input: [0xF8, 0x80, 0x8F, 0xBF, 0xBF], name: 'overlong U+FFFF - 5 bytes' },
+    { encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x8F, 0xBF, 0xBF], name: 'overlong U+FFFF - 6 bytes' },
+    { encoding: 'utf-8', input: [0xF8, 0x84, 0x8F, 0xBF, 0xBF], name: 'overlong U+10FFFF - 5 bytes' },
+    { encoding: 'utf-8', input: [0xFC, 0x80, 0x84, 0x8F, 0xBF, 0xBF], name: 'overlong U+10FFFF - 6 bytes' },
+    // UTF-16 surrogates encoded as code points in UTF-8
+    { encoding: 'utf-8', input: [0xED, 0xA0, 0x80], name: 'lead surrogate' },
+    { encoding: 'utf-8', input: [0xED, 0xB0, 0x80], name: 'trail surrogate' },
+    { encoding: 'utf-8', input: [0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80], name: 'surrogate pair' },
+    // mixed input
+    { encoding: 'utf-8', input: [0x7A, 0xC2, 0xA2, 0xE6, 0xB0, 0xB4, 0xF0, 0x9D, 0x84, 0x9E, 0xEF, 0xA3, 0xBF, 0xF4, 0x8F, 0xBF, 0xBD, 0xEF, 0xBF, 0xBE], name: 'mixed sample' }
+]
+data.forEach(function(data) {
+    var bytes = ""
+    data.input.forEach((p) => {
+        bytes = bytes + "\\x" + p.toString(16).padStart(2, "0").toUpperCase()
+    })
+    var codes = []
+    var text = new TextDecoder("utf-8").decode(new Uint8Array(data.input))
+    var b = 0
+    for (let a = 0; a < text.length; a++) {
+        let point = text.codePointAt(a)
+        if (point >= 55296 && point <= 57343) {
+            // non-BMP characters have trailing low surrogates in JavaScript strings
+            continue
+        }
+        codes[b++] = point
+    }
+    codes = codes.join(", ")
+    var line = "'" + data.name + "' => [" + '"' + bytes + '", [' + codes + "]],\n"
+    document.getElementsByTagName("pre")[0].appendChild(document.createTextNode(line));
+})
+</script>
--- a/tests/phpunit.xml
+++ b/tests/phpunit.xml
@ -17,5 +17,8 @@
 </filter>

 <testsuites>
+    <testsuite name="Codec">
+        <file>cases/TestCodec.php</file>
+    </testsuite>
 </testsuites>
 </phpunit>