Clean up static-method interface and test it

6 years ago · ca91a86744
4 changed files with 200 additions and 112 deletions
--- a/lib/UTF8.php
+++ b/lib/UTF8.php
@ -7,12 +7,6 @@ declare(strict_types=1);
 namespace MensBeam\UTF8;

 abstract class UTF8 {
-    public static $replacementChar = "\u{FFFD}";
-    public static $errMode = self::M_REPLACE;
-
-    const M_REPLACE = 0;
-    const M_SKIP = 1;
-    const M_HALT = 2;

    /** Retrieve a character from $string starting at byte offset $pos
     *
@ -20,35 +14,22 @@ abstract class UTF8 {
     *
     * The returned character may be a replacement character, or the empty string if $pos is beyond the end of $string
     */
-    public static function get(string $string, int $pos, &$next = null, int $errMode = null): string {
+    public static function get(string $string, int $pos, &$next = null): string {
        start:
        // get the byte at the specified position
        $b = @$string[$pos];
        if (ord($b) < 0x80) {
            // if the byte is an ASCII character or end of input, simply return it
-            $next = $pos + 1;
+            if ($b !== "") {
+                $next = $pos + 1;
+            } else {
+                $next = $pos;
+            }
            return $b;
        } else {
            // otherwise determine the numeric code point of the character, as well as the position of the next character
-            $p = self::ord($string, $pos, $next, self::M_REPLACE);
-            if (is_int($p)) {
-                // if the character is valid, return its serialization
-                // we do a round trip (bytes > code point > bytes) to normalize overlong sequences
-                return self::chr($p);
-            } else {
-                $errMode = $errMode ?? self::$errMode;
-                if ($errMode==self::M_REPLACE) {
-                    // if the byte is invalid and we're supposed to replace, return a replacement character
-                    return self::$replacementChar;
-                } elseif ($errMode==self::M_SKIP) {
-                    // if the character is invalid and we're supposed to skip invalid characters, advance the position and start over
-                    $pos = $next;
-                    goto start;
-                } else {
-                    // if the byte is invalid and we're supposed to halt, halt
-                    throw new \Exception;
-                }
-            }
+            $p = self::ord($string, $pos, $next);
+            return is_int($p) ? self::chr($p) : "\u{FFFD}";
        }
    }

@ -58,12 +39,11 @@ abstract class UTF8 {
     *
     * If $pos is omitted, the start of the string will be used for a forward seek, and the end for a reverse seek
     */
-    public static function seek(string $string, int $num, int $pos = null, int $errMode = null): int {
-        $errMode = $errMode ?? self::$errMode;
+    public static function seek(string $string, int $num, int $pos = null): int {
        if ($num > 0) {
            $pos = $pos ?? 0;
            do {
-                $c = self::get($string, $pos, $pos, $errMode); // the current position is getting overwritten with the next position, by reference
+                $c = self::get($string, $pos, $pos); // the current position is getting overwritten with the next position, by reference
            } while (--$num && $c != ""); // stop after we have skipped the desired number of characters, or reached EOF
            return $pos;
        } elseif ($num < 0) {
@ -74,71 +54,32 @@ abstract class UTF8 {
            }
            $num = abs($num);
            do {
-                $pos = self::sync($string, $pos -1, $errMode);
+                $pos = self::sync($string, $pos -1);
                $num--;
            } while ($num && $pos);
            return $pos;
        } else {
            // seeking zero characters is equivalent to a sync
-            return self::sync($string, $pos, $errMode);
+            return self::sync($string, $pos);
        }
    }

    /** Synchronize to the byte offset of the start of the nearest character at or before byte offset $pos */
-    public static function sync(string $string, int $pos, int $errMode = null): int {
-        $errMode = $errMode ?? self::$errMode;
-        start:
-        if (!$pos || $pos >= strlen($string)) {
-            // if we're at the start of the string or past its end, then this is the character start
-            return $pos;
-        }
-        // save the start position for later, and increment before the coming decrement loop
-        $s = $pos++;
-        // examine the current byte and skip up to three continuation bytes, going backward and counting the number of examined bytes (between 1 and 4)
-        $t = 0;
-        do {
-            $pos--;
-            $t++;
-            $b = @$string[$pos];
-        } while (
-            $b >= "\x80" && $b <= "\xBF" && // continuation bytes
-            ($t < 4 || $errMode==self::M_SKIP) && // stop after four bytes, unless we're skipping invalid sequences
-            $pos > 0 // stop once the start of the string has been reached
-        );
-        // attempt to extract a code point at the current position
-        $p = self::ord($string, $pos, $n, self::M_REPLACE);
-        // if the position of the character after the one we just consumed is earlier than our start position,
-        // then there was at least one invalid sequence between the consumed character and the start position
-        if ($n < $s) {
-            if ($errMode==self::M_SKIP) {
-                // if we're supposed to skip invalid sequences, there is no need to do anything
-            } elseif ($errMode==self::M_REPLACE) {
-                // if we're supposed to replace invalid sequences, return the starting offset: it is itself a character
-                return $s;
-            } else {
-                // otherwise if the character is invalid and we're expected to halt, halt
-                throw new \Exception;
+    public static function sync(string $string, int $pos): int {
+        $b = ord(@$string[$pos]);
+        if ($b < 0x80) {
+            // if the byte is an ASCII byte or the end of input, then this is already a synchronized position
+            return min(max($pos,0), strlen($string));
+        } else {
+            $s = $pos;
+            while ($b >= 0x80 && $b <= 0xBF && $pos > 0 && ($s - $pos) < 3) { // go back at most three bytes, no further than the start of the string, and only as long as the byte remains a continuation byte
+                $b = ord(@$string[--$pos]);
            }
-        }
-        // if the consumed character is valid, return the current position
-        if (is_int($p)) {
-            return $pos;
-        } elseif ($errMode==self::M_SKIP) {
-            // if we're supposed to skip invalid sequences:
-            if ($pos < 1) {
-                // if we're already at the start of the string, give up
-                return $pos;
+            if (is_null(self::ord($string, $pos, $next))) {
+                return $s;
            } else {
-                // otherwise skip over the last examined byte and start over
-                $pos--;
-                goto start;
+                return ($next > $s) ? $pos : $s;
            }
-        } elseif ($errMode==self::M_REPLACE) {
-            // if we're supposed to replace invalid sequences, return the current offset: we've synchronized
-            return $pos;
-        } else {
-            // otherwise if the character is invalid and we're expected to halt, halt
-            throw new \Exception;
        }
    }

@ -177,20 +118,19 @@ abstract class UTF8 {
     *
     * Upon success, returns the numeric code point of the character, an integer between 0 and 1114111
     *
-     * Upon error, returns false; if $char is the empty string or $pos is beyond the end of the string, null is returned
+     * Upon error, returns null; if $char is the empty string or $pos is beyond the end of the string, false is returned
     *
     * $next is a variable in which to store the next byte offset at which a character starts
     */
-    public static function ord(string $string, int $pos = 0, &$next = null, int $errMode = null) {
+    public static function ord(string $string, int $pos = 0, &$next = null) {
        // this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder
        // though it differs from a slavish implementation because it operates on only a single
        // character rather than a whole stream
-        start:
        // optimization for ASCII characters
        $b = @$string[$pos];
        if ($b=="") {
-            $next = $pos + 1;
-            return null;
+            $next = $pos;
+            return false;
        } elseif (($b = ord($b)) < 0x80) {
            $next = $pos + 1;
            return $b;
@ -224,25 +164,11 @@ abstract class UTF8 {
                    $point = $b & 0x7;
                } else { // invalid byte
                    $next = $pos;
-                    switch ($errMode ?? self::$errMode) {
-                        case self::M_SKIP:
-                            goto start;
-                        case self::M_REPLACE:
-                            return false;
-                        default:
-                            throw new \Exception;
-                    }
+                    return null;
                }
            } elseif ($b < $lower || $b > $upper) {
                $next = $pos - 1;
-                switch ($errMode ?? self::$errMode) {
-                    case self::M_SKIP:
-                        goto start;
-                    case self::M_REPLACE:
-                        return false;
-                    default:
-                        throw new \Exception;
-                }
+                return null;
            } else {
                $lower = 0x80;
                $upper = 0xBF;
--- a/perf/perf.php
+++ b/perf/perf.php
@ -16,6 +16,13 @@ $files = [
 ];

 $tests = [
+    'Intl characters' => ["intl", function(string $text) {
+        $i = \IntlBreakIterator::createCodePointInstance();
+        $i->setText($text);
+        foreach ($i as $b) {
+            \IntlChar::chr($i->getLastCodePoint());
+        }
+    }],
    'Native characters (obj)' => ["", function(string $text) {
        $c = null;
        $i = new \MensBeam\UTF8\UTF8String($text);
@ -25,16 +32,14 @@ $tests = [
    }],
    'Native characters (func)' => ["", function(string $text) {
        $pos = 0;
-        $eof = strlen($text);
-        while ($pos <= $eof) {
-            UTF8::get($text, $pos, $pos);
+        while (($p = UTF8::get($text, $pos, $pos)) !== "") {
        }
    }],
-    'Intl characters' => ["intl", function(string $text) {
+    'Intl code points' => ["intl", function(string $text) {
        $i = \IntlBreakIterator::createCodePointInstance();
        $i->setText($text);
        foreach ($i as $b) {
-            \IntlChar::chr($i->getLastCodePoint());
+            $i->getLastCodePoint();
        }
    }],
    'Native code points (obj)' => ["", function(string $text) {
@ -46,9 +51,7 @@ $tests = [
    }],
    'Native code points (func)' => ["", function(string $text) {
        $pos = 0;
-        $eof = strlen($text);
-        while ($pos <= $eof) {
-            UTF8::ord($text, $pos, $pos);
+        while (($p = UTF8::ord($text, $pos, $pos)) !== false) {
        }
    }],
 ];
--- a/tests/cases/TestFunctions.php
+++ b/tests/cases/TestFunctions.php
@ -0,0 +1,156 @@
+<?php
+/** @license MIT
+ * Copyright 2017 J. King, Dustin Wilson et al.
+ * See LICENSE and AUTHORS files for details */
+
+declare(strict_types=1);
+namespace MensBeam\UTF8\TestCase\Codec;
+
+use MensBeam\UTF8\UTF8;
+
+class TestFunctions extends \PHPUnit\Framework\TestCase {
+    
+    /**
+     * @dataProvider provideStrings
+     * @covers \MensBeam\UTF8\UTF8::ord
+    */
+    public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) {
+        $off = 0;
+        while (($p = UTF8::ord($input, $off, $off)) !== false) {
+            $out[] = $p ?? 0xFFFD;
+        }
+        $this->assertEquals($exp, $out);
+    }
+    
+    /**
+     * @dataProvider provideStrings
+     * @covers \MensBeam\UTF8\UTF8::get
+    */
+    public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) {
+        $exp = array_map(function ($v) {
+            return \IntlChar::chr($v);
+        }, $exp);
+        $off = 0;
+        while (($p = UTF8::get($input, $off, $off)) !== "") {
+            $out[] = $p ?? 0xFFFD;
+        }
+        $this->assertEquals($exp, $out);
+    }
+    
+    /**
+     * @covers \MensBeam\UTF8\UTF8::get
+     * @covers \MensBeam\UTF8\UTF8::ord
+    */
+    public function testTraversePastTheEndOfAString() {
+        $input = "\u{10FFFD}";
+
+        $off = 0;
+        $this->assertSame(0, $off);
+        $this->assertSame("\u{10FFFD}", UTF8::get($input, $off, $off));
+        $this->assertSame(4, $off);
+        $this->assertSame("", UTF8::get($input, $off, $off));
+        $this->assertSame(4, $off);
+        $off = 0;
+        $this->assertSame(0, $off);
+        $this->assertSame(0x10FFFD, UTF8::ord($input, $off, $off));
+        $this->assertSame(4, $off);
+        $this->assertSame(false, UTF8::ord($input, $off, $off));
+        $this->assertSame(4, $off);
+    }
+    
+    /**
+     * @dataProvider provideStrings
+     * @covers \MensBeam\UTF8\UTF8::sync
+    */
+    public function testSTepBackThroughAString(string $input, array $points) {
+        $off = strlen($input);
+        $p = [];
+        while ($off > 0) {
+            $off = UTF8::sync($input, $off - 1);
+            $p[] = UTF8::ord($input, $off) ?? 0xFFFD;
+        }
+        $p = array_reverse($p);
+        $this->assertSame($points, $p);
+    }
+    
+    /**
+     * @covers \MensBeam\UTF8\UTF8::seek
+    */
+    public function testSeekThroughAString() {
+        /*
+            Char 0  U+007A   (1 byte)  Offset 0
+            Char 1  U+00A2   (2 bytes) Offset 1
+            Char 2  U+6C34   (3 bytes) Offset 3
+            Char 3  U+1D11E  (4 bytes) Offset 6
+            Char 4  U+F8FF   (3 bytes) Offset 10
+            Char 5  U+10FFFD (4 bytes) Offset 13
+            Char 6  U+FFFE   (3 bytes) Offset 17
+            End of string at char 7, offset 20
+        */
+        $input = "\x7A\xC2\xA2\xE6\xB0\xB4\xF0\x9D\x84\x9E\xEF\xA3\xBF\xF4\x8F\xBF\xBD\xEF\xBF\xBE";
+        $off = 0;
+        $off = UTF8::seek($input, 0, $off);
+        $this->assertSame(0, $off);
+        $off = UTF8::seek($input, -1, $off);
+        $this->assertSame(0, $off);
+        $off = UTF8::seek($input, 1, $off);
+        $this->assertSame(1, $off);
+        $off = UTF8::seek($input, 2, $off);
+        $this->assertSame(6, $off);
+        $off = UTF8::seek($input, 4, $off);
+        $this->assertSame(20, $off);
+        $off = UTF8::seek($input, 1, $off);
+        $this->assertSame(20, $off);
+        $off = UTF8::seek($input, -3, $off);
+        $this->assertSame(10, $off);
+        $off = UTF8::seek($input, -10, $off);
+        $this->assertSame(0, $off);
+    }
+
+    public function provideStrings() {
+        return [
+            // control samples
+            'sanity check' => ["\x61\x62\x63\x31\x32\x33", [97, 98, 99, 49, 50, 51]],
+            'multibyte control' => ["\xE5\x8F\xA4\xE6\xB1\xA0\xE3\x82\x84\xE8\x9B\x99\xE9\xA3\x9B\xE3\x81\xB3\xE8\xBE\xBC\xE3\x82\x80\xE6\xB0\xB4\xE3\x81\xAE\xE9\x9F\xB3", [21476, 27744, 12420, 34521, 39131, 12403, 36796, 12416, 27700, 12398, 38899]],
+            'mixed sample' => ["\x7A\xC2\xA2\xE6\xB0\xB4\xF0\x9D\x84\x9E\xEF\xA3\xBF\xF4\x8F\xBF\xBD\xEF\xBF\xBE", [122, 162, 27700, 119070, 63743, 1114109, 65534]],
+            // various invalid sequences
+            'invalid code' => ["\xFF", [65533]],
+            'ends early' => ["\xC0", [65533]],
+            'ends early 2' => ["\xE0", [65533]],
+            'invalid trail' => ["\xC0\x00", [65533, 0]],
+            'invalid trail 2' => ["\xC0\xC0", [65533, 65533]],
+            'invalid trail 3' => ["\xE0\x00", [65533, 0]],
+            'invalid trail 4' => ["\xE0\xC0", [65533, 65533]],
+            'invalid trail 5' => ["\xE0\x80\x00", [65533, 65533, 0]],
+            'invalid trail 6' => ["\xE0\x80\xC0", [65533, 65533, 65533]],
+            '> 0x10FFFF' => ["\xFC\x80\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533, 65533]],
+            'obsolete lead byte' => ["\xFE\x80\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533, 65533]],
+            'overlong U+0000 - 2 bytes' => ["\xC0\x80", [65533, 65533]],
+            'overlong U+0000 - 3 bytes' => ["\xE0\x80\x80", [65533, 65533, 65533]],
+            'overlong U+0000 - 4 bytes' => ["\xF0\x80\x80\x80", [65533, 65533, 65533, 65533]],
+            'overlong U+0000 - 5 bytes' => ["\xF8\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533]],
+            'overlong U+0000 - 6 bytes' => ["\xFC\x80\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533, 65533]],
+            'overlong U+007F - 2 bytes' => ["\xC1\xBF", [65533, 65533]],
+            'overlong U+007F - 3 bytes' => ["\xE0\x81\xBF", [65533, 65533, 65533]],
+            'overlong U+007F - 4 bytes' => ["\xF0\x80\x81\xBF", [65533, 65533, 65533, 65533]],
+            'overlong U+007F - 5 bytes' => ["\xF8\x80\x80\x81\xBF", [65533, 65533, 65533, 65533, 65533]],
+            'overlong U+007F - 6 bytes' => ["\xFC\x80\x80\x80\x81\xBF", [65533, 65533, 65533, 65533, 65533, 65533]],
+            'overlong U+07FF - 3 bytes' => ["\xE0\x9F\xBF", [65533, 65533, 65533]],
+            'overlong U+07FF - 4 bytes' => ["\xF0\x80\x9F\xBF", [65533, 65533, 65533, 65533]],
+            'overlong U+07FF - 5 bytes' => ["\xF8\x80\x80\x9F\xBF", [65533, 65533, 65533, 65533, 65533]],
+            'overlong U+07FF - 6 bytes' => ["\xFC\x80\x80\x80\x9F\xBF", [65533, 65533, 65533, 65533, 65533, 65533]],
+            'overlong U+FFFF - 4 bytes' => ["\xF0\x8F\xBF\xBF", [65533, 65533, 65533, 65533]],
+            'overlong U+FFFF - 5 bytes' => ["\xF8\x80\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533]],
+            'overlong U+FFFF - 6 bytes' => ["\xFC\x80\x80\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533, 65533]],
+            'overlong U+10FFFF - 5 bytes' => ["\xF8\x84\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533]],
+            'overlong U+10FFFF - 6 bytes' => ["\xFC\x80\x84\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533, 65533]],
+            // UTF-16 surrogates
+            'lead surrogate' => ["\xED\xA0\x80", [65533, 65533, 65533]],
+            'trail surrogate' => ["\xED\xB0\x80", [65533, 65533, 65533]],
+            'surrogate pair' => ["\xED\xA0\x80\xED\xB0\x80", [65533, 65533, 65533, 65533, 65533, 65533]],
+            // self-sync edge cases
+            'trailing continuation' => ["\x0A\x80\x80", [10, 65533, 65533]],
+            'trailing continuation 2' => ["\xE5\x8F\xA4\x80", [21476, 65533]],
+        ];
+    }
+}
--- a/tests/phpunit.xml
+++ b/tests/phpunit.xml
@ -20,5 +20,8 @@
    <testsuite name="Class instance">
        <file>cases/TestInstance.php</file>
    </testsuite>
+    <testsuite name="Static methods">
+        <file>cases/TestFunctions.php</file>
+    </testsuite>
 </testsuites>
 </phpunit>