Remove functional interface

The maintenance burden is not worth the advantages it provides in limited situations. Moreover, if other decoders are to be implemented, most multi-byte schemes would not be able to support a functional interface of similar simplicity, and single-byte schemes wouldn't benefit much
2018-08-02 14:19:03 -04:00 · 2018-08-02 14:19:03 -04:00 · 88497ddc41
commit 88497ddc41
parent 7409520477
4 changed files with 3 additions and 382 deletions
--- a/lib/UTF8.php
+++ b/lib/UTF8.php
@ -1,210 +0,0 @@
-<?php
-/** @license MIT
- * Copyright 2018 J. King et al.
- * See LICENSE and AUTHORS files for details */
-
-declare(strict_types=1);
-namespace MensBeam\UTF8;
-
-abstract class UTF8 {
-
-    /** Retrieve a character from $string starting at byte offset $pos
-     *
-     * $next is a variable in which to store the next byte offset at which a character starts
-     *
-     * The returned character may be a replacement character, or the empty string if $pos is beyond the end of $string
-     */
-    public static function get(string $string, int $pos, &$next = null): string {
-        start:
-        // get the byte at the specified position
-        $b = @$string[$pos];
-        if (ord($b) < 0x80) {
-            // if the byte is an ASCII character or end of input, simply return it
-            if ($b !== "") {
-                $next = $pos + 1;
-            } else {
-                $next = $pos;
-            }
-            return $b;
-        } else {
-            // otherwise determine the numeric code point of the character, as well as the position of the next character
-            $p = self::ord($string, $pos, $next);
-            return is_int($p) ? self::chr($p) : "\u{FFFD}";
-        }
-    }
-
-    /** Starting from byte offset $pos, advance $num characters through $string and return the byte offset of the found character
-     *
-     * If $num is negative, the operation will be performed in reverse
-     *
-     * If $pos is omitted, the start of the string will be used for a forward seek, and the end for a reverse seek
-     */
-    public static function seek(string $string, int $num, int $pos = null): int {
-        if ($num > 0) {
-            $pos = $pos ?? 0;
-            do {
-                $c = self::get($string, $pos, $pos); // the current position is getting overwritten with the next position, by reference
-            } while (--$num && $c != ""); // stop after we have skipped the desired number of characters, or reached EOF
-            return $pos;
-        } elseif ($num < 0) {
-            $pos = $pos ?? strlen($string);
-            if (!$pos) {
-                // if we're already at the start of the string, we can't go further back
-                return $pos;
-            }
-            $num = abs($num);
-            do {
-                $pos = self::sync($string, $pos -1);
-                $num--;
-            } while ($num && $pos);
-            return $pos;
-        } else {
-            // seeking zero characters is equivalent to a sync
-            return self::sync($string, $pos);
-        }
-    }
-
-    /** Synchronize to the byte offset of the start of the nearest character at or before byte offset $pos */
-    public static function sync(string $string, int $pos): int {
-        $b = ord(@$string[$pos]);
-        if ($b < 0x80) {
-            // if the byte is an ASCII byte or the end of input, then this is already a synchronized position
-            return min(max($pos,0), strlen($string));
-        } else {
-            $s = $pos;
-            while ($b >= 0x80 && $b <= 0xBF && $pos > 0 && ($s - $pos) < 3) { // go back at most three bytes, no further than the start of the string, and only as long as the byte remains a continuation byte
-                $b = ord(@$string[--$pos]);
-            }
-            if (is_null(self::ord($string, $pos, $next))) {
-                return $s;
-            } else {
-                return ($next > $s) ? $pos : $s;
-            }
-        }
-    }
-
-    public static function len(string $string, int $start = 0, int $end = null, int $errMode = null): int {
-        $errMode = $errMode ?? self::$errMode;
-        $end = $end ?? strlen($string);
-        if (substr($string, $start, ($end - $start)) =="") {
-            return 0;
-        }
-        $count = 0;
-        $pos = $start;
-        do {
-            $c = self::get($string, $pos, $pos, $errMode);
-        } while ($c != "" && ++$count && $pos < $end);
-        return $count;
-    }
-
-    public static function substr(string $str, int $start = 0, int $length = null, &$next = null, int $errMode = null): string {
-        $errMode = $errMode ?? self::$errMode;
-        if ($length > 0) {
-            $pos = $start;
-            $buffer = "";
-            do {
-                $c = self::get($string, $pos, $pos, $errMode); // the current position is getting overwritten with the next position, by reference
-                $buffer .= $c;
-            } while (--$length && $c != ""); // stop after we have skipped the desired number of characters, or reached EOF
-            $next = $pos;
-            return $buffer;
-        } else {
-            $next = self::sync($string, $start, $errMode);
-            return "";
-        }
-    }
-
-    /** Decodes the first UTF-8 character from a byte sequence into a numeric code point, starting at byte offset $pos
-     *
-     * Upon success, returns the numeric code point of the character, an integer between 0 and 1114111
-     *
-     * Upon error, returns null; if $char is the empty string or $pos is beyond the end of the string, false is returned
-     *
-     * $next is a variable in which to store the next byte offset at which a character starts
-     */
-    public static function ord(string $string, int $pos = 0, &$next = null) {
-        // this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder
-        // though it differs from a slavish implementation because it operates on only a single
-        // character rather than a whole stream
-        // optimization for ASCII characters
-        $b = @$string[$pos];
-        if ($b=="") {
-            $next = $pos;
-            return false;
-        } elseif (($b = ord($b)) < 0x80) {
-            $next = $pos + 1;
-            return $b;
-        }
-        $point = 0;
-        $seen = 0;
-        $needed = 1;
-        $lower = 0x80;
-        $upper = 0xBF;
-        while ($seen < $needed) {
-            $b = ord(@$string[$pos++]);
-            if (!$seen) {
-                if ($b >= 0xC2 && $b <= 0xDF) { // two-byte character
-                    $needed = 2;
-                    $point = $b & 0x1F;
-                } elseif ($b >= 0xE0 && $b <= 0xEF) { // three-byte character
-                    $needed = 3;
-                    if ($b==0xE0) {
-                        $lower = 0xA0;
-                    } elseif ($b==0xED) {
-                        $upper = 0x9F;
-                    }
-                    $point = $b & 0xF;
-                } elseif ($b >= 0xF0 && $b <= 0xF4) { // four-byte character
-                    $needed = 4;
-                    if ($b==0xF0) {
-                        $lower = 0x90;
-                    } elseif ($b==0xF4) {
-                        $upper = 0x8F;
-                    }
-                    $point = $b & 0x7;
-                } else { // invalid byte
-                    $next = $pos;
-                    return null;
-                }
-            } elseif ($b < $lower || $b > $upper) {
-                $next = $pos - 1;
-                return null;
-            } else {
-                $lower = 0x80;
-                $upper = 0xBF;
-                $point = ($point << 6) | ($b & 0x3F);
-            }
-            $seen++;
-        }
-        $next = $pos;
-        return $point;
-    }
-
-    /** Returns the UTF-8 encoding of $codePoint
-     *
-     * If $codePoint is less than 0 or greater than 1114111, an empty string is returned
-     */
-    public static function chr(int $codePoint): string {
-        // this function implements https://encoding.spec.whatwg.org/#utf-8-encoder
-        if ($codePoint < 0 || $codePoint > 0x10FFFF) {
-            return "";
-        } elseif ($codePoint < 128) {
-            return chr($codePoint);
-        } elseif ($codePoint < 0x800) {
-            $count = 1;
-            $offset = 0xC0;
-        } elseif ($codePoint < 0x10000) {
-            $count = 2;
-            $offset = 0xE0;
-        } else {
-            $count = 3;
-            $offset = 0xF0;
-        }
-        $bytes = chr(($codePoint >> (6 * $count)) + $offset);
-        while ($count > 0) {
-            $bytes .= chr(0x80 | (($codePoint >> (6 * ($count - 1))) & 0x3F));
-            $count--;
-        }
-        return $bytes;
-    }
-}
--- a/perf/perf.php
+++ b/perf/perf.php
@ -28,18 +28,13 @@ $tests = [
            $b = $c;
        }
    }],
-    'Native characters (obj)' => ["", function(string $text) {
+    'Native characters' => ["", function(string $text) {
        $c = null;
        $i = new \MensBeam\UTF8\UTF8String($text);
        while ($c !== "") {
            $c = $i->nextChr();
        }
    }],
-    'Native characters (func)' => ["", function(string $text) {
-        $pos = 0;
-        while (($p = UTF8::get($text, $pos, $pos)) !== "") {
-        }
-    }],
    'Intl code points' => ["intl", function(string $text) {
        $i = (function($text) {
            $i = \IntlBreakIterator::createCodePointInstance();
@ -52,18 +47,13 @@ $tests = [
            $b = $c;
        }
    }],
-    'Native code points (obj)' => ["", function(string $text) {
+    'Native code points' => ["", function(string $text) {
        $p = null;
        $i = new \MensBeam\UTF8\UTF8String($text);
        while ($p !== false) {
            $p = $i->nextOrd();
        }
    }],
-    'Native code points (func)' => ["", function(string $text) {
-        $pos = 0;
-        while (($p = UTF8::ord($text, $pos, $pos)) !== false) {
-        }
-    }],
 ];

 if (!file_exists(__DIR__."/docs/")) {
--- a/tests/cases/TestFunctions.php
+++ b/tests/cases/TestFunctions.php
@ -1,156 +0,0 @@
-<?php
-/** @license MIT
- * Copyright 2017 J. King, Dustin Wilson et al.
- * See LICENSE and AUTHORS files for details */
-
-declare(strict_types=1);
-namespace MensBeam\UTF8\TestCase\Codec;
-
-use MensBeam\UTF8\UTF8;
-
-class TestFunctions extends \PHPUnit\Framework\TestCase {
-    
-    /**
-     * @dataProvider provideStrings
-     * @covers \MensBeam\UTF8\UTF8::ord
-    */
-    public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) {
-        $off = 0;
-        while (($p = UTF8::ord($input, $off, $off)) !== false) {
-            $out[] = $p ?? 0xFFFD;
-        }
-        $this->assertEquals($exp, $out);
-    }
-    
-    /**
-     * @dataProvider provideStrings
-     * @covers \MensBeam\UTF8\UTF8::get
-    */
-    public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) {
-        $exp = array_map(function ($v) {
-            return \IntlChar::chr($v);
-        }, $exp);
-        $off = 0;
-        while (($p = UTF8::get($input, $off, $off)) !== "") {
-            $out[] = $p ?? 0xFFFD;
-        }
-        $this->assertEquals($exp, $out);
-    }
-    
-    /**
-     * @covers \MensBeam\UTF8\UTF8::get
-     * @covers \MensBeam\UTF8\UTF8::ord
-    */
-    public function testTraversePastTheEndOfAString() {
-        $input = "\u{10FFFD}";
-
-        $off = 0;
-        $this->assertSame(0, $off);
-        $this->assertSame("\u{10FFFD}", UTF8::get($input, $off, $off));
-        $this->assertSame(4, $off);
-        $this->assertSame("", UTF8::get($input, $off, $off));
-        $this->assertSame(4, $off);
-        $off = 0;
-        $this->assertSame(0, $off);
-        $this->assertSame(0x10FFFD, UTF8::ord($input, $off, $off));
-        $this->assertSame(4, $off);
-        $this->assertSame(false, UTF8::ord($input, $off, $off));
-        $this->assertSame(4, $off);
-    }
-    
-    /**
-     * @dataProvider provideStrings
-     * @covers \MensBeam\UTF8\UTF8::sync
-    */
-    public function testSTepBackThroughAString(string $input, array $points) {
-        $off = strlen($input);
-        $p = [];
-        while ($off > 0) {
-            $off = UTF8::sync($input, $off - 1);
-            $p[] = UTF8::ord($input, $off) ?? 0xFFFD;
-        }
-        $p = array_reverse($p);
-        $this->assertSame($points, $p);
-    }
-    
-    /**
-     * @covers \MensBeam\UTF8\UTF8::seek
-    */
-    public function testSeekThroughAString() {
-        /*
-            Char 0  U+007A   (1 byte)  Offset 0
-            Char 1  U+00A2   (2 bytes) Offset 1
-            Char 2  U+6C34   (3 bytes) Offset 3
-            Char 3  U+1D11E  (4 bytes) Offset 6
-            Char 4  U+F8FF   (3 bytes) Offset 10
-            Char 5  U+10FFFD (4 bytes) Offset 13
-            Char 6  U+FFFE   (3 bytes) Offset 17
-            End of string at char 7, offset 20
-        */
-        $input = "\x7A\xC2\xA2\xE6\xB0\xB4\xF0\x9D\x84\x9E\xEF\xA3\xBF\xF4\x8F\xBF\xBD\xEF\xBF\xBE";
-        $off = 0;
-        $off = UTF8::seek($input, 0, $off);
-        $this->assertSame(0, $off);
-        $off = UTF8::seek($input, -1, $off);
-        $this->assertSame(0, $off);
-        $off = UTF8::seek($input, 1, $off);
-        $this->assertSame(1, $off);
-        $off = UTF8::seek($input, 2, $off);
-        $this->assertSame(6, $off);
-        $off = UTF8::seek($input, 4, $off);
-        $this->assertSame(20, $off);
-        $off = UTF8::seek($input, 1, $off);
-        $this->assertSame(20, $off);
-        $off = UTF8::seek($input, -3, $off);
-        $this->assertSame(10, $off);
-        $off = UTF8::seek($input, -10, $off);
-        $this->assertSame(0, $off);
-    }
-
-    public function provideStrings() {
-        return [
-            // control samples
-            'sanity check' => ["\x61\x62\x63\x31\x32\x33", [97, 98, 99, 49, 50, 51]],
-            'multibyte control' => ["\xE5\x8F\xA4\xE6\xB1\xA0\xE3\x82\x84\xE8\x9B\x99\xE9\xA3\x9B\xE3\x81\xB3\xE8\xBE\xBC\xE3\x82\x80\xE6\xB0\xB4\xE3\x81\xAE\xE9\x9F\xB3", [21476, 27744, 12420, 34521, 39131, 12403, 36796, 12416, 27700, 12398, 38899]],
-            'mixed sample' => ["\x7A\xC2\xA2\xE6\xB0\xB4\xF0\x9D\x84\x9E\xEF\xA3\xBF\xF4\x8F\xBF\xBD\xEF\xBF\xBE", [122, 162, 27700, 119070, 63743, 1114109, 65534]],
-            // various invalid sequences
-            'invalid code' => ["\xFF", [65533]],
-            'ends early' => ["\xC0", [65533]],
-            'ends early 2' => ["\xE0", [65533]],
-            'invalid trail' => ["\xC0\x00", [65533, 0]],
-            'invalid trail 2' => ["\xC0\xC0", [65533, 65533]],
-            'invalid trail 3' => ["\xE0\x00", [65533, 0]],
-            'invalid trail 4' => ["\xE0\xC0", [65533, 65533]],
-            'invalid trail 5' => ["\xE0\x80\x00", [65533, 65533, 0]],
-            'invalid trail 6' => ["\xE0\x80\xC0", [65533, 65533, 65533]],
-            '> 0x10FFFF' => ["\xFC\x80\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533, 65533]],
-            'obsolete lead byte' => ["\xFE\x80\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533, 65533]],
-            'overlong U+0000 - 2 bytes' => ["\xC0\x80", [65533, 65533]],
-            'overlong U+0000 - 3 bytes' => ["\xE0\x80\x80", [65533, 65533, 65533]],
-            'overlong U+0000 - 4 bytes' => ["\xF0\x80\x80\x80", [65533, 65533, 65533, 65533]],
-            'overlong U+0000 - 5 bytes' => ["\xF8\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533]],
-            'overlong U+0000 - 6 bytes' => ["\xFC\x80\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533, 65533]],
-            'overlong U+007F - 2 bytes' => ["\xC1\xBF", [65533, 65533]],
-            'overlong U+007F - 3 bytes' => ["\xE0\x81\xBF", [65533, 65533, 65533]],
-            'overlong U+007F - 4 bytes' => ["\xF0\x80\x81\xBF", [65533, 65533, 65533, 65533]],
-            'overlong U+007F - 5 bytes' => ["\xF8\x80\x80\x81\xBF", [65533, 65533, 65533, 65533, 65533]],
-            'overlong U+007F - 6 bytes' => ["\xFC\x80\x80\x80\x81\xBF", [65533, 65533, 65533, 65533, 65533, 65533]],
-            'overlong U+07FF - 3 bytes' => ["\xE0\x9F\xBF", [65533, 65533, 65533]],
-            'overlong U+07FF - 4 bytes' => ["\xF0\x80\x9F\xBF", [65533, 65533, 65533, 65533]],
-            'overlong U+07FF - 5 bytes' => ["\xF8\x80\x80\x9F\xBF", [65533, 65533, 65533, 65533, 65533]],
-            'overlong U+07FF - 6 bytes' => ["\xFC\x80\x80\x80\x9F\xBF", [65533, 65533, 65533, 65533, 65533, 65533]],
-            'overlong U+FFFF - 4 bytes' => ["\xF0\x8F\xBF\xBF", [65533, 65533, 65533, 65533]],
-            'overlong U+FFFF - 5 bytes' => ["\xF8\x80\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533]],
-            'overlong U+FFFF - 6 bytes' => ["\xFC\x80\x80\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533, 65533]],
-            'overlong U+10FFFF - 5 bytes' => ["\xF8\x84\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533]],
-            'overlong U+10FFFF - 6 bytes' => ["\xFC\x80\x84\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533, 65533]],
-            // UTF-16 surrogates
-            'lead surrogate' => ["\xED\xA0\x80", [65533, 65533, 65533]],
-            'trail surrogate' => ["\xED\xB0\x80", [65533, 65533, 65533]],
-            'surrogate pair' => ["\xED\xA0\x80\xED\xB0\x80", [65533, 65533, 65533, 65533, 65533, 65533]],
-            // self-sync edge cases
-            'trailing continuation' => ["\x0A\x80\x80", [10, 65533, 65533]],
-            'trailing continuation 2' => ["\xE5\x8F\xA4\x80", [21476, 65533]],
-        ];
-    }
-}
--- a/tests/phpunit.xml
+++ b/tests/phpunit.xml
@ -17,11 +17,8 @@
 </filter>

 <testsuites>
-    <testsuite name="Class instance">
+    <testsuite name="UTF-8">
        <file>cases/TestInstance.php</file>
    </testsuite>
-    <testsuite name="Static methods">
-        <file>cases/TestFunctions.php</file>
-    </testsuite>
 </testsuites>
 </phpunit>