Start on alternate object-based interface

This is both simpler, and slightly faster, yielding between 2% and 5% faster performance
6 years ago · 1ed3c36a65
4 changed files with 121 additions and 156 deletions
--- a/lib/UTF8.php
+++ b/lib/UTF8.php
@ -14,44 +14,6 @@ abstract class UTF8 {
    const M_SKIP = 1;
    const M_HALT = 2;

-    /** Retrieve a character from $string starting at byte offset $pos
-     *
-     * $next is a variable in which to store the next byte offset at which a character starts
-     *
-     * The returned character may be a replacement character, or the empty string if $pos is beyond the end of $string
-     */
-    public static function get(string $string, int $pos, &$next = null, int $errMode = null): string {
-        start:
-        // get the byte at the specified position
-        $b = @$string[$pos];
-        if (ord($b) < 0x80) {
-            // if the byte is an ASCII character or end of input, simply return it
-            $next = $pos + 1;
-            return $b;
-        } else {
-            // otherwise determine the numeric code point of the character, as well as the position of the next character
-            $p = self::ord($string, $pos, $next, self::M_REPLACE);
-            if (is_int($p)) {
-                // if the character is valid, return its serialization
-                // we do a round trip (bytes > code point > bytes) to normalize overlong sequences
-                return self::chr($p);
-            } else {
-                $errMode = $errMode ?? self::$errMode;
-                if ($errMode==self::M_REPLACE) {
-                    // if the byte is invalid and we're supposed to replace, return a replacement character
-                    return self::$replacementChar;
-                } elseif ($errMode==self::M_SKIP) {
-                    // if the character is invalid and we're supposed to skip invalid characters, advance the position and start over
-                    $pos = $next;
-                    goto start;
-                } else {
-                    // if the byte is invalid and we're supposed to halt, halt
-                    throw new \Exception;
-                }
-            }
-        }
-    }
-
    /** Starting from byte offset $pos, advance $num characters through $string and return the byte offset of the found character
     *
     * If $num is negative, the operation will be performed in reverse
@ -173,87 +135,6 @@ abstract class UTF8 {
        }
    }

-    /** Decodes the first UTF-8 character from a byte sequence into a numeric code point, starting at byte offset $pos
-     *
-     * Upon success, returns the numeric code point of the character, an integer between 0 and 1114111
-     *
-     * Upon error, returns false; if $char is the empty string or $pos is beyond the end of the string, null is returned
-     *
-     * $next is a variable in which to store the next byte offset at which a character starts
-     */
-    public static function ord(string $string, int $pos = 0, &$next = null, int $errMode = null) {
-        // this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder
-        // though it differs from a slavish implementation because it operates on only a single
-        // character rather than a whole stream
-        start:
-        // optimization for ASCII characters
-        $b = @$string[$pos];
-        if ($b=="") {
-            $next = $pos + 1;
-            return null;
-        } elseif (($b = ord($b)) < 0x80) {
-            $next = $pos + 1;
-            return $b;
-        }
-        $point = 0;
-        $seen = 0;
-        $needed = 1;
-        $lower = 0x80;
-        $upper = 0xBF;
-        while ($seen < $needed) {
-            $b = ord(@$string[$pos++]);
-            if (!$seen) {
-                if ($b >= 0xC2 && $b <= 0xDF) { // two-byte character
-                    $needed = 2;
-                    $point = $b & 0x1F;
-                } elseif ($b >= 0xE0 && $b <= 0xEF) { // three-byte character
-                    $needed = 3;
-                    if ($b==0xE0) {
-                        $lower = 0xA0;
-                    } elseif ($b==0xED) {
-                        $upper = 0x9F;
-                    }
-                    $point = $b & 0xF;
-                } elseif ($b >= 0xF0 && $b <= 0xF4) { // four-byte character
-                    $needed = 4;
-                    if ($b==0xF0) {
-                        $lower = 0x90;
-                    } elseif ($b==0xF4) {
-                        $upper = 0x8F;
-                    }
-                    $point = $b & 0x7;
-                } else { // invalid byte
-                    $next = $pos;
-                    switch ($errMode ?? self::$errMode) {
-                        case self::M_SKIP:
-                            goto start;
-                        case self::M_REPLACE:
-                            return false;
-                        default:
-                            throw new \Exception;
-                    }
-                }
-            } elseif ($b < $lower || $b > $upper) {
-                $next = $pos - 1;
-                switch ($errMode ?? self::$errMode) {
-                    case self::M_SKIP:
-                        goto start;
-                    case self::M_REPLACE:
-                        return false;
-                    default:
-                        throw new \Exception;
-                }
-            } else {
-                $lower = 0x80;
-                $upper = 0xBF;
-                $point = ($point << 6) | ($b & 0x3F);
-            }
-            $seen++;
-        }
-        $next = $pos;
-        return $point;
-    }
-
    /** Returns the UTF-8 encoding of $codePoint
     *
     * If $codePoint is less than 0 or greater than 1114111, an empty string is returned
--- a/lib/UTF8String.php
+++ b/lib/UTF8String.php
@ -0,0 +1,89 @@
+<?php
+/** @license MIT
+ * Copyright 2018 J. King et al.
+ * See LICENSE and AUTHORS files for details */
+
+declare(strict_types=1);
+namespace MensBeam\UTF8;
+
+class UTF8String {
+    protected $string;
+    protected $posByte = 0;
+    protected $posChar = 0;
+
+
+    public function __construct(string $string) {
+        $this->string = $string;
+    }
+
+    public function nextChr(): string {
+        // get the byte at the current position
+        $b = @$this->string[$this->posByte];
+        if (ord($b) < 0x80) {
+            // if the byte is an ASCII character or end of input, simply return it
+            $this->posChar++;
+            $this->posByte++;
+            return $b;
+        } else {
+            // otherwise return the serialization of the code point at the current position
+            return UTF8::chr($this->nextOrd() ?? 0xFFFD);
+        }
+    }
+
+    public function nextOrd() {
+        // this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder
+        // though it differs from a slavish implementation because it operates on only a single
+        // character rather than a whole stream
+        $this->posChar++;
+        // optimization for ASCII characters
+        $b = @$this->string[$this->posByte];
+        if ($b=="") {
+            $this->posByte++;
+            return false;
+        } elseif (($b = ord($b)) < 0x80) {
+            $this->posByte++;
+            return $b;
+        }
+        $point = 0;
+        $seen = 0;
+        $needed = 1;
+        $lower = 0x80;
+        $upper = 0xBF;
+        while ($seen < $needed) {
+            $b = ord(@$this->string[$this->posByte++]);
+            if (!$seen) {
+                if ($b >= 0xC2 && $b <= 0xDF) { // two-byte character
+                    $needed = 2;
+                    $point = $b & 0x1F;
+                } elseif ($b >= 0xE0 && $b <= 0xEF) { // three-byte character
+                    $needed = 3;
+                    if ($b==0xE0) {
+                        $lower = 0xA0;
+                    } elseif ($b==0xED) {
+                        $upper = 0x9F;
+                    }
+                    $point = $b & 0xF;
+                } elseif ($b >= 0xF0 && $b <= 0xF4) { // four-byte character
+                    $needed = 4;
+                    if ($b==0xF0) {
+                        $lower = 0x90;
+                    } elseif ($b==0xF4) {
+                        $upper = 0x8F;
+                    }
+                    $point = $b & 0x7;
+                } else { // invalid byte
+                    return null;
+                }
+            } elseif ($b < $lower || $b > $upper) {
+                $this->posByte--;
+                return null;
+            } else {
+                $lower = 0x80;
+                $upper = 0xBF;
+                $point = ($point << 6) | ($b & 0x3F);
+            }
+            $seen++;
+        }
+        return $point;
+    }
+}
--- a/perf/perf.php
+++ b/perf/perf.php
@ -17,10 +17,10 @@ $files = [

 $tests = [
    'Native characters' => ["", function(string $text) {
-        $pos = 0;
-        $eof = strlen($text);
-        while ($pos <= $eof) {
-            UTF8::get($text, $pos, $pos);
+        $c = null;
+        $i = new \MensBeam\UTF8\UTF8String($text);
+        while ($c !== "") {
+            $c = $i->nextChr();
        }
    }],
    'Intl characters' => ["intl", function(string $text) {
@ -31,10 +31,10 @@ $tests = [
        }
    }],
    'Native code points' => ["", function(string $text) {
-        $pos = 0;
-        $eof = strlen($text);
-        while ($pos <= $eof) {
-            UTF8::ord($text, $pos, $pos);
+        $p = null;
+        $i = new \MensBeam\UTF8\UTF8String($text);
+        while ($p !== false) {
+            $p = $i->nextOrd();
        }
    }],
 ];
--- a/tests/cases/TestCodec.php
+++ b/tests/cases/TestCodec.php
@ -6,40 +6,35 @@
 declare(strict_types=1);
 namespace MensBeam\UTF8\TestCase\Codec;

-use MensBeam\UTF8\UTF8;
+use MensBeam\UTF8\UTF8String;

-/** @covers \MensBeam\UTF8\UTF8 */
 class TestConf extends \PHPUnit\Framework\TestCase {
-
-    /** @group optional */
-    public function testDecodeSingleCharacter() {
-        for ($a = 0; $a <= 0x10FFFF; $a++) {
-            // the UTF-8 encoding of the code point
-            $bytes = \IntlChar::chr($a);
-            // the expected result of decoding the bytes: surrogates are supposed to result in failures on every byte
-            $exp1 = ($a >= 55296 && $a <= 57343) ? array_fill(0, strlen($bytes), false) : [$a];
-            // the expected next-character poisitions: surrogates are supposed to return multiple positions; others always return only the end of the string
-            $exp2 = ($a >= 55296 && $a <= 57343) ? range(1, strlen($bytes)) : [strlen($bytes)];
-            $act1 = [];
-            $act2 = [];
-            $pos = 0;
-            do {
-                $act1[] = UTF8::ord($bytes, $pos, $pos);
-                $act2[] = $pos;
-            } while ($pos < strlen($bytes));
-            $this->assertSame($exp1, $act1, 'Character '.strtoupper(bin2hex(\IntlChar::chr($a))).' was not decoded correctly.');
-            $this->assertSame($exp2, $act2, 'Next offset for character '.strtoupper(bin2hex(\IntlChar::chr($a))).' is incorrect.');
+    
+    /** 
+     * @dataProvider provideStrings
+     * @covers \MensBeam\UTF8\UTF8String::__construct
+     * @covers \MensBeam\UTF8\UTF8String::nextOrd
+    */
+    public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) {
+        $s = new UTF8String($input);
+        while (($p = $s->nextOrd()) !== false) {
+            $out[] = $p ?? 0xFFFD;
        }
+        $this->assertEquals($exp, $out);
    }
    
-    /** @dataProvider provideStrings */
-    public function testDecodeMultipleCharacters(string $input, array $exp) {
-        $pos = 0;
-        $out = [];
-        $eof = strlen($input);
-        while ($pos < $eof) {
-            $p = UTF8::ord($input, $pos, $pos);
-            $out[] = is_int($p) ? $p : 0xFFFD;
+    /** 
+     * @dataProvider provideStrings
+     * @covers \MensBeam\UTF8\UTF8String::__construct
+     * @covers \MensBeam\UTF8\UTF8String::nextChr
+    */
+    public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) {
+        $exp = array_map(function($v) {
+            return \IntlChar::chr($v);
+        }, $exp);
+        $s = new UTF8String($input);
+        while (($c = $s->nextChr()) !== "") {
+            $out[] = $c;
        }
        $this->assertEquals($exp, $out);
    }