Restore deleted portion of functional interface

Also added comparative performance measurement
6 years ago · ac5e91f843
2 changed files with 138 additions and 2 deletions
--- a/lib/UTF8.php
+++ b/lib/UTF8.php
@ -14,6 +14,44 @@ abstract class UTF8 {
    const M_SKIP = 1;
    const M_HALT = 2;

+    /** Retrieve a character from $string starting at byte offset $pos
+     *
+     * $next is a variable in which to store the next byte offset at which a character starts
+     *
+     * The returned character may be a replacement character, or the empty string if $pos is beyond the end of $string
+     */
+    public static function get(string $string, int $pos, &$next = null, int $errMode = null): string {
+        start:
+        // get the byte at the specified position
+        $b = @$string[$pos];
+        if (ord($b) < 0x80) {
+            // if the byte is an ASCII character or end of input, simply return it
+            $next = $pos + 1;
+            return $b;
+        } else {
+            // otherwise determine the numeric code point of the character, as well as the position of the next character
+            $p = self::ord($string, $pos, $next, self::M_REPLACE);
+            if (is_int($p)) {
+                // if the character is valid, return its serialization
+                // we do a round trip (bytes > code point > bytes) to normalize overlong sequences
+                return self::chr($p);
+            } else {
+                $errMode = $errMode ?? self::$errMode;
+                if ($errMode==self::M_REPLACE) {
+                    // if the byte is invalid and we're supposed to replace, return a replacement character
+                    return self::$replacementChar;
+                } elseif ($errMode==self::M_SKIP) {
+                    // if the character is invalid and we're supposed to skip invalid characters, advance the position and start over
+                    $pos = $next;
+                    goto start;
+                } else {
+                    // if the byte is invalid and we're supposed to halt, halt
+                    throw new \Exception;
+                }
+            }
+        }
+    }
+
    /** Starting from byte offset $pos, advance $num characters through $string and return the byte offset of the found character
     *
     * If $num is negative, the operation will be performed in reverse
@ -135,6 +173,87 @@ abstract class UTF8 {
        }
    }

+    /** Decodes the first UTF-8 character from a byte sequence into a numeric code point, starting at byte offset $pos
+     *
+     * Upon success, returns the numeric code point of the character, an integer between 0 and 1114111
+     *
+     * Upon error, returns false; if $char is the empty string or $pos is beyond the end of the string, null is returned
+     *
+     * $next is a variable in which to store the next byte offset at which a character starts
+     */
+    public static function ord(string $string, int $pos = 0, &$next = null, int $errMode = null) {
+        // this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder
+        // though it differs from a slavish implementation because it operates on only a single
+        // character rather than a whole stream
+        start:
+        // optimization for ASCII characters
+        $b = @$string[$pos];
+        if ($b=="") {
+            $next = $pos + 1;
+            return null;
+        } elseif (($b = ord($b)) < 0x80) {
+            $next = $pos + 1;
+            return $b;
+        }
+        $point = 0;
+        $seen = 0;
+        $needed = 1;
+        $lower = 0x80;
+        $upper = 0xBF;
+        while ($seen < $needed) {
+            $b = ord(@$string[$pos++]);
+            if (!$seen) {
+                if ($b >= 0xC2 && $b <= 0xDF) { // two-byte character
+                    $needed = 2;
+                    $point = $b & 0x1F;
+                } elseif ($b >= 0xE0 && $b <= 0xEF) { // three-byte character
+                    $needed = 3;
+                    if ($b==0xE0) {
+                        $lower = 0xA0;
+                    } elseif ($b==0xED) {
+                        $upper = 0x9F;
+                    }
+                    $point = $b & 0xF;
+                } elseif ($b >= 0xF0 && $b <= 0xF4) { // four-byte character
+                    $needed = 4;
+                    if ($b==0xF0) {
+                        $lower = 0x90;
+                    } elseif ($b==0xF4) {
+                        $upper = 0x8F;
+                    }
+                    $point = $b & 0x7;
+                } else { // invalid byte
+                    $next = $pos;
+                    switch ($errMode ?? self::$errMode) {
+                        case self::M_SKIP:
+                            goto start;
+                        case self::M_REPLACE:
+                            return false;
+                        default:
+                            throw new \Exception;
+                    }
+                }
+            } elseif ($b < $lower || $b > $upper) {
+                $next = $pos - 1;
+                switch ($errMode ?? self::$errMode) {
+                    case self::M_SKIP:
+                        goto start;
+                    case self::M_REPLACE:
+                        return false;
+                    default:
+                        throw new \Exception;
+                }
+            } else {
+                $lower = 0x80;
+                $upper = 0xBF;
+                $point = ($point << 6) | ($b & 0x3F);
+            }
+            $seen++;
+        }
+        $next = $pos;
+        return $point;
+    }
+
    /** Returns the UTF-8 encoding of $codePoint
     *
     * If $codePoint is less than 0 or greater than 1114111, an empty string is returned
--- a/perf/perf.php
+++ b/perf/perf.php
@ -16,13 +16,20 @@ $files = [
 ];

 $tests = [
-    'Native characters' => ["", function(string $text) {
+    'Native characters (obj)' => ["", function(string $text) {
        $c = null;
        $i = new \MensBeam\UTF8\UTF8String($text);
        while ($c !== "") {
            $c = $i->nextChr();
        }
    }],
+    'Native characters (func)' => ["", function(string $text) {
+        $pos = 0;
+        $eof = strlen($text);
+        while ($pos <= $eof) {
+            UTF8::get($text, $pos, $pos);
+        }
+    }],
    'Intl characters' => ["intl", function(string $text) {
        $i = \IntlBreakIterator::createCodePointInstance();
        $i->setText($text);
@ -30,13 +37,20 @@ $tests = [
            \IntlChar::chr($i->getLastCodePoint());
        }
    }],
-    'Native code points' => ["", function(string $text) {
+    'Native code points (obj)' => ["", function(string $text) {
        $p = null;
        $i = new \MensBeam\UTF8\UTF8String($text);
        while ($p !== false) {
            $p = $i->nextOrd();
        }
    }],
+    'Native code points (func)' => ["", function(string $text) {
+        $pos = 0;
+        $eof = strlen($text);
+        while ($pos <= $eof) {
+            UTF8::ord($text, $pos, $pos);
+        }
+    }],
 ];

 if (!file_exists(__DIR__."/docs/")) {
@ -69,6 +83,9 @@ foreach($files as $fName => $file) {
                $test($text);
                $t[$a] = microtime(true) - $s;
            }
+            sort($t);
+            array_pop($t);
+            array_pop($t);
            $t = array_sum($t) / sizeof($t);
            echo number_format($t, 3)."\n";
        }