More performance improvements, and a regression fix

6 years ago · 3aaaae0c74
2 changed files with 10 additions and 25 deletions
--- a/lib/UTF8.php
+++ b/lib/UTF8.php
@ -24,7 +24,7 @@ abstract class UTF8 {
        start:
        // get the byte at the specified position
        $b = @$string[$pos];
-        if ($b < "\x80" || $b=="") {
+        if ($b < "\x80") {
            // if the byte is an ASCII character or end of input, simply return it
            $next = $pos + 1;
            return $b;
@ -97,7 +97,7 @@ abstract class UTF8 {
        do {
            $pos--;
            $t++;
-            $b = ($pos < strlen($string)) ? $string[$pos] : "";
+            $b = @$string[$pos];
        } while (
            $b >= "\x80" && $b <= "\xBF" && // continuation bytes
            ($t < 4 || $errMode==self::M_SKIP) && // stop after four bytes, unless we're skipping invalid sequences
@ -186,7 +186,10 @@ abstract class UTF8 {
        start:
        // optimization for ASCII characters
        $b = @$string[$pos];
-        if ($b < "\x80") {
+        if ($b=="") {
+            $next = $pos + 1;
+            return null;
+        } elseif ($b < "\x80") {
            $next = $pos + 1;
            return ord($b);
        }
--- a/perf/perf.php
+++ b/perf/perf.php
@ -14,22 +14,12 @@ $files = [
 ];

 $tests = [
-    'Intl code points' => ["intl", function(string $text): int {
-        $t = 0;
-        $i = \IntlBreakIterator::createCodePointInstance();
-        $i->setText($text);
-        foreach ($i as $o) {
-            $p = $i->getLastCodePoint();
-            $t++;
-        }
-        return $t;
-    }],
-    'Native code points' => ["", function(string $text): int {
+    'Native characters' => ["", function(string $text): int {
        $t = 0;
        $pos = 0;
        $eof = strlen($text);
        while ($pos <= $eof) {
-            $p = UTF8::ord($text, $pos, $pos);
+            $p = UTF8::get($text, $pos, $pos);
            $t++;
        }
        return $t;
@ -44,20 +34,12 @@ $tests = [
        }
        return $t;
    }],
-    'Native characters' => ["", function(string $text): int {
+    'Native code points' => ["", function(string $text): int {
        $t = 0;
        $pos = 0;
        $eof = strlen($text);
        while ($pos <= $eof) {
-            $p = UTF8::get($text, $pos, $pos);
-            $t++;
-        }
-        return $t;
-    }],
-    'PCRE split characters' => ["pcre", function(string $text): int {
-        $t = 0;
-        foreach (preg_split('//u',  $text) as $c) {
-            $p = $c;
+            $p = UTF8::ord($text, $pos, $pos);
            $t++;
        }
        return $t;