Browse Source

More performance improvements, and a regression fix

labels
J. King 6 years ago
parent
commit
3aaaae0c74
  1. 9
      lib/UTF8.php
  2. 26
      perf/perf.php

9
lib/UTF8.php

@ -24,7 +24,7 @@ abstract class UTF8 {
start:
// get the byte at the specified position
$b = @$string[$pos];
if ($b < "\x80" || $b=="") {
if ($b < "\x80") {
// if the byte is an ASCII character or end of input, simply return it
$next = $pos + 1;
return $b;
@ -97,7 +97,7 @@ abstract class UTF8 {
do {
$pos--;
$t++;
$b = ($pos < strlen($string)) ? $string[$pos] : "";
$b = @$string[$pos];
} while (
$b >= "\x80" && $b <= "\xBF" && // continuation bytes
($t < 4 || $errMode==self::M_SKIP) && // stop after four bytes, unless we're skipping invalid sequences
@ -186,7 +186,10 @@ abstract class UTF8 {
start:
// optimization for ASCII characters
$b = @$string[$pos];
if ($b < "\x80") {
if ($b=="") {
$next = $pos + 1;
return null;
} elseif ($b < "\x80") {
$next = $pos + 1;
return ord($b);
}

26
perf/perf.php

@ -14,22 +14,12 @@ $files = [
];
$tests = [
'Intl code points' => ["intl", function(string $text): int {
$t = 0;
$i = \IntlBreakIterator::createCodePointInstance();
$i->setText($text);
foreach ($i as $o) {
$p = $i->getLastCodePoint();
$t++;
}
return $t;
}],
'Native code points' => ["", function(string $text): int {
'Native characters' => ["", function(string $text): int {
$t = 0;
$pos = 0;
$eof = strlen($text);
while ($pos <= $eof) {
$p = UTF8::ord($text, $pos, $pos);
$p = UTF8::get($text, $pos, $pos);
$t++;
}
return $t;
@ -44,20 +34,12 @@ $tests = [
}
return $t;
}],
'Native characters' => ["", function(string $text): int {
'Native code points' => ["", function(string $text): int {
$t = 0;
$pos = 0;
$eof = strlen($text);
while ($pos <= $eof) {
$p = UTF8::get($text, $pos, $pos);
$t++;
}
return $t;
}],
'PCRE split characters' => ["pcre", function(string $text): int {
$t = 0;
foreach (preg_split('//u', $text) as $c) {
$p = $c;
$p = UTF8::ord($text, $pos, $pos);
$t++;
}
return $t;

Loading…
Cancel
Save