Browse Source

30% improvement in performance for multibyte characters

labels
J. King 6 years ago
parent
commit
84d103269f
  1. 96
      lib/UTF8.php

96
lib/UTF8.php

@ -193,54 +193,46 @@ abstract class UTF8 {
$next = $pos + 1;
return ord($b);
}
$eof = strlen($string);
$point = null;
$point = 0;
$seen = 0;
$needed = 0;
$needed = 1;
$lower = 0x80;
$upper = 0xBF;
while ($pos < $eof && !($needed && $seen >= $needed)) {
$b = ord($string[$pos++]);
$next = $pos;
$seen++;
if(!$needed) {
$needed = self::l($b);
switch($needed) {
case 1:
$point = $b;
break;
case 2:
$point = $b & 0x1F;
break;
case 3:
if ($b==0xE0) {
$lower = 0xA0;
} elseif ($b==0xED) {
$upper = 0x9F;
}
$point = $b & 0xF;
break;
case 4:
if ($b==0xF0) {
$lower = 0x90;
} elseif ($b==0xF4) {
$upper = 0x8F;
}
$point = $b & 0x7;
break;
case 0:
switch ($errMode ?? self::$errMode) {
case self::M_SKIP:
goto start;
case self::M_REPLACE:
return false;
default:
throw new \Exception;
}
break;
while ($seen < $needed) {
$b = ord(@$string[$pos++]);
if(!$seen) {
if ($b >= 0xC2 && $b <= 0xDF) { // two-byte character
$needed = 2;
$point = $b & 0x1F;
} elseif ($b >= 0xE0 && $b <= 0xEF) { // three-byte character
$needed = 3;
if ($b==0xE0) {
$lower = 0xA0;
} elseif ($b==0xED) {
$upper = 0x9F;
}
$point = $b & 0xF;
} elseif ($b >= 0xF0 && $b <= 0xF4) { // four-byte character
$needed = 4;
if ($b==0xF0) {
$lower = 0x90;
} elseif ($b==0xF4) {
$upper = 0x8F;
}
$point = $b & 0x7;
} else { // invalid byte
$next = $pos;
switch ($errMode ?? self::$errMode) {
case self::M_SKIP:
goto start;
case self::M_REPLACE:
return false;
default:
throw new \Exception;
}
}
} elseif ($b < $lower || $b > $upper) {
$next--;
$next = $pos - 1;
switch ($errMode ?? self::$errMode) {
case self::M_SKIP:
goto start;
@ -254,19 +246,10 @@ abstract class UTF8 {
$upper = 0xBF;
$point = ($point << 6) | ($b & 0x3F);
}
$seen++;
}
if ($seen < $needed) {
switch ($errMode ?? self::$errMode) {
case self::M_SKIP:
goto start;
case self::M_REPLACE:
return false;
default:
throw new \Exception;
}
} else {
return $point;
}
$next = $pos;
return $point;
}
/** Returns the UTF-8 encoding of $codePoint
@ -291,8 +274,7 @@ abstract class UTF8 {
}
$bytes = chr(($codePoint >> (6 * $count)) + $offset);
while ($count > 0) {
$temp = $codePoint >> (6 * ($count - 1));
$bytes .= chr(0x80 | ($temp & 0x3F));
$bytes .= chr(0x80 | (($codePoint >> (6 * ($count - 1))) & 0x3F));
$count--;
}
return $bytes;

Loading…
Cancel
Save