diff --git a/lib/UTF8.php b/lib/UTF8.php index c1765c9..02a84e3 100644 --- a/lib/UTF8.php +++ b/lib/UTF8.php @@ -14,6 +14,44 @@ abstract class UTF8 { const M_SKIP = 1; const M_HALT = 2; + /** Retrieve a character from $string starting at byte offset $pos + * + * $next is a variable in which to store the next byte offset at which a character starts + * + * The returned character may be a replacement character, or the empty string if $pos is beyond the end of $string + */ + public static function get(string $string, int $pos, &$next = null, int $errMode = null): string { + start: + // get the byte at the specified position + $b = @$string[$pos]; + if (ord($b) < 0x80) { + // if the byte is an ASCII character or end of input, simply return it + $next = $pos + 1; + return $b; + } else { + // otherwise determine the numeric code point of the character, as well as the position of the next character + $p = self::ord($string, $pos, $next, self::M_REPLACE); + if (is_int($p)) { + // if the character is valid, return its serialization + // we do a round trip (bytes > code point > bytes) to normalize overlong sequences + return self::chr($p); + } else { + $errMode = $errMode ?? self::$errMode; + if ($errMode==self::M_REPLACE) { + // if the byte is invalid and we're supposed to replace, return a replacement character + return self::$replacementChar; + } elseif ($errMode==self::M_SKIP) { + // if the character is invalid and we're supposed to skip invalid characters, advance the position and start over + $pos = $next; + goto start; + } else { + // if the byte is invalid and we're supposed to halt, halt + throw new \Exception; + } + } + } + } + /** Starting from byte offset $pos, advance $num characters through $string and return the byte offset of the found character * * If $num is negative, the operation will be performed in reverse @@ -135,6 +173,87 @@ abstract class UTF8 { } } + /** Decodes the first UTF-8 character from a byte sequence into a numeric code point, starting at byte offset $pos + * + * Upon success, returns the numeric code point of the character, an integer between 0 and 1114111 + * + * Upon error, returns false; if $char is the empty string or $pos is beyond the end of the string, null is returned + * + * $next is a variable in which to store the next byte offset at which a character starts + */ + public static function ord(string $string, int $pos = 0, &$next = null, int $errMode = null) { + // this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder + // though it differs from a slavish implementation because it operates on only a single + // character rather than a whole stream + start: + // optimization for ASCII characters + $b = @$string[$pos]; + if ($b=="") { + $next = $pos + 1; + return null; + } elseif (($b = ord($b)) < 0x80) { + $next = $pos + 1; + return $b; + } + $point = 0; + $seen = 0; + $needed = 1; + $lower = 0x80; + $upper = 0xBF; + while ($seen < $needed) { + $b = ord(@$string[$pos++]); + if (!$seen) { + if ($b >= 0xC2 && $b <= 0xDF) { // two-byte character + $needed = 2; + $point = $b & 0x1F; + } elseif ($b >= 0xE0 && $b <= 0xEF) { // three-byte character + $needed = 3; + if ($b==0xE0) { + $lower = 0xA0; + } elseif ($b==0xED) { + $upper = 0x9F; + } + $point = $b & 0xF; + } elseif ($b >= 0xF0 && $b <= 0xF4) { // four-byte character + $needed = 4; + if ($b==0xF0) { + $lower = 0x90; + } elseif ($b==0xF4) { + $upper = 0x8F; + } + $point = $b & 0x7; + } else { // invalid byte + $next = $pos; + switch ($errMode ?? self::$errMode) { + case self::M_SKIP: + goto start; + case self::M_REPLACE: + return false; + default: + throw new \Exception; + } + } + } elseif ($b < $lower || $b > $upper) { + $next = $pos - 1; + switch ($errMode ?? self::$errMode) { + case self::M_SKIP: + goto start; + case self::M_REPLACE: + return false; + default: + throw new \Exception; + } + } else { + $lower = 0x80; + $upper = 0xBF; + $point = ($point << 6) | ($b & 0x3F); + } + $seen++; + } + $next = $pos; + return $point; + } + /** Returns the UTF-8 encoding of $codePoint * * If $codePoint is less than 0 or greater than 1114111, an empty string is returned diff --git a/perf/perf.php b/perf/perf.php index 51dd324..794460b 100644 --- a/perf/perf.php +++ b/perf/perf.php @@ -16,13 +16,20 @@ $files = [ ]; $tests = [ - 'Native characters' => ["", function(string $text) { + 'Native characters (obj)' => ["", function(string $text) { $c = null; $i = new \MensBeam\UTF8\UTF8String($text); while ($c !== "") { $c = $i->nextChr(); } }], + 'Native characters (func)' => ["", function(string $text) { + $pos = 0; + $eof = strlen($text); + while ($pos <= $eof) { + UTF8::get($text, $pos, $pos); + } + }], 'Intl characters' => ["intl", function(string $text) { $i = \IntlBreakIterator::createCodePointInstance(); $i->setText($text); @@ -30,13 +37,20 @@ $tests = [ \IntlChar::chr($i->getLastCodePoint()); } }], - 'Native code points' => ["", function(string $text) { + 'Native code points (obj)' => ["", function(string $text) { $p = null; $i = new \MensBeam\UTF8\UTF8String($text); while ($p !== false) { $p = $i->nextOrd(); } }], + 'Native code points (func)' => ["", function(string $text) { + $pos = 0; + $eof = strlen($text); + while ($pos <= $eof) { + UTF8::ord($text, $pos, $pos); + } + }], ]; if (!file_exists(__DIR__."/docs/")) { @@ -69,6 +83,9 @@ foreach($files as $fName => $file) { $test($text); $t[$a] = microtime(true) - $s; } + sort($t); + array_pop($t); + array_pop($t); $t = array_sum($t) / sizeof($t); echo number_format($t, 3)."\n"; }