diff --git a/lib/UTF8.php b/lib/UTF8.php index 02a84e3..c1765c9 100644 --- a/lib/UTF8.php +++ b/lib/UTF8.php @@ -14,44 +14,6 @@ abstract class UTF8 { const M_SKIP = 1; const M_HALT = 2; - /** Retrieve a character from $string starting at byte offset $pos - * - * $next is a variable in which to store the next byte offset at which a character starts - * - * The returned character may be a replacement character, or the empty string if $pos is beyond the end of $string - */ - public static function get(string $string, int $pos, &$next = null, int $errMode = null): string { - start: - // get the byte at the specified position - $b = @$string[$pos]; - if (ord($b) < 0x80) { - // if the byte is an ASCII character or end of input, simply return it - $next = $pos + 1; - return $b; - } else { - // otherwise determine the numeric code point of the character, as well as the position of the next character - $p = self::ord($string, $pos, $next, self::M_REPLACE); - if (is_int($p)) { - // if the character is valid, return its serialization - // we do a round trip (bytes > code point > bytes) to normalize overlong sequences - return self::chr($p); - } else { - $errMode = $errMode ?? self::$errMode; - if ($errMode==self::M_REPLACE) { - // if the byte is invalid and we're supposed to replace, return a replacement character - return self::$replacementChar; - } elseif ($errMode==self::M_SKIP) { - // if the character is invalid and we're supposed to skip invalid characters, advance the position and start over - $pos = $next; - goto start; - } else { - // if the byte is invalid and we're supposed to halt, halt - throw new \Exception; - } - } - } - } - /** Starting from byte offset $pos, advance $num characters through $string and return the byte offset of the found character * * If $num is negative, the operation will be performed in reverse @@ -173,87 +135,6 @@ abstract class UTF8 { } } - /** Decodes the first UTF-8 character from a byte sequence into a numeric code point, starting at byte offset $pos - * - * Upon success, returns the numeric code point of the character, an integer between 0 and 1114111 - * - * Upon error, returns false; if $char is the empty string or $pos is beyond the end of the string, null is returned - * - * $next is a variable in which to store the next byte offset at which a character starts - */ - public static function ord(string $string, int $pos = 0, &$next = null, int $errMode = null) { - // this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder - // though it differs from a slavish implementation because it operates on only a single - // character rather than a whole stream - start: - // optimization for ASCII characters - $b = @$string[$pos]; - if ($b=="") { - $next = $pos + 1; - return null; - } elseif (($b = ord($b)) < 0x80) { - $next = $pos + 1; - return $b; - } - $point = 0; - $seen = 0; - $needed = 1; - $lower = 0x80; - $upper = 0xBF; - while ($seen < $needed) { - $b = ord(@$string[$pos++]); - if (!$seen) { - if ($b >= 0xC2 && $b <= 0xDF) { // two-byte character - $needed = 2; - $point = $b & 0x1F; - } elseif ($b >= 0xE0 && $b <= 0xEF) { // three-byte character - $needed = 3; - if ($b==0xE0) { - $lower = 0xA0; - } elseif ($b==0xED) { - $upper = 0x9F; - } - $point = $b & 0xF; - } elseif ($b >= 0xF0 && $b <= 0xF4) { // four-byte character - $needed = 4; - if ($b==0xF0) { - $lower = 0x90; - } elseif ($b==0xF4) { - $upper = 0x8F; - } - $point = $b & 0x7; - } else { // invalid byte - $next = $pos; - switch ($errMode ?? self::$errMode) { - case self::M_SKIP: - goto start; - case self::M_REPLACE: - return false; - default: - throw new \Exception; - } - } - } elseif ($b < $lower || $b > $upper) { - $next = $pos - 1; - switch ($errMode ?? self::$errMode) { - case self::M_SKIP: - goto start; - case self::M_REPLACE: - return false; - default: - throw new \Exception; - } - } else { - $lower = 0x80; - $upper = 0xBF; - $point = ($point << 6) | ($b & 0x3F); - } - $seen++; - } - $next = $pos; - return $point; - } - /** Returns the UTF-8 encoding of $codePoint * * If $codePoint is less than 0 or greater than 1114111, an empty string is returned diff --git a/lib/UTF8String.php b/lib/UTF8String.php new file mode 100644 index 0000000..27c6b92 --- /dev/null +++ b/lib/UTF8String.php @@ -0,0 +1,89 @@ +string = $string; + } + + public function nextChr(): string { + // get the byte at the current position + $b = @$this->string[$this->posByte]; + if (ord($b) < 0x80) { + // if the byte is an ASCII character or end of input, simply return it + $this->posChar++; + $this->posByte++; + return $b; + } else { + // otherwise return the serialization of the code point at the current position + return UTF8::chr($this->nextOrd() ?? 0xFFFD); + } + } + + public function nextOrd() { + // this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder + // though it differs from a slavish implementation because it operates on only a single + // character rather than a whole stream + $this->posChar++; + // optimization for ASCII characters + $b = @$this->string[$this->posByte]; + if ($b=="") { + $this->posByte++; + return false; + } elseif (($b = ord($b)) < 0x80) { + $this->posByte++; + return $b; + } + $point = 0; + $seen = 0; + $needed = 1; + $lower = 0x80; + $upper = 0xBF; + while ($seen < $needed) { + $b = ord(@$this->string[$this->posByte++]); + if (!$seen) { + if ($b >= 0xC2 && $b <= 0xDF) { // two-byte character + $needed = 2; + $point = $b & 0x1F; + } elseif ($b >= 0xE0 && $b <= 0xEF) { // three-byte character + $needed = 3; + if ($b==0xE0) { + $lower = 0xA0; + } elseif ($b==0xED) { + $upper = 0x9F; + } + $point = $b & 0xF; + } elseif ($b >= 0xF0 && $b <= 0xF4) { // four-byte character + $needed = 4; + if ($b==0xF0) { + $lower = 0x90; + } elseif ($b==0xF4) { + $upper = 0x8F; + } + $point = $b & 0x7; + } else { // invalid byte + return null; + } + } elseif ($b < $lower || $b > $upper) { + $this->posByte--; + return null; + } else { + $lower = 0x80; + $upper = 0xBF; + $point = ($point << 6) | ($b & 0x3F); + } + $seen++; + } + return $point; + } +} diff --git a/perf/perf.php b/perf/perf.php index e2e4d0d..51dd324 100644 --- a/perf/perf.php +++ b/perf/perf.php @@ -17,10 +17,10 @@ $files = [ $tests = [ 'Native characters' => ["", function(string $text) { - $pos = 0; - $eof = strlen($text); - while ($pos <= $eof) { - UTF8::get($text, $pos, $pos); + $c = null; + $i = new \MensBeam\UTF8\UTF8String($text); + while ($c !== "") { + $c = $i->nextChr(); } }], 'Intl characters' => ["intl", function(string $text) { @@ -31,10 +31,10 @@ $tests = [ } }], 'Native code points' => ["", function(string $text) { - $pos = 0; - $eof = strlen($text); - while ($pos <= $eof) { - UTF8::ord($text, $pos, $pos); + $p = null; + $i = new \MensBeam\UTF8\UTF8String($text); + while ($p !== false) { + $p = $i->nextOrd(); } }], ]; diff --git a/tests/cases/TestCodec.php b/tests/cases/TestCodec.php index 8786bf2..a1bd38a 100644 --- a/tests/cases/TestCodec.php +++ b/tests/cases/TestCodec.php @@ -6,40 +6,35 @@ declare(strict_types=1); namespace MensBeam\UTF8\TestCase\Codec; -use MensBeam\UTF8\UTF8; +use MensBeam\UTF8\UTF8String; -/** @covers \MensBeam\UTF8\UTF8 */ class TestConf extends \PHPUnit\Framework\TestCase { - - /** @group optional */ - public function testDecodeSingleCharacter() { - for ($a = 0; $a <= 0x10FFFF; $a++) { - // the UTF-8 encoding of the code point - $bytes = \IntlChar::chr($a); - // the expected result of decoding the bytes: surrogates are supposed to result in failures on every byte - $exp1 = ($a >= 55296 && $a <= 57343) ? array_fill(0, strlen($bytes), false) : [$a]; - // the expected next-character poisitions: surrogates are supposed to return multiple positions; others always return only the end of the string - $exp2 = ($a >= 55296 && $a <= 57343) ? range(1, strlen($bytes)) : [strlen($bytes)]; - $act1 = []; - $act2 = []; - $pos = 0; - do { - $act1[] = UTF8::ord($bytes, $pos, $pos); - $act2[] = $pos; - } while ($pos < strlen($bytes)); - $this->assertSame($exp1, $act1, 'Character '.strtoupper(bin2hex(\IntlChar::chr($a))).' was not decoded correctly.'); - $this->assertSame($exp2, $act2, 'Next offset for character '.strtoupper(bin2hex(\IntlChar::chr($a))).' is incorrect.'); + + /** + * @dataProvider provideStrings + * @covers \MensBeam\UTF8\UTF8String::__construct + * @covers \MensBeam\UTF8\UTF8String::nextOrd + */ + public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) { + $s = new UTF8String($input); + while (($p = $s->nextOrd()) !== false) { + $out[] = $p ?? 0xFFFD; } + $this->assertEquals($exp, $out); } - /** @dataProvider provideStrings */ - public function testDecodeMultipleCharacters(string $input, array $exp) { - $pos = 0; - $out = []; - $eof = strlen($input); - while ($pos < $eof) { - $p = UTF8::ord($input, $pos, $pos); - $out[] = is_int($p) ? $p : 0xFFFD; + /** + * @dataProvider provideStrings + * @covers \MensBeam\UTF8\UTF8String::__construct + * @covers \MensBeam\UTF8\UTF8String::nextChr + */ + public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) { + $exp = array_map(function($v) { + return \IntlChar::chr($v); + }, $exp); + $s = new UTF8String($input); + while (($c = $s->nextChr()) !== "") { + $out[] = $c; } $this->assertEquals($exp, $out); }