diff --git a/lib/UTF8.php b/lib/UTF8.php deleted file mode 100644 index e3ee5e3..0000000 --- a/lib/UTF8.php +++ /dev/null @@ -1,210 +0,0 @@ - 0) { - $pos = $pos ?? 0; - do { - $c = self::get($string, $pos, $pos); // the current position is getting overwritten with the next position, by reference - } while (--$num && $c != ""); // stop after we have skipped the desired number of characters, or reached EOF - return $pos; - } elseif ($num < 0) { - $pos = $pos ?? strlen($string); - if (!$pos) { - // if we're already at the start of the string, we can't go further back - return $pos; - } - $num = abs($num); - do { - $pos = self::sync($string, $pos -1); - $num--; - } while ($num && $pos); - return $pos; - } else { - // seeking zero characters is equivalent to a sync - return self::sync($string, $pos); - } - } - - /** Synchronize to the byte offset of the start of the nearest character at or before byte offset $pos */ - public static function sync(string $string, int $pos): int { - $b = ord(@$string[$pos]); - if ($b < 0x80) { - // if the byte is an ASCII byte or the end of input, then this is already a synchronized position - return min(max($pos,0), strlen($string)); - } else { - $s = $pos; - while ($b >= 0x80 && $b <= 0xBF && $pos > 0 && ($s - $pos) < 3) { // go back at most three bytes, no further than the start of the string, and only as long as the byte remains a continuation byte - $b = ord(@$string[--$pos]); - } - if (is_null(self::ord($string, $pos, $next))) { - return $s; - } else { - return ($next > $s) ? $pos : $s; - } - } - } - - public static function len(string $string, int $start = 0, int $end = null, int $errMode = null): int { - $errMode = $errMode ?? self::$errMode; - $end = $end ?? strlen($string); - if (substr($string, $start, ($end - $start)) =="") { - return 0; - } - $count = 0; - $pos = $start; - do { - $c = self::get($string, $pos, $pos, $errMode); - } while ($c != "" && ++$count && $pos < $end); - return $count; - } - - public static function substr(string $str, int $start = 0, int $length = null, &$next = null, int $errMode = null): string { - $errMode = $errMode ?? self::$errMode; - if ($length > 0) { - $pos = $start; - $buffer = ""; - do { - $c = self::get($string, $pos, $pos, $errMode); // the current position is getting overwritten with the next position, by reference - $buffer .= $c; - } while (--$length && $c != ""); // stop after we have skipped the desired number of characters, or reached EOF - $next = $pos; - return $buffer; - } else { - $next = self::sync($string, $start, $errMode); - return ""; - } - } - - /** Decodes the first UTF-8 character from a byte sequence into a numeric code point, starting at byte offset $pos - * - * Upon success, returns the numeric code point of the character, an integer between 0 and 1114111 - * - * Upon error, returns null; if $char is the empty string or $pos is beyond the end of the string, false is returned - * - * $next is a variable in which to store the next byte offset at which a character starts - */ - public static function ord(string $string, int $pos = 0, &$next = null) { - // this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder - // though it differs from a slavish implementation because it operates on only a single - // character rather than a whole stream - // optimization for ASCII characters - $b = @$string[$pos]; - if ($b=="") { - $next = $pos; - return false; - } elseif (($b = ord($b)) < 0x80) { - $next = $pos + 1; - return $b; - } - $point = 0; - $seen = 0; - $needed = 1; - $lower = 0x80; - $upper = 0xBF; - while ($seen < $needed) { - $b = ord(@$string[$pos++]); - if (!$seen) { - if ($b >= 0xC2 && $b <= 0xDF) { // two-byte character - $needed = 2; - $point = $b & 0x1F; - } elseif ($b >= 0xE0 && $b <= 0xEF) { // three-byte character - $needed = 3; - if ($b==0xE0) { - $lower = 0xA0; - } elseif ($b==0xED) { - $upper = 0x9F; - } - $point = $b & 0xF; - } elseif ($b >= 0xF0 && $b <= 0xF4) { // four-byte character - $needed = 4; - if ($b==0xF0) { - $lower = 0x90; - } elseif ($b==0xF4) { - $upper = 0x8F; - } - $point = $b & 0x7; - } else { // invalid byte - $next = $pos; - return null; - } - } elseif ($b < $lower || $b > $upper) { - $next = $pos - 1; - return null; - } else { - $lower = 0x80; - $upper = 0xBF; - $point = ($point << 6) | ($b & 0x3F); - } - $seen++; - } - $next = $pos; - return $point; - } - - /** Returns the UTF-8 encoding of $codePoint - * - * If $codePoint is less than 0 or greater than 1114111, an empty string is returned - */ - public static function chr(int $codePoint): string { - // this function implements https://encoding.spec.whatwg.org/#utf-8-encoder - if ($codePoint < 0 || $codePoint > 0x10FFFF) { - return ""; - } elseif ($codePoint < 128) { - return chr($codePoint); - } elseif ($codePoint < 0x800) { - $count = 1; - $offset = 0xC0; - } elseif ($codePoint < 0x10000) { - $count = 2; - $offset = 0xE0; - } else { - $count = 3; - $offset = 0xF0; - } - $bytes = chr(($codePoint >> (6 * $count)) + $offset); - while ($count > 0) { - $bytes .= chr(0x80 | (($codePoint >> (6 * ($count - 1))) & 0x3F)); - $count--; - } - return $bytes; - } -} diff --git a/perf/perf.php b/perf/perf.php index 978a2e0..fe29fec 100644 --- a/perf/perf.php +++ b/perf/perf.php @@ -28,18 +28,13 @@ $tests = [ $b = $c; } }], - 'Native characters (obj)' => ["", function(string $text) { + 'Native characters' => ["", function(string $text) { $c = null; $i = new \MensBeam\UTF8\UTF8String($text); while ($c !== "") { $c = $i->nextChr(); } }], - 'Native characters (func)' => ["", function(string $text) { - $pos = 0; - while (($p = UTF8::get($text, $pos, $pos)) !== "") { - } - }], 'Intl code points' => ["intl", function(string $text) { $i = (function($text) { $i = \IntlBreakIterator::createCodePointInstance(); @@ -52,18 +47,13 @@ $tests = [ $b = $c; } }], - 'Native code points (obj)' => ["", function(string $text) { + 'Native code points' => ["", function(string $text) { $p = null; $i = new \MensBeam\UTF8\UTF8String($text); while ($p !== false) { $p = $i->nextOrd(); } }], - 'Native code points (func)' => ["", function(string $text) { - $pos = 0; - while (($p = UTF8::ord($text, $pos, $pos)) !== false) { - } - }], ]; if (!file_exists(__DIR__."/docs/")) { diff --git a/tests/cases/TestFunctions.php b/tests/cases/TestFunctions.php deleted file mode 100644 index 4b96c52..0000000 --- a/tests/cases/TestFunctions.php +++ /dev/null @@ -1,156 +0,0 @@ -assertEquals($exp, $out); - } - - /** - * @dataProvider provideStrings - * @covers \MensBeam\UTF8\UTF8::get - */ - public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) { - $exp = array_map(function ($v) { - return \IntlChar::chr($v); - }, $exp); - $off = 0; - while (($p = UTF8::get($input, $off, $off)) !== "") { - $out[] = $p ?? 0xFFFD; - } - $this->assertEquals($exp, $out); - } - - /** - * @covers \MensBeam\UTF8\UTF8::get - * @covers \MensBeam\UTF8\UTF8::ord - */ - public function testTraversePastTheEndOfAString() { - $input = "\u{10FFFD}"; - - $off = 0; - $this->assertSame(0, $off); - $this->assertSame("\u{10FFFD}", UTF8::get($input, $off, $off)); - $this->assertSame(4, $off); - $this->assertSame("", UTF8::get($input, $off, $off)); - $this->assertSame(4, $off); - $off = 0; - $this->assertSame(0, $off); - $this->assertSame(0x10FFFD, UTF8::ord($input, $off, $off)); - $this->assertSame(4, $off); - $this->assertSame(false, UTF8::ord($input, $off, $off)); - $this->assertSame(4, $off); - } - - /** - * @dataProvider provideStrings - * @covers \MensBeam\UTF8\UTF8::sync - */ - public function testSTepBackThroughAString(string $input, array $points) { - $off = strlen($input); - $p = []; - while ($off > 0) { - $off = UTF8::sync($input, $off - 1); - $p[] = UTF8::ord($input, $off) ?? 0xFFFD; - } - $p = array_reverse($p); - $this->assertSame($points, $p); - } - - /** - * @covers \MensBeam\UTF8\UTF8::seek - */ - public function testSeekThroughAString() { - /* - Char 0 U+007A (1 byte) Offset 0 - Char 1 U+00A2 (2 bytes) Offset 1 - Char 2 U+6C34 (3 bytes) Offset 3 - Char 3 U+1D11E (4 bytes) Offset 6 - Char 4 U+F8FF (3 bytes) Offset 10 - Char 5 U+10FFFD (4 bytes) Offset 13 - Char 6 U+FFFE (3 bytes) Offset 17 - End of string at char 7, offset 20 - */ - $input = "\x7A\xC2\xA2\xE6\xB0\xB4\xF0\x9D\x84\x9E\xEF\xA3\xBF\xF4\x8F\xBF\xBD\xEF\xBF\xBE"; - $off = 0; - $off = UTF8::seek($input, 0, $off); - $this->assertSame(0, $off); - $off = UTF8::seek($input, -1, $off); - $this->assertSame(0, $off); - $off = UTF8::seek($input, 1, $off); - $this->assertSame(1, $off); - $off = UTF8::seek($input, 2, $off); - $this->assertSame(6, $off); - $off = UTF8::seek($input, 4, $off); - $this->assertSame(20, $off); - $off = UTF8::seek($input, 1, $off); - $this->assertSame(20, $off); - $off = UTF8::seek($input, -3, $off); - $this->assertSame(10, $off); - $off = UTF8::seek($input, -10, $off); - $this->assertSame(0, $off); - } - - public function provideStrings() { - return [ - // control samples - 'sanity check' => ["\x61\x62\x63\x31\x32\x33", [97, 98, 99, 49, 50, 51]], - 'multibyte control' => ["\xE5\x8F\xA4\xE6\xB1\xA0\xE3\x82\x84\xE8\x9B\x99\xE9\xA3\x9B\xE3\x81\xB3\xE8\xBE\xBC\xE3\x82\x80\xE6\xB0\xB4\xE3\x81\xAE\xE9\x9F\xB3", [21476, 27744, 12420, 34521, 39131, 12403, 36796, 12416, 27700, 12398, 38899]], - 'mixed sample' => ["\x7A\xC2\xA2\xE6\xB0\xB4\xF0\x9D\x84\x9E\xEF\xA3\xBF\xF4\x8F\xBF\xBD\xEF\xBF\xBE", [122, 162, 27700, 119070, 63743, 1114109, 65534]], - // various invalid sequences - 'invalid code' => ["\xFF", [65533]], - 'ends early' => ["\xC0", [65533]], - 'ends early 2' => ["\xE0", [65533]], - 'invalid trail' => ["\xC0\x00", [65533, 0]], - 'invalid trail 2' => ["\xC0\xC0", [65533, 65533]], - 'invalid trail 3' => ["\xE0\x00", [65533, 0]], - 'invalid trail 4' => ["\xE0\xC0", [65533, 65533]], - 'invalid trail 5' => ["\xE0\x80\x00", [65533, 65533, 0]], - 'invalid trail 6' => ["\xE0\x80\xC0", [65533, 65533, 65533]], - '> 0x10FFFF' => ["\xFC\x80\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533, 65533]], - 'obsolete lead byte' => ["\xFE\x80\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533, 65533]], - 'overlong U+0000 - 2 bytes' => ["\xC0\x80", [65533, 65533]], - 'overlong U+0000 - 3 bytes' => ["\xE0\x80\x80", [65533, 65533, 65533]], - 'overlong U+0000 - 4 bytes' => ["\xF0\x80\x80\x80", [65533, 65533, 65533, 65533]], - 'overlong U+0000 - 5 bytes' => ["\xF8\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533]], - 'overlong U+0000 - 6 bytes' => ["\xFC\x80\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533, 65533]], - 'overlong U+007F - 2 bytes' => ["\xC1\xBF", [65533, 65533]], - 'overlong U+007F - 3 bytes' => ["\xE0\x81\xBF", [65533, 65533, 65533]], - 'overlong U+007F - 4 bytes' => ["\xF0\x80\x81\xBF", [65533, 65533, 65533, 65533]], - 'overlong U+007F - 5 bytes' => ["\xF8\x80\x80\x81\xBF", [65533, 65533, 65533, 65533, 65533]], - 'overlong U+007F - 6 bytes' => ["\xFC\x80\x80\x80\x81\xBF", [65533, 65533, 65533, 65533, 65533, 65533]], - 'overlong U+07FF - 3 bytes' => ["\xE0\x9F\xBF", [65533, 65533, 65533]], - 'overlong U+07FF - 4 bytes' => ["\xF0\x80\x9F\xBF", [65533, 65533, 65533, 65533]], - 'overlong U+07FF - 5 bytes' => ["\xF8\x80\x80\x9F\xBF", [65533, 65533, 65533, 65533, 65533]], - 'overlong U+07FF - 6 bytes' => ["\xFC\x80\x80\x80\x9F\xBF", [65533, 65533, 65533, 65533, 65533, 65533]], - 'overlong U+FFFF - 4 bytes' => ["\xF0\x8F\xBF\xBF", [65533, 65533, 65533, 65533]], - 'overlong U+FFFF - 5 bytes' => ["\xF8\x80\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533]], - 'overlong U+FFFF - 6 bytes' => ["\xFC\x80\x80\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533, 65533]], - 'overlong U+10FFFF - 5 bytes' => ["\xF8\x84\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533]], - 'overlong U+10FFFF - 6 bytes' => ["\xFC\x80\x84\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533, 65533]], - // UTF-16 surrogates - 'lead surrogate' => ["\xED\xA0\x80", [65533, 65533, 65533]], - 'trail surrogate' => ["\xED\xB0\x80", [65533, 65533, 65533]], - 'surrogate pair' => ["\xED\xA0\x80\xED\xB0\x80", [65533, 65533, 65533, 65533, 65533, 65533]], - // self-sync edge cases - 'trailing continuation' => ["\x0A\x80\x80", [10, 65533, 65533]], - 'trailing continuation 2' => ["\xE5\x8F\xA4\x80", [21476, 65533]], - ]; - } -} diff --git a/tests/phpunit.xml b/tests/phpunit.xml index 374bc20..85ebbfe 100644 --- a/tests/phpunit.xml +++ b/tests/phpunit.xml @@ -17,11 +17,8 @@ - + cases/TestInstance.php - - cases/TestFunctions.php -