Browse Source

Start on alternate object-based interface

This is both simpler, and slightly faster, yielding between 2% and 5% faster performance
labels
J. King 6 years ago
parent
commit
1ed3c36a65
  1. 119
      lib/UTF8.php
  2. 89
      lib/UTF8String.php
  3. 16
      perf/perf.php
  4. 53
      tests/cases/TestCodec.php

119
lib/UTF8.php

@ -14,44 +14,6 @@ abstract class UTF8 {
const M_SKIP = 1;
const M_HALT = 2;
/** Retrieve a character from $string starting at byte offset $pos
*
* $next is a variable in which to store the next byte offset at which a character starts
*
* The returned character may be a replacement character, or the empty string if $pos is beyond the end of $string
*/
public static function get(string $string, int $pos, &$next = null, int $errMode = null): string {
start:
// get the byte at the specified position
$b = @$string[$pos];
if (ord($b) < 0x80) {
// if the byte is an ASCII character or end of input, simply return it
$next = $pos + 1;
return $b;
} else {
// otherwise determine the numeric code point of the character, as well as the position of the next character
$p = self::ord($string, $pos, $next, self::M_REPLACE);
if (is_int($p)) {
// if the character is valid, return its serialization
// we do a round trip (bytes > code point > bytes) to normalize overlong sequences
return self::chr($p);
} else {
$errMode = $errMode ?? self::$errMode;
if ($errMode==self::M_REPLACE) {
// if the byte is invalid and we're supposed to replace, return a replacement character
return self::$replacementChar;
} elseif ($errMode==self::M_SKIP) {
// if the character is invalid and we're supposed to skip invalid characters, advance the position and start over
$pos = $next;
goto start;
} else {
// if the byte is invalid and we're supposed to halt, halt
throw new \Exception;
}
}
}
}
/** Starting from byte offset $pos, advance $num characters through $string and return the byte offset of the found character
*
* If $num is negative, the operation will be performed in reverse
@ -173,87 +135,6 @@ abstract class UTF8 {
}
}
/** Decodes the first UTF-8 character from a byte sequence into a numeric code point, starting at byte offset $pos
*
* Upon success, returns the numeric code point of the character, an integer between 0 and 1114111
*
* Upon error, returns false; if $char is the empty string or $pos is beyond the end of the string, null is returned
*
* $next is a variable in which to store the next byte offset at which a character starts
*/
public static function ord(string $string, int $pos = 0, &$next = null, int $errMode = null) {
// this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder
// though it differs from a slavish implementation because it operates on only a single
// character rather than a whole stream
start:
// optimization for ASCII characters
$b = @$string[$pos];
if ($b=="") {
$next = $pos + 1;
return null;
} elseif (($b = ord($b)) < 0x80) {
$next = $pos + 1;
return $b;
}
$point = 0;
$seen = 0;
$needed = 1;
$lower = 0x80;
$upper = 0xBF;
while ($seen < $needed) {
$b = ord(@$string[$pos++]);
if (!$seen) {
if ($b >= 0xC2 && $b <= 0xDF) { // two-byte character
$needed = 2;
$point = $b & 0x1F;
} elseif ($b >= 0xE0 && $b <= 0xEF) { // three-byte character
$needed = 3;
if ($b==0xE0) {
$lower = 0xA0;
} elseif ($b==0xED) {
$upper = 0x9F;
}
$point = $b & 0xF;
} elseif ($b >= 0xF0 && $b <= 0xF4) { // four-byte character
$needed = 4;
if ($b==0xF0) {
$lower = 0x90;
} elseif ($b==0xF4) {
$upper = 0x8F;
}
$point = $b & 0x7;
} else { // invalid byte
$next = $pos;
switch ($errMode ?? self::$errMode) {
case self::M_SKIP:
goto start;
case self::M_REPLACE:
return false;
default:
throw new \Exception;
}
}
} elseif ($b < $lower || $b > $upper) {
$next = $pos - 1;
switch ($errMode ?? self::$errMode) {
case self::M_SKIP:
goto start;
case self::M_REPLACE:
return false;
default:
throw new \Exception;
}
} else {
$lower = 0x80;
$upper = 0xBF;
$point = ($point << 6) | ($b & 0x3F);
}
$seen++;
}
$next = $pos;
return $point;
}
/** Returns the UTF-8 encoding of $codePoint
*
* If $codePoint is less than 0 or greater than 1114111, an empty string is returned

89
lib/UTF8String.php

@ -0,0 +1,89 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\UTF8;
class UTF8String {
protected $string;
protected $posByte = 0;
protected $posChar = 0;
public function __construct(string $string) {
$this->string = $string;
}
public function nextChr(): string {
// get the byte at the current position
$b = @$this->string[$this->posByte];
if (ord($b) < 0x80) {
// if the byte is an ASCII character or end of input, simply return it
$this->posChar++;
$this->posByte++;
return $b;
} else {
// otherwise return the serialization of the code point at the current position
return UTF8::chr($this->nextOrd() ?? 0xFFFD);
}
}
public function nextOrd() {
// this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder
// though it differs from a slavish implementation because it operates on only a single
// character rather than a whole stream
$this->posChar++;
// optimization for ASCII characters
$b = @$this->string[$this->posByte];
if ($b=="") {
$this->posByte++;
return false;
} elseif (($b = ord($b)) < 0x80) {
$this->posByte++;
return $b;
}
$point = 0;
$seen = 0;
$needed = 1;
$lower = 0x80;
$upper = 0xBF;
while ($seen < $needed) {
$b = ord(@$this->string[$this->posByte++]);
if (!$seen) {
if ($b >= 0xC2 && $b <= 0xDF) { // two-byte character
$needed = 2;
$point = $b & 0x1F;
} elseif ($b >= 0xE0 && $b <= 0xEF) { // three-byte character
$needed = 3;
if ($b==0xE0) {
$lower = 0xA0;
} elseif ($b==0xED) {
$upper = 0x9F;
}
$point = $b & 0xF;
} elseif ($b >= 0xF0 && $b <= 0xF4) { // four-byte character
$needed = 4;
if ($b==0xF0) {
$lower = 0x90;
} elseif ($b==0xF4) {
$upper = 0x8F;
}
$point = $b & 0x7;
} else { // invalid byte
return null;
}
} elseif ($b < $lower || $b > $upper) {
$this->posByte--;
return null;
} else {
$lower = 0x80;
$upper = 0xBF;
$point = ($point << 6) | ($b & 0x3F);
}
$seen++;
}
return $point;
}
}

16
perf/perf.php

@ -17,10 +17,10 @@ $files = [
$tests = [
'Native characters' => ["", function(string $text) {
$pos = 0;
$eof = strlen($text);
while ($pos <= $eof) {
UTF8::get($text, $pos, $pos);
$c = null;
$i = new \MensBeam\UTF8\UTF8String($text);
while ($c !== "") {
$c = $i->nextChr();
}
}],
'Intl characters' => ["intl", function(string $text) {
@ -31,10 +31,10 @@ $tests = [
}
}],
'Native code points' => ["", function(string $text) {
$pos = 0;
$eof = strlen($text);
while ($pos <= $eof) {
UTF8::ord($text, $pos, $pos);
$p = null;
$i = new \MensBeam\UTF8\UTF8String($text);
while ($p !== false) {
$p = $i->nextOrd();
}
}],
];

53
tests/cases/TestCodec.php

@ -6,40 +6,35 @@
declare(strict_types=1);
namespace MensBeam\UTF8\TestCase\Codec;
use MensBeam\UTF8\UTF8;
use MensBeam\UTF8\UTF8String;
/** @covers \MensBeam\UTF8\UTF8 */
class TestConf extends \PHPUnit\Framework\TestCase {
/** @group optional */
public function testDecodeSingleCharacter() {
for ($a = 0; $a <= 0x10FFFF; $a++) {
// the UTF-8 encoding of the code point
$bytes = \IntlChar::chr($a);
// the expected result of decoding the bytes: surrogates are supposed to result in failures on every byte
$exp1 = ($a >= 55296 && $a <= 57343) ? array_fill(0, strlen($bytes), false) : [$a];
// the expected next-character poisitions: surrogates are supposed to return multiple positions; others always return only the end of the string
$exp2 = ($a >= 55296 && $a <= 57343) ? range(1, strlen($bytes)) : [strlen($bytes)];
$act1 = [];
$act2 = [];
$pos = 0;
do {
$act1[] = UTF8::ord($bytes, $pos, $pos);
$act2[] = $pos;
} while ($pos < strlen($bytes));
$this->assertSame($exp1, $act1, 'Character '.strtoupper(bin2hex(\IntlChar::chr($a))).' was not decoded correctly.');
$this->assertSame($exp2, $act2, 'Next offset for character '.strtoupper(bin2hex(\IntlChar::chr($a))).' is incorrect.');
/**
* @dataProvider provideStrings
* @covers \MensBeam\UTF8\UTF8String::__construct
* @covers \MensBeam\UTF8\UTF8String::nextOrd
*/
public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) {
$s = new UTF8String($input);
while (($p = $s->nextOrd()) !== false) {
$out[] = $p ?? 0xFFFD;
}
$this->assertEquals($exp, $out);
}
/** @dataProvider provideStrings */
public function testDecodeMultipleCharacters(string $input, array $exp) {
$pos = 0;
$out = [];
$eof = strlen($input);
while ($pos < $eof) {
$p = UTF8::ord($input, $pos, $pos);
$out[] = is_int($p) ? $p : 0xFFFD;
/**
* @dataProvider provideStrings
* @covers \MensBeam\UTF8\UTF8String::__construct
* @covers \MensBeam\UTF8\UTF8String::nextChr
*/
public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) {
$exp = array_map(function($v) {
return \IntlChar::chr($v);
}, $exp);
$s = new UTF8String($input);
while (($c = $s->nextChr()) !== "") {
$out[] = $c;
}
$this->assertEquals($exp, $out);
}

Loading…
Cancel
Save