Browse Source

Clean up static-method interface and test it

labels
J. King 6 years ago
parent
commit
ca91a86744
  1. 134
      lib/UTF8.php
  2. 19
      perf/perf.php
  3. 156
      tests/cases/TestFunctions.php
  4. 3
      tests/phpunit.xml

134
lib/UTF8.php

@ -7,12 +7,6 @@ declare(strict_types=1);
namespace MensBeam\UTF8;
abstract class UTF8 {
public static $replacementChar = "\u{FFFD}";
public static $errMode = self::M_REPLACE;
const M_REPLACE = 0;
const M_SKIP = 1;
const M_HALT = 2;
/** Retrieve a character from $string starting at byte offset $pos
*
@ -20,35 +14,22 @@ abstract class UTF8 {
*
* The returned character may be a replacement character, or the empty string if $pos is beyond the end of $string
*/
public static function get(string $string, int $pos, &$next = null, int $errMode = null): string {
public static function get(string $string, int $pos, &$next = null): string {
start:
// get the byte at the specified position
$b = @$string[$pos];
if (ord($b) < 0x80) {
// if the byte is an ASCII character or end of input, simply return it
$next = $pos + 1;
if ($b !== "") {
$next = $pos + 1;
} else {
$next = $pos;
}
return $b;
} else {
// otherwise determine the numeric code point of the character, as well as the position of the next character
$p = self::ord($string, $pos, $next, self::M_REPLACE);
if (is_int($p)) {
// if the character is valid, return its serialization
// we do a round trip (bytes > code point > bytes) to normalize overlong sequences
return self::chr($p);
} else {
$errMode = $errMode ?? self::$errMode;
if ($errMode==self::M_REPLACE) {
// if the byte is invalid and we're supposed to replace, return a replacement character
return self::$replacementChar;
} elseif ($errMode==self::M_SKIP) {
// if the character is invalid and we're supposed to skip invalid characters, advance the position and start over
$pos = $next;
goto start;
} else {
// if the byte is invalid and we're supposed to halt, halt
throw new \Exception;
}
}
$p = self::ord($string, $pos, $next);
return is_int($p) ? self::chr($p) : "\u{FFFD}";
}
}
@ -58,12 +39,11 @@ abstract class UTF8 {
*
* If $pos is omitted, the start of the string will be used for a forward seek, and the end for a reverse seek
*/
public static function seek(string $string, int $num, int $pos = null, int $errMode = null): int {
$errMode = $errMode ?? self::$errMode;
public static function seek(string $string, int $num, int $pos = null): int {
if ($num > 0) {
$pos = $pos ?? 0;
do {
$c = self::get($string, $pos, $pos, $errMode); // the current position is getting overwritten with the next position, by reference
$c = self::get($string, $pos, $pos); // the current position is getting overwritten with the next position, by reference
} while (--$num && $c != ""); // stop after we have skipped the desired number of characters, or reached EOF
return $pos;
} elseif ($num < 0) {
@ -74,71 +54,32 @@ abstract class UTF8 {
}
$num = abs($num);
do {
$pos = self::sync($string, $pos -1, $errMode);
$pos = self::sync($string, $pos -1);
$num--;
} while ($num && $pos);
return $pos;
} else {
// seeking zero characters is equivalent to a sync
return self::sync($string, $pos, $errMode);
return self::sync($string, $pos);
}
}
/** Synchronize to the byte offset of the start of the nearest character at or before byte offset $pos */
public static function sync(string $string, int $pos, int $errMode = null): int {
$errMode = $errMode ?? self::$errMode;
start:
if (!$pos || $pos >= strlen($string)) {
// if we're at the start of the string or past its end, then this is the character start
return $pos;
}
// save the start position for later, and increment before the coming decrement loop
$s = $pos++;
// examine the current byte and skip up to three continuation bytes, going backward and counting the number of examined bytes (between 1 and 4)
$t = 0;
do {
$pos--;
$t++;
$b = @$string[$pos];
} while (
$b >= "\x80" && $b <= "\xBF" && // continuation bytes
($t < 4 || $errMode==self::M_SKIP) && // stop after four bytes, unless we're skipping invalid sequences
$pos > 0 // stop once the start of the string has been reached
);
// attempt to extract a code point at the current position
$p = self::ord($string, $pos, $n, self::M_REPLACE);
// if the position of the character after the one we just consumed is earlier than our start position,
// then there was at least one invalid sequence between the consumed character and the start position
if ($n < $s) {
if ($errMode==self::M_SKIP) {
// if we're supposed to skip invalid sequences, there is no need to do anything
} elseif ($errMode==self::M_REPLACE) {
// if we're supposed to replace invalid sequences, return the starting offset: it is itself a character
return $s;
} else {
// otherwise if the character is invalid and we're expected to halt, halt
throw new \Exception;
public static function sync(string $string, int $pos): int {
$b = ord(@$string[$pos]);
if ($b < 0x80) {
// if the byte is an ASCII byte or the end of input, then this is already a synchronized position
return min(max($pos,0), strlen($string));
} else {
$s = $pos;
while ($b >= 0x80 && $b <= 0xBF && $pos > 0 && ($s - $pos) < 3) { // go back at most three bytes, no further than the start of the string, and only as long as the byte remains a continuation byte
$b = ord(@$string[--$pos]);
}
}
// if the consumed character is valid, return the current position
if (is_int($p)) {
return $pos;
} elseif ($errMode==self::M_SKIP) {
// if we're supposed to skip invalid sequences:
if ($pos < 1) {
// if we're already at the start of the string, give up
return $pos;
if (is_null(self::ord($string, $pos, $next))) {
return $s;
} else {
// otherwise skip over the last examined byte and start over
$pos--;
goto start;
return ($next > $s) ? $pos : $s;
}
} elseif ($errMode==self::M_REPLACE) {
// if we're supposed to replace invalid sequences, return the current offset: we've synchronized
return $pos;
} else {
// otherwise if the character is invalid and we're expected to halt, halt
throw new \Exception;
}
}
@ -177,20 +118,19 @@ abstract class UTF8 {
*
* Upon success, returns the numeric code point of the character, an integer between 0 and 1114111
*
* Upon error, returns false; if $char is the empty string or $pos is beyond the end of the string, null is returned
* Upon error, returns null; if $char is the empty string or $pos is beyond the end of the string, false is returned
*
* $next is a variable in which to store the next byte offset at which a character starts
*/
public static function ord(string $string, int $pos = 0, &$next = null, int $errMode = null) {
public static function ord(string $string, int $pos = 0, &$next = null) {
// this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder
// though it differs from a slavish implementation because it operates on only a single
// character rather than a whole stream
start:
// optimization for ASCII characters
$b = @$string[$pos];
if ($b=="") {
$next = $pos + 1;
return null;
$next = $pos;
return false;
} elseif (($b = ord($b)) < 0x80) {
$next = $pos + 1;
return $b;
@ -224,25 +164,11 @@ abstract class UTF8 {
$point = $b & 0x7;
} else { // invalid byte
$next = $pos;
switch ($errMode ?? self::$errMode) {
case self::M_SKIP:
goto start;
case self::M_REPLACE:
return false;
default:
throw new \Exception;
}
return null;
}
} elseif ($b < $lower || $b > $upper) {
$next = $pos - 1;
switch ($errMode ?? self::$errMode) {
case self::M_SKIP:
goto start;
case self::M_REPLACE:
return false;
default:
throw new \Exception;
}
return null;
} else {
$lower = 0x80;
$upper = 0xBF;

19
perf/perf.php

@ -16,6 +16,13 @@ $files = [
];
$tests = [
'Intl characters' => ["intl", function(string $text) {
$i = \IntlBreakIterator::createCodePointInstance();
$i->setText($text);
foreach ($i as $b) {
\IntlChar::chr($i->getLastCodePoint());
}
}],
'Native characters (obj)' => ["", function(string $text) {
$c = null;
$i = new \MensBeam\UTF8\UTF8String($text);
@ -25,16 +32,14 @@ $tests = [
}],
'Native characters (func)' => ["", function(string $text) {
$pos = 0;
$eof = strlen($text);
while ($pos <= $eof) {
UTF8::get($text, $pos, $pos);
while (($p = UTF8::get($text, $pos, $pos)) !== "") {
}
}],
'Intl characters' => ["intl", function(string $text) {
'Intl code points' => ["intl", function(string $text) {
$i = \IntlBreakIterator::createCodePointInstance();
$i->setText($text);
foreach ($i as $b) {
\IntlChar::chr($i->getLastCodePoint());
$i->getLastCodePoint();
}
}],
'Native code points (obj)' => ["", function(string $text) {
@ -46,9 +51,7 @@ $tests = [
}],
'Native code points (func)' => ["", function(string $text) {
$pos = 0;
$eof = strlen($text);
while ($pos <= $eof) {
UTF8::ord($text, $pos, $pos);
while (($p = UTF8::ord($text, $pos, $pos)) !== false) {
}
}],
];

156
tests/cases/TestFunctions.php

@ -0,0 +1,156 @@
<?php
/** @license MIT
* Copyright 2017 J. King, Dustin Wilson et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\UTF8\TestCase\Codec;
use MensBeam\UTF8\UTF8;
class TestFunctions extends \PHPUnit\Framework\TestCase {
/**
* @dataProvider provideStrings
* @covers \MensBeam\UTF8\UTF8::ord
*/
public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) {
$off = 0;
while (($p = UTF8::ord($input, $off, $off)) !== false) {
$out[] = $p ?? 0xFFFD;
}
$this->assertEquals($exp, $out);
}
/**
* @dataProvider provideStrings
* @covers \MensBeam\UTF8\UTF8::get
*/
public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) {
$exp = array_map(function ($v) {
return \IntlChar::chr($v);
}, $exp);
$off = 0;
while (($p = UTF8::get($input, $off, $off)) !== "") {
$out[] = $p ?? 0xFFFD;
}
$this->assertEquals($exp, $out);
}
/**
* @covers \MensBeam\UTF8\UTF8::get
* @covers \MensBeam\UTF8\UTF8::ord
*/
public function testTraversePastTheEndOfAString() {
$input = "\u{10FFFD}";
$off = 0;
$this->assertSame(0, $off);
$this->assertSame("\u{10FFFD}", UTF8::get($input, $off, $off));
$this->assertSame(4, $off);
$this->assertSame("", UTF8::get($input, $off, $off));
$this->assertSame(4, $off);
$off = 0;
$this->assertSame(0, $off);
$this->assertSame(0x10FFFD, UTF8::ord($input, $off, $off));
$this->assertSame(4, $off);
$this->assertSame(false, UTF8::ord($input, $off, $off));
$this->assertSame(4, $off);
}
/**
* @dataProvider provideStrings
* @covers \MensBeam\UTF8\UTF8::sync
*/
public function testSTepBackThroughAString(string $input, array $points) {
$off = strlen($input);
$p = [];
while ($off > 0) {
$off = UTF8::sync($input, $off - 1);
$p[] = UTF8::ord($input, $off) ?? 0xFFFD;
}
$p = array_reverse($p);
$this->assertSame($points, $p);
}
/**
* @covers \MensBeam\UTF8\UTF8::seek
*/
public function testSeekThroughAString() {
/*
Char 0 U+007A (1 byte) Offset 0
Char 1 U+00A2 (2 bytes) Offset 1
Char 2 U+6C34 (3 bytes) Offset 3
Char 3 U+1D11E (4 bytes) Offset 6
Char 4 U+F8FF (3 bytes) Offset 10
Char 5 U+10FFFD (4 bytes) Offset 13
Char 6 U+FFFE (3 bytes) Offset 17
End of string at char 7, offset 20
*/
$input = "\x7A\xC2\xA2\xE6\xB0\xB4\xF0\x9D\x84\x9E\xEF\xA3\xBF\xF4\x8F\xBF\xBD\xEF\xBF\xBE";
$off = 0;
$off = UTF8::seek($input, 0, $off);
$this->assertSame(0, $off);
$off = UTF8::seek($input, -1, $off);
$this->assertSame(0, $off);
$off = UTF8::seek($input, 1, $off);
$this->assertSame(1, $off);
$off = UTF8::seek($input, 2, $off);
$this->assertSame(6, $off);
$off = UTF8::seek($input, 4, $off);
$this->assertSame(20, $off);
$off = UTF8::seek($input, 1, $off);
$this->assertSame(20, $off);
$off = UTF8::seek($input, -3, $off);
$this->assertSame(10, $off);
$off = UTF8::seek($input, -10, $off);
$this->assertSame(0, $off);
}
public function provideStrings() {
return [
// control samples
'sanity check' => ["\x61\x62\x63\x31\x32\x33", [97, 98, 99, 49, 50, 51]],
'multibyte control' => ["\xE5\x8F\xA4\xE6\xB1\xA0\xE3\x82\x84\xE8\x9B\x99\xE9\xA3\x9B\xE3\x81\xB3\xE8\xBE\xBC\xE3\x82\x80\xE6\xB0\xB4\xE3\x81\xAE\xE9\x9F\xB3", [21476, 27744, 12420, 34521, 39131, 12403, 36796, 12416, 27700, 12398, 38899]],
'mixed sample' => ["\x7A\xC2\xA2\xE6\xB0\xB4\xF0\x9D\x84\x9E\xEF\xA3\xBF\xF4\x8F\xBF\xBD\xEF\xBF\xBE", [122, 162, 27700, 119070, 63743, 1114109, 65534]],
// various invalid sequences
'invalid code' => ["\xFF", [65533]],
'ends early' => ["\xC0", [65533]],
'ends early 2' => ["\xE0", [65533]],
'invalid trail' => ["\xC0\x00", [65533, 0]],
'invalid trail 2' => ["\xC0\xC0", [65533, 65533]],
'invalid trail 3' => ["\xE0\x00", [65533, 0]],
'invalid trail 4' => ["\xE0\xC0", [65533, 65533]],
'invalid trail 5' => ["\xE0\x80\x00", [65533, 65533, 0]],
'invalid trail 6' => ["\xE0\x80\xC0", [65533, 65533, 65533]],
'> 0x10FFFF' => ["\xFC\x80\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533, 65533]],
'obsolete lead byte' => ["\xFE\x80\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533, 65533]],
'overlong U+0000 - 2 bytes' => ["\xC0\x80", [65533, 65533]],
'overlong U+0000 - 3 bytes' => ["\xE0\x80\x80", [65533, 65533, 65533]],
'overlong U+0000 - 4 bytes' => ["\xF0\x80\x80\x80", [65533, 65533, 65533, 65533]],
'overlong U+0000 - 5 bytes' => ["\xF8\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533]],
'overlong U+0000 - 6 bytes' => ["\xFC\x80\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533, 65533]],
'overlong U+007F - 2 bytes' => ["\xC1\xBF", [65533, 65533]],
'overlong U+007F - 3 bytes' => ["\xE0\x81\xBF", [65533, 65533, 65533]],
'overlong U+007F - 4 bytes' => ["\xF0\x80\x81\xBF", [65533, 65533, 65533, 65533]],
'overlong U+007F - 5 bytes' => ["\xF8\x80\x80\x81\xBF", [65533, 65533, 65533, 65533, 65533]],
'overlong U+007F - 6 bytes' => ["\xFC\x80\x80\x80\x81\xBF", [65533, 65533, 65533, 65533, 65533, 65533]],
'overlong U+07FF - 3 bytes' => ["\xE0\x9F\xBF", [65533, 65533, 65533]],
'overlong U+07FF - 4 bytes' => ["\xF0\x80\x9F\xBF", [65533, 65533, 65533, 65533]],
'overlong U+07FF - 5 bytes' => ["\xF8\x80\x80\x9F\xBF", [65533, 65533, 65533, 65533, 65533]],
'overlong U+07FF - 6 bytes' => ["\xFC\x80\x80\x80\x9F\xBF", [65533, 65533, 65533, 65533, 65533, 65533]],
'overlong U+FFFF - 4 bytes' => ["\xF0\x8F\xBF\xBF", [65533, 65533, 65533, 65533]],
'overlong U+FFFF - 5 bytes' => ["\xF8\x80\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533]],
'overlong U+FFFF - 6 bytes' => ["\xFC\x80\x80\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533, 65533]],
'overlong U+10FFFF - 5 bytes' => ["\xF8\x84\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533]],
'overlong U+10FFFF - 6 bytes' => ["\xFC\x80\x84\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533, 65533]],
// UTF-16 surrogates
'lead surrogate' => ["\xED\xA0\x80", [65533, 65533, 65533]],
'trail surrogate' => ["\xED\xB0\x80", [65533, 65533, 65533]],
'surrogate pair' => ["\xED\xA0\x80\xED\xB0\x80", [65533, 65533, 65533, 65533, 65533, 65533]],
// self-sync edge cases
'trailing continuation' => ["\x0A\x80\x80", [10, 65533, 65533]],
'trailing continuation 2' => ["\xE5\x8F\xA4\x80", [21476, 65533]],
];
}
}

3
tests/phpunit.xml

@ -20,5 +20,8 @@
<testsuite name="Class instance">
<file>cases/TestInstance.php</file>
</testsuite>
<testsuite name="Static methods">
<file>cases/TestFunctions.php</file>
</testsuite>
</testsuites>
</phpunit>

Loading…
Cancel
Save