diff --git a/lib/Encoding/DecoderException.php b/lib/Encoding/DecoderException.php new file mode 100644 index 0000000..57d1740 --- /dev/null +++ b/lib/Encoding/DecoderException.php @@ -0,0 +1,10 @@ +current = null; } - public function __construct(string $string) { + public function __construct(string $string, bool $fatal = false) { $this->string = $string; $this->lenByte = strlen($string); + $this->errMode = $fatal ? self::MODE_FATAL_DEC : self::MODE_REPLACE; } public function posByte(): int { @@ -65,7 +77,7 @@ class UTF8 implements \Iterator { return $b; } else { // otherwise return the serialization of the code point at the current position - return UTF8::encode($this->nextCode() ?? 0xFFFD); + return UTF8::encode($this->nextCode()); } } @@ -75,8 +87,6 @@ class UTF8 implements \Iterator { */ public function nextCode() { // this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder - // though it differs from a slavish implementation because it operates on only a single - // character rather than a whole stream // optimization for ASCII characters $b = @$this->string[$this->posByte]; if ($b === "") { @@ -115,11 +125,10 @@ class UTF8 implements \Iterator { } $point = $b & 0x7; } else { // invalid byte - return null; + return self::err($this->errMode, [$this->posChar, $this->posByte]); } } elseif ($b < $lower || $b > $upper) { - $this->posByte--; - return null; + return self::err($this->errMode, [$this->posChar, $this->posByte--]); } else { $lower = 0x80; $upper = 0xBF; @@ -153,11 +162,14 @@ class UTF8 implements \Iterator { // if we're already at the start of the string, we can't go further back return $distance; } + $mode = $this->errMode; + $this->errMode = self::MODE_NULL; do { $this->sync($this->posByte - 1); // manually decrement the character position $this->posChar--; } while (--$distance && $this->posByte); + $this->errMode = $mode; return $distance; } else { return 0; @@ -168,10 +180,13 @@ class UTF8 implements \Iterator { public function peekChar(int $num = 1): string { $out = ""; $state = $this->stateSave(); - while ($num-- > 0 && ($b = $this->nextChar()) !== "") { - $out .= $b; + try { + while ($num-- > 0 && ($b = $this->nextChar()) !== "") { + $out .= $b; + } + } finally { + $this->stateApply($state); } - $this->stateApply($state); return $out; } @@ -179,10 +194,13 @@ class UTF8 implements \Iterator { public function peekCode(int $num = 1): array { $out = []; $state = $this->stateSave(); - while ($num-- > 0 && ($b = $this->nextCode()) !== false) { - $out[] = $b; + try { + while ($num-- > 0 && ($b = $this->nextCode()) !== false) { + $out[] = $b; + } + } finally { + $this->stateApply($state); } - $this->stateApply($state); return $out; } @@ -235,14 +253,37 @@ class UTF8 implements \Iterator { } } + protected static function err(int $mode, $data = null) { + switch($mode) { + case self::MODE_NULL: + // used internally during backward seeking + return null; + case self::MODE_REPLACE: + // standard "replace" mode + return 0xFFFD; + case self::MODE_HTML: // @codeCoverageIgnore + // the "html" replacement mode; not applicable to Unicode transformation formats + return "&#".(string) $data.";"; // @codeCoverageIgnore + case self::MODE_FATAL_DEC: + // fatal replacement mode for decoders + throw new DecoderException("Invalid code sequence at character offset {$data[0]} (byte offset {$data[1]})", self::E_INVALID_BYTE); + case self::MODE_FATAL_ENC: // @codeCoverageIgnore + // fatal replacement mode for decoders; not applicable to Unicode transformation formats + throw new EncoderException("Code point $data not available in target encoding", self::E_INVALID_BYTE); // @codeCoverageIgnore + default: + // indicative of internal bug; should never be triggered + throw new DecoderException("Invalid replacement mode {$mode}", self::E_INVALID_MODE); // @codeCoverageIgnore + } + } + /** Returns the UTF-8 encoding of $codePoint * * If $codePoint is less than 0 or greater than 1114111, an empty string is returned */ - public static function encode(int $codePoint): string { + public static function encode(int $codePoint, bool $fatal = true): string { // this function implements https://encoding.spec.whatwg.org/#utf-8-encoder if ($codePoint < 0 || $codePoint > 0x10FFFF) { - return ""; + throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT); } elseif ($codePoint < 128) { return chr($codePoint); } elseif ($codePoint < 0x800) { diff --git a/tests/cases/Encoding/TestUTF8.php b/tests/cases/Encoding/TestUTF8.php index c6beb62..3d18c54 100644 --- a/tests/cases/Encoding/TestUTF8.php +++ b/tests/cases/Encoding/TestUTF8.php @@ -7,21 +7,22 @@ declare(strict_types=1); namespace MensBeam\Intl\TestCase\Encoding; use MensBeam\Intl\Encoding\UTF8; +use MensBeam\Intl\Encoding\EncoderException; +use MensBeam\Intl\Encoding\DecoderException; class TestUTF8 extends \PHPUnit\Framework\TestCase { /** + * @dataProvider provideCodePoints * @covers MensBeam\Intl\Encoding\UTF8::encode */ - public function testEncodeCodePoints() { - $input = [122, 162, 27700, 119070, 63743, 1114109, 65534]; - $exp = ["\x7A", "\xC2\xA2", "\xE6\xB0\xB4", "\xF0\x9D\x84\x9E", "\xEF\xA3\xBF", "\xF4\x8F\xBF\xBD", "\xEF\xBF\xBE"]; - for ($a = 0; $a < sizeof($input); $a++) { - $out = UTF8::encode($input[$a]); - $this->assertSame(bin2hex($exp[$a]), bin2hex($out), "Character $a was not encoded correctly"); + public function testEncodeCodePoints(int $input, $exp) { + if ($exp instanceof \Throwable) { + $this->expectException(get_class($exp)); + $this->expectExceptionCode($exp->getCode()); } - $this->assertSame("", UTF8::encode(\PHP_INT_MAX)); - $this->assertSame("", UTF8::encode(\PHP_INT_MIN)); + $out = UTF8::encode($input); + $this->assertSame(bin2hex($exp), bin2hex($out)); } /** @@ -33,7 +34,7 @@ class TestUTF8 extends \PHPUnit\Framework\TestCase { $s = new UTF8($input); $out = []; while (($p = $s->nextCode()) !== false) { - $out[] = $p ?? 0xFFFD; + $out[] = $p; } $this->assertEquals($exp, $out); } @@ -86,9 +87,9 @@ class TestUTF8 extends \PHPUnit\Framework\TestCase { $s = new UTF8($input); $a = 0; $this->assertTrue(true); // prevent risky test of empty string - while (($p1 = $s->nextCode() ?? 0xFFFD) !== false) { + while (($p1 = $s->nextCode()) !== false) { $this->assertSame(0, $s->seek(-1)); - $p2 = $s->nextCode() ?? 0xFFFD; + $p2 = $s->nextCode(); $this->assertSame($p1, $p2, "Mismatch at character position $a"); $this->assertSame(++$a, $s->posChar(), "Character position should be $a"); } @@ -277,6 +278,62 @@ class TestUTF8 extends \PHPUnit\Framework\TestCase { $this->assertSame($posByte, $s->posByte()); } + /** + * @covers MensBeam\Intl\Encoding\UTF8::err + */ + public function testReplacementModes() { + $input = "\x30\xFF\x30"; + // officially test replacement characters and null replacement (already effectively tested by other tests) + $s = new UTF8($input, false); + $s->seek(1); + $this->assertSame(0xFFFD, $s->nextCode()); + $s->seek(-2); + // test fatal mode + $s = new UTF8($input, true); + $s->seek(1); + try { + $p = $s->nextCode(); + } catch (DecoderException $e) { + $p = $e; + } finally { + $this->assertInstanceOf(DecoderException::class, $p); + } + $this->assertSame(2, $s->posChar()); + $this->assertSame(0x30, $s->nextCode()); + $s->seek(-2); + $this->assertSame(1, $s->posChar()); + try { + $p = $s->peekCode(); + } catch (DecoderException $e) { + $p = $e; + } finally { + $this->assertInstanceOf(DecoderException::class, $p); + } + $this->assertSame(1, $s->posChar()); + try { + $p = $s->peekChar(); + } catch (DecoderException $e) { + $p = $e; + } finally { + $this->assertInstanceOf(DecoderException::class, $p); + } + $this->assertSame(1, $s->posChar()); + } + + public function provideCodePoints() { + return [ + "122" => [122, "\x7A"], + "162" => [162, "\xC2\xA2"], + "27700" => [27700, "\xE6\xB0\xB4"], + "119070" => [119070, "\xF0\x9D\x84\x9E"], + "63743" => [63743, "\xEF\xA3\xBF"], + "1114109" => [1114109, "\xF4\x8F\xBF\xBD"], + "65534" => [65534, "\xEF\xBF\xBE"], + "-1" => [-1, new EncoderException("", UTF8::E_INVALID_CODE_POINT)], + "1114112" => [1114112, new EncoderException("", UTF8::E_INVALID_CODE_POINT)], + ]; + } + public function provideStrings() { return [ // control samples