Browse Source

Implement fatal replacement mode

labels
J. King 6 years ago
parent
commit
cb1cab9d84
  1. 10
      lib/Encoding/DecoderException.php
  2. 10
      lib/Encoding/EncoderException.php
  3. 10
      lib/Encoding/EncodingException.php
  4. 71
      lib/Encoding/UTF8.php
  5. 79
      tests/cases/Encoding/TestUTF8.php

10
lib/Encoding/DecoderException.php

@ -0,0 +1,10 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Intl\Encoding;
class DecoderException extends EncodingException {
}

10
lib/Encoding/EncoderException.php

@ -0,0 +1,10 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Intl\Encoding;
class EncoderException extends EncodingException {
}

10
lib/Encoding/EncodingException.php

@ -0,0 +1,10 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Intl\Encoding;
class EncodingException extends \Exception {
}

71
lib/Encoding/UTF8.php

@ -7,11 +7,22 @@ declare(strict_types=1);
namespace MensBeam\Intl\Encoding;
class UTF8 implements \Iterator {
const MODE_NULL = 0;
const MODE_REPLACE = 1;
const MODE_HTML = 2;
const MODE_FATAL_DEC = 3;
const MODE_FATAL_ENC = 4;
const E_INVALID_CODE_POINT = 1;
const E_INVALID_BYTE = 2;
const E_INVALID_MODE = 3;
protected $string;
protected $posByte = 0;
protected $posChar = 0;
protected $lenByte = null;
protected $lenChar = null;
protected $errMode = self::MODE_REPLACE;
protected $current;
public function rewind() {
@ -36,9 +47,10 @@ class UTF8 implements \Iterator {
$this->current = null;
}
public function __construct(string $string) {
public function __construct(string $string, bool $fatal = false) {
$this->string = $string;
$this->lenByte = strlen($string);
$this->errMode = $fatal ? self::MODE_FATAL_DEC : self::MODE_REPLACE;
}
public function posByte(): int {
@ -65,7 +77,7 @@ class UTF8 implements \Iterator {
return $b;
} else {
// otherwise return the serialization of the code point at the current position
return UTF8::encode($this->nextCode() ?? 0xFFFD);
return UTF8::encode($this->nextCode());
}
}
@ -75,8 +87,6 @@ class UTF8 implements \Iterator {
*/
public function nextCode() {
// this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder
// though it differs from a slavish implementation because it operates on only a single
// character rather than a whole stream
// optimization for ASCII characters
$b = @$this->string[$this->posByte];
if ($b === "") {
@ -115,11 +125,10 @@ class UTF8 implements \Iterator {
}
$point = $b & 0x7;
} else { // invalid byte
return null;
return self::err($this->errMode, [$this->posChar, $this->posByte]);
}
} elseif ($b < $lower || $b > $upper) {
$this->posByte--;
return null;
return self::err($this->errMode, [$this->posChar, $this->posByte--]);
} else {
$lower = 0x80;
$upper = 0xBF;
@ -153,11 +162,14 @@ class UTF8 implements \Iterator {
// if we're already at the start of the string, we can't go further back
return $distance;
}
$mode = $this->errMode;
$this->errMode = self::MODE_NULL;
do {
$this->sync($this->posByte - 1);
// manually decrement the character position
$this->posChar--;
} while (--$distance && $this->posByte);
$this->errMode = $mode;
return $distance;
} else {
return 0;
@ -168,10 +180,13 @@ class UTF8 implements \Iterator {
public function peekChar(int $num = 1): string {
$out = "";
$state = $this->stateSave();
while ($num-- > 0 && ($b = $this->nextChar()) !== "") {
$out .= $b;
try {
while ($num-- > 0 && ($b = $this->nextChar()) !== "") {
$out .= $b;
}
} finally {
$this->stateApply($state);
}
$this->stateApply($state);
return $out;
}
@ -179,10 +194,13 @@ class UTF8 implements \Iterator {
public function peekCode(int $num = 1): array {
$out = [];
$state = $this->stateSave();
while ($num-- > 0 && ($b = $this->nextCode()) !== false) {
$out[] = $b;
try {
while ($num-- > 0 && ($b = $this->nextCode()) !== false) {
$out[] = $b;
}
} finally {
$this->stateApply($state);
}
$this->stateApply($state);
return $out;
}
@ -235,14 +253,37 @@ class UTF8 implements \Iterator {
}
}
protected static function err(int $mode, $data = null) {
switch($mode) {
case self::MODE_NULL:
// used internally during backward seeking
return null;
case self::MODE_REPLACE:
// standard "replace" mode
return 0xFFFD;
case self::MODE_HTML: // @codeCoverageIgnore
// the "html" replacement mode; not applicable to Unicode transformation formats
return "&#".(string) $data.";"; // @codeCoverageIgnore
case self::MODE_FATAL_DEC:
// fatal replacement mode for decoders
throw new DecoderException("Invalid code sequence at character offset {$data[0]} (byte offset {$data[1]})", self::E_INVALID_BYTE);
case self::MODE_FATAL_ENC: // @codeCoverageIgnore
// fatal replacement mode for decoders; not applicable to Unicode transformation formats
throw new EncoderException("Code point $data not available in target encoding", self::E_INVALID_BYTE); // @codeCoverageIgnore
default:
// indicative of internal bug; should never be triggered
throw new DecoderException("Invalid replacement mode {$mode}", self::E_INVALID_MODE); // @codeCoverageIgnore
}
}
/** Returns the UTF-8 encoding of $codePoint
*
* If $codePoint is less than 0 or greater than 1114111, an empty string is returned
*/
public static function encode(int $codePoint): string {
public static function encode(int $codePoint, bool $fatal = true): string {
// this function implements https://encoding.spec.whatwg.org/#utf-8-encoder
if ($codePoint < 0 || $codePoint > 0x10FFFF) {
return "";
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);
} elseif ($codePoint < 128) {
return chr($codePoint);
} elseif ($codePoint < 0x800) {

79
tests/cases/Encoding/TestUTF8.php

@ -7,21 +7,22 @@ declare(strict_types=1);
namespace MensBeam\Intl\TestCase\Encoding;
use MensBeam\Intl\Encoding\UTF8;
use MensBeam\Intl\Encoding\EncoderException;
use MensBeam\Intl\Encoding\DecoderException;
class TestUTF8 extends \PHPUnit\Framework\TestCase {
/**
* @dataProvider provideCodePoints
* @covers MensBeam\Intl\Encoding\UTF8::encode
*/
public function testEncodeCodePoints() {
$input = [122, 162, 27700, 119070, 63743, 1114109, 65534];
$exp = ["\x7A", "\xC2\xA2", "\xE6\xB0\xB4", "\xF0\x9D\x84\x9E", "\xEF\xA3\xBF", "\xF4\x8F\xBF\xBD", "\xEF\xBF\xBE"];
for ($a = 0; $a < sizeof($input); $a++) {
$out = UTF8::encode($input[$a]);
$this->assertSame(bin2hex($exp[$a]), bin2hex($out), "Character $a was not encoded correctly");
public function testEncodeCodePoints(int $input, $exp) {
if ($exp instanceof \Throwable) {
$this->expectException(get_class($exp));
$this->expectExceptionCode($exp->getCode());
}
$this->assertSame("", UTF8::encode(\PHP_INT_MAX));
$this->assertSame("", UTF8::encode(\PHP_INT_MIN));
$out = UTF8::encode($input);
$this->assertSame(bin2hex($exp), bin2hex($out));
}
/**
@ -33,7 +34,7 @@ class TestUTF8 extends \PHPUnit\Framework\TestCase {
$s = new UTF8($input);
$out = [];
while (($p = $s->nextCode()) !== false) {
$out[] = $p ?? 0xFFFD;
$out[] = $p;
}
$this->assertEquals($exp, $out);
}
@ -86,9 +87,9 @@ class TestUTF8 extends \PHPUnit\Framework\TestCase {
$s = new UTF8($input);
$a = 0;
$this->assertTrue(true); // prevent risky test of empty string
while (($p1 = $s->nextCode() ?? 0xFFFD) !== false) {
while (($p1 = $s->nextCode()) !== false) {
$this->assertSame(0, $s->seek(-1));
$p2 = $s->nextCode() ?? 0xFFFD;
$p2 = $s->nextCode();
$this->assertSame($p1, $p2, "Mismatch at character position $a");
$this->assertSame(++$a, $s->posChar(), "Character position should be $a");
}
@ -277,6 +278,62 @@ class TestUTF8 extends \PHPUnit\Framework\TestCase {
$this->assertSame($posByte, $s->posByte());
}
/**
* @covers MensBeam\Intl\Encoding\UTF8::err
*/
public function testReplacementModes() {
$input = "\x30\xFF\x30";
// officially test replacement characters and null replacement (already effectively tested by other tests)
$s = new UTF8($input, false);
$s->seek(1);
$this->assertSame(0xFFFD, $s->nextCode());
$s->seek(-2);
// test fatal mode
$s = new UTF8($input, true);
$s->seek(1);
try {
$p = $s->nextCode();
} catch (DecoderException $e) {
$p = $e;
} finally {
$this->assertInstanceOf(DecoderException::class, $p);
}
$this->assertSame(2, $s->posChar());
$this->assertSame(0x30, $s->nextCode());
$s->seek(-2);
$this->assertSame(1, $s->posChar());
try {
$p = $s->peekCode();
} catch (DecoderException $e) {
$p = $e;
} finally {
$this->assertInstanceOf(DecoderException::class, $p);
}
$this->assertSame(1, $s->posChar());
try {
$p = $s->peekChar();
} catch (DecoderException $e) {
$p = $e;
} finally {
$this->assertInstanceOf(DecoderException::class, $p);
}
$this->assertSame(1, $s->posChar());
}
public function provideCodePoints() {
return [
"122" => [122, "\x7A"],
"162" => [162, "\xC2\xA2"],
"27700" => [27700, "\xE6\xB0\xB4"],
"119070" => [119070, "\xF0\x9D\x84\x9E"],
"63743" => [63743, "\xEF\xA3\xBF"],
"1114109" => [1114109, "\xF4\x8F\xBF\xBD"],
"65534" => [65534, "\xEF\xBF\xBE"],
"-1" => [-1, new EncoderException("", UTF8::E_INVALID_CODE_POINT)],
"1114112" => [1114112, new EncoderException("", UTF8::E_INVALID_CODE_POINT)],
];
}
public function provideStrings() {
return [
// control samples

Loading…
Cancel
Save