From d5327a3b83d847ddd80ec59d75115b60b6b75eec Mon Sep 17 00:00:00 2001 From: "J. King" Date: Thu, 30 Aug 2018 12:26:50 -0400 Subject: [PATCH] Implement x-user-defined decoder Also further refactored tests to better account for one-way encodings --- lib/Encoding/XUserDefined.php | 91 ++++++++++++ tests/cases/Encoding/TestGB18030.php | 2 +- tests/cases/Encoding/TestSingleByte.php | 8 +- tests/cases/Encoding/TestUTF16LE.php | 16 +-- tests/cases/Encoding/TestUTF8.php | 2 +- tests/cases/Encoding/TestXUserDefined.php | 132 ++++++++++++++++++ tests/lib/CoderDecoderTest.php | 23 +++ .../lib/{EncodingTest.php => DecoderTest.php} | 14 +- tests/phpunit.xml | 1 + 9 files changed, 256 insertions(+), 33 deletions(-) create mode 100644 lib/Encoding/XUserDefined.php create mode 100644 tests/cases/Encoding/TestXUserDefined.php create mode 100644 tests/lib/CoderDecoderTest.php rename tests/lib/{EncodingTest.php => DecoderTest.php} (94%) diff --git a/lib/Encoding/XUserDefined.php b/lib/Encoding/XUserDefined.php new file mode 100644 index 0000000..d90d236 --- /dev/null +++ b/lib/Encoding/XUserDefined.php @@ -0,0 +1,91 @@ +string[$this->posChar]; + if ($b === "") { + return ""; + } + $this->posChar++; + $p = ord($b); + if ($p < 0x80) { + // if the byte is an ASCII character or end of input, simply return it + return $b; + } else { + return UTF8::encode(0xF700 + $p); + } + } + + /** Decodes the next character from the string and returns its code point number + * + * If the end of the string has been reached, false is returned + * + * @return int|bool + */ + public function nextCode() { + // get the byte at the current position + $b = @$this->string[$this->posChar]; + if ($b === "") { + return false; + } + $this->posChar++; + $p = ord($b); + if ($p < 0x80) { + // if the byte is an ASCII character or end of input, simply return it + return $p; + } else { + return 0xF700 + $p; + } + } + + /** Advance $distance characters through the string + * + * If $distance is negative, the operation will be performed in reverse + * + * If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned + */ + public function seek(int $distance): int { + if ($distance > 0) { + while ($this->posChar < $this->lenByte && $distance > 0) { + $this->nextCode(); + $distance--; + } + return $distance; + } elseif ($distance < 0) { + $distance = abs($distance); + while ($this->posChar > 0 && $distance > 0) { + $this->posChar--; + $distance--; + } + return $distance; + } else { + return 0; + } + } + + /** Returns the current byte position of the decoder */ + public function posByte(): int { + return $this->posChar; + } + + /** Calculates the length of the string in code points + * + * Note that this may involve processing to the end of the string + */ + public function len(): int { + return $this->lenByte; + } +} diff --git a/tests/cases/Encoding/TestGB18030.php b/tests/cases/Encoding/TestGB18030.php index 6831c2e..cdc0af6 100644 --- a/tests/cases/Encoding/TestGB18030.php +++ b/tests/cases/Encoding/TestGB18030.php @@ -10,7 +10,7 @@ use MensBeam\Intl\Encoding\GBK; use MensBeam\Intl\Encoding\GB18030; use MensBeam\Intl\Encoding\EncoderException; -class TestGB18030 extends \MensBeam\Intl\Test\EncodingTest { +class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest { protected $testedClass = GB18030::class; /* Char 0 U+007A (1 byte) Offset 0 diff --git a/tests/cases/Encoding/TestSingleByte.php b/tests/cases/Encoding/TestSingleByte.php index a9c310d..bb06f42 100644 --- a/tests/cases/Encoding/TestSingleByte.php +++ b/tests/cases/Encoding/TestSingleByte.php @@ -9,7 +9,7 @@ namespace MensBeam\Intl\TestCase\Encoding; use MensBeam\Intl\Encoding\SingleByteEncoding; use MensBeam\Intl\Encoding\EncoderException; -class TestSingleByte extends \MensBeam\Intl\Test\EncodingTest { +class TestSingleByte extends \MensBeam\Intl\Test\CoderDecoderTest { // maps taken from https://github.com/web-platform-tests/wpt/blob/d6c29bef8d4bcdfe4f689defca73360b07647d71/encoding/single-byte-decoder.html // ISO-8859-8 was duplicated for ISO-8859-8-I protected static $maps = [ @@ -115,11 +115,13 @@ class TestSingleByte extends \MensBeam\Intl\Test\EncodingTest { } /** + * @dataProvider provideStrings * @coversNothing */ - public function testSTepBackThroughAString(string $input = "", array $exp = []) { + public function testSTepBackThroughAString(string $input, array $exp, string $class = SingleByteEncoding::class) { // this test has no meaning for single-byte encodings - $this->assertTrue(true); + $this->testedClass = $class; + return parent::testSTepBackThroughAString($input, $exp); } /** diff --git a/tests/cases/Encoding/TestUTF16LE.php b/tests/cases/Encoding/TestUTF16LE.php index dabacd4..0916fc9 100644 --- a/tests/cases/Encoding/TestUTF16LE.php +++ b/tests/cases/Encoding/TestUTF16LE.php @@ -9,7 +9,7 @@ namespace MensBeam\Intl\TestCase\Encoding; use MensBeam\Intl\Encoding\UTF16LE; use MensBeam\Intl\Encoding\UTF16BE; -class TestUTF16LE extends \MensBeam\Intl\Test\EncodingTest { +class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest { protected $testedClass = UTF16LE::class; /* Char 0 U+007A (2 byte) Offset 0 @@ -28,15 +28,6 @@ class TestUTF16LE extends \MensBeam\Intl\Test\EncodingTest { protected $brokenChar = "0000 00DC 0000"; protected $lowerA = "a\x00"; - /** - * @dataProvider provideCodePoints - * @coversNothing - */ - public function testEncodeCodePoints(bool $fatal, $input, $exp) { - // UTF-16 has no encoder - $this->assertTrue(true); - } - /** * @dataProvider provideStrings * @covers MensBeam\Intl\Encoding\UTF16::__construct @@ -126,11 +117,6 @@ class TestUTF16LE extends \MensBeam\Intl\Test\EncodingTest { return parent::testIterateThroughAString($input, $exp); } - public function provideCodePoints() { - // UTF-16 has no encoder - return [[true, 0, ""]]; - } - public function provideStrings() { return [ // control samples diff --git a/tests/cases/Encoding/TestUTF8.php b/tests/cases/Encoding/TestUTF8.php index 921fb22..96b2f76 100644 --- a/tests/cases/Encoding/TestUTF8.php +++ b/tests/cases/Encoding/TestUTF8.php @@ -9,7 +9,7 @@ namespace MensBeam\Intl\TestCase\Encoding; use MensBeam\Intl\Encoding\UTF8; use MensBeam\Intl\Encoding\EncoderException; -class TestUTF8 extends \MensBeam\Intl\Test\EncodingTest { +class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest { protected $testedClass = UTF8::class; /* Char 0 U+007A (1 byte) Offset 0 diff --git a/tests/cases/Encoding/TestXUserDefined.php b/tests/cases/Encoding/TestXUserDefined.php new file mode 100644 index 0000000..1c380fc --- /dev/null +++ b/tests/cases/Encoding/TestXUserDefined.php @@ -0,0 +1,132 @@ +assertTrue(true); + } + + /** + * @dataProvider provideStrings + * @covers MensBeam\Intl\Encoding\XUserDefined::rewind + * @covers MensBeam\Intl\Encoding\XUserDefined::chars + * @covers MensBeam\Intl\Encoding\XUserDefined::codes + */ + public function testIterateThroughAString(string $input, array $exp) { + return parent::testIterateThroughAString($input, $exp); + } + + public function provideStrings() { + $a_bytes = []; + $a_codes = []; + for ($a = 0; $a < 0x80; $a++) { + $a_bytes[] = strtoupper(bin2hex(chr($a))); + $a_codes[] = $a; + } + $p_bytes = []; + $p_codes = []; + for ($a = 0; $a < 0x80; $a++) { + $p_bytes[] = strtoupper(bin2hex(chr(0x80 + $a))); + $p_codes[] = 0xF780 + $a; + } + $a_bytes = implode(" ", $a_bytes); + $p_bytes = implode(" ", $p_bytes); + return [ + 'empty string' => ["", []], + 'ASCI bytes' => [$a_bytes, $a_codes], + 'private-use bytes' => [$p_bytes, $p_codes], + ]; + } +} diff --git a/tests/lib/CoderDecoderTest.php b/tests/lib/CoderDecoderTest.php new file mode 100644 index 0000000..5efb93f --- /dev/null +++ b/tests/lib/CoderDecoderTest.php @@ -0,0 +1,23 @@ +testedClass; + if ($exp instanceof \Throwable) { + $this->expectException(get_class($exp)); + $this->expectExceptionCode($exp->getCode()); + } else { + $exp = strtolower(str_replace(" ", "", $exp)); + } + $out = $class::encode($input, $fatal); + $this->assertSame($exp, bin2hex($out)); + } +} diff --git a/tests/lib/EncodingTest.php b/tests/lib/DecoderTest.php similarity index 94% rename from tests/lib/EncodingTest.php rename to tests/lib/DecoderTest.php index c5b9808..b1ccd30 100644 --- a/tests/lib/EncodingTest.php +++ b/tests/lib/DecoderTest.php @@ -6,21 +6,9 @@ declare(strict_types=1); namespace MensBeam\Intl\Test; -use MensBeam\Intl\Encoding\EncoderException; use MensBeam\Intl\Encoding\DecoderException; -abstract class EncodingTest extends \PHPUnit\Framework\TestCase { - public function testEncodeCodePoints(bool $fatal, $input, $exp) { - $class = $this->testedClass; - if ($exp instanceof \Throwable) { - $this->expectException(get_class($exp)); - $this->expectExceptionCode($exp->getCode()); - } else { - $exp = strtolower(str_replace(" ", "", $exp)); - } - $out = $class::encode($input, $fatal); - $this->assertSame($exp, bin2hex($out)); - } +abstract class DecoderTest extends \PHPUnit\Framework\TestCase { public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) { $class = $this->testedClass; diff --git a/tests/phpunit.xml b/tests/phpunit.xml index 7bf4948..2188102 100644 --- a/tests/phpunit.xml +++ b/tests/phpunit.xml @@ -22,6 +22,7 @@ cases/Encoding/TestUTF16LE.php cases/Encoding/TestUTF16BE.php cases/Encoding/TestSingleByte.php + cases/Encoding/TestXUserDefined.php cases/Encoding/TestGB18030.php