From 808b4128dd721b1d45ec6b56337ea7caac4d3e27 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Sat, 17 Oct 2020 13:52:32 -0400 Subject: [PATCH] Tests for replacement encoding; readme correction --- README.md | 13 +- lib/Encoding/Replacement.php | 6 +- tests/cases/Encoding/TestReplacement.php | 201 +++++++++++++++++++++++ tests/phpunit.xml | 1 + 4 files changed, 206 insertions(+), 15 deletions(-) create mode 100644 tests/cases/Encoding/TestReplacement.php diff --git a/README.md b/README.md index 6cc7199..2d1b11b 100644 --- a/README.md +++ b/README.md @@ -2,18 +2,7 @@ While PHP's [internationalization extension][PHP_INTL] offers excellent and extensive functionality for dealing with human languages, character encodings, and various related things, it is not always available. Moreover, its character decoder does not yield the same results as [WHATWG's Encoding standard][ENCODING], making it unsuitable for implementing parsers for URLs or HTML. The more widely used [multi-byte string extension][PHP_MBSTRING] not only suffers the same problems, but is also very slow. -Included here is a partial suite of WHATWG-compatible seekable string decoders which are reasonably performant while requiring no external dependencies or PHP extensions. At present it includes the following encodings: - -* UTF-8 -* UTF-16 -* gb18030 -* GBK -* Big5 -* EUC-KR -* all single-byte encodings -* x-user-defined - -Where applicable, code point encoders are also included. In time it will be extended to cover the entire suite of WHATWG character encodings, and may also provide other character-centric internationalization functionality. +Included here is a complete suite of WHATWG-compatible seekable string decoders which are reasonably performant while requiring no external dependencies or PHP extensions. Where applicable, code point encoders are also included. In time it may also provide other character-centric internationalization functionality. [PHP_INTL]: https://php.net/manual/en/book.intl.php [PHP_MBSTRING]: https://php.net/manual/en/book.mbstring.php diff --git a/lib/Encoding/Replacement.php b/lib/Encoding/Replacement.php index 0845852..318698b 100644 --- a/lib/Encoding/Replacement.php +++ b/lib/Encoding/Replacement.php @@ -51,7 +51,7 @@ class Replacement implements Encoding { public function nextCode() { if (!$this->eof()) { try { - return $this->peekCode(); + return $this->peekCode()[0]; } finally { $this->done = true; $this->posErr = 1; @@ -80,7 +80,7 @@ class Replacement implements Encoding { } public function peekChar(int $num = 1): string { - if (!$this->eof()) { + if (!$this->eof() && $num > 0) { if ($this->fatal) { throw new DecoderException("Unable to decode string", self::E_INVALID_BYTE); } @@ -90,7 +90,7 @@ class Replacement implements Encoding { } public function peekCode(int $num = 1): array { - if (!$this->eof()) { + if (!$this->eof() && $num > 0) { if ($this->fatal) { throw new DecoderException("Unable to decode string", self::E_INVALID_BYTE); } diff --git a/tests/cases/Encoding/TestReplacement.php b/tests/cases/Encoding/TestReplacement.php new file mode 100644 index 0000000..76fe3fd --- /dev/null +++ b/tests/cases/Encoding/TestReplacement.php @@ -0,0 +1,201 @@ + ["", []], + 'Arbitrary string 1' => ["20", [0xFFFD]], + 'Arbitrary string 2' => ["64 8B 20 00 FF A5", [0xFFFD]], + ]; + } + + /** + * @dataProvider provideStrings + * @covers MensBeam\Intl\Encoding\Replacement::__construct + * @covers MensBeam\Intl\Encoding\Replacement::nextCode + */ + public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) { + return parent::testDecodeMultipleCharactersAsCodePoints($input, $exp); + } + + /** + * @dataProvider provideStrings + * @covers MensBeam\Intl\Encoding\Replacement::__construct + * @covers MensBeam\Intl\Encoding\Replacement::nextChar + */ + public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) { + return parent::testDecodeMultipleCharactersAsStrings($input, $exp); + } + + /** + * @dataProvider provideStrings + * @covers MensBeam\Intl\Encoding\Replacement::seek + */ + public function testSTepBackThroughAString(string $input, array $exp) { + return parent::testSTepBackThroughAString($input, $exp); + } + + /** + * @coversNothing + */ + public function testSeekThroughAString() { + $this->assertTrue(true); + } + + /** + * @covers MensBeam\Intl\Encoding\Replacement::posChar + * @covers MensBeam\Intl\Encoding\Replacement::posByte + * @covers MensBeam\Intl\Encoding\Replacement::seek + * @covers MensBeam\Intl\Encoding\Replacement::eof + */ + public function testTraversePastTheEndOfAString() { + $d = new Replacement("a"); + $this->assertFalse($d->eof()); + $this->assertSame(0, $d->posChar()); + $this->assertSame(0, $d->posByte()); + $d->seek(1); + $this->assertTrue($d->eof()); + $this->assertSame(1, $d->posChar()); + $this->assertSame(1, $d->posByte()); + $d->seek(1); + $this->assertTrue($d->eof()); + $this->assertSame(1, $d->posChar()); + $this->assertSame(1, $d->posByte()); + } + + /** + * @covers MensBeam\Intl\Encoding\Replacement::peekChar + * @covers MensBeam\Intl\Encoding\Replacement::posChar + * @covers MensBeam\Intl\Encoding\Replacement::posByte + */ + public function testPeekAtCharacters() { + $d = new Replacement("A"); + $this->assertSame(0, $d->posChar()); + $this->assertSame(0, $d->posByte()); + $this->assertSame("\u{FFFD}", $d->peekChar(2112)); + $this->assertSame(0, $d->posChar()); + $this->assertSame(0, $d->posByte()); + $this->assertSame("", $d->peekChar(0)); + $this->assertSame("", $d->peekChar(-2112)); + } + + /** + * @covers MensBeam\Intl\Encoding\Replacement::peekCode + * @covers MensBeam\Intl\Encoding\Replacement::posChar + * @covers MensBeam\Intl\Encoding\Replacement::posByte + */ + public function testPeekAtCodePoints() { + $d = new Replacement("A"); + $this->assertSame(0, $d->posChar()); + $this->assertSame(0, $d->posByte()); + $this->assertSame([0xFFFD], $d->peekCode(2112)); + $this->assertSame(0, $d->posChar()); + $this->assertSame(0, $d->posByte()); + $this->assertSame([], $d->peekCode(0)); + $this->assertSame([], $d->peekCode(-2112)); + } + + /** + * @dataProvider provideStrings + * @covers MensBeam\Intl\Encoding\Replacement::lenChar + * @covers MensBeam\Intl\Encoding\Replacement::lenByte + */ + public function testGetStringLength(string $input, array $points) { + return parent::testGetStringLength($input, $points); + } + + /** + * @covers MensBeam\Intl\Encoding\Replacement::nextChar + * @covers MensBeam\Intl\Encoding\Replacement::nextCode + * @covers MensBeam\Intl\Encoding\Replacement::peekChar + * @covers MensBeam\Intl\Encoding\Replacement::peekCode + * @covers MensBeam\Intl\Encoding\Replacement::rewind + * @covers MensBeam\Intl\Encoding\Replacement::posChar + * @covers MensBeam\Intl\Encoding\Replacement::posByte + */ + public function testReplacementModes() { + $d = new Replacement("VVVVVV", true); + $this->assertSame(0, $d->posChar()); + $this->assertSame(0, $d->posByte()); + try { + $p = $d->peekCode(); + } catch (\Exception $e) { + $p = $e; + } finally { + $this->assertInstanceOf(DecoderException::class, $p); + } + $this->assertSame(0, $d->posErr); + $this->assertSame(0, $d->posChar()); + $this->assertSame(0, $d->posByte()); + try { + $p = $d->nextCode(); + } catch (\Exception $e) { + $p = $e; + } finally { + $this->assertInstanceOf(DecoderException::class, $p); + } + $this->assertSame(1, $d->posErr); + $this->assertSame(1, $d->posChar()); + $this->assertSame(6, $d->posByte()); + $d->rewind(); + $this->assertSame(0, $d->posChar()); + $this->assertSame(0, $d->posByte()); + try { + $p = $d->peekChar(); + } catch (\Exception $e) { + $p = $e; + } finally { + $this->assertInstanceOf(DecoderException::class, $p); + } + $this->assertSame(1, $d->posErr); + $this->assertSame(0, $d->posChar()); + $this->assertSame(0, $d->posByte()); + try { + $p = $d->nextChar(); + } catch (\Exception $e) { + $p = $e; + } finally { + $this->assertInstanceOf(DecoderException::class, $p); + } + $this->assertSame(1, $d->posErr); + $this->assertSame(1, $d->posChar()); + $this->assertSame(6, $d->posByte()); + } + + /** + * @dataProvider provideStrings + * @covers MensBeam\Intl\Encoding\Replacement::rewind + * @covers MensBeam\Intl\Encoding\Replacement::chars + * @covers MensBeam\Intl\Encoding\Replacement::codes + */ + public function testIterateThroughAString(string $input, array $exp) { + return parent::testIterateThroughAString($input, $exp); + } + + /** + * @dataProvider provideStrings + * @covers MensBeam\Intl\Encoding\Replacement::nextCode + */ + public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) { + return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp); + } + + /** + * @coversNothing + */ + public function testSeekBackOverRandomData() { + return parent::testSeekBackOverRandomData(); + } +} diff --git a/tests/phpunit.xml b/tests/phpunit.xml index 4296878..ccdb1f6 100644 --- a/tests/phpunit.xml +++ b/tests/phpunit.xml @@ -28,6 +28,7 @@ cases/Encoding/TestEUCKR.php cases/Encoding/TestShiftJIS.php cases/Encoding/TestISO2022JP.php + cases/Encoding/TestReplacement.php cases/TestEncoding.php