From cdd1c0182bb4f7c70975e6d8244b633672d1c209 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Wed, 14 Oct 2020 12:29:19 -0400 Subject: [PATCH] Corrected ISO 2022-JP decoder and seeker --- lib/Encoding/AbstractEncoding.php | 5 +- lib/Encoding/ISO2022JP.php | 112 ++++++++++---- tests/cases/Encoding/TestISO2022JP.php | 200 +++++++++++++++++++++++++ tests/lib/DecoderTest.php | 9 +- tests/phpunit.xml | 1 + tools/test-iso2022jp.html | 24 +++ 6 files changed, 314 insertions(+), 37 deletions(-) create mode 100644 tests/cases/Encoding/TestISO2022JP.php create mode 100644 tools/test-iso2022jp.html diff --git a/lib/Encoding/AbstractEncoding.php b/lib/Encoding/AbstractEncoding.php index ab602aa..f7790b8 100644 --- a/lib/Encoding/AbstractEncoding.php +++ b/lib/Encoding/AbstractEncoding.php @@ -78,12 +78,9 @@ abstract class AbstractEncoding implements Encoding { public function seek(int $distance): int { if ($distance > 0) { - if ($this->posByte == strlen($this->string)) { - return $distance; - } do { $p = $this->nextCode(); - } while (--$distance && $p !== false); + } while ($p !== false && --$distance); return $distance; } elseif ($distance < 0) { $distance = abs($distance); diff --git a/lib/Encoding/ISO2022JP.php b/lib/Encoding/ISO2022JP.php index ce84af0..fc1e985 100644 --- a/lib/Encoding/ISO2022JP.php +++ b/lib/Encoding/ISO2022JP.php @@ -25,18 +25,24 @@ class ISO2022JP extends AbstractEncoding implements StatefulEncoding { protected $mode = self::ASCII_STATE; protected $modeMark = \PHP_INT_MIN; protected $modeStack = []; + protected $dirtyEOF = 0; + + public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) { + parent::__construct($string, $fatal, $allowSurrogates); + $this->stateProps[] = "dirtyEOF"; + } + public function nextChar(): string { + $code = $this->nextCode(); + if ($code !== false) { + return UTF8::encode($code); + } + return ""; + } - /** Decodes the next character from the string and returns its code point number - * - * If the end of the string has been reached, false is returned - * - * @return int|bool - */ public function nextCode() { $this->posChar++; $state = $this->mode; - assert($state < self::TRAIL_BYTE_STATE, "Invalid base state $state"); while (true) { $b = @$this->string[$this->posByte++]; $eof = ($b === ""); @@ -45,6 +51,7 @@ class ISO2022JP extends AbstractEncoding implements StatefulEncoding { if ($state < self::TRAIL_BYTE_STATE) { if ($eof) { $this->posByte--; + $this->posChar--; return false; } elseif ($b === 0x1B) { $state = self::ESCAPE_START_STATE; @@ -55,7 +62,7 @@ class ISO2022JP extends AbstractEncoding implements StatefulEncoding { } if ($state === self::ASCII_STATE) { return $b; - } elseif ($this->state === self::ROMAN_STATE) { + } elseif ($state === self::ROMAN_STATE) { if ($b === 0x5C) { return 0xA5; } elseif ($b === 0x7E) { @@ -63,26 +70,25 @@ class ISO2022JP extends AbstractEncoding implements StatefulEncoding { } else { return $b; } - } elseif ($this->state === self::KATAKANA_STATE) { + } elseif ($state === self::KATAKANA_STATE) { if ($b >= 0x21 && $b <= 0x5F) { return 0xFF61 - 0x21 + $b; } else { return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1); } - } elseif ($this->state === self::LEAD_BYTE_STATE) { - assert(!isset($lead), "Lead byte is set when it shouldn't be"); - if ($b >= 0x21 && $b <= 0x5F) { + } elseif ($state === self::LEAD_BYTE_STATE) { + if ($b >= 0x21 && $b <= 0x7E) { $lead = $b; + $state = self::TRAIL_BYTE_STATE; continue; } else { return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1); } - } elseif ($this->state === self::TRAIL_BYTE_STATE) { - assert(isset($lead), "Trail byte without lead byte"); + } elseif ($state === self::TRAIL_BYTE_STATE) { if ($eof || $b === 0x1B) { return $this->errDec($this->errMode, $this->posChar - 1, --$this->posByte - 1); - } elseif ($b >= 0x21 && $b <= 0x5F) { - $pointer = ($lead - 0x21) * 94 + $b - 0x21; + } elseif ($b >= 0x21 && $b <= 0x7E) { + $pointer = (($lead - 0x21) * 94) + $b - 0x21; $codePoint = self::TABLE_JIS0208[$pointer] ?? null; if (!is_null($codePoint)) { return $codePoint; @@ -93,8 +99,7 @@ class ISO2022JP extends AbstractEncoding implements StatefulEncoding { return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 2); } } elseif ($state === self::ESCAPE_START_STATE) { - assert(!isset($lead), "Lead byte is set when it shouldn't be"); - if ($b === 0x24 || $b ===0x28) { + if ($b === 0x24 || $b === 0x28) { $lead = $b; $state = self::ESCAPE_STATE; continue; @@ -102,7 +107,6 @@ class ISO2022JP extends AbstractEncoding implements StatefulEncoding { return $this->errDec($this->errMode, $this->posChar - 1, --$this->posByte - 1); } } elseif ($state === self::ESCAPE_STATE) { - assert(isset($lead), "Trail byte without lead byte"); if ($lead === 0x28 && $b === 0x42) { $newState = self::ASCII_STATE; } elseif ($lead === 0x28 && $b === 0x4A) { @@ -120,34 +124,80 @@ class ISO2022JP extends AbstractEncoding implements StatefulEncoding { return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 3); } else { $state = $this->modeSet($newState); + unset($lead); + // if we're at the end of the string, mark the string as dirty + if ($this->posByte === $this->lenByte) { + $this->dirtyEOF = 3; + } continue; } } - assert(false, "Process failed to continue"); } - assert(false, "Process failed to return a code point"); } protected function modeSet(int $mode): int { - assert($mode < self::TRAIL_BYTE_STATE, "Mode $mode is invalid"); - $this->modeStack = [$this->modeMark, $this->mode]; + $this->modeStack[] = [$this->modeMark, $this->mode]; $this->mode = $mode; $this->modeMark = $this->posByte; return $mode; } - - /** Returns the encoding of $codePoint as a byte string - * - * If $codePoint is less than 0 or greater than 1114111, an exception is thrown - * - * If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted - */ + public static function encode(array $codePoints, bool $fatal = true): string { return ""; } - /** Implements backward seeking $distance characters */ protected function seekBack(int $distance): int { + if ($this->dirtyEOF && $this->posByte === $this->lenByte) { + list($this->modeMark, $this->mode) = array_pop($this->modeStack); + $this->posByte -= $this->dirtyEOF; + $this->dirtyEOF = 0; + } + while ($distance > 0 && $this->posByte > 0) { + $this->posChar--; + $distance--; + if ($this->posByte === $this->errMark) { // the previous character was malformed + // if the position also marks a mode change, pop the mode stack + if ($this->posByte === $this->modeMark) { + list($this->modeMark, $this->mode) = array_pop($this->modeStack); + } + // move to the correct sync position, pop the error stack, and continue + $this->posByte = $this->errSync; + list($this->errMark, $this->errSync) = array_pop($this->errStack); + } else { + $this->posByte -= ($this->mode === self::LEAD_BYTE_STATE ? 2 : 1); + } + // check for a mode change that is not also an error character + if ($this->posByte === $this->modeMark && $this->posByte !== $this->errMark) { + $this->posByte -= 3; + list($this->modeMark, $this->mode) = array_pop($this->modeStack); + } + } return $distance; } + + protected function stateSave(): array { + $out = parent::stateSave(); + $out['modeCount'] = sizeof($this->modeStack); + return $out; + } + + protected function stateApply(array $state) { + while (sizeof($this->modeStack) > $state['modeCount']) { + list($this->modeMark, $this->mode) = array_pop($this->modeStack); + } + unset($state['modeCount']); + parent::stateApply($state); + } + + public function rewind() { + $this->modeStack = []; + $this->modeMark = \PHP_INT_MIN; + $this->mode = self::ASCII_STATE; + $this->dirtyEOF = 0; + parent::rewind(); + } + + public function eof(): bool { + return $this->posByte === $this->lenByte || ($this->posByte === ($this->lenByte - 3) && $this->peekCode() === false); + } } diff --git a/tests/cases/Encoding/TestISO2022JP.php b/tests/cases/Encoding/TestISO2022JP.php new file mode 100644 index 0000000..e4b1ffc --- /dev/null +++ b/tests/cases/Encoding/TestISO2022JP.php @@ -0,0 +1,200 @@ + ["", []], + 'Implied ASCII mode' => ["00 30 5C 7E 21 5F", [0, 48, 92, 126, 33, 95]], + 'Explicit ASCII mode' => ["1B2842 00 30 5C 7E 21 5F", [0, 48, 92, 126, 33, 95]], + 'Roman mode' => ["1B284A 00 30 5C 7E 21 5F", [0, 48, 165, 8254, 33, 95]], + 'Katakana mode' => ["1B2849 00 30 5C 7E 21 5F", [65533, 65392, 65436, 65533, 65377, 65439]], + 'Double-byte mode 1' => ["1B2440 00 305C 7E21 5F", [65533, 31227, 65533, 65533]], + 'Double-byte mode 2' => ["1B2442 00 305C 7E21 5F", [65533, 31227, 65533, 65533]], + 'Multiple modes' => ["5C 1B2849 21 1B2440 305C 1B284A 5C 1B2842 5C", [92, 65377, 31227, 165, 92]], + 'Double escape' => ["1B2849 1B2842 5C", [65533, 92]], + 'Triple escape' => ["1B2849 1B2842 1B284A 5C", [65533, 65533, 165]], + 'Trailing escape' => ["20 1B284A 30 33 1B2849", [32, 48, 51]], + 'Truncated escape 1' => ["1B", [65533]], + 'Truncated escape 2' => ["1B28", [65533, 40]], + 'Truncated escape 3' => ["1B2820", [65533, 40, 32]], + 'Truncated escape 4' => ["1B2020", [65533, 32, 32]], + 'Invalid escape 1' => ["1B2840", [65533, 40, 64]], + 'Invalid escape 2' => ["1B244A", [65533, 36, 74]], + 'Invalid bytes' => ["80 FF 1B2849 00 20 7F 1B2442 00 2100 FF FF", [65533, 65533, 65533, 65533, 65533, 65533, 65533, 65533, 65533]], + ]; + } + + /** + * @dataProvider provideCodePoints + * @covers MensBeam\Intl\Encoding\ISO2022JP::encode + * @covers MensBeam\Intl\Encoding\ISO2022JP::errEnc + */ + public function testEncodeCodePoints(bool $fatal, $input, $exp) { + return parent::testEncodeCodePoints($fatal, $input, $exp); + } + + /** + * @dataProvider provideStrings + * @covers MensBeam\Intl\Encoding\ISO2022JP::__construct + * @covers MensBeam\Intl\Encoding\ISO2022JP::nextCode + * @covers MensBeam\Intl\Encoding\ISO2022JP::modeSet + */ + public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) { + return parent::testDecodeMultipleCharactersAsCodePoints($input, $exp); + } + + /** + * @dataProvider provideStrings + * @covers MensBeam\Intl\Encoding\ISO2022JP::__construct + * @covers MensBeam\Intl\Encoding\ISO2022JP::nextChar + * @covers MensBeam\Intl\Encoding\ISO2022JP::modeSet + */ + public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) { + return parent::testDecodeMultipleCharactersAsStrings($input, $exp); + } + + /** + * @dataProvider provideStrings + * @covers MensBeam\Intl\Encoding\ISO2022JP::seekBack + */ + public function testSTepBackThroughAString(string $input, array $exp) { + return parent::testSTepBackThroughAString($input, $exp); + } + + /** + * @covers MensBeam\Intl\Encoding\ISO2022JP::seek + * @covers MensBeam\Intl\Encoding\ISO2022JP::posChar + * @covers MensBeam\Intl\Encoding\ISO2022JP::posByte + * @covers MensBeam\Intl\Encoding\ISO2022JP::rewind + */ + public function testSeekThroughAString() { + return parent::testSeekThroughAString(); + } + + /** + * @covers MensBeam\Intl\Encoding\ISO2022JP::posChar + * @covers MensBeam\Intl\Encoding\ISO2022JP::posByte + * @covers MensBeam\Intl\Encoding\ISO2022JP::eof + */ + public function testTraversePastTheEndOfAString() { + return parent::testTraversePastTheEndOfAString(); + } + + /** + * @covers MensBeam\Intl\Encoding\ISO2022JP::peekChar + * @covers MensBeam\Intl\Encoding\ISO2022JP::stateSave + * @covers MensBeam\Intl\Encoding\ISO2022JP::stateApply + */ + public function testPeekAtCharacters() { + return parent::testPeekAtCharacters(); + } + + /** + * @covers MensBeam\Intl\Encoding\ISO2022JP::peekCode + * @covers MensBeam\Intl\Encoding\ISO2022JP::stateSave + * @covers MensBeam\Intl\Encoding\ISO2022JP::stateApply + */ + public function testPeekAtCodePoints() { + return parent::testPeekAtCodePoints(); + } + + /** + * @dataProvider provideStrings + * @covers MensBeam\Intl\Encoding\ISO2022JP::lenChar + * @covers MensBeam\Intl\Encoding\ISO2022JP::lenByte + * @covers MensBeam\Intl\Encoding\ISO2022JP::stateSave + * @covers MensBeam\Intl\Encoding\ISO2022JP::stateApply + */ + public function testGetStringLength(string $input, array $points) { + return parent::testGetStringLength($input, $points); + } + + /** + * @covers MensBeam\Intl\Encoding\ISO2022JP::errDec + */ + public function testReplacementModes() { + return parent::testReplacementModes(); + } + + /** + * @dataProvider provideStrings + * @covers MensBeam\Intl\Encoding\ISO2022JP::rewind + * @covers MensBeam\Intl\Encoding\ISO2022JP::chars + * @covers MensBeam\Intl\Encoding\ISO2022JP::codes + */ + public function testIterateThroughAString(string $input, array $exp) { + return parent::testIterateThroughAString($input, $exp); + } + + /** + * @dataProvider provideStrings + * @coversNothing + */ + public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) { + return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp); + } + + + /** + * @covers MensBeam\Intl\Encoding\ISO2022JP::seekBack + */ + public function testSeekBackOverRandomData() { + return parent::testSeekBackOverRandomData(); + } + + /** + * @group optional + */ + public function testPedanticallyDecodeSingleCharactersAsCodePoint() { + $series = [ + ]; + foreach ($series as $test) { + foreach ($test[0] as $a => $input) { + $class = $this->testedClass; + $char = hex2bin($input); + $exp = $test[1][$a]; + $s = new $class($char); + $this->assertSame($exp, $s->nextCode(), "Sequence $input did not decode to $exp."); + $this->assertFalse($s->nextCode(), "Sequence $input did not end after one character"); + } + } + } +} + + diff --git a/tests/lib/DecoderTest.php b/tests/lib/DecoderTest.php index fcf069e..839e622 100644 --- a/tests/lib/DecoderTest.php +++ b/tests/lib/DecoderTest.php @@ -7,6 +7,7 @@ declare(strict_types=1); namespace MensBeam\Intl\Test; use MensBeam\Intl\Encoding\DecoderException; +use MensBeam\Intl\Encoding\ISO2022JP; abstract class DecoderTest extends \PHPUnit\Framework\TestCase { protected $random = "L51yGwEFuatjbZi7wgNC80qYncvauVm1Lh8vCSK/KJs6QxoynMU8TCamx5TNhbjeh5VpWqQ0Q1j/W6u4O/InxBDxk8g83azJFQHzU+L7Npk0bkdofFv2AHDI2SUlXotYeEOnkKa/c6eQiDk8NapS0LGnb64ypKASacAMp6s2wSUU03l6iVVapHsNBgYs0cD++vnG8ckgbGsV3KkE3Lh601u6jviDyeRwbTxLZcUfSS2uIzrvvGWFfw6D4/FOa3uTR1k2Ya6jT+T/F+OdMgWlUPouuAVgLuvFxj9v9ZBnI+FAFc0kX4aT/JoTuBGMm8YS4xPVvczdrPXCUijML5TZrU201uFqeB9LDDWULp1Ai9d41fcD/8GBFrzlpXPIV+hsSJ4HvWswXdDeVKWgSMrQ78pf+zwvD66TA4FjMiEsLLpf9bb+mPiS2Aa3BP0JpjPwi0gdBu8QipLXNGFUUGW/15jGlj3eNynELRAtvyYZnoYIYShsN1TIU+buw8hHOp9iKsKT+fqPaEuuLLtlJ/cqhcxaZhbaWRB6vCQW9mO7f8whl7cpbBOO+NwDDCJZCsULh7rINF2omkexfOZzQSt/LC3yw+Pzqrf5Pmp5YgpMvoNgHcY1FkpsHc48IHMsJ+gex2zltIG51TQBAhy/fWF0KIqd+IPT+qngVGYIw/WuXj0LaK7XIVp33tc6fzuXNv+GUzYwpv4k9ry8R/DW8EX572FXFA49HHxbytSIJLD/+KpE2CE1WOr3ONwOXm6WduUBmFi4bwlRrCKnHqnFtLztVdLwMOauFa8N822XoAnWvHs+8R1DLHtgUyZas3ktp/qjMp5oVsb2PO+VpPFHIighHySgljrPl+sKaPULh7P/rAHXOuS9p9zTZKHrQ4nccl8SnYZlHKdioWo1NK5LRZB0PXYH8Ytu8aWVBmb4lAlpAFbSTqtOhydUJ/lyM29STG5mTV3rbG6tWMsUXBpaX4PrGCnhj40RVdz0BzsgvzLu4PNI+s3TJ6ZKV4hGS5on040xMDC2423DpKHPNa7mbl7J036dFt0JcYeGu07maGxssJnwLbebg5cm36Ecea7cTBWEGFMqiFjLoBEu0Y2CfF/GEbwqOf55/p1ewaZMrunFKd/Mj89qyYU5bp6mwmXSwj10psAA+qtXYm3XzRrLHKfCuiukyPEtvI+RdjbQDtMP1vF5qkmjlQLHXvEDpviJMaqvIPkjGrZkvAej1JX5yka50z0od9LLz8TIernjLLoVZ+cWtpd3kchO6w+zTpIOups4HdD66zaiPJrXIrJwi5bIgwTOWLhVs3ufZ0loFjlWWUh5FlTW+oWl1AD4h/yPBHWglqfMaTTqH75B4XEriy+Bw9k="; @@ -93,10 +94,14 @@ abstract class DecoderTest extends \PHPUnit\Framework\TestCase { $this->assertSame(0, $s->seek(4)); $this->assertSame(7, $s->posChar()); $this->assertSame($off[7], $s->posByte()); - + $this->assertSame(1, $s->seek(1)); $this->assertSame(7, $s->posChar()); - $this->assertSame($off[7], $s->posByte()); + if ($this->testedClass !== ISO2022JP::class) { + $this->assertSame($off[7], $s->posByte()); + } else { + $this->assertSame($off[7] + 3, $s->posByte()); + } $this->assertSame(0, $s->seek(-3)); $this->assertSame(4, $s->posChar()); diff --git a/tests/phpunit.xml b/tests/phpunit.xml index 9c45978..4296878 100644 --- a/tests/phpunit.xml +++ b/tests/phpunit.xml @@ -27,6 +27,7 @@ cases/Encoding/TestBig5.php cases/Encoding/TestEUCKR.php cases/Encoding/TestShiftJIS.php + cases/Encoding/TestISO2022JP.php cases/TestEncoding.php diff --git a/tools/test-iso2022jp.html b/tools/test-iso2022jp.html new file mode 100644 index 0000000..c481ba2 --- /dev/null +++ b/tools/test-iso2022jp.html @@ -0,0 +1,24 @@ + + + + +