diff --git a/lib/Encoding/AbstractEncoding.php b/lib/Encoding/AbstractEncoding.php
index ab602aa..f7790b8 100644
--- a/lib/Encoding/AbstractEncoding.php
+++ b/lib/Encoding/AbstractEncoding.php
@@ -78,12 +78,9 @@ abstract class AbstractEncoding implements Encoding {
public function seek(int $distance): int {
if ($distance > 0) {
- if ($this->posByte == strlen($this->string)) {
- return $distance;
- }
do {
$p = $this->nextCode();
- } while (--$distance && $p !== false);
+ } while ($p !== false && --$distance);
return $distance;
} elseif ($distance < 0) {
$distance = abs($distance);
diff --git a/lib/Encoding/ISO2022JP.php b/lib/Encoding/ISO2022JP.php
index ce84af0..fc1e985 100644
--- a/lib/Encoding/ISO2022JP.php
+++ b/lib/Encoding/ISO2022JP.php
@@ -25,18 +25,24 @@ class ISO2022JP extends AbstractEncoding implements StatefulEncoding {
protected $mode = self::ASCII_STATE;
protected $modeMark = \PHP_INT_MIN;
protected $modeStack = [];
+ protected $dirtyEOF = 0;
+
+ public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) {
+ parent::__construct($string, $fatal, $allowSurrogates);
+ $this->stateProps[] = "dirtyEOF";
+ }
+ public function nextChar(): string {
+ $code = $this->nextCode();
+ if ($code !== false) {
+ return UTF8::encode($code);
+ }
+ return "";
+ }
- /** Decodes the next character from the string and returns its code point number
- *
- * If the end of the string has been reached, false is returned
- *
- * @return int|bool
- */
public function nextCode() {
$this->posChar++;
$state = $this->mode;
- assert($state < self::TRAIL_BYTE_STATE, "Invalid base state $state");
while (true) {
$b = @$this->string[$this->posByte++];
$eof = ($b === "");
@@ -45,6 +51,7 @@ class ISO2022JP extends AbstractEncoding implements StatefulEncoding {
if ($state < self::TRAIL_BYTE_STATE) {
if ($eof) {
$this->posByte--;
+ $this->posChar--;
return false;
} elseif ($b === 0x1B) {
$state = self::ESCAPE_START_STATE;
@@ -55,7 +62,7 @@ class ISO2022JP extends AbstractEncoding implements StatefulEncoding {
}
if ($state === self::ASCII_STATE) {
return $b;
- } elseif ($this->state === self::ROMAN_STATE) {
+ } elseif ($state === self::ROMAN_STATE) {
if ($b === 0x5C) {
return 0xA5;
} elseif ($b === 0x7E) {
@@ -63,26 +70,25 @@ class ISO2022JP extends AbstractEncoding implements StatefulEncoding {
} else {
return $b;
}
- } elseif ($this->state === self::KATAKANA_STATE) {
+ } elseif ($state === self::KATAKANA_STATE) {
if ($b >= 0x21 && $b <= 0x5F) {
return 0xFF61 - 0x21 + $b;
} else {
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
}
- } elseif ($this->state === self::LEAD_BYTE_STATE) {
- assert(!isset($lead), "Lead byte is set when it shouldn't be");
- if ($b >= 0x21 && $b <= 0x5F) {
+ } elseif ($state === self::LEAD_BYTE_STATE) {
+ if ($b >= 0x21 && $b <= 0x7E) {
$lead = $b;
+ $state = self::TRAIL_BYTE_STATE;
continue;
} else {
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
}
- } elseif ($this->state === self::TRAIL_BYTE_STATE) {
- assert(isset($lead), "Trail byte without lead byte");
+ } elseif ($state === self::TRAIL_BYTE_STATE) {
if ($eof || $b === 0x1B) {
return $this->errDec($this->errMode, $this->posChar - 1, --$this->posByte - 1);
- } elseif ($b >= 0x21 && $b <= 0x5F) {
- $pointer = ($lead - 0x21) * 94 + $b - 0x21;
+ } elseif ($b >= 0x21 && $b <= 0x7E) {
+ $pointer = (($lead - 0x21) * 94) + $b - 0x21;
$codePoint = self::TABLE_JIS0208[$pointer] ?? null;
if (!is_null($codePoint)) {
return $codePoint;
@@ -93,8 +99,7 @@ class ISO2022JP extends AbstractEncoding implements StatefulEncoding {
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 2);
}
} elseif ($state === self::ESCAPE_START_STATE) {
- assert(!isset($lead), "Lead byte is set when it shouldn't be");
- if ($b === 0x24 || $b ===0x28) {
+ if ($b === 0x24 || $b === 0x28) {
$lead = $b;
$state = self::ESCAPE_STATE;
continue;
@@ -102,7 +107,6 @@ class ISO2022JP extends AbstractEncoding implements StatefulEncoding {
return $this->errDec($this->errMode, $this->posChar - 1, --$this->posByte - 1);
}
} elseif ($state === self::ESCAPE_STATE) {
- assert(isset($lead), "Trail byte without lead byte");
if ($lead === 0x28 && $b === 0x42) {
$newState = self::ASCII_STATE;
} elseif ($lead === 0x28 && $b === 0x4A) {
@@ -120,34 +124,80 @@ class ISO2022JP extends AbstractEncoding implements StatefulEncoding {
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 3);
} else {
$state = $this->modeSet($newState);
+ unset($lead);
+ // if we're at the end of the string, mark the string as dirty
+ if ($this->posByte === $this->lenByte) {
+ $this->dirtyEOF = 3;
+ }
continue;
}
}
- assert(false, "Process failed to continue");
}
- assert(false, "Process failed to return a code point");
}
protected function modeSet(int $mode): int {
- assert($mode < self::TRAIL_BYTE_STATE, "Mode $mode is invalid");
- $this->modeStack = [$this->modeMark, $this->mode];
+ $this->modeStack[] = [$this->modeMark, $this->mode];
$this->mode = $mode;
$this->modeMark = $this->posByte;
return $mode;
}
-
- /** Returns the encoding of $codePoint as a byte string
- *
- * If $codePoint is less than 0 or greater than 1114111, an exception is thrown
- *
- * If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
- */
+
public static function encode(array $codePoints, bool $fatal = true): string {
return "";
}
- /** Implements backward seeking $distance characters */
protected function seekBack(int $distance): int {
+ if ($this->dirtyEOF && $this->posByte === $this->lenByte) {
+ list($this->modeMark, $this->mode) = array_pop($this->modeStack);
+ $this->posByte -= $this->dirtyEOF;
+ $this->dirtyEOF = 0;
+ }
+ while ($distance > 0 && $this->posByte > 0) {
+ $this->posChar--;
+ $distance--;
+ if ($this->posByte === $this->errMark) { // the previous character was malformed
+ // if the position also marks a mode change, pop the mode stack
+ if ($this->posByte === $this->modeMark) {
+ list($this->modeMark, $this->mode) = array_pop($this->modeStack);
+ }
+ // move to the correct sync position, pop the error stack, and continue
+ $this->posByte = $this->errSync;
+ list($this->errMark, $this->errSync) = array_pop($this->errStack);
+ } else {
+ $this->posByte -= ($this->mode === self::LEAD_BYTE_STATE ? 2 : 1);
+ }
+ // check for a mode change that is not also an error character
+ if ($this->posByte === $this->modeMark && $this->posByte !== $this->errMark) {
+ $this->posByte -= 3;
+ list($this->modeMark, $this->mode) = array_pop($this->modeStack);
+ }
+ }
return $distance;
}
+
+ protected function stateSave(): array {
+ $out = parent::stateSave();
+ $out['modeCount'] = sizeof($this->modeStack);
+ return $out;
+ }
+
+ protected function stateApply(array $state) {
+ while (sizeof($this->modeStack) > $state['modeCount']) {
+ list($this->modeMark, $this->mode) = array_pop($this->modeStack);
+ }
+ unset($state['modeCount']);
+ parent::stateApply($state);
+ }
+
+ public function rewind() {
+ $this->modeStack = [];
+ $this->modeMark = \PHP_INT_MIN;
+ $this->mode = self::ASCII_STATE;
+ $this->dirtyEOF = 0;
+ parent::rewind();
+ }
+
+ public function eof(): bool {
+ return $this->posByte === $this->lenByte || ($this->posByte === ($this->lenByte - 3) && $this->peekCode() === false);
+ }
}
diff --git a/tests/cases/Encoding/TestISO2022JP.php b/tests/cases/Encoding/TestISO2022JP.php
new file mode 100644
index 0000000..e4b1ffc
--- /dev/null
+++ b/tests/cases/Encoding/TestISO2022JP.php
@@ -0,0 +1,200 @@
+ ["", []],
+ 'Implied ASCII mode' => ["00 30 5C 7E 21 5F", [0, 48, 92, 126, 33, 95]],
+ 'Explicit ASCII mode' => ["1B2842 00 30 5C 7E 21 5F", [0, 48, 92, 126, 33, 95]],
+ 'Roman mode' => ["1B284A 00 30 5C 7E 21 5F", [0, 48, 165, 8254, 33, 95]],
+ 'Katakana mode' => ["1B2849 00 30 5C 7E 21 5F", [65533, 65392, 65436, 65533, 65377, 65439]],
+ 'Double-byte mode 1' => ["1B2440 00 305C 7E21 5F", [65533, 31227, 65533, 65533]],
+ 'Double-byte mode 2' => ["1B2442 00 305C 7E21 5F", [65533, 31227, 65533, 65533]],
+ 'Multiple modes' => ["5C 1B2849 21 1B2440 305C 1B284A 5C 1B2842 5C", [92, 65377, 31227, 165, 92]],
+ 'Double escape' => ["1B2849 1B2842 5C", [65533, 92]],
+ 'Triple escape' => ["1B2849 1B2842 1B284A 5C", [65533, 65533, 165]],
+ 'Trailing escape' => ["20 1B284A 30 33 1B2849", [32, 48, 51]],
+ 'Truncated escape 1' => ["1B", [65533]],
+ 'Truncated escape 2' => ["1B28", [65533, 40]],
+ 'Truncated escape 3' => ["1B2820", [65533, 40, 32]],
+ 'Truncated escape 4' => ["1B2020", [65533, 32, 32]],
+ 'Invalid escape 1' => ["1B2840", [65533, 40, 64]],
+ 'Invalid escape 2' => ["1B244A", [65533, 36, 74]],
+ 'Invalid bytes' => ["80 FF 1B2849 00 20 7F 1B2442 00 2100 FF FF", [65533, 65533, 65533, 65533, 65533, 65533, 65533, 65533, 65533]],
+ ];
+ }
+
+ /**
+ * @dataProvider provideCodePoints
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::encode
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::errEnc
+ */
+ public function testEncodeCodePoints(bool $fatal, $input, $exp) {
+ return parent::testEncodeCodePoints($fatal, $input, $exp);
+ }
+
+ /**
+ * @dataProvider provideStrings
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::__construct
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::nextCode
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::modeSet
+ */
+ public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) {
+ return parent::testDecodeMultipleCharactersAsCodePoints($input, $exp);
+ }
+
+ /**
+ * @dataProvider provideStrings
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::__construct
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::nextChar
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::modeSet
+ */
+ public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) {
+ return parent::testDecodeMultipleCharactersAsStrings($input, $exp);
+ }
+
+ /**
+ * @dataProvider provideStrings
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::seekBack
+ */
+ public function testSTepBackThroughAString(string $input, array $exp) {
+ return parent::testSTepBackThroughAString($input, $exp);
+ }
+
+ /**
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::seek
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::posChar
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::posByte
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::rewind
+ */
+ public function testSeekThroughAString() {
+ return parent::testSeekThroughAString();
+ }
+
+ /**
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::posChar
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::posByte
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::eof
+ */
+ public function testTraversePastTheEndOfAString() {
+ return parent::testTraversePastTheEndOfAString();
+ }
+
+ /**
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::peekChar
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::stateSave
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::stateApply
+ */
+ public function testPeekAtCharacters() {
+ return parent::testPeekAtCharacters();
+ }
+
+ /**
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::peekCode
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::stateSave
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::stateApply
+ */
+ public function testPeekAtCodePoints() {
+ return parent::testPeekAtCodePoints();
+ }
+
+ /**
+ * @dataProvider provideStrings
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::lenChar
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::lenByte
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::stateSave
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::stateApply
+ */
+ public function testGetStringLength(string $input, array $points) {
+ return parent::testGetStringLength($input, $points);
+ }
+
+ /**
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::errDec
+ */
+ public function testReplacementModes() {
+ return parent::testReplacementModes();
+ }
+
+ /**
+ * @dataProvider provideStrings
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::rewind
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::chars
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::codes
+ */
+ public function testIterateThroughAString(string $input, array $exp) {
+ return parent::testIterateThroughAString($input, $exp);
+ }
+
+ /**
+ * @dataProvider provideStrings
+ * @coversNothing
+ */
+ public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
+ return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
+ }
+
+
+ /**
+ * @covers MensBeam\Intl\Encoding\ISO2022JP::seekBack
+ */
+ public function testSeekBackOverRandomData() {
+ return parent::testSeekBackOverRandomData();
+ }
+
+ /**
+ * @group optional
+ */
+ public function testPedanticallyDecodeSingleCharactersAsCodePoint() {
+ $series = [
+ ];
+ foreach ($series as $test) {
+ foreach ($test[0] as $a => $input) {
+ $class = $this->testedClass;
+ $char = hex2bin($input);
+ $exp = $test[1][$a];
+ $s = new $class($char);
+ $this->assertSame($exp, $s->nextCode(), "Sequence $input did not decode to $exp.");
+ $this->assertFalse($s->nextCode(), "Sequence $input did not end after one character");
+ }
+ }
+ }
+}
+
+
diff --git a/tests/lib/DecoderTest.php b/tests/lib/DecoderTest.php
index fcf069e..839e622 100644
--- a/tests/lib/DecoderTest.php
+++ b/tests/lib/DecoderTest.php
@@ -7,6 +7,7 @@ declare(strict_types=1);
namespace MensBeam\Intl\Test;
use MensBeam\Intl\Encoding\DecoderException;
+use MensBeam\Intl\Encoding\ISO2022JP;
abstract class DecoderTest extends \PHPUnit\Framework\TestCase {
protected $random = "L51yGwEFuatjbZi7wgNC80qYncvauVm1Lh8vCSK/KJs6QxoynMU8TCamx5TNhbjeh5VpWqQ0Q1j/W6u4O/InxBDxk8g83azJFQHzU+L7Npk0bkdofFv2AHDI2SUlXotYeEOnkKa/c6eQiDk8NapS0LGnb64ypKASacAMp6s2wSUU03l6iVVapHsNBgYs0cD++vnG8ckgbGsV3KkE3Lh601u6jviDyeRwbTxLZcUfSS2uIzrvvGWFfw6D4/FOa3uTR1k2Ya6jT+T/F+OdMgWlUPouuAVgLuvFxj9v9ZBnI+FAFc0kX4aT/JoTuBGMm8YS4xPVvczdrPXCUijML5TZrU201uFqeB9LDDWULp1Ai9d41fcD/8GBFrzlpXPIV+hsSJ4HvWswXdDeVKWgSMrQ78pf+zwvD66TA4FjMiEsLLpf9bb+mPiS2Aa3BP0JpjPwi0gdBu8QipLXNGFUUGW/15jGlj3eNynELRAtvyYZnoYIYShsN1TIU+buw8hHOp9iKsKT+fqPaEuuLLtlJ/cqhcxaZhbaWRB6vCQW9mO7f8whl7cpbBOO+NwDDCJZCsULh7rINF2omkexfOZzQSt/LC3yw+Pzqrf5Pmp5YgpMvoNgHcY1FkpsHc48IHMsJ+gex2zltIG51TQBAhy/fWF0KIqd+IPT+qngVGYIw/WuXj0LaK7XIVp33tc6fzuXNv+GUzYwpv4k9ry8R/DW8EX572FXFA49HHxbytSIJLD/+KpE2CE1WOr3ONwOXm6WduUBmFi4bwlRrCKnHqnFtLztVdLwMOauFa8N822XoAnWvHs+8R1DLHtgUyZas3ktp/qjMp5oVsb2PO+VpPFHIighHySgljrPl+sKaPULh7P/rAHXOuS9p9zTZKHrQ4nccl8SnYZlHKdioWo1NK5LRZB0PXYH8Ytu8aWVBmb4lAlpAFbSTqtOhydUJ/lyM29STG5mTV3rbG6tWMsUXBpaX4PrGCnhj40RVdz0BzsgvzLu4PNI+s3TJ6ZKV4hGS5on040xMDC2423DpKHPNa7mbl7J036dFt0JcYeGu07maGxssJnwLbebg5cm36Ecea7cTBWEGFMqiFjLoBEu0Y2CfF/GEbwqOf55/p1ewaZMrunFKd/Mj89qyYU5bp6mwmXSwj10psAA+qtXYm3XzRrLHKfCuiukyPEtvI+RdjbQDtMP1vF5qkmjlQLHXvEDpviJMaqvIPkjGrZkvAej1JX5yka50z0od9LLz8TIernjLLoVZ+cWtpd3kchO6w+zTpIOups4HdD66zaiPJrXIrJwi5bIgwTOWLhVs3ufZ0loFjlWWUh5FlTW+oWl1AD4h/yPBHWglqfMaTTqH75B4XEriy+Bw9k=";
@@ -93,10 +94,14 @@ abstract class DecoderTest extends \PHPUnit\Framework\TestCase {
$this->assertSame(0, $s->seek(4));
$this->assertSame(7, $s->posChar());
$this->assertSame($off[7], $s->posByte());
-
+
$this->assertSame(1, $s->seek(1));
$this->assertSame(7, $s->posChar());
- $this->assertSame($off[7], $s->posByte());
+ if ($this->testedClass !== ISO2022JP::class) {
+ $this->assertSame($off[7], $s->posByte());
+ } else {
+ $this->assertSame($off[7] + 3, $s->posByte());
+ }
$this->assertSame(0, $s->seek(-3));
$this->assertSame(4, $s->posChar());
diff --git a/tests/phpunit.xml b/tests/phpunit.xml
index 9c45978..4296878 100644
--- a/tests/phpunit.xml
+++ b/tests/phpunit.xml
@@ -27,6 +27,7 @@
cases/Encoding/TestBig5.php
cases/Encoding/TestEUCKR.php
cases/Encoding/TestShiftJIS.php
+ cases/Encoding/TestISO2022JP.php
cases/TestEncoding.php
diff --git a/tools/test-iso2022jp.html b/tools/test-iso2022jp.html
new file mode 100644
index 0000000..c481ba2
--- /dev/null
+++ b/tools/test-iso2022jp.html
@@ -0,0 +1,24 @@
+
+
+
+
+