Browse Source

Corrected ISO 2022-JP decoder and seeker

multi-byte
J. King 4 years ago
parent
commit
cdd1c0182b
  1. 5
      lib/Encoding/AbstractEncoding.php
  2. 112
      lib/Encoding/ISO2022JP.php
  3. 200
      tests/cases/Encoding/TestISO2022JP.php
  4. 9
      tests/lib/DecoderTest.php
  5. 1
      tests/phpunit.xml
  6. 24
      tools/test-iso2022jp.html

5
lib/Encoding/AbstractEncoding.php

@ -78,12 +78,9 @@ abstract class AbstractEncoding implements Encoding {
public function seek(int $distance): int {
if ($distance > 0) {
if ($this->posByte == strlen($this->string)) {
return $distance;
}
do {
$p = $this->nextCode();
} while (--$distance && $p !== false);
} while ($p !== false && --$distance);
return $distance;
} elseif ($distance < 0) {
$distance = abs($distance);

112
lib/Encoding/ISO2022JP.php

@ -25,18 +25,24 @@ class ISO2022JP extends AbstractEncoding implements StatefulEncoding {
protected $mode = self::ASCII_STATE;
protected $modeMark = \PHP_INT_MIN;
protected $modeStack = [];
protected $dirtyEOF = 0;
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) {
parent::__construct($string, $fatal, $allowSurrogates);
$this->stateProps[] = "dirtyEOF";
}
public function nextChar(): string {
$code = $this->nextCode();
if ($code !== false) {
return UTF8::encode($code);
}
return "";
}
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
*/
public function nextCode() {
$this->posChar++;
$state = $this->mode;
assert($state < self::TRAIL_BYTE_STATE, "Invalid base state $state");
while (true) {
$b = @$this->string[$this->posByte++];
$eof = ($b === "");
@ -45,6 +51,7 @@ class ISO2022JP extends AbstractEncoding implements StatefulEncoding {
if ($state < self::TRAIL_BYTE_STATE) {
if ($eof) {
$this->posByte--;
$this->posChar--;
return false;
} elseif ($b === 0x1B) {
$state = self::ESCAPE_START_STATE;
@ -55,7 +62,7 @@ class ISO2022JP extends AbstractEncoding implements StatefulEncoding {
}
if ($state === self::ASCII_STATE) {
return $b;
} elseif ($this->state === self::ROMAN_STATE) {
} elseif ($state === self::ROMAN_STATE) {
if ($b === 0x5C) {
return 0xA5;
} elseif ($b === 0x7E) {
@ -63,26 +70,25 @@ class ISO2022JP extends AbstractEncoding implements StatefulEncoding {
} else {
return $b;
}
} elseif ($this->state === self::KATAKANA_STATE) {
} elseif ($state === self::KATAKANA_STATE) {
if ($b >= 0x21 && $b <= 0x5F) {
return 0xFF61 - 0x21 + $b;
} else {
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
}
} elseif ($this->state === self::LEAD_BYTE_STATE) {
assert(!isset($lead), "Lead byte is set when it shouldn't be");
if ($b >= 0x21 && $b <= 0x5F) {
} elseif ($state === self::LEAD_BYTE_STATE) {
if ($b >= 0x21 && $b <= 0x7E) {
$lead = $b;
$state = self::TRAIL_BYTE_STATE;
continue;
} else {
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
}
} elseif ($this->state === self::TRAIL_BYTE_STATE) {
assert(isset($lead), "Trail byte without lead byte");
} elseif ($state === self::TRAIL_BYTE_STATE) {
if ($eof || $b === 0x1B) {
return $this->errDec($this->errMode, $this->posChar - 1, --$this->posByte - 1);
} elseif ($b >= 0x21 && $b <= 0x5F) {
$pointer = ($lead - 0x21) * 94 + $b - 0x21;
} elseif ($b >= 0x21 && $b <= 0x7E) {
$pointer = (($lead - 0x21) * 94) + $b - 0x21;
$codePoint = self::TABLE_JIS0208[$pointer] ?? null;
if (!is_null($codePoint)) {
return $codePoint;
@ -93,8 +99,7 @@ class ISO2022JP extends AbstractEncoding implements StatefulEncoding {
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 2);
}
} elseif ($state === self::ESCAPE_START_STATE) {
assert(!isset($lead), "Lead byte is set when it shouldn't be");
if ($b === 0x24 || $b ===0x28) {
if ($b === 0x24 || $b === 0x28) {
$lead = $b;
$state = self::ESCAPE_STATE;
continue;
@ -102,7 +107,6 @@ class ISO2022JP extends AbstractEncoding implements StatefulEncoding {
return $this->errDec($this->errMode, $this->posChar - 1, --$this->posByte - 1);
}
} elseif ($state === self::ESCAPE_STATE) {
assert(isset($lead), "Trail byte without lead byte");
if ($lead === 0x28 && $b === 0x42) {
$newState = self::ASCII_STATE;
} elseif ($lead === 0x28 && $b === 0x4A) {
@ -120,34 +124,80 @@ class ISO2022JP extends AbstractEncoding implements StatefulEncoding {
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 3);
} else {
$state = $this->modeSet($newState);
unset($lead);
// if we're at the end of the string, mark the string as dirty
if ($this->posByte === $this->lenByte) {
$this->dirtyEOF = 3;
}
continue;
}
}
assert(false, "Process failed to continue");
}
assert(false, "Process failed to return a code point");
}
protected function modeSet(int $mode): int {
assert($mode < self::TRAIL_BYTE_STATE, "Mode $mode is invalid");
$this->modeStack = [$this->modeMark, $this->mode];
$this->modeStack[] = [$this->modeMark, $this->mode];
$this->mode = $mode;
$this->modeMark = $this->posByte;
return $mode;
}
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
*/
public static function encode(array $codePoints, bool $fatal = true): string {
return "";
}
/** Implements backward seeking $distance characters */
protected function seekBack(int $distance): int {
if ($this->dirtyEOF && $this->posByte === $this->lenByte) {
list($this->modeMark, $this->mode) = array_pop($this->modeStack);
$this->posByte -= $this->dirtyEOF;
$this->dirtyEOF = 0;
}
while ($distance > 0 && $this->posByte > 0) {
$this->posChar--;
$distance--;
if ($this->posByte === $this->errMark) { // the previous character was malformed
// if the position also marks a mode change, pop the mode stack
if ($this->posByte === $this->modeMark) {
list($this->modeMark, $this->mode) = array_pop($this->modeStack);
}
// move to the correct sync position, pop the error stack, and continue
$this->posByte = $this->errSync;
list($this->errMark, $this->errSync) = array_pop($this->errStack);
} else {
$this->posByte -= ($this->mode === self::LEAD_BYTE_STATE ? 2 : 1);
}
// check for a mode change that is not also an error character
if ($this->posByte === $this->modeMark && $this->posByte !== $this->errMark) {
$this->posByte -= 3;
list($this->modeMark, $this->mode) = array_pop($this->modeStack);
}
}
return $distance;
}
protected function stateSave(): array {
$out = parent::stateSave();
$out['modeCount'] = sizeof($this->modeStack);
return $out;
}
protected function stateApply(array $state) {
while (sizeof($this->modeStack) > $state['modeCount']) {
list($this->modeMark, $this->mode) = array_pop($this->modeStack);
}
unset($state['modeCount']);
parent::stateApply($state);
}
public function rewind() {
$this->modeStack = [];
$this->modeMark = \PHP_INT_MIN;
$this->mode = self::ASCII_STATE;
$this->dirtyEOF = 0;
parent::rewind();
}
public function eof(): bool {
return $this->posByte === $this->lenByte || ($this->posByte === ($this->lenByte - 3) && $this->peekCode() === false);
}
}

200
tests/cases/Encoding/TestISO2022JP.php

@ -0,0 +1,200 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Intl\TestCase\Encoding;
use MensBeam\Intl\Encoding\ISO2022JP;
use MensBeam\Intl\Encoding\Encoding;
use MensBeam\Intl\Encoding\EncoderException;
class TestISO2022JP extends \MensBeam\Intl\Test\CoderDecoderTest {
protected $testedClass = ISO2022JP::class;
/*
Char 0 U+007A (1 byte) Offset 0
Esc: Katakana (3 bytes) Offset 1
Char 1 U+FF9C (1 byte) Offset 4
Char 2 U+FF9F (1 byte) Offset 5
Esc: Double-byte (3 bytes) Offset 6
Char 3 U+79FB (2 bytes) Offset 9
Char 4 U+67B8 (2 bytes) Offset 11
Char 5 U+9B91 (2 bytes) Offset 13
Esc: ASCII (3 bytes) Offset 15
Char 6 U+007E (1 byte) Offset 18
Esc: Roman (3 bytes) Offset 19
End of string at char 7, offset 22
*/
protected $seekString = "7A 1B2849 5C 5F 1B2440 305C 5B4E 723A 1B2842 7E 1B284A";
protected $seekCodes = [0x7A, 0xFF9C, 0xFF9F, 0x79FB, 0x67B8, 0x9B91, 0x7E];
protected $seekOffsets = [0, 1, 5, 6, 11, 13, 15, 19];
/* This string contains an invalid character sequence sandwiched between two null characters */
protected $brokenChar = "00 FF 00";
public function provideCodePoints() {
return [
];
}
public function provideStrings() {
return [
'empty string' => ["", []],
'Implied ASCII mode' => ["00 30 5C 7E 21 5F", [0, 48, 92, 126, 33, 95]],
'Explicit ASCII mode' => ["1B2842 00 30 5C 7E 21 5F", [0, 48, 92, 126, 33, 95]],
'Roman mode' => ["1B284A 00 30 5C 7E 21 5F", [0, 48, 165, 8254, 33, 95]],
'Katakana mode' => ["1B2849 00 30 5C 7E 21 5F", [65533, 65392, 65436, 65533, 65377, 65439]],
'Double-byte mode 1' => ["1B2440 00 305C 7E21 5F", [65533, 31227, 65533, 65533]],
'Double-byte mode 2' => ["1B2442 00 305C 7E21 5F", [65533, 31227, 65533, 65533]],
'Multiple modes' => ["5C 1B2849 21 1B2440 305C 1B284A 5C 1B2842 5C", [92, 65377, 31227, 165, 92]],
'Double escape' => ["1B2849 1B2842 5C", [65533, 92]],
'Triple escape' => ["1B2849 1B2842 1B284A 5C", [65533, 65533, 165]],
'Trailing escape' => ["20 1B284A 30 33 1B2849", [32, 48, 51]],
'Truncated escape 1' => ["1B", [65533]],
'Truncated escape 2' => ["1B28", [65533, 40]],
'Truncated escape 3' => ["1B2820", [65533, 40, 32]],
'Truncated escape 4' => ["1B2020", [65533, 32, 32]],
'Invalid escape 1' => ["1B2840", [65533, 40, 64]],
'Invalid escape 2' => ["1B244A", [65533, 36, 74]],
'Invalid bytes' => ["80 FF 1B2849 00 20 7F 1B2442 00 2100 FF FF", [65533, 65533, 65533, 65533, 65533, 65533, 65533, 65533, 65533]],
];
}
/**
* @dataProvider provideCodePoints
* @covers MensBeam\Intl\Encoding\ISO2022JP::encode
* @covers MensBeam\Intl\Encoding\ISO2022JP::errEnc
*/
public function testEncodeCodePoints(bool $fatal, $input, $exp) {
return parent::testEncodeCodePoints($fatal, $input, $exp);
}
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\ISO2022JP::__construct
* @covers MensBeam\Intl\Encoding\ISO2022JP::nextCode
* @covers MensBeam\Intl\Encoding\ISO2022JP::modeSet
*/
public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) {
return parent::testDecodeMultipleCharactersAsCodePoints($input, $exp);
}
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\ISO2022JP::__construct
* @covers MensBeam\Intl\Encoding\ISO2022JP::nextChar
* @covers MensBeam\Intl\Encoding\ISO2022JP::modeSet
*/
public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) {
return parent::testDecodeMultipleCharactersAsStrings($input, $exp);
}
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\ISO2022JP::seekBack
*/
public function testSTepBackThroughAString(string $input, array $exp) {
return parent::testSTepBackThroughAString($input, $exp);
}
/**
* @covers MensBeam\Intl\Encoding\ISO2022JP::seek
* @covers MensBeam\Intl\Encoding\ISO2022JP::posChar
* @covers MensBeam\Intl\Encoding\ISO2022JP::posByte
* @covers MensBeam\Intl\Encoding\ISO2022JP::rewind
*/
public function testSeekThroughAString() {
return parent::testSeekThroughAString();
}
/**
* @covers MensBeam\Intl\Encoding\ISO2022JP::posChar
* @covers MensBeam\Intl\Encoding\ISO2022JP::posByte
* @covers MensBeam\Intl\Encoding\ISO2022JP::eof
*/
public function testTraversePastTheEndOfAString() {
return parent::testTraversePastTheEndOfAString();
}
/**
* @covers MensBeam\Intl\Encoding\ISO2022JP::peekChar
* @covers MensBeam\Intl\Encoding\ISO2022JP::stateSave
* @covers MensBeam\Intl\Encoding\ISO2022JP::stateApply
*/
public function testPeekAtCharacters() {
return parent::testPeekAtCharacters();
}
/**
* @covers MensBeam\Intl\Encoding\ISO2022JP::peekCode
* @covers MensBeam\Intl\Encoding\ISO2022JP::stateSave
* @covers MensBeam\Intl\Encoding\ISO2022JP::stateApply
*/
public function testPeekAtCodePoints() {
return parent::testPeekAtCodePoints();
}
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\ISO2022JP::lenChar
* @covers MensBeam\Intl\Encoding\ISO2022JP::lenByte
* @covers MensBeam\Intl\Encoding\ISO2022JP::stateSave
* @covers MensBeam\Intl\Encoding\ISO2022JP::stateApply
*/
public function testGetStringLength(string $input, array $points) {
return parent::testGetStringLength($input, $points);
}
/**
* @covers MensBeam\Intl\Encoding\ISO2022JP::errDec
*/
public function testReplacementModes() {
return parent::testReplacementModes();
}
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\ISO2022JP::rewind
* @covers MensBeam\Intl\Encoding\ISO2022JP::chars
* @covers MensBeam\Intl\Encoding\ISO2022JP::codes
*/
public function testIterateThroughAString(string $input, array $exp) {
return parent::testIterateThroughAString($input, $exp);
}
/**
* @dataProvider provideStrings
* @coversNothing
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
/**
* @covers MensBeam\Intl\Encoding\ISO2022JP::seekBack
*/
public function testSeekBackOverRandomData() {
return parent::testSeekBackOverRandomData();
}
/**
* @group optional
*/
public function testPedanticallyDecodeSingleCharactersAsCodePoint() {
$series = [
];
foreach ($series as $test) {
foreach ($test[0] as $a => $input) {
$class = $this->testedClass;
$char = hex2bin($input);
$exp = $test[1][$a];
$s = new $class($char);
$this->assertSame($exp, $s->nextCode(), "Sequence $input did not decode to $exp.");
$this->assertFalse($s->nextCode(), "Sequence $input did not end after one character");
}
}
}
}

9
tests/lib/DecoderTest.php

@ -7,6 +7,7 @@ declare(strict_types=1);
namespace MensBeam\Intl\Test;
use MensBeam\Intl\Encoding\DecoderException;
use MensBeam\Intl\Encoding\ISO2022JP;
abstract class DecoderTest extends \PHPUnit\Framework\TestCase {
protected $random = "L51yGwEFuatjbZi7wgNC80qYncvauVm1Lh8vCSK/KJs6QxoynMU8TCamx5TNhbjeh5VpWqQ0Q1j/W6u4O/InxBDxk8g83azJFQHzU+L7Npk0bkdofFv2AHDI2SUlXotYeEOnkKa/c6eQiDk8NapS0LGnb64ypKASacAMp6s2wSUU03l6iVVapHsNBgYs0cD++vnG8ckgbGsV3KkE3Lh601u6jviDyeRwbTxLZcUfSS2uIzrvvGWFfw6D4/FOa3uTR1k2Ya6jT+T/F+OdMgWlUPouuAVgLuvFxj9v9ZBnI+FAFc0kX4aT/JoTuBGMm8YS4xPVvczdrPXCUijML5TZrU201uFqeB9LDDWULp1Ai9d41fcD/8GBFrzlpXPIV+hsSJ4HvWswXdDeVKWgSMrQ78pf+zwvD66TA4FjMiEsLLpf9bb+mPiS2Aa3BP0JpjPwi0gdBu8QipLXNGFUUGW/15jGlj3eNynELRAtvyYZnoYIYShsN1TIU+buw8hHOp9iKsKT+fqPaEuuLLtlJ/cqhcxaZhbaWRB6vCQW9mO7f8whl7cpbBOO+NwDDCJZCsULh7rINF2omkexfOZzQSt/LC3yw+Pzqrf5Pmp5YgpMvoNgHcY1FkpsHc48IHMsJ+gex2zltIG51TQBAhy/fWF0KIqd+IPT+qngVGYIw/WuXj0LaK7XIVp33tc6fzuXNv+GUzYwpv4k9ry8R/DW8EX572FXFA49HHxbytSIJLD/+KpE2CE1WOr3ONwOXm6WduUBmFi4bwlRrCKnHqnFtLztVdLwMOauFa8N822XoAnWvHs+8R1DLHtgUyZas3ktp/qjMp5oVsb2PO+VpPFHIighHySgljrPl+sKaPULh7P/rAHXOuS9p9zTZKHrQ4nccl8SnYZlHKdioWo1NK5LRZB0PXYH8Ytu8aWVBmb4lAlpAFbSTqtOhydUJ/lyM29STG5mTV3rbG6tWMsUXBpaX4PrGCnhj40RVdz0BzsgvzLu4PNI+s3TJ6ZKV4hGS5on040xMDC2423DpKHPNa7mbl7J036dFt0JcYeGu07maGxssJnwLbebg5cm36Ecea7cTBWEGFMqiFjLoBEu0Y2CfF/GEbwqOf55/p1ewaZMrunFKd/Mj89qyYU5bp6mwmXSwj10psAA+qtXYm3XzRrLHKfCuiukyPEtvI+RdjbQDtMP1vF5qkmjlQLHXvEDpviJMaqvIPkjGrZkvAej1JX5yka50z0od9LLz8TIernjLLoVZ+cWtpd3kchO6w+zTpIOups4HdD66zaiPJrXIrJwi5bIgwTOWLhVs3ufZ0loFjlWWUh5FlTW+oWl1AD4h/yPBHWglqfMaTTqH75B4XEriy+Bw9k=";
@ -93,10 +94,14 @@ abstract class DecoderTest extends \PHPUnit\Framework\TestCase {
$this->assertSame(0, $s->seek(4));
$this->assertSame(7, $s->posChar());
$this->assertSame($off[7], $s->posByte());
$this->assertSame(1, $s->seek(1));
$this->assertSame(7, $s->posChar());
$this->assertSame($off[7], $s->posByte());
if ($this->testedClass !== ISO2022JP::class) {
$this->assertSame($off[7], $s->posByte());
} else {
$this->assertSame($off[7] + 3, $s->posByte());
}
$this->assertSame(0, $s->seek(-3));
$this->assertSame(4, $s->posChar());

1
tests/phpunit.xml

@ -27,6 +27,7 @@
<file>cases/Encoding/TestBig5.php</file>
<file>cases/Encoding/TestEUCKR.php</file>
<file>cases/Encoding/TestShiftJIS.php</file>
<file>cases/Encoding/TestISO2022JP.php</file>
<file>cases/TestEncoding.php</file>
</testsuite>
</testsuites>

24
tools/test-iso2022jp.html

@ -0,0 +1,24 @@
<!DOCTYPE html>
<meta charset=iso-2022-jp>
<!-- Chromium does NOT produce correct results as of this writing; use Firefox to generate test data -->
<script>
var sampleStrings = {
'empty string': "",
'Implied ASCII mode': "00 30 5C 7E 21 5F",
'Explicit ASCII mode': "1B2842 00 30 5C 7E 21 5F",
'Roman mode': "1B284A 00 30 5C 7E 21 5F",
'Katakana mode': "1B2849 00 30 5C 7E 21 5F",
'Double-byte mode 1': "1B2440 00 30 5C 7E 21 5F",
'Double-byte mode 2': "1B2442 00 30 5C 7E 21 5F",
'Multiple modes': "5C 1B2849 21 1B2440 305C 1B284A 5C 1B2842 5C",
'Double escape': "1B2849 1B2842 5C",
'Triple escape': "1B2849 1B2842 1B284A 5C",
'Trailing escape': "20 1B284A 30 33 1B2849",
'Invalid bytes': "80 FF 1B2849 00 20 7F 1B2442 00 2100 FF FF",
};
var sampleCharacters = {
};
var seekCodePoints = [
];
</script>
<script src="test.js"></script>
Loading…
Cancel
Save