Browse Source

Finally fix Shift_JIS seeker

multi-byte
J. King 4 years ago
parent
commit
915aa7ca93
  1. 10
      lib/Encoding/ShiftJIS.php
  2. 16
      tests/cases/Encoding/TestGB18030.php
  3. 16
      tests/cases/Encoding/TestShiftJIS.php
  4. 2
      tests/lib/DecoderTest.php
  5. 20
      tools/test-shiftjis.html

10
lib/Encoding/ShiftJIS.php

@ -120,19 +120,19 @@ class ShiftJIS extends AbstractEncoding implements StatelessEncoding {
}
// go back one byte
$b1 = ord(@$this->string[--$this->posByte]);
if ($b1 < 0x40 || $b1 > 0xFC || $b1 == 0x7F || $this->posByte === 0 || $this->posByte === $this->errMark) { // these bytes never appear in sequences, and the first byte is necessarily the start of a sequence
if ($b1 < 0x40 || $b1 > 0xFC || $b1 === 0x7F || $this->posByte === 0 || $this->posByte === $this->errMark) { // these bytes never appear in sequences, and the first byte is necessarily the start of a sequence
// the byte is a character
continue;
}
// go back a second byte
$b2 = ord(@$this->string[--$this->posByte]);
if ($this->posByte === $this->errMark || $this->posByte === 0) { // position is unambiguously the start of a character
// the two bytes form a character
continue;
} elseif ($b2 < 0x81 || $b2 > 0xFC || ($b2 >= 0xA0 && $b2 <= 0xDF)) { // these bytes never appear in the lead of a sequence
if ($b2 < 0x81 || $b2 > 0xFC || ($b2 >= 0xA0 && $b2 <= 0xDF)) { // these bytes never appear in the lead of a sequence
// the first byte was a character
$this->posByte += 1;
continue;
} elseif ($this->posByte === $this->errMark || $this->posByte === 0) { // position is unambiguously the start of a character
// the two bytes form a character
continue;
} else { // the second byte is part of a two-byte sequence, but it's unclear if it's the lead or trail byte
$start = $this->posByte + 2;
$pos = $this->posByte;

16
tests/cases/Encoding/TestGB18030.php

@ -144,6 +144,14 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
/**
* @covers MensBeam\Intl\Encoding\GB18030::seekBack
*/
public function testSeekBackOverRandomData() {
return parent::testSeekBackOverRandomData();
}
public function provideCodePoints() {
// bytes confirmed using Firefox
$series_gb18030 = [
@ -200,14 +208,6 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest {
}
}
/**
* @covers MensBeam\Intl\Encoding\GB18030::seekBack
*/
public function testSeekBackOverRandomData() {
return parent::testSeekBackOverRandomData();
}
public function provideStrings() {
return [
'empty string' => ["", []],

16
tests/cases/Encoding/TestShiftJIS.php

@ -12,16 +12,6 @@ use MensBeam\Intl\Encoding\EncoderException;
class TestShiftJIS extends \MensBeam\Intl\Test\CoderDecoderTest {
protected $testedClass = ShiftJIS::class;
/*
Char 0 U+007A (1 byte) Offset 0
Char 1 U+86CC (2 bytes) Offset 1
Char 2 U+6C34 (2 bytes) Offset 3
Char 3 U+00CA (0 bytes) Offset 5
Char 4 U+0304 (2 bytes) Offset 5
Char 5 U+00EA (0 bytes) Offset 7
Char 6 U+030C (2 bytes) Offset 7
End of string at char 7, offset 9
*/
protected $seekString = "";
protected $seekCodes = [];
protected $seekOffsets = [];
@ -151,6 +141,12 @@ class TestShiftJIS extends \MensBeam\Intl\Test\CoderDecoderTest {
public function provideStrings() {
return [
'empty string' => ["", []],
'sanity check' => ["40", [64]],
'former ASCII deviations' => ["5C 7E", [92, 126]],
'JIS X 0201 range' => ["A1 DF", [65377, 65439]],
'EUDC range' => ["F040 F9FC", [57344, 59223]],
'JIS X 0208 assigned range' => ["8140 9F7E 8180 9FFC", [12288, 27631, 247, 28364]],
];
}

2
tests/lib/DecoderTest.php

@ -54,7 +54,7 @@ abstract class DecoderTest extends \PHPUnit\Framework\TestCase {
}
$this->assertSame(sizeof($exp), $pos);
while ($s->posChar()) {
$this->assertSame(0, $s->seek(-1));
$this->assertSame(0, $s->seek(-1), "Error stepping back to position ".($pos - 1));
$this->assertSame(--$pos, $s->posChar());
$act[] = $s->nextCode();
$s->seek(-1);

20
tools/test-shiftjis.html

@ -0,0 +1,20 @@
<!DOCTYPE html>
<meta charset=shift_jis>
<!-- Chromium does NOT produce correct results as of this writing; use Firefox to generate test data -->
<script>
var sampleStrings = {
'empty string': "",
// sanity checks
'sanity check': "40",
'former ASCII deviations': "5C 7E",
'JIS X 0201 range': "A1 DF",
'EUDC range': "F040 F9FC",
// JIS X 0208
'JIS X 0208 assigned range': "8140 9F7E 8180 9FFC",
};
var sampleCharacters = {
};
var seekCodePoints = [
];
</script>
<script src="test.js"></script>
Loading…
Cancel
Save