Browse Source

Fix gb18030 seeking; tidy up

multi-byte
J. King 4 years ago
parent
commit
f7246ccc34
  1. 88
      lib/Encoding/GBCommon.php
  2. 1
      lib/Encoding/SingleByteEncoding.php
  3. 2
      lib/Encoding/XUserDefined.php
  4. 8
      tests/cases/Encoding/TestUTF16LE.php
  5. 8
      tests/cases/Encoding/TestXUserDefined.php

88
lib/Encoding/GBCommon.php

@ -146,49 +146,61 @@ abstract class GBCommon extends AbstractEncoding implements StatelessEncoding {
}
// go back one byte
$b1 = ord(@$this->string[--$this->posByte]);
if ($b1 < 0x30 || $b1 == 0x7F || $this->posByte == 0) { // these bytes never appear in sequences, and the first byte is necessarily the start of a sequence
// the byte is a character
continue;
}
// go back a second byte
$b2 = ord(@$this->string[--$this->posByte]);
if ($b2 < 0x81 || $b2 == 0xFF) { // these bytes never appear second-to-last in a sequence
// the first byte was a character
$this->posByte += 1;
if ($b1 > 0x80) { // only GBK characters end in high bytes
// the preceeding byte starts the character
$this->posByte--;
continue;
} elseif ($b1 < 0x40 && $this->posByte < 2) { // byte values indicate a four-byte character, but there are insufficient bytes in the string
// the first byte was a character
$this->posByte += 1;
} elseif ($b1 < 0x30 || $this->errMark === $this->posByte || $this->posByte === 0) { // the byte is unambiguously a single-byte character
// the byte is a character
continue;
} elseif ($b1 > 0x39) { // the second byte is part of a two-byte sequence, but it's unclear if it's the lead or trail byte
$start = $this->posByte + 2;
// go back bytes until a definite trail byte or start of string
while ($this->posByte > 0) {
$b2 = ord(@$this->string[--$this->posByte]);
if ($b2 < 0x81 || $b2 == 0xFF) {
$this->posByte++;
break;
} elseif ($b1 >= 0x30 && $b1 <= 0x39) { // this can either be the last byte of a four-byte gb18030 character or an ASCII character
if ($this->posByte < 3) { // there are not enough bytes left for this to be a four-byte sequence
// the byte is a character
continue;
} elseif ($this->errMark > ($this->posByte - 3)) { // there was an error in what would otherwise be the four-byte sequence
// the byte is a character
continue;
}
// go back a second byte
$b2 = ord(@$this->string[$this->posByte - 1]);
if ($b2 > 0x80) {
// go back a third byte
$b3 = ord(@$this->string[$this->posByte - 2]);
if ($b3 >= 0x30 && $b3 <= 0x39) {
// the next byte starts the character
$this->posByte -= 3;
continue;
}
}
// if the number of ambiguous bytes is odd, the character is a single-byte character, otherwise it is double-byte
$this->posByte = $start - (($start - $this->posByte) % 2 ? 1 : 2);
continue;
}
// go back a third byte
$b3 = ord(@$this->string[--$this->posByte]);
if ($b3 > 0x39 || $b3 < 0x30) { // these bytes never appear in the second position of a four-byte sequence
// the first byte was a character
$this->posByte += 2;
// if the byte pattern doesn't match the first byte is a character
continue;
}
// go back a fourth byte
$b4 = ord(@$this->string[--$this->posByte]);
if (($b4 < 0x81 || $b4 == 0xFF)) { // these bytes never appear first in a four-byte sequence
// the first byte was a character
$this->posByte += 3;
continue;
} else {
// this is a four-byte character
} else { // this can either be the trail of a two-byte GBK character, or a single-byte character
// go back a second byte
$b2 = ord(@$this->string[--$this->posByte]);
if ($b2 < 0x81) { // these bytes never appear in the lead of a sequence
// the first byte was a character
$this->posByte += 1;
continue;
} else { // the second byte is part of a two-byte sequence, but it's unclear if it's the lead or trail byte
$start = $this->posByte + 2;
$pos = $this->posByte;
// go back bytes until an error mark, an ASCII byte, or start of string
while ($pos > 0 && $pos > $this->errMark) {
$b = ord(@$this->string[--$pos]);
if ($b < 0x81) {
$pos++;
break;
}
}
if (($start - $pos) % 2) { // the number of bytes is odd
// the first byte was a character
$this->posByte += 1;
continue;
} else { // the number of bytes is even
// the second byte was a character
continue;
}
}
}
}
return $distance;

1
lib/Encoding/SingleByteEncoding.php

@ -70,6 +70,7 @@ abstract class SingleByteEncoding extends AbstractEncoding implements StatelessE
}
}
/** @codeCoverageIgnore */
protected function seekBack(int $distance): int {
// stub: not used
return 0;

2
lib/Encoding/XUserDefined.php

@ -77,6 +77,8 @@ class XUserDefined extends AbstractEncoding implements Encoding {
}
}
/** @codeCoverageIgnore */
protected function seekBack(int $distance): int {
// stub: not used
return 0;

8
tests/cases/Encoding/TestUTF16LE.php

@ -126,6 +126,14 @@ class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
/**
* @covers MensBeam\Intl\Encoding\UTF16::seekBack
*/
public function testSeekBackOverRandomData() {
return parent::testSeekBackOverRandomData();
}
public function provideStrings() {
return [
// control samples

8
tests/cases/Encoding/TestXUserDefined.php

@ -117,6 +117,14 @@ class TestXUserDefined extends \MensBeam\Intl\Test\DecoderTest {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
/**
* @coversNothing
*/
public function testSeekBackOverRandomData() {
return parent::testSeekBackOverRandomData();
}
public function provideStrings() {
$a_bytes = [];
$a_codes = [];

Loading…
Cancel
Save