diff --git a/lib/Encoding/GBCommon.php b/lib/Encoding/GBCommon.php index 76391c6..0228509 100644 --- a/lib/Encoding/GBCommon.php +++ b/lib/Encoding/GBCommon.php @@ -146,49 +146,61 @@ abstract class GBCommon extends AbstractEncoding implements StatelessEncoding { } // go back one byte $b1 = ord(@$this->string[--$this->posByte]); - if ($b1 < 0x30 || $b1 == 0x7F || $this->posByte == 0) { // these bytes never appear in sequences, and the first byte is necessarily the start of a sequence - // the byte is a character - continue; - } - // go back a second byte - $b2 = ord(@$this->string[--$this->posByte]); - if ($b2 < 0x81 || $b2 == 0xFF) { // these bytes never appear second-to-last in a sequence - // the first byte was a character - $this->posByte += 1; + if ($b1 > 0x80) { // only GBK characters end in high bytes + // the preceeding byte starts the character + $this->posByte--; continue; - } elseif ($b1 < 0x40 && $this->posByte < 2) { // byte values indicate a four-byte character, but there are insufficient bytes in the string - // the first byte was a character - $this->posByte += 1; + } elseif ($b1 < 0x30 || $this->errMark === $this->posByte || $this->posByte === 0) { // the byte is unambiguously a single-byte character + // the byte is a character continue; - } elseif ($b1 > 0x39) { // the second byte is part of a two-byte sequence, but it's unclear if it's the lead or trail byte - $start = $this->posByte + 2; - // go back bytes until a definite trail byte or start of string - while ($this->posByte > 0) { - $b2 = ord(@$this->string[--$this->posByte]); - if ($b2 < 0x81 || $b2 == 0xFF) { - $this->posByte++; - break; + } elseif ($b1 >= 0x30 && $b1 <= 0x39) { // this can either be the last byte of a four-byte gb18030 character or an ASCII character + if ($this->posByte < 3) { // there are not enough bytes left for this to be a four-byte sequence + // the byte is a character + continue; + } elseif ($this->errMark > ($this->posByte - 3)) { // there was an error in what would otherwise be the four-byte sequence + // the byte is a character + continue; + } + // go back a second byte + $b2 = ord(@$this->string[$this->posByte - 1]); + if ($b2 > 0x80) { + // go back a third byte + $b3 = ord(@$this->string[$this->posByte - 2]); + if ($b3 >= 0x30 && $b3 <= 0x39) { + // the next byte starts the character + $this->posByte -= 3; + continue; } } - // if the number of ambiguous bytes is odd, the character is a single-byte character, otherwise it is double-byte - $this->posByte = $start - (($start - $this->posByte) % 2 ? 1 : 2); - continue; - } - // go back a third byte - $b3 = ord(@$this->string[--$this->posByte]); - if ($b3 > 0x39 || $b3 < 0x30) { // these bytes never appear in the second position of a four-byte sequence - // the first byte was a character - $this->posByte += 2; + // if the byte pattern doesn't match the first byte is a character continue; - } - // go back a fourth byte - $b4 = ord(@$this->string[--$this->posByte]); - if (($b4 < 0x81 || $b4 == 0xFF)) { // these bytes never appear first in a four-byte sequence - // the first byte was a character - $this->posByte += 3; - continue; - } else { - // this is a four-byte character + } else { // this can either be the trail of a two-byte GBK character, or a single-byte character + // go back a second byte + $b2 = ord(@$this->string[--$this->posByte]); + if ($b2 < 0x81) { // these bytes never appear in the lead of a sequence + // the first byte was a character + $this->posByte += 1; + continue; + } else { // the second byte is part of a two-byte sequence, but it's unclear if it's the lead or trail byte + $start = $this->posByte + 2; + $pos = $this->posByte; + // go back bytes until an error mark, an ASCII byte, or start of string + while ($pos > 0 && $pos > $this->errMark) { + $b = ord(@$this->string[--$pos]); + if ($b < 0x81) { + $pos++; + break; + } + } + if (($start - $pos) % 2) { // the number of bytes is odd + // the first byte was a character + $this->posByte += 1; + continue; + } else { // the number of bytes is even + // the second byte was a character + continue; + } + } } } return $distance; diff --git a/lib/Encoding/SingleByteEncoding.php b/lib/Encoding/SingleByteEncoding.php index 5629e20..7867ebf 100644 --- a/lib/Encoding/SingleByteEncoding.php +++ b/lib/Encoding/SingleByteEncoding.php @@ -70,6 +70,7 @@ abstract class SingleByteEncoding extends AbstractEncoding implements StatelessE } } + /** @codeCoverageIgnore */ protected function seekBack(int $distance): int { // stub: not used return 0; diff --git a/lib/Encoding/XUserDefined.php b/lib/Encoding/XUserDefined.php index 63c63db..8266a35 100644 --- a/lib/Encoding/XUserDefined.php +++ b/lib/Encoding/XUserDefined.php @@ -77,6 +77,8 @@ class XUserDefined extends AbstractEncoding implements Encoding { } } + + /** @codeCoverageIgnore */ protected function seekBack(int $distance): int { // stub: not used return 0; diff --git a/tests/cases/Encoding/TestUTF16LE.php b/tests/cases/Encoding/TestUTF16LE.php index 0d4f4ad..493d8a1 100644 --- a/tests/cases/Encoding/TestUTF16LE.php +++ b/tests/cases/Encoding/TestUTF16LE.php @@ -126,6 +126,14 @@ class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest { return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp); } + + /** + * @covers MensBeam\Intl\Encoding\UTF16::seekBack + */ + public function testSeekBackOverRandomData() { + return parent::testSeekBackOverRandomData(); + } + public function provideStrings() { return [ // control samples diff --git a/tests/cases/Encoding/TestXUserDefined.php b/tests/cases/Encoding/TestXUserDefined.php index 42caeb5..80b0504 100644 --- a/tests/cases/Encoding/TestXUserDefined.php +++ b/tests/cases/Encoding/TestXUserDefined.php @@ -117,6 +117,14 @@ class TestXUserDefined extends \MensBeam\Intl\Test\DecoderTest { return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp); } + + /** + * @coversNothing + */ + public function testSeekBackOverRandomData() { + return parent::testSeekBackOverRandomData(); + } + public function provideStrings() { $a_bytes = []; $a_codes = [];