From 0eb2a8ac245654f5013f027b2ca1e9d5431a47a8 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Sun, 4 Oct 2020 13:41:26 -0400 Subject: [PATCH] Fix bugs in gb18030 and UTF-16 - UTF-16 needs to restore dirtyEOF after seeking - gb18030 now tracks errors like other non-synchronizing encodings - gb18030 could produce null when asked for a character --- lib/Encoding/GBCommon.php | 23 ++++++++++++++++------- lib/Encoding/UTF16.php | 5 +++++ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/lib/Encoding/GBCommon.php b/lib/Encoding/GBCommon.php index dad1558..76391c6 100644 --- a/lib/Encoding/GBCommon.php +++ b/lib/Encoding/GBCommon.php @@ -34,14 +34,18 @@ abstract class GBCommon extends AbstractEncoding implements StatelessEncoding { $second = $b; continue; } else { + $codePoint = null; if (($b > 0x3A && $b < 0x7F) || ($b > 0x7F && $b < 0xFF)) { $offset = ($b < 0x7F) ? 0x40 : 0x41; $pointer = ($first - 0x81) * 190 + ($b - $offset); - return self::TABLE_GBK[$pointer]; + $codePoint = self::TABLE_GBK[$pointer] ?? null; + } + if (!is_null($codePoint)) { + return $codePoint; } elseif ($b < 0x80) { - return $this->errDec($this->errMode, $this->posChar - 1, --$this->posByte); + return $this->errDec($this->errMode, $this->posChar - 1, --$this->posByte - 1); } else { - return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1); + return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 2); } } } elseif ($third === 0) { @@ -69,7 +73,7 @@ abstract class GBCommon extends AbstractEncoding implements StatelessEncoding { if (isset($codePointOffset)) { return $codePointOffset + $pointer - $offset; } else { - return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1); + return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 4); } } else { $this->posByte -= 3; @@ -83,9 +87,8 @@ abstract class GBCommon extends AbstractEncoding implements StatelessEncoding { $this->posChar--; return false; } else { - // dirty EOF; note how many bytes the last character had - $this->dirtyEOF = ($third ? 3 : ($second ? 2 : 1)); - return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - $this->dirtyEOF); + // dirty EOF + return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - ($third ? 3 : ($second ? 2 : 1))); } } @@ -135,6 +138,12 @@ abstract class GBCommon extends AbstractEncoding implements StatelessEncoding { while ($distance > 0 && $this->posByte > 0) { $distance--; $this->posChar--; + if ($this->posByte === $this->errMark) { // the previous character was malformed + // move to the correct sync position, pop the error stack, and continue + $this->posByte = $this->errSync; + list($this->errMark, $this->errSync) = array_pop($this->errStack); + continue; + } // go back one byte $b1 = ord(@$this->string[--$this->posByte]); if ($b1 < 0x30 || $b1 == 0x7F || $this->posByte == 0) { // these bytes never appear in sequences, and the first byte is necessarily the start of a sequence diff --git a/lib/Encoding/UTF16.php b/lib/Encoding/UTF16.php index dc3a8f7..b01c756 100644 --- a/lib/Encoding/UTF16.php +++ b/lib/Encoding/UTF16.php @@ -10,6 +10,11 @@ abstract class UTF16 extends AbstractEncoding { protected $selfSynchronizing = true; protected $dirtyEOF = 0; + public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) { + $this->stateProps[] = "dirtyEOF"; + parent::__construct($string, $fatal, $allowSurrogates); + } + public function nextCode() { $lead_b = null; $lead_s = null;