Browse Source

Fix bugs in gb18030 and UTF-16

- UTF-16 needs to restore dirtyEOF after seeking
- gb18030 now tracks errors like other non-synchronizing encodings
- gb18030 could produce null when asked for a character
multi-byte
J. King 4 years ago
parent
commit
0eb2a8ac24
  1. 23
      lib/Encoding/GBCommon.php
  2. 5
      lib/Encoding/UTF16.php

23
lib/Encoding/GBCommon.php

@ -34,14 +34,18 @@ abstract class GBCommon extends AbstractEncoding implements StatelessEncoding {
$second = $b;
continue;
} else {
$codePoint = null;
if (($b > 0x3A && $b < 0x7F) || ($b > 0x7F && $b < 0xFF)) {
$offset = ($b < 0x7F) ? 0x40 : 0x41;
$pointer = ($first - 0x81) * 190 + ($b - $offset);
return self::TABLE_GBK[$pointer];
$codePoint = self::TABLE_GBK[$pointer] ?? null;
}
if (!is_null($codePoint)) {
return $codePoint;
} elseif ($b < 0x80) {
return $this->errDec($this->errMode, $this->posChar - 1, --$this->posByte);
return $this->errDec($this->errMode, $this->posChar - 1, --$this->posByte - 1);
} else {
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 2);
}
}
} elseif ($third === 0) {
@ -69,7 +73,7 @@ abstract class GBCommon extends AbstractEncoding implements StatelessEncoding {
if (isset($codePointOffset)) {
return $codePointOffset + $pointer - $offset;
} else {
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 4);
}
} else {
$this->posByte -= 3;
@ -83,9 +87,8 @@ abstract class GBCommon extends AbstractEncoding implements StatelessEncoding {
$this->posChar--;
return false;
} else {
// dirty EOF; note how many bytes the last character had
$this->dirtyEOF = ($third ? 3 : ($second ? 2 : 1));
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - $this->dirtyEOF);
// dirty EOF
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - ($third ? 3 : ($second ? 2 : 1)));
}
}
@ -135,6 +138,12 @@ abstract class GBCommon extends AbstractEncoding implements StatelessEncoding {
while ($distance > 0 && $this->posByte > 0) {
$distance--;
$this->posChar--;
if ($this->posByte === $this->errMark) { // the previous character was malformed
// move to the correct sync position, pop the error stack, and continue
$this->posByte = $this->errSync;
list($this->errMark, $this->errSync) = array_pop($this->errStack);
continue;
}
// go back one byte
$b1 = ord(@$this->string[--$this->posByte]);
if ($b1 < 0x30 || $b1 == 0x7F || $this->posByte == 0) { // these bytes never appear in sequences, and the first byte is necessarily the start of a sequence

5
lib/Encoding/UTF16.php

@ -10,6 +10,11 @@ abstract class UTF16 extends AbstractEncoding {
protected $selfSynchronizing = true;
protected $dirtyEOF = 0;
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) {
$this->stateProps[] = "dirtyEOF";
parent::__construct($string, $fatal, $allowSurrogates);
}
public function nextCode() {
$lead_b = null;
$lead_s = null;

Loading…
Cancel
Save