Browse Source

Split error handlers

multi-byte
J. King 4 years ago
parent
commit
7339176e3e
  1. 10
      lib/Encoding/Big5.php
  2. 10
      lib/Encoding/EUCKR.php
  3. 7
      lib/Encoding/Encoding.php
  4. 18
      lib/Encoding/GBCommon.php
  5. 33
      lib/Encoding/GenericEncoding.php
  6. 6
      lib/Encoding/SingleByteEncoding.php
  7. 6
      lib/Encoding/UTF16.php
  8. 4
      lib/Encoding/UTF8.php

10
lib/Encoding/Big5.php

@ -39,7 +39,7 @@ class Big5 implements StatelessEncoding {
if ($b < 0x80) {
return $b;
} elseif ($b == 0x80 || $b == 0xFF) {
return self::err($this->errMode, [$this->posChar -1, $this->posByte - 1]);
return $this->errDec($this->errMode, $this->posChar -1, $this->posByte - 1);
} else {
$lead = $b;
continue;
@ -63,10 +63,10 @@ class Big5 implements StatelessEncoding {
} else {
if ($b < 0x80) {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar -1, --$this->posByte - 1]);
return $this->errDec($this->errMode, $this->posChar -1, --$this->posByte - 1);
} else {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar -1, $this->posByte - 2]);
return $this->errDec($this->errMode, $this->posChar -1, $this->posByte - 2);
}
}
}
@ -80,7 +80,7 @@ class Big5 implements StatelessEncoding {
// dirty EOF
$this->dirtyEOF = 1;
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]);
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - $this->dirtyEOF);
}
}
@ -97,7 +97,7 @@ class Big5 implements StatelessEncoding {
$offset = ($trail < 0x3F) ? 0x40 : 0x62;
return chr($lead).chr($trail + $offset);
} else {
return self::err($fatal ? self::MODE_FATAL_ENC : self::MODE_HTML, $codePoint);
return self::errEnc(!$fatal, $codePoint);
}
}
}

10
lib/Encoding/EUCKR.php

@ -34,7 +34,7 @@ class EUCKR implements StatelessEncoding {
return $b;
} elseif ($b == 0x80 || $b == 0xFF) {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar -1, $this->posByte - 1]);
return $this->errDec($this->errMode, $this->posChar -1, $this->posByte - 1);
} else {
$lead = $b;
continue;
@ -50,10 +50,10 @@ class EUCKR implements StatelessEncoding {
} else {
if ($b < 0x80) {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar -1, --$this->posByte - 1]);
return $this->errDec($this->errMode, $this->posChar -1, --$this->posByte - 1);
} else {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar -1, $this->posByte - 2]);
return $this->errDec($this->errMode, $this->posChar -1, $this->posByte - 2);
}
}
}
@ -67,7 +67,7 @@ class EUCKR implements StatelessEncoding {
// dirty EOF
$this->dirtyEOF = 1;
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]);
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - $this->dirtyEOF);
}
}
@ -83,7 +83,7 @@ class EUCKR implements StatelessEncoding {
$trail = ($pointer % 190) + 0x41;
return chr($lead).chr($trail);
} else {
return self::err($fatal ? self::MODE_FATAL_ENC : self::MODE_HTML, $codePoint);
return self::errEnc(!$fatal, $codePoint);
}
}
}

7
lib/Encoding/Encoding.php

@ -9,14 +9,11 @@ namespace MensBeam\Intl\Encoding;
interface Encoding {
const MODE_NULL = 0;
const MODE_REPLACE = 1;
const MODE_HTML = 2;
const MODE_FATAL_DEC = 3;
const MODE_FATAL_ENC = 4;
const MODE_FATAL = 2;
const E_INVALID_CODE_POINT = 1;
const E_INVALID_BYTE = 2;
const E_INVALID_MODE = 3;
const E_UNAVAILABLE_CODE_POINT = 4;
const E_UNAVAILABLE_CODE_POINT = 3;
/** Constructs a new decoder
* @param bool $fatal If true, throw enceptions when encountering invalid input. If false, substitute U+FFFD REPLACEMENT CHARACTER instead

18
lib/Encoding/GBCommon.php

@ -30,7 +30,7 @@ abstract class GBCommon implements StatelessEncoding {
continue;
} else {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
}
} elseif ($second === 0) {
if ($b > 0x2F && $b < 0x3A) {
@ -43,10 +43,10 @@ abstract class GBCommon implements StatelessEncoding {
return self::TABLE_GBK[$pointer];
} elseif ($b < 0x80) {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, --$this->posByte]);
return $this->errDec($this->errMode, $this->posChar - 1, --$this->posByte);
} else {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
}
}
} elseif ($third === 0) {
@ -56,7 +56,7 @@ abstract class GBCommon implements StatelessEncoding {
} else {
$this->posByte -= 2;
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
}
} else {
if ($b > 0x2F && $b < 0x3A) {
@ -76,12 +76,12 @@ abstract class GBCommon implements StatelessEncoding {
return $codePointOffset + $pointer - $offset;
} else {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
}
} else {
$this->posByte -= 3;
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
}
}
}
@ -94,7 +94,7 @@ abstract class GBCommon implements StatelessEncoding {
// dirty EOF; note how many bytes the last character had
$this->dirtyEOF = ($third ? 3 : ($second ? 2 : 1));
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]);
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - $this->dirtyEOF);
}
}
@ -104,7 +104,7 @@ abstract class GBCommon implements StatelessEncoding {
} elseif ($codePoint < 128) {
return chr($codePoint);
} elseif ($codePoint == 0xE5E5) {
return self::err($fatal ? self::MODE_FATAL_ENC : self::MODE_HTML, $codePoint);
return self::errEnc(!$fatal, $codePoint);
} elseif (static::GBK && $codePoint == 0x20AC) {
return "\x80";
} else {
@ -115,7 +115,7 @@ abstract class GBCommon implements StatelessEncoding {
$offset = ($trail < 0x3F) ? 0x40 : 0x41;
return chr($lead).chr($trail + $offset);
} elseif (static::GBK) {
return self::err($fatal ? self::MODE_FATAL_ENC : self::MODE_HTML, $codePoint);
return self::errEnc(!$fatal, $codePoint);
} else {
if ($codePoint == 0xE7C7) {
$pointer = 7457;

33
lib/Encoding/GenericEncoding.php

@ -21,7 +21,7 @@ trait GenericEncoding {
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) {
$this->string = $string;
$this->lenByte = strlen($string);
$this->errMode = $fatal ? self::MODE_FATAL_DEC : self::MODE_REPLACE;
$this->errMode = $fatal ? self::MODE_FATAL : self::MODE_REPLACE;
$this->allowSurrogates = $allowSurrogates;
}
@ -158,27 +158,28 @@ trait GenericEncoding {
}
}
/** Handles decoding and encoding errors */
protected static function err(int $mode, $data = null) {
/** Handles decoding errors */
protected function errDec(int $mode, int $charOffset = -1, int $byteOffset = -1) {
assert(in_array($mode, [self::MODE_NULL, self::MODE_REPLACE, self::MODE_FATAL]), "Invalid error mode $mode");
assert($mode !== self::MODE_FATAL || ($charOffset > -1 && $byteOffset > -1), "Offsets for error reporting not supplied");
switch ($mode) {
case self::MODE_NULL:
// used internally during backward seeking for some encodings
return null; // @codeCoverageIgnore
case self::MODE_REPLACE:
// standard "replace" mode
return 0xFFFD;
case self::MODE_HTML:
// the "html" replacement mode; not applicable to Unicode transformation formats
return "&#".(string) $data.";";
case self::MODE_FATAL_DEC:
// fatal replacement mode for decoders
throw new DecoderException("Invalid code sequence at character offset {$data[0]} (byte offset {$data[1]})", self::E_INVALID_BYTE);
case self::MODE_FATAL_ENC:
// fatal replacement mode for encoders; not applicable to Unicode transformation formats
throw new EncoderException("Code point $data not available in target encoding", self::E_UNAVAILABLE_CODE_POINT);
default:
// indicative of internal bug; should never be triggered
throw new DecoderException("Invalid replacement mode {$mode}", self::E_INVALID_MODE); // @codeCoverageIgnore
case self::MODE_FATAL:
throw new DecoderException("Invalid code sequence at character offset $charOffset (byte offset $byteOffset)", self::E_INVALID_BYTE);
}
}
/** Handles encoding errors */
protected static function errEnc(bool $htmlMode, $data = null) {
if ($htmlMode) {
return "&#".(string) $data.";";
} else {
// fatal replacement mode for encoders; not applicable to Unicode transformation formats
throw new EncoderException("Code point $data not available in target encoding", self::E_UNAVAILABLE_CODE_POINT);
}
}
}

6
lib/Encoding/SingleByteEncoding.php

@ -21,7 +21,7 @@ abstract class SingleByteEncoding implements StatelessEncoding {
// if the byte is an ASCII character or end of input, simply return it
return $b;
} else {
return static::TABLE_DEC_CHAR[$p - 128] ?? UTF8::encode(static::err($this->errMode, [$this->posChar, $this->posChar]));
return static::TABLE_DEC_CHAR[$p - 128] ?? UTF8::encode($this->errDec($this->errMode, $this->posChar, $this->posChar));
}
}
@ -37,7 +37,7 @@ abstract class SingleByteEncoding implements StatelessEncoding {
// if the byte is an ASCII character or end of input, simply return it
return $p;
} else {
return static::TABLE_DEC_CODE[$p - 128] ?? static::err($this->errMode, [$this->posChar, $this->posChar]);
return static::TABLE_DEC_CODE[$p - 128] ?? $this->errDec($this->errMode, $this->posChar, $this->posChar);
}
}
@ -47,7 +47,7 @@ abstract class SingleByteEncoding implements StatelessEncoding {
} elseif ($codePoint < 128) {
return chr($codePoint);
} else {
return static::TABLE_ENC[$codePoint] ?? static::err($fatal ? self::MODE_FATAL_ENC : self::MODE_HTML, $codePoint);
return static::TABLE_ENC[$codePoint] ?? static::errEnc(!$fatal, $codePoint);
}
}

6
lib/Encoding/UTF16.php

@ -33,7 +33,7 @@ abstract class UTF16 implements Encoding {
return $lead_s;
} else {
$this->posByte -= 2;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]);
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 2);
}
} else {
if ($code >= 0xD800 && $code <= 0xDBFF) {
@ -43,7 +43,7 @@ abstract class UTF16 implements Encoding {
if ($this->allowSurrogates) {
return $code;
} else {
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]);
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 2);
}
} else {
return $code;
@ -60,7 +60,7 @@ abstract class UTF16 implements Encoding {
// dirty EOF; note how many bytes the last character had
// properly synchronizing UTF-16 is possible without retaining this information, but retaining it makes the task easier
$this->dirtyEOF = ($lead_s && $lead_b ? 3 : ($lead_s ? 2 : 1));
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]);
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - $this->dirtyEOF);
}
}

4
lib/Encoding/UTF8.php

@ -53,11 +53,11 @@ class UTF8 implements StatelessEncoding {
$point = $b & 0x7;
} else { // invalid byte
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar, $this->posByte]);
return $this->errDec($this->errMode, $this->posChar, $this->posByte);
}
} elseif ($b < $lower || $b > $upper) {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar, $this->posByte--]);
return $this->errDec($this->errMode, $this->posChar, $this->posByte--);
} else {
$lower = 0x80;
$upper = 0xBF;

Loading…
Cancel
Save