diff --git a/lib/Encoding/Big5.php b/lib/Encoding/Big5.php index 1ba5324..cb237a4 100644 --- a/lib/Encoding/Big5.php +++ b/lib/Encoding/Big5.php @@ -39,7 +39,7 @@ class Big5 implements StatelessEncoding { if ($b < 0x80) { return $b; } elseif ($b == 0x80 || $b == 0xFF) { - return self::err($this->errMode, [$this->posChar -1, $this->posByte - 1]); + return $this->errDec($this->errMode, $this->posChar -1, $this->posByte - 1); } else { $lead = $b; continue; @@ -63,10 +63,10 @@ class Big5 implements StatelessEncoding { } else { if ($b < 0x80) { $this->posErr = $this->posChar; - return self::err($this->errMode, [$this->posChar -1, --$this->posByte - 1]); + return $this->errDec($this->errMode, $this->posChar -1, --$this->posByte - 1); } else { $this->posErr = $this->posChar; - return self::err($this->errMode, [$this->posChar -1, $this->posByte - 2]); + return $this->errDec($this->errMode, $this->posChar -1, $this->posByte - 2); } } } @@ -80,7 +80,7 @@ class Big5 implements StatelessEncoding { // dirty EOF $this->dirtyEOF = 1; $this->posErr = $this->posChar; - return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]); + return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - $this->dirtyEOF); } } @@ -97,7 +97,7 @@ class Big5 implements StatelessEncoding { $offset = ($trail < 0x3F) ? 0x40 : 0x62; return chr($lead).chr($trail + $offset); } else { - return self::err($fatal ? self::MODE_FATAL_ENC : self::MODE_HTML, $codePoint); + return self::errEnc(!$fatal, $codePoint); } } } diff --git a/lib/Encoding/EUCKR.php b/lib/Encoding/EUCKR.php index 9c8c1f3..2345f3b 100644 --- a/lib/Encoding/EUCKR.php +++ b/lib/Encoding/EUCKR.php @@ -34,7 +34,7 @@ class EUCKR implements StatelessEncoding { return $b; } elseif ($b == 0x80 || $b == 0xFF) { $this->posErr = $this->posChar; - return self::err($this->errMode, [$this->posChar -1, $this->posByte - 1]); + return $this->errDec($this->errMode, $this->posChar -1, $this->posByte - 1); } else { $lead = $b; continue; @@ -50,10 +50,10 @@ class EUCKR implements StatelessEncoding { } else { if ($b < 0x80) { $this->posErr = $this->posChar; - return self::err($this->errMode, [$this->posChar -1, --$this->posByte - 1]); + return $this->errDec($this->errMode, $this->posChar -1, --$this->posByte - 1); } else { $this->posErr = $this->posChar; - return self::err($this->errMode, [$this->posChar -1, $this->posByte - 2]); + return $this->errDec($this->errMode, $this->posChar -1, $this->posByte - 2); } } } @@ -67,7 +67,7 @@ class EUCKR implements StatelessEncoding { // dirty EOF $this->dirtyEOF = 1; $this->posErr = $this->posChar; - return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]); + return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - $this->dirtyEOF); } } @@ -83,7 +83,7 @@ class EUCKR implements StatelessEncoding { $trail = ($pointer % 190) + 0x41; return chr($lead).chr($trail); } else { - return self::err($fatal ? self::MODE_FATAL_ENC : self::MODE_HTML, $codePoint); + return self::errEnc(!$fatal, $codePoint); } } } diff --git a/lib/Encoding/Encoding.php b/lib/Encoding/Encoding.php index d3e0f30..6e8a13c 100644 --- a/lib/Encoding/Encoding.php +++ b/lib/Encoding/Encoding.php @@ -9,14 +9,11 @@ namespace MensBeam\Intl\Encoding; interface Encoding { const MODE_NULL = 0; const MODE_REPLACE = 1; - const MODE_HTML = 2; - const MODE_FATAL_DEC = 3; - const MODE_FATAL_ENC = 4; + const MODE_FATAL = 2; const E_INVALID_CODE_POINT = 1; const E_INVALID_BYTE = 2; - const E_INVALID_MODE = 3; - const E_UNAVAILABLE_CODE_POINT = 4; + const E_UNAVAILABLE_CODE_POINT = 3; /** Constructs a new decoder * @param bool $fatal If true, throw enceptions when encountering invalid input. If false, substitute U+FFFD REPLACEMENT CHARACTER instead diff --git a/lib/Encoding/GBCommon.php b/lib/Encoding/GBCommon.php index daeca9d..29a2d31 100644 --- a/lib/Encoding/GBCommon.php +++ b/lib/Encoding/GBCommon.php @@ -30,7 +30,7 @@ abstract class GBCommon implements StatelessEncoding { continue; } else { $this->posErr = $this->posChar; - return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); + return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1); } } elseif ($second === 0) { if ($b > 0x2F && $b < 0x3A) { @@ -43,10 +43,10 @@ abstract class GBCommon implements StatelessEncoding { return self::TABLE_GBK[$pointer]; } elseif ($b < 0x80) { $this->posErr = $this->posChar; - return self::err($this->errMode, [$this->posChar - 1, --$this->posByte]); + return $this->errDec($this->errMode, $this->posChar - 1, --$this->posByte); } else { $this->posErr = $this->posChar; - return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); + return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1); } } } elseif ($third === 0) { @@ -56,7 +56,7 @@ abstract class GBCommon implements StatelessEncoding { } else { $this->posByte -= 2; $this->posErr = $this->posChar; - return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); + return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1); } } else { if ($b > 0x2F && $b < 0x3A) { @@ -76,12 +76,12 @@ abstract class GBCommon implements StatelessEncoding { return $codePointOffset + $pointer - $offset; } else { $this->posErr = $this->posChar; - return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); + return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1); } } else { $this->posByte -= 3; $this->posErr = $this->posChar; - return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); + return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1); } } } @@ -94,7 +94,7 @@ abstract class GBCommon implements StatelessEncoding { // dirty EOF; note how many bytes the last character had $this->dirtyEOF = ($third ? 3 : ($second ? 2 : 1)); $this->posErr = $this->posChar; - return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]); + return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - $this->dirtyEOF); } } @@ -104,7 +104,7 @@ abstract class GBCommon implements StatelessEncoding { } elseif ($codePoint < 128) { return chr($codePoint); } elseif ($codePoint == 0xE5E5) { - return self::err($fatal ? self::MODE_FATAL_ENC : self::MODE_HTML, $codePoint); + return self::errEnc(!$fatal, $codePoint); } elseif (static::GBK && $codePoint == 0x20AC) { return "\x80"; } else { @@ -115,7 +115,7 @@ abstract class GBCommon implements StatelessEncoding { $offset = ($trail < 0x3F) ? 0x40 : 0x41; return chr($lead).chr($trail + $offset); } elseif (static::GBK) { - return self::err($fatal ? self::MODE_FATAL_ENC : self::MODE_HTML, $codePoint); + return self::errEnc(!$fatal, $codePoint); } else { if ($codePoint == 0xE7C7) { $pointer = 7457; diff --git a/lib/Encoding/GenericEncoding.php b/lib/Encoding/GenericEncoding.php index 97d7be1..e6435d9 100644 --- a/lib/Encoding/GenericEncoding.php +++ b/lib/Encoding/GenericEncoding.php @@ -21,7 +21,7 @@ trait GenericEncoding { public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) { $this->string = $string; $this->lenByte = strlen($string); - $this->errMode = $fatal ? self::MODE_FATAL_DEC : self::MODE_REPLACE; + $this->errMode = $fatal ? self::MODE_FATAL : self::MODE_REPLACE; $this->allowSurrogates = $allowSurrogates; } @@ -158,27 +158,28 @@ trait GenericEncoding { } } - /** Handles decoding and encoding errors */ - protected static function err(int $mode, $data = null) { + /** Handles decoding errors */ + protected function errDec(int $mode, int $charOffset = -1, int $byteOffset = -1) { + assert(in_array($mode, [self::MODE_NULL, self::MODE_REPLACE, self::MODE_FATAL]), "Invalid error mode $mode"); + assert($mode !== self::MODE_FATAL || ($charOffset > -1 && $byteOffset > -1), "Offsets for error reporting not supplied"); switch ($mode) { case self::MODE_NULL: // used internally during backward seeking for some encodings return null; // @codeCoverageIgnore case self::MODE_REPLACE: - // standard "replace" mode return 0xFFFD; - case self::MODE_HTML: - // the "html" replacement mode; not applicable to Unicode transformation formats - return "&#".(string) $data.";"; - case self::MODE_FATAL_DEC: - // fatal replacement mode for decoders - throw new DecoderException("Invalid code sequence at character offset {$data[0]} (byte offset {$data[1]})", self::E_INVALID_BYTE); - case self::MODE_FATAL_ENC: - // fatal replacement mode for encoders; not applicable to Unicode transformation formats - throw new EncoderException("Code point $data not available in target encoding", self::E_UNAVAILABLE_CODE_POINT); - default: - // indicative of internal bug; should never be triggered - throw new DecoderException("Invalid replacement mode {$mode}", self::E_INVALID_MODE); // @codeCoverageIgnore + case self::MODE_FATAL: + throw new DecoderException("Invalid code sequence at character offset $charOffset (byte offset $byteOffset)", self::E_INVALID_BYTE); + } + } + + /** Handles encoding errors */ + protected static function errEnc(bool $htmlMode, $data = null) { + if ($htmlMode) { + return "&#".(string) $data.";"; + } else { + // fatal replacement mode for encoders; not applicable to Unicode transformation formats + throw new EncoderException("Code point $data not available in target encoding", self::E_UNAVAILABLE_CODE_POINT); } } } diff --git a/lib/Encoding/SingleByteEncoding.php b/lib/Encoding/SingleByteEncoding.php index 0b48684..19a868c 100644 --- a/lib/Encoding/SingleByteEncoding.php +++ b/lib/Encoding/SingleByteEncoding.php @@ -21,7 +21,7 @@ abstract class SingleByteEncoding implements StatelessEncoding { // if the byte is an ASCII character or end of input, simply return it return $b; } else { - return static::TABLE_DEC_CHAR[$p - 128] ?? UTF8::encode(static::err($this->errMode, [$this->posChar, $this->posChar])); + return static::TABLE_DEC_CHAR[$p - 128] ?? UTF8::encode($this->errDec($this->errMode, $this->posChar, $this->posChar)); } } @@ -37,7 +37,7 @@ abstract class SingleByteEncoding implements StatelessEncoding { // if the byte is an ASCII character or end of input, simply return it return $p; } else { - return static::TABLE_DEC_CODE[$p - 128] ?? static::err($this->errMode, [$this->posChar, $this->posChar]); + return static::TABLE_DEC_CODE[$p - 128] ?? $this->errDec($this->errMode, $this->posChar, $this->posChar); } } @@ -47,7 +47,7 @@ abstract class SingleByteEncoding implements StatelessEncoding { } elseif ($codePoint < 128) { return chr($codePoint); } else { - return static::TABLE_ENC[$codePoint] ?? static::err($fatal ? self::MODE_FATAL_ENC : self::MODE_HTML, $codePoint); + return static::TABLE_ENC[$codePoint] ?? static::errEnc(!$fatal, $codePoint); } } diff --git a/lib/Encoding/UTF16.php b/lib/Encoding/UTF16.php index 94bebf9..7f1bad9 100644 --- a/lib/Encoding/UTF16.php +++ b/lib/Encoding/UTF16.php @@ -33,7 +33,7 @@ abstract class UTF16 implements Encoding { return $lead_s; } else { $this->posByte -= 2; - return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]); + return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 2); } } else { if ($code >= 0xD800 && $code <= 0xDBFF) { @@ -43,7 +43,7 @@ abstract class UTF16 implements Encoding { if ($this->allowSurrogates) { return $code; } else { - return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]); + return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 2); } } else { return $code; @@ -60,7 +60,7 @@ abstract class UTF16 implements Encoding { // dirty EOF; note how many bytes the last character had // properly synchronizing UTF-16 is possible without retaining this information, but retaining it makes the task easier $this->dirtyEOF = ($lead_s && $lead_b ? 3 : ($lead_s ? 2 : 1)); - return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]); + return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - $this->dirtyEOF); } } diff --git a/lib/Encoding/UTF8.php b/lib/Encoding/UTF8.php index ea39156..c3cebb3 100644 --- a/lib/Encoding/UTF8.php +++ b/lib/Encoding/UTF8.php @@ -53,11 +53,11 @@ class UTF8 implements StatelessEncoding { $point = $b & 0x7; } else { // invalid byte $this->posErr = $this->posChar; - return self::err($this->errMode, [$this->posChar, $this->posByte]); + return $this->errDec($this->errMode, $this->posChar, $this->posByte); } } elseif ($b < $lower || $b > $upper) { $this->posErr = $this->posChar; - return self::err($this->errMode, [$this->posChar, $this->posByte--]); + return $this->errDec($this->errMode, $this->posChar, $this->posByte--); } else { $lower = 0x80; $upper = 0xBF;