Browse Source

Split error handlers

multi-byte
J. King 4 years ago
parent
commit
7339176e3e
  1. 10
      lib/Encoding/Big5.php
  2. 10
      lib/Encoding/EUCKR.php
  3. 7
      lib/Encoding/Encoding.php
  4. 18
      lib/Encoding/GBCommon.php
  5. 33
      lib/Encoding/GenericEncoding.php
  6. 6
      lib/Encoding/SingleByteEncoding.php
  7. 6
      lib/Encoding/UTF16.php
  8. 4
      lib/Encoding/UTF8.php

10
lib/Encoding/Big5.php

@ -39,7 +39,7 @@ class Big5 implements StatelessEncoding {
if ($b < 0x80) { if ($b < 0x80) {
return $b; return $b;
} elseif ($b == 0x80 || $b == 0xFF) { } elseif ($b == 0x80 || $b == 0xFF) {
return self::err($this->errMode, [$this->posChar -1, $this->posByte - 1]); return $this->errDec($this->errMode, $this->posChar -1, $this->posByte - 1);
} else { } else {
$lead = $b; $lead = $b;
continue; continue;
@ -63,10 +63,10 @@ class Big5 implements StatelessEncoding {
} else { } else {
if ($b < 0x80) { if ($b < 0x80) {
$this->posErr = $this->posChar; $this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar -1, --$this->posByte - 1]); return $this->errDec($this->errMode, $this->posChar -1, --$this->posByte - 1);
} else { } else {
$this->posErr = $this->posChar; $this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar -1, $this->posByte - 2]); return $this->errDec($this->errMode, $this->posChar -1, $this->posByte - 2);
} }
} }
} }
@ -80,7 +80,7 @@ class Big5 implements StatelessEncoding {
// dirty EOF // dirty EOF
$this->dirtyEOF = 1; $this->dirtyEOF = 1;
$this->posErr = $this->posChar; $this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]); return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - $this->dirtyEOF);
} }
} }
@ -97,7 +97,7 @@ class Big5 implements StatelessEncoding {
$offset = ($trail < 0x3F) ? 0x40 : 0x62; $offset = ($trail < 0x3F) ? 0x40 : 0x62;
return chr($lead).chr($trail + $offset); return chr($lead).chr($trail + $offset);
} else { } else {
return self::err($fatal ? self::MODE_FATAL_ENC : self::MODE_HTML, $codePoint); return self::errEnc(!$fatal, $codePoint);
} }
} }
} }

10
lib/Encoding/EUCKR.php

@ -34,7 +34,7 @@ class EUCKR implements StatelessEncoding {
return $b; return $b;
} elseif ($b == 0x80 || $b == 0xFF) { } elseif ($b == 0x80 || $b == 0xFF) {
$this->posErr = $this->posChar; $this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar -1, $this->posByte - 1]); return $this->errDec($this->errMode, $this->posChar -1, $this->posByte - 1);
} else { } else {
$lead = $b; $lead = $b;
continue; continue;
@ -50,10 +50,10 @@ class EUCKR implements StatelessEncoding {
} else { } else {
if ($b < 0x80) { if ($b < 0x80) {
$this->posErr = $this->posChar; $this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar -1, --$this->posByte - 1]); return $this->errDec($this->errMode, $this->posChar -1, --$this->posByte - 1);
} else { } else {
$this->posErr = $this->posChar; $this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar -1, $this->posByte - 2]); return $this->errDec($this->errMode, $this->posChar -1, $this->posByte - 2);
} }
} }
} }
@ -67,7 +67,7 @@ class EUCKR implements StatelessEncoding {
// dirty EOF // dirty EOF
$this->dirtyEOF = 1; $this->dirtyEOF = 1;
$this->posErr = $this->posChar; $this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]); return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - $this->dirtyEOF);
} }
} }
@ -83,7 +83,7 @@ class EUCKR implements StatelessEncoding {
$trail = ($pointer % 190) + 0x41; $trail = ($pointer % 190) + 0x41;
return chr($lead).chr($trail); return chr($lead).chr($trail);
} else { } else {
return self::err($fatal ? self::MODE_FATAL_ENC : self::MODE_HTML, $codePoint); return self::errEnc(!$fatal, $codePoint);
} }
} }
} }

7
lib/Encoding/Encoding.php

@ -9,14 +9,11 @@ namespace MensBeam\Intl\Encoding;
interface Encoding { interface Encoding {
const MODE_NULL = 0; const MODE_NULL = 0;
const MODE_REPLACE = 1; const MODE_REPLACE = 1;
const MODE_HTML = 2; const MODE_FATAL = 2;
const MODE_FATAL_DEC = 3;
const MODE_FATAL_ENC = 4;
const E_INVALID_CODE_POINT = 1; const E_INVALID_CODE_POINT = 1;
const E_INVALID_BYTE = 2; const E_INVALID_BYTE = 2;
const E_INVALID_MODE = 3; const E_UNAVAILABLE_CODE_POINT = 3;
const E_UNAVAILABLE_CODE_POINT = 4;
/** Constructs a new decoder /** Constructs a new decoder
* @param bool $fatal If true, throw enceptions when encountering invalid input. If false, substitute U+FFFD REPLACEMENT CHARACTER instead * @param bool $fatal If true, throw enceptions when encountering invalid input. If false, substitute U+FFFD REPLACEMENT CHARACTER instead

18
lib/Encoding/GBCommon.php

@ -30,7 +30,7 @@ abstract class GBCommon implements StatelessEncoding {
continue; continue;
} else { } else {
$this->posErr = $this->posChar; $this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
} }
} elseif ($second === 0) { } elseif ($second === 0) {
if ($b > 0x2F && $b < 0x3A) { if ($b > 0x2F && $b < 0x3A) {
@ -43,10 +43,10 @@ abstract class GBCommon implements StatelessEncoding {
return self::TABLE_GBK[$pointer]; return self::TABLE_GBK[$pointer];
} elseif ($b < 0x80) { } elseif ($b < 0x80) {
$this->posErr = $this->posChar; $this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, --$this->posByte]); return $this->errDec($this->errMode, $this->posChar - 1, --$this->posByte);
} else { } else {
$this->posErr = $this->posChar; $this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
} }
} }
} elseif ($third === 0) { } elseif ($third === 0) {
@ -56,7 +56,7 @@ abstract class GBCommon implements StatelessEncoding {
} else { } else {
$this->posByte -= 2; $this->posByte -= 2;
$this->posErr = $this->posChar; $this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
} }
} else { } else {
if ($b > 0x2F && $b < 0x3A) { if ($b > 0x2F && $b < 0x3A) {
@ -76,12 +76,12 @@ abstract class GBCommon implements StatelessEncoding {
return $codePointOffset + $pointer - $offset; return $codePointOffset + $pointer - $offset;
} else { } else {
$this->posErr = $this->posChar; $this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
} }
} else { } else {
$this->posByte -= 3; $this->posByte -= 3;
$this->posErr = $this->posChar; $this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 1);
} }
} }
} }
@ -94,7 +94,7 @@ abstract class GBCommon implements StatelessEncoding {
// dirty EOF; note how many bytes the last character had // dirty EOF; note how many bytes the last character had
$this->dirtyEOF = ($third ? 3 : ($second ? 2 : 1)); $this->dirtyEOF = ($third ? 3 : ($second ? 2 : 1));
$this->posErr = $this->posChar; $this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]); return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - $this->dirtyEOF);
} }
} }
@ -104,7 +104,7 @@ abstract class GBCommon implements StatelessEncoding {
} elseif ($codePoint < 128) { } elseif ($codePoint < 128) {
return chr($codePoint); return chr($codePoint);
} elseif ($codePoint == 0xE5E5) { } elseif ($codePoint == 0xE5E5) {
return self::err($fatal ? self::MODE_FATAL_ENC : self::MODE_HTML, $codePoint); return self::errEnc(!$fatal, $codePoint);
} elseif (static::GBK && $codePoint == 0x20AC) { } elseif (static::GBK && $codePoint == 0x20AC) {
return "\x80"; return "\x80";
} else { } else {
@ -115,7 +115,7 @@ abstract class GBCommon implements StatelessEncoding {
$offset = ($trail < 0x3F) ? 0x40 : 0x41; $offset = ($trail < 0x3F) ? 0x40 : 0x41;
return chr($lead).chr($trail + $offset); return chr($lead).chr($trail + $offset);
} elseif (static::GBK) { } elseif (static::GBK) {
return self::err($fatal ? self::MODE_FATAL_ENC : self::MODE_HTML, $codePoint); return self::errEnc(!$fatal, $codePoint);
} else { } else {
if ($codePoint == 0xE7C7) { if ($codePoint == 0xE7C7) {
$pointer = 7457; $pointer = 7457;

33
lib/Encoding/GenericEncoding.php

@ -21,7 +21,7 @@ trait GenericEncoding {
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) { public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) {
$this->string = $string; $this->string = $string;
$this->lenByte = strlen($string); $this->lenByte = strlen($string);
$this->errMode = $fatal ? self::MODE_FATAL_DEC : self::MODE_REPLACE; $this->errMode = $fatal ? self::MODE_FATAL : self::MODE_REPLACE;
$this->allowSurrogates = $allowSurrogates; $this->allowSurrogates = $allowSurrogates;
} }
@ -158,27 +158,28 @@ trait GenericEncoding {
} }
} }
/** Handles decoding and encoding errors */ /** Handles decoding errors */
protected static function err(int $mode, $data = null) { protected function errDec(int $mode, int $charOffset = -1, int $byteOffset = -1) {
assert(in_array($mode, [self::MODE_NULL, self::MODE_REPLACE, self::MODE_FATAL]), "Invalid error mode $mode");
assert($mode !== self::MODE_FATAL || ($charOffset > -1 && $byteOffset > -1), "Offsets for error reporting not supplied");
switch ($mode) { switch ($mode) {
case self::MODE_NULL: case self::MODE_NULL:
// used internally during backward seeking for some encodings // used internally during backward seeking for some encodings
return null; // @codeCoverageIgnore return null; // @codeCoverageIgnore
case self::MODE_REPLACE: case self::MODE_REPLACE:
// standard "replace" mode
return 0xFFFD; return 0xFFFD;
case self::MODE_HTML: case self::MODE_FATAL:
// the "html" replacement mode; not applicable to Unicode transformation formats throw new DecoderException("Invalid code sequence at character offset $charOffset (byte offset $byteOffset)", self::E_INVALID_BYTE);
return "&#".(string) $data.";"; }
case self::MODE_FATAL_DEC: }
// fatal replacement mode for decoders
throw new DecoderException("Invalid code sequence at character offset {$data[0]} (byte offset {$data[1]})", self::E_INVALID_BYTE); /** Handles encoding errors */
case self::MODE_FATAL_ENC: protected static function errEnc(bool $htmlMode, $data = null) {
// fatal replacement mode for encoders; not applicable to Unicode transformation formats if ($htmlMode) {
throw new EncoderException("Code point $data not available in target encoding", self::E_UNAVAILABLE_CODE_POINT); return "&#".(string) $data.";";
default: } else {
// indicative of internal bug; should never be triggered // fatal replacement mode for encoders; not applicable to Unicode transformation formats
throw new DecoderException("Invalid replacement mode {$mode}", self::E_INVALID_MODE); // @codeCoverageIgnore throw new EncoderException("Code point $data not available in target encoding", self::E_UNAVAILABLE_CODE_POINT);
} }
} }
} }

6
lib/Encoding/SingleByteEncoding.php

@ -21,7 +21,7 @@ abstract class SingleByteEncoding implements StatelessEncoding {
// if the byte is an ASCII character or end of input, simply return it // if the byte is an ASCII character or end of input, simply return it
return $b; return $b;
} else { } else {
return static::TABLE_DEC_CHAR[$p - 128] ?? UTF8::encode(static::err($this->errMode, [$this->posChar, $this->posChar])); return static::TABLE_DEC_CHAR[$p - 128] ?? UTF8::encode($this->errDec($this->errMode, $this->posChar, $this->posChar));
} }
} }
@ -37,7 +37,7 @@ abstract class SingleByteEncoding implements StatelessEncoding {
// if the byte is an ASCII character or end of input, simply return it // if the byte is an ASCII character or end of input, simply return it
return $p; return $p;
} else { } else {
return static::TABLE_DEC_CODE[$p - 128] ?? static::err($this->errMode, [$this->posChar, $this->posChar]); return static::TABLE_DEC_CODE[$p - 128] ?? $this->errDec($this->errMode, $this->posChar, $this->posChar);
} }
} }
@ -47,7 +47,7 @@ abstract class SingleByteEncoding implements StatelessEncoding {
} elseif ($codePoint < 128) { } elseif ($codePoint < 128) {
return chr($codePoint); return chr($codePoint);
} else { } else {
return static::TABLE_ENC[$codePoint] ?? static::err($fatal ? self::MODE_FATAL_ENC : self::MODE_HTML, $codePoint); return static::TABLE_ENC[$codePoint] ?? static::errEnc(!$fatal, $codePoint);
} }
} }

6
lib/Encoding/UTF16.php

@ -33,7 +33,7 @@ abstract class UTF16 implements Encoding {
return $lead_s; return $lead_s;
} else { } else {
$this->posByte -= 2; $this->posByte -= 2;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]); return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 2);
} }
} else { } else {
if ($code >= 0xD800 && $code <= 0xDBFF) { if ($code >= 0xD800 && $code <= 0xDBFF) {
@ -43,7 +43,7 @@ abstract class UTF16 implements Encoding {
if ($this->allowSurrogates) { if ($this->allowSurrogates) {
return $code; return $code;
} else { } else {
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]); return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 2);
} }
} else { } else {
return $code; return $code;
@ -60,7 +60,7 @@ abstract class UTF16 implements Encoding {
// dirty EOF; note how many bytes the last character had // dirty EOF; note how many bytes the last character had
// properly synchronizing UTF-16 is possible without retaining this information, but retaining it makes the task easier // properly synchronizing UTF-16 is possible without retaining this information, but retaining it makes the task easier
$this->dirtyEOF = ($lead_s && $lead_b ? 3 : ($lead_s ? 2 : 1)); $this->dirtyEOF = ($lead_s && $lead_b ? 3 : ($lead_s ? 2 : 1));
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]); return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - $this->dirtyEOF);
} }
} }

4
lib/Encoding/UTF8.php

@ -53,11 +53,11 @@ class UTF8 implements StatelessEncoding {
$point = $b & 0x7; $point = $b & 0x7;
} else { // invalid byte } else { // invalid byte
$this->posErr = $this->posChar; $this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar, $this->posByte]); return $this->errDec($this->errMode, $this->posChar, $this->posByte);
} }
} elseif ($b < $lower || $b > $upper) { } elseif ($b < $lower || $b > $upper) {
$this->posErr = $this->posChar; $this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar, $this->posByte--]); return $this->errDec($this->errMode, $this->posChar, $this->posByte--);
} else { } else {
$lower = 0x80; $lower = 0x80;
$upper = 0xBF; $upper = 0xBF;

Loading…
Cancel
Save