Compare commits
93 Commits
91 changed files with 7579 additions and 2555 deletions
File diff suppressed because one or more lines are too long
@ -0,0 +1,250 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\Encoding; |
||||
|
|
||||
|
abstract class AbstractEncoding implements Decoder { |
||||
|
protected const MODE_NULL = 0; |
||||
|
protected const MODE_REPLACE = 1; |
||||
|
protected const MODE_FATAL = 2; |
||||
|
|
||||
|
protected const HIGH_BYTES = "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF"; |
||||
|
|
||||
|
/** @var string $string The string being decoded */ |
||||
|
protected $string; |
||||
|
/** @var int $posByte The current byte position in the string */ |
||||
|
protected $posByte = 0; |
||||
|
/** @var int $posChar The current character (code point) position in the string */ |
||||
|
protected $posChar = 0; |
||||
|
/** @var int $lenByte The length of the string, in bytes */ |
||||
|
protected $lenByte = null; |
||||
|
/** @var int $lenChar The length of the string in characters, if known */ |
||||
|
protected $lenChar = null; |
||||
|
/** @var array $errStack A list of error data to aid in backwards seeking; the most recent error is kept off the stack */ |
||||
|
protected $errStack = []; |
||||
|
/** @var int $errMark The byte position marking the most recent error. The one or more bytes previous to this position constitute an invalid character */ |
||||
|
protected $errMark = -1; |
||||
|
/** @var int $errSync The byte position to which to move to skip over the most recent erroneous character */ |
||||
|
protected $errSync = -2; |
||||
|
/** @var int $errMode The selected error mode (fatal or replace) */ |
||||
|
protected $errMode = self::MODE_REPLACE; |
||||
|
/** @var bool $allowSurrogates Whether surrogates in encodings other than UTF-16 should be passed through */ |
||||
|
protected $allowSurrogates = false; |
||||
|
/** @var bool $selfSynchronizing Whether the concrete class represents a self-synchronizing decoder. Such decoders do not use the error stack */ |
||||
|
protected $selfSynchronizing = false; |
||||
|
/** @var string[] $stateProps The list of properties which constitutee state which must be saved when peeking/seeking; some encodings may add to this last for their own purposes */ |
||||
|
protected $stateProps = ["posChar", "posByte", "posErr"]; |
||||
|
|
||||
|
public $posErr = 0; |
||||
|
|
||||
|
/** Seeks backwards through the string the specified number of characters. |
||||
|
* If the beginning of the string is reached before the requested number |
||||
|
* of characters has been skipped over, the number of remaining characters |
||||
|
* is returned. |
||||
|
*/ |
||||
|
abstract protected function seekBack(int $distance): int; |
||||
|
|
||||
|
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) { |
||||
|
$this->string = $string; |
||||
|
$this->lenByte = strlen($string); |
||||
|
$this->errMode = $fatal ? self::MODE_FATAL : self::MODE_REPLACE; |
||||
|
$this->allowSurrogates = $allowSurrogates; |
||||
|
} |
||||
|
|
||||
|
public function posByte(): int { |
||||
|
return $this->posByte; |
||||
|
} |
||||
|
|
||||
|
public function posChar(): int { |
||||
|
return $this->posChar; |
||||
|
} |
||||
|
|
||||
|
public function rewind(): void { |
||||
|
$this->posByte = 0; |
||||
|
$this->posChar = 0; |
||||
|
$this->errMark = -1; |
||||
|
$this->errSync = -2; |
||||
|
$this->errStack = []; |
||||
|
} |
||||
|
|
||||
|
public function nextChar(): string { |
||||
|
// get the byte at the current position |
||||
|
$b = $this->string[$this->posByte] ?? ""; |
||||
|
if ($b === "") { |
||||
|
// if the byte is end of input, simply return it |
||||
|
return ""; |
||||
|
} elseif (ord($b) < 0x80) { |
||||
|
// if the byte is an ASCII character, simply return it |
||||
|
$this->posChar++; |
||||
|
$this->posByte++; |
||||
|
return $b; |
||||
|
} else { |
||||
|
// otherwise return the serialization of the code point at the current position |
||||
|
return UTF8::encode($this->nextCode()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public function seek(int $distance): int { |
||||
|
if ($distance > 0) { |
||||
|
do { |
||||
|
$p = $this->nextCode(); |
||||
|
} while ($p !== false && --$distance); |
||||
|
return $distance; |
||||
|
} elseif ($distance < 0) { |
||||
|
$distance = abs($distance); |
||||
|
if (!$this->posChar) { |
||||
|
return $distance; |
||||
|
} |
||||
|
$mode = $this->errMode; |
||||
|
$this->errMode = self::MODE_NULL; |
||||
|
$out = $this->seekBack($distance); |
||||
|
$this->errMode = $mode; |
||||
|
return $out; |
||||
|
} else { |
||||
|
return 0; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public function peekChar(int $num = 1): string { |
||||
|
$out = ""; |
||||
|
$state = $this->stateSave(); |
||||
|
try { |
||||
|
while ($num-- > 0 && ($b = $this->nextChar()) !== "") { |
||||
|
$out .= $b; |
||||
|
} |
||||
|
} finally { |
||||
|
$this->stateApply($state); |
||||
|
} |
||||
|
return $out; |
||||
|
} |
||||
|
|
||||
|
public function peekCode(int $num = 1): array { |
||||
|
$out = []; |
||||
|
$state = $this->stateSave(); |
||||
|
try { |
||||
|
while ($num-- > 0 && ($b = $this->nextCode()) !== false) { |
||||
|
$out[] = $b; |
||||
|
} |
||||
|
} finally { |
||||
|
$this->stateApply($state); |
||||
|
} |
||||
|
return $out; |
||||
|
} |
||||
|
|
||||
|
public function lenByte(): int { |
||||
|
return $this->lenByte; |
||||
|
} |
||||
|
|
||||
|
public function lenChar(): int { |
||||
|
return $this->lenChar ?? (function() { |
||||
|
$state = $this->stateSave(); |
||||
|
while ($this->nextCode() !== false); |
||||
|
$this->lenChar = $this->posChar; |
||||
|
$this->stateApply($state); |
||||
|
return $this->lenChar; |
||||
|
})(); |
||||
|
} |
||||
|
|
||||
|
public function eof(): bool { |
||||
|
return $this->posByte >= $this->lenByte; |
||||
|
} |
||||
|
|
||||
|
public function chars(): \Generator { |
||||
|
while (($c = $this->nextChar()) !== "") { |
||||
|
yield ($this->posChar - 1) => $c; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public function codes(): \Generator { |
||||
|
while (($c = $this->nextCode()) !== false) { |
||||
|
yield ($this->posChar - 1) => $c; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public function asciiSpan(string $mask, int $length = null): string { |
||||
|
$mask = preg_replace('/[\x80-\xFF]/s', "", $mask); |
||||
|
if ($length !== null) { |
||||
|
$len = strspn($this->string, $mask, $this->posByte, $length); |
||||
|
} else { |
||||
|
$len = strspn($this->string, $mask, $this->posByte); |
||||
|
} |
||||
|
if ($len) { |
||||
|
$out = substr($this->string, $this->posByte, $len); |
||||
|
$this->posByte += $len; |
||||
|
$this->posChar += $len; |
||||
|
return $out; |
||||
|
} else { |
||||
|
return ""; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public function asciiSpanNot(string $mask, int $length = null): string { |
||||
|
$mask .= self::HIGH_BYTES; |
||||
|
if ($length !== null) { |
||||
|
$len = strcspn($this->string, $mask, $this->posByte, $length); |
||||
|
} else { |
||||
|
$len = strcspn($this->string, $mask, $this->posByte); |
||||
|
} |
||||
|
if ($len) { |
||||
|
$out = substr($this->string, $this->posByte, $len); |
||||
|
$this->posByte += $len; |
||||
|
$this->posChar += $len; |
||||
|
return $out; |
||||
|
} else { |
||||
|
return ""; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** Returns a copy of the decoder's state to keep in memory */ |
||||
|
protected function stateSave(): array { |
||||
|
$out = ['errCount' => sizeof($this->errStack)]; |
||||
|
foreach ($this->stateProps as $prop) { |
||||
|
$out[$prop] = $this->$prop; |
||||
|
} |
||||
|
return $out; |
||||
|
} |
||||
|
|
||||
|
/** Sets the decoder's state to the values specified */ |
||||
|
protected function stateApply(array $state): void { |
||||
|
while (sizeof($this->errStack) > $state['errCount']) { |
||||
|
list($this->errMark, $this->errSync) = array_pop($this->errStack); |
||||
|
} |
||||
|
unset($state['errCount']); |
||||
|
foreach ($state as $key => $value) { |
||||
|
$this->$key = $value; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** Handles decoding errors */ |
||||
|
protected function errDec(int $mode, int $charOffset, int $byteOffset): ?int { |
||||
|
if ($mode !== self::MODE_NULL) { |
||||
|
// expose the error to the user; this disambiguates a literal replacement character |
||||
|
$this->posErr = $this->posChar; |
||||
|
// unless the decoder is self-synchronizing, mark the error so that it can be skipped when seeking back |
||||
|
if (!$this->selfSynchronizing) { |
||||
|
$this->errStack[] = [$this->errMark, $this->errSync]; |
||||
|
$this->errMark = $this->posByte; |
||||
|
$this->errSync = $byteOffset; |
||||
|
} |
||||
|
if ($mode === self::MODE_FATAL) { |
||||
|
throw new DecoderException("Invalid code sequence at character offset $charOffset (byte offset $byteOffset)", self::E_INVALID_BYTE); |
||||
|
} else { |
||||
|
return 0xFFFD; |
||||
|
} |
||||
|
} |
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
/** Handles encoding errors */ |
||||
|
protected static function errEnc(bool $htmlMode, $data = null): string { |
||||
|
if ($htmlMode) { |
||||
|
return "&#".(string) $data.";"; |
||||
|
} else { |
||||
|
// fatal replacement mode for encoders; not applicable to Unicode transformation formats |
||||
|
throw new EncoderException("Code point $data not available in target encoding", Coder::E_UNAVAILABLE_CODE_POINT); |
||||
|
} |
||||
|
} |
||||
|
} |
File diff suppressed because one or more lines are too long
@ -0,0 +1,20 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\Encoding; |
||||
|
|
||||
|
interface Coder { |
||||
|
public const E_INVALID_CODE_POINT = 1; |
||||
|
public const E_UNAVAILABLE_CODE_POINT = 3; |
||||
|
public const E_UNAVAILABLE_ENCODER = 4; |
||||
|
|
||||
|
/** Returns the encoding of $codePoint as a byte string |
||||
|
* |
||||
|
* @param int $codePoint The Unicode code point to encode. If less than 0 or greater than 1114111, an exception is thrown |
||||
|
* @param bool $fatal Whether an exception will be thrown if the code point cannot be encoded into a character; if false HTML character references will be substituted |
||||
|
*/ |
||||
|
public static function encode(int $codePoint, bool $fatal = true): string; |
||||
|
} |
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -0,0 +1,322 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\Encoding; |
||||
|
|
||||
|
use MensBeam\Intl\Encoding as Matcher; |
||||
|
|
||||
|
class Encoder { |
||||
|
protected $name; |
||||
|
protected $fatal = true; |
||||
|
protected $mode = null; |
||||
|
|
||||
|
/** Constructs a new encoder for the specified $label |
||||
|
* |
||||
|
* @param string $label One of the encoding labels listed in the specification e.g. "utf-8", "Latin1", "shift_JIS" |
||||
|
* @param bool $fatal If true (the default) exceptions will be thrown when a character cannot be represented in the target encoding; if false HTML character references will be substituted instead |
||||
|
* |
||||
|
* @see https://encoding.spec.whatwg.org#names-and-labels |
||||
|
*/ |
||||
|
public function __construct(string $label, bool $fatal = true) { |
||||
|
$l = Matcher::matchLabel($label); |
||||
|
if (!$l || !$l['encoder']) { |
||||
|
throw new EncoderException("Label '$label' does not have an encoder", Coder::E_UNAVAILABLE_ENCODER); |
||||
|
} else { |
||||
|
$this->name = $l['name']; |
||||
|
$this->fatal = $fatal; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** Encodes a series of code point numbers into a string |
||||
|
* |
||||
|
* @param iterable $codePoints An iterable set of integers representing code points in the Unicode range |
||||
|
*/ |
||||
|
public function encode(iterable $codePoints): string { |
||||
|
$out = ""; |
||||
|
switch ($this->name) { |
||||
|
case "UTF-8": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= UTF8::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "Big5": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= Big5::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "EUC-JP": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= EUCJP::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "EUC-KR": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= EUCKR::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "gb18030": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= GB18030::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "GBK": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= GBK::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "IBM866": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= IBM866::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-2022-JP": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO2022JP::encode($codePoint, $this->fatal, $mode); |
||||
|
} |
||||
|
$out .= ISO2022JP::encode(null, $this->fatal, $mode); |
||||
|
break; |
||||
|
case "ISO-8859-2": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO88592::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-8859-3": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO88593::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-8859-4": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO88594::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-8859-5": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO88595::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-8859-6": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO88596::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-8859-7": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO88597::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-8859-8": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO88598::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-8859-8-I": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO88598I::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-8859-10": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO885910::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-8859-13": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO885913::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-8859-14": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO885914::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-8859-15": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO885915::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-8859-16": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO885916::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "KOI8-R": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= KOI8R::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "KOI8-U": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= KOI8U::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "macintosh": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= Macintosh::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "Shift_JIS": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ShiftJIS::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "windows-1250": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= Windows1250::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "windows-1251": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= Windows1251::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "windows-1252": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= Windows1252::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "windows-1253": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= Windows1253::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "windows-1254": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= Windows1254::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "windows-1255": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= Windows1255::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "windows-1256": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= Windows1256::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "windows-1257": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= Windows1257::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "windows-1258": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= Windows1258::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "windows-874": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= Windows874::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "x-mac-cyrillic": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= XMacCyrillic::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "x-user-defined": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= XUserDefined::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
} |
||||
|
return $out; |
||||
|
} |
||||
|
|
||||
|
/** Encodes a single character into a string |
||||
|
* |
||||
|
* When using this method to encode a string, the finalize() method should be called to terminate the string |
||||
|
* |
||||
|
* @param int $codePoint An integer representing the Unicode code point number to encode |
||||
|
*/ |
||||
|
public function encodeChar(int $codePoint): string { |
||||
|
switch ($this->name) { |
||||
|
case "UTF-8": |
||||
|
return UTF8::encode($codePoint, $this->fatal); |
||||
|
case "Big5": |
||||
|
return Big5::encode($codePoint, $this->fatal); |
||||
|
case "EUC-JP": |
||||
|
return EUCJP::encode($codePoint, $this->fatal); |
||||
|
case "EUC-KR": |
||||
|
return EUCKR::encode($codePoint, $this->fatal); |
||||
|
case "gb18030": |
||||
|
return GB18030::encode($codePoint, $this->fatal); |
||||
|
case "GBK": |
||||
|
return GBK::encode($codePoint, $this->fatal); |
||||
|
case "IBM866": |
||||
|
return IBM866::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-2": |
||||
|
return ISO88592::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-3": |
||||
|
return ISO88593::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-4": |
||||
|
return ISO88594::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-5": |
||||
|
return ISO88595::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-6": |
||||
|
return ISO88596::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-7": |
||||
|
return ISO88597::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-8": |
||||
|
return ISO88598::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-8-I": |
||||
|
return ISO88598I::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-10": |
||||
|
return ISO885910::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-13": |
||||
|
return ISO885913::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-14": |
||||
|
return ISO885914::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-15": |
||||
|
return ISO885915::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-16": |
||||
|
return ISO885916::encode($codePoint, $this->fatal); |
||||
|
case "KOI8-R": |
||||
|
return KOI8R::encode($codePoint, $this->fatal); |
||||
|
case "KOI8-U": |
||||
|
return KOI8U::encode($codePoint, $this->fatal); |
||||
|
case "macintosh": |
||||
|
return Macintosh::encode($codePoint, $this->fatal); |
||||
|
case "Shift_JIS": |
||||
|
return ShiftJIS::encode($codePoint, $this->fatal); |
||||
|
case "windows-1250": |
||||
|
return Windows1250::encode($codePoint, $this->fatal); |
||||
|
case "windows-1251": |
||||
|
return Windows1251::encode($codePoint, $this->fatal); |
||||
|
case "windows-1252": |
||||
|
return Windows1252::encode($codePoint, $this->fatal); |
||||
|
case "windows-1253": |
||||
|
return Windows1253::encode($codePoint, $this->fatal); |
||||
|
case "windows-1254": |
||||
|
return Windows1254::encode($codePoint, $this->fatal); |
||||
|
case "windows-1255": |
||||
|
return Windows1255::encode($codePoint, $this->fatal); |
||||
|
case "windows-1256": |
||||
|
return Windows1256::encode($codePoint, $this->fatal); |
||||
|
case "windows-1257": |
||||
|
return Windows1257::encode($codePoint, $this->fatal); |
||||
|
case "windows-1258": |
||||
|
return Windows1258::encode($codePoint, $this->fatal); |
||||
|
case "windows-874": |
||||
|
return Windows874::encode($codePoint, $this->fatal); |
||||
|
case "x-mac-cyrillic": |
||||
|
return XMacCyrillic::encode($codePoint, $this->fatal); |
||||
|
case "x-user-defined": |
||||
|
return XUserDefined::encode($codePoint, $this->fatal); |
||||
|
case "ISO-2022-JP": |
||||
|
return ISO2022JP::encode($codePoint, $this->fatal, $this->mode); |
||||
|
} |
||||
|
} // @codeCoverageIgnore |
||||
|
|
||||
|
/** Finalizes a string, returning any terminal bytes to append to the output |
||||
|
* |
||||
|
* For the ISO-2022-JP encoding, this method must be called fater the last character is encoded to correctly encode a string; for other encodings this is a no-op |
||||
|
*/ |
||||
|
public function finalize(): string { |
||||
|
return ISO2022JP::encode(null, $this->fatal, $this->mode); |
||||
|
} |
||||
|
} |
File diff suppressed because one or more lines are too long
@ -1,176 +0,0 @@ |
|||||
<?php |
|
||||
/** @license MIT |
|
||||
* Copyright 2018 J. King et al. |
|
||||
* See LICENSE and AUTHORS files for details */ |
|
||||
|
|
||||
declare(strict_types=1); |
|
||||
namespace MensBeam\Intl\Encoding; |
|
||||
|
|
||||
trait GenericEncoding { |
|
||||
protected $string; |
|
||||
protected $posByte = 0; |
|
||||
protected $posChar = 0; |
|
||||
protected $lenByte = null; |
|
||||
protected $lenChar = null; |
|
||||
protected $errMode = self::MODE_REPLACE; |
|
||||
protected $allowSurrogates = false; |
|
||||
|
|
||||
public $posErr = 0; |
|
||||
|
|
||||
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) { |
|
||||
$this->string = $string; |
|
||||
$this->lenByte = strlen($string); |
|
||||
$this->errMode = $fatal ? self::MODE_FATAL_DEC : self::MODE_REPLACE; |
|
||||
$this->allowSurrogates = $allowSurrogates; |
|
||||
} |
|
||||
|
|
||||
public function posByte(): int { |
|
||||
return $this->posByte; |
|
||||
} |
|
||||
|
|
||||
public function posChar(): int { |
|
||||
return $this->posChar; |
|
||||
} |
|
||||
|
|
||||
public function rewind() { |
|
||||
$this->posByte = 0; |
|
||||
$this->posChar = 0; |
|
||||
} |
|
||||
|
|
||||
public function nextChar(): string { |
|
||||
// get the byte at the current position |
|
||||
$b = @$this->string[$this->posByte]; |
|
||||
if ($b === "") { |
|
||||
// if the byte is end of input, simply return it |
|
||||
return ""; |
|
||||
} elseif (ord($b) < 0x80) { |
|
||||
// if the byte is an ASCII character, simply return it |
|
||||
$this->posChar++; |
|
||||
$this->posByte++; |
|
||||
return $b; |
|
||||
} else { |
|
||||
// otherwise return the serialization of the code point at the current position |
|
||||
return UTF8::encode($this->nextCode()); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
public function seek(int $distance): int { |
|
||||
if ($distance > 0) { |
|
||||
if ($this->posByte == strlen($this->string)) { |
|
||||
return $distance; |
|
||||
} |
|
||||
do { |
|
||||
$p = $this->nextCode(); |
|
||||
} while (--$distance && $p !== false); |
|
||||
return $distance; |
|
||||
} elseif ($distance < 0) { |
|
||||
$distance = abs($distance); |
|
||||
if (!$this->posChar) { |
|
||||
return $distance; |
|
||||
} |
|
||||
$mode = $this->errMode; |
|
||||
$this->errMode = self::MODE_NULL; |
|
||||
$out = $this->seekBack($distance); |
|
||||
$this->errMode = $mode; |
|
||||
return $out; |
|
||||
} else { |
|
||||
return 0; |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
public function peekChar(int $num = 1): string { |
|
||||
$out = ""; |
|
||||
$state = $this->stateSave(); |
|
||||
try { |
|
||||
while ($num-- > 0 && ($b = $this->nextChar()) !== "") { |
|
||||
$out .= $b; |
|
||||
} |
|
||||
} finally { |
|
||||
$this->stateApply($state); |
|
||||
} |
|
||||
return $out; |
|
||||
} |
|
||||
|
|
||||
public function peekCode(int $num = 1): array { |
|
||||
$out = []; |
|
||||
$state = $this->stateSave(); |
|
||||
try { |
|
||||
while ($num-- > 0 && ($b = $this->nextCode()) !== false) { |
|
||||
$out[] = $b; |
|
||||
} |
|
||||
} finally { |
|
||||
$this->stateApply($state); |
|
||||
} |
|
||||
return $out; |
|
||||
} |
|
||||
|
|
||||
public function lenByte(): int { |
|
||||
return $this->lenByte; |
|
||||
} |
|
||||
|
|
||||
public function lenChar(): int { |
|
||||
return $this->lenChar ?? (function() { |
|
||||
$state = $this->stateSave(); |
|
||||
while ($this->nextCode() !== false); |
|
||||
$this->lenChar = $this->posChar; |
|
||||
$this->stateApply($state); |
|
||||
return $this->lenChar; |
|
||||
})(); |
|
||||
} |
|
||||
|
|
||||
public function eof(): bool { |
|
||||
return $this->posByte >= $this->lenByte; |
|
||||
} |
|
||||
|
|
||||
public function chars(): \Generator { |
|
||||
while (($c = $this->nextChar()) !== "") { |
|
||||
yield ($this->posChar - 1) => $c; |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
public function codes(): \Generator { |
|
||||
while (($c = $this->nextCode()) !== false) { |
|
||||
yield ($this->posChar - 1) => $c; |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
/** Returns a copy of the decoder's state to keep in memory */ |
|
||||
protected function stateSave(): array { |
|
||||
return [ |
|
||||
'posChar' => $this->posChar, |
|
||||
'posByte' => $this->posByte, |
|
||||
'posErr' => $this->posErr, |
|
||||
]; |
|
||||
} |
|
||||
|
|
||||
/** Sets the decoder's state to the values specified */ |
|
||||
protected function stateApply(array $state) { |
|
||||
foreach ($state as $key => $value) { |
|
||||
$this->$key = $value; |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
/** Handles decoding and encoding errors */ |
|
||||
protected static function err(int $mode, $data = null) { |
|
||||
switch ($mode) { |
|
||||
case self::MODE_NULL: |
|
||||
// used internally during backward seeking for some encodings |
|
||||
return null; // @codeCoverageIgnore |
|
||||
case self::MODE_REPLACE: |
|
||||
// standard "replace" mode |
|
||||
return 0xFFFD; |
|
||||
case self::MODE_HTML: |
|
||||
// the "html" replacement mode; not applicable to Unicode transformation formats |
|
||||
return "&#".(string) $data.";"; |
|
||||
case self::MODE_FATAL_DEC: |
|
||||
// fatal replacement mode for decoders |
|
||||
throw new DecoderException("Invalid code sequence at character offset {$data[0]} (byte offset {$data[1]})", self::E_INVALID_BYTE); |
|
||||
case self::MODE_FATAL_ENC: |
|
||||
// fatal replacement mode for encoders; not applicable to Unicode transformation formats |
|
||||
throw new EncoderException("Code point $data not available in target encoding", self::E_UNAVAILABLE_CODE_POINT); |
|
||||
default: |
|
||||
// indicative of internal bug; should never be triggered |
|
||||
throw new DecoderException("Invalid replacement mode {$mode}", self::E_INVALID_MODE); // @codeCoverageIgnore |
|
||||
} |
|
||||
} |
|
||||
} |
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,17 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\Encoding; |
||||
|
|
||||
|
interface ModalCoder { |
||||
|
/** Returns the encoding of $codePoint as a byte string |
||||
|
* |
||||
|
* @param int $codePoint The Unicode code point to encode. If less than 0 or greater than 1114111, an exception is thrown; if $codePoint is null this signals end-of-file |
||||
|
* @param bool $fatal Whether an exception will be thrown if the code point cannot be encoded into a character; if false HTML character references will be substituted |
||||
|
* @param mixed &$mode A reference keeping track of the current encoder mode. An uninitialized variable should be passed on first invocation, and that variable used for further invocations. |
||||
|
*/ |
||||
|
public static function encode(?int $codePoint, bool $fatal = true, &$mode = null): string; |
||||
|
} |
@ -0,0 +1,133 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\Encoding; |
||||
|
|
||||
|
class Replacement implements Decoder { |
||||
|
public const NAME = "replacement"; |
||||
|
public const LABELS = [ |
||||
|
"csiso2022kr", |
||||
|
"hz-gb-2312", |
||||
|
"iso-2022-cn", |
||||
|
"iso-2022-cn-ext", |
||||
|
"iso-2022-kr", |
||||
|
"replacement", |
||||
|
]; |
||||
|
|
||||
|
protected $len = 0; |
||||
|
protected $done = false; |
||||
|
protected $fatal = false; |
||||
|
|
||||
|
public $posErr = 0; |
||||
|
|
||||
|
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) { |
||||
|
$this->len = strlen($string); |
||||
|
$this->fatal = $fatal; |
||||
|
} |
||||
|
|
||||
|
public function posByte(): int { |
||||
|
return $this->done ? $this->len : 0; |
||||
|
} |
||||
|
|
||||
|
public function posChar(): int { |
||||
|
return $this->done ? 1 : 0; |
||||
|
} |
||||
|
|
||||
|
public function nextChar(): string { |
||||
|
if (!$this->eof()) { |
||||
|
try { |
||||
|
return $this->peekChar(); |
||||
|
} finally { |
||||
|
$this->done = true; |
||||
|
$this->posErr = 1; |
||||
|
} |
||||
|
} |
||||
|
return ""; |
||||
|
} |
||||
|
|
||||
|
public function nextCode() { |
||||
|
if (!$this->eof()) { |
||||
|
try { |
||||
|
return $this->peekCode()[0]; |
||||
|
} finally { |
||||
|
$this->done = true; |
||||
|
$this->posErr = 1; |
||||
|
} |
||||
|
} |
||||
|
return false; |
||||
|
} |
||||
|
|
||||
|
public function seek(int $distance): int { |
||||
|
if ($distance > 0) { |
||||
|
if (!$this->eof()) { |
||||
|
$distance--; |
||||
|
$this->nextCode(); |
||||
|
} |
||||
|
} elseif ($distance < 0) { |
||||
|
if ($this->eof()) { |
||||
|
$distance++; |
||||
|
$this->rewind(); |
||||
|
} |
||||
|
} |
||||
|
return $distance; |
||||
|
} |
||||
|
|
||||
|
public function rewind(): void { |
||||
|
$this->done = false; |
||||
|
} |
||||
|
|
||||
|
public function peekChar(int $num = 1): string { |
||||
|
if (!$this->eof() && $num > 0) { |
||||
|
if ($this->fatal) { |
||||
|
throw new DecoderException("Unable to decode string", self::E_INVALID_BYTE); |
||||
|
} |
||||
|
return "\u{FFFD}"; |
||||
|
} |
||||
|
return ""; |
||||
|
} |
||||
|
|
||||
|
public function peekCode(int $num = 1): array { |
||||
|
if (!$this->eof() && $num > 0) { |
||||
|
if ($this->fatal) { |
||||
|
throw new DecoderException("Unable to decode string", self::E_INVALID_BYTE); |
||||
|
} |
||||
|
return [0xFFFD]; |
||||
|
} |
||||
|
return []; |
||||
|
} |
||||
|
|
||||
|
public function lenByte(): int { |
||||
|
return $this->len; |
||||
|
} |
||||
|
|
||||
|
public function lenChar(): int { |
||||
|
return (int) ($this->len > 0); |
||||
|
} |
||||
|
|
||||
|
public function eof(): bool { |
||||
|
return $this->done || $this->len === 0; |
||||
|
} |
||||
|
|
||||
|
public function chars(): \Generator { |
||||
|
if (!$this->eof()) { |
||||
|
yield 0 => $this->nextChar(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public function codes(): \Generator { |
||||
|
if (!$this->eof()) { |
||||
|
yield 0 => $this->nextCode(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public function asciiSpan(string $mask, int $length = null): string { |
||||
|
return ""; |
||||
|
} |
||||
|
|
||||
|
public function asciiSpanNot(string $mask, int $length = null): string { |
||||
|
return ""; |
||||
|
} |
||||
|
} |
File diff suppressed because one or more lines are too long
@ -1,18 +0,0 @@ |
|||||
<?php |
|
||||
/** @license MIT |
|
||||
* Copyright 2018 J. King et al. |
|
||||
* See LICENSE and AUTHORS files for details */ |
|
||||
|
|
||||
declare(strict_types=1); |
|
||||
namespace MensBeam\Intl\Encoding; |
|
||||
|
|
||||
interface StatefulEncoding extends Encoding { |
|
||||
|
|
||||
/** Returns the encoding of $codePoint as a byte string |
|
||||
* |
|
||||
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown |
|
||||
* |
|
||||
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted |
|
||||
*/ |
|
||||
public static function encode(array $codePoints, bool $fatal = true): string; |
|
||||
} |
|
@ -1,18 +0,0 @@ |
|||||
<?php |
|
||||
/** @license MIT |
|
||||
* Copyright 2018 J. King et al. |
|
||||
* See LICENSE and AUTHORS files for details */ |
|
||||
|
|
||||
declare(strict_types=1); |
|
||||
namespace MensBeam\Intl\Encoding; |
|
||||
|
|
||||
interface StatelessEncoding extends Encoding { |
|
||||
|
|
||||
/** Returns the encoding of $codePoint as a byte string |
|
||||
* |
|
||||
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown |
|
||||
* |
|
||||
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted |
|
||||
*/ |
|
||||
public static function encode(int $codePoint, bool $fatal = true): string; |
|
||||
} |
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,279 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\TestCase\Encoding; |
||||
|
|
||||
|
use MensBeam\Intl\Encoding\ISO2022JP; |
||||
|
use MensBeam\Intl\Encoding\Coder; |
||||
|
use MensBeam\Intl\Encoding\EncoderException; |
||||
|
|
||||
|
class TestISO2022JP extends \MensBeam\Intl\Test\CoderDecoderTest { |
||||
|
protected $testedClass = ISO2022JP::class; |
||||
|
/* |
||||
|
Char 0 U+007A (1 byte) Offset 0 |
||||
|
Esc: Katakana (3 bytes) Offset 1 |
||||
|
Char 1 U+FF9C (1 byte) Offset 4 |
||||
|
Char 2 U+FF9F (1 byte) Offset 5 |
||||
|
Esc: Double-byte (3 bytes) Offset 6 |
||||
|
Char 3 U+79FB (2 bytes) Offset 9 |
||||
|
Char 4 U+67B8 (2 bytes) Offset 11 |
||||
|
Char 5 U+9B91 (2 bytes) Offset 13 |
||||
|
Esc: ASCII (3 bytes) Offset 15 |
||||
|
Char 6 U+007E (1 byte) Offset 18 |
||||
|
Esc: Roman (3 bytes) Offset 19 |
||||
|
End of string at char 7, offset 22 |
||||
|
*/ |
||||
|
protected $seekString = "7A 1B2849 5C 5F 1B2440 305C 5B4E 723A 1B2842 7E 1B284A"; |
||||
|
protected $seekCodes = [0x7A, 0xFF9C, 0xFF9F, 0x79FB, 0x67B8, 0x9B91, 0x7E]; |
||||
|
protected $seekOffsets = [0, 1, 5, 6, 11, 13, 15, 19]; |
||||
|
/* This string contains an invalid character sequence sandwiched between two null characters */ |
||||
|
protected $brokenChar = "00 FF 00"; |
||||
|
/* This string conatins the ASCII characters "A" and "Z" followed by two arbitrary non-ASCII characters, followed by the two ASCII characters "0" and "9" */ |
||||
|
protected $spanString = "1B284A 41 5A 1B2849 5C 5F 1B2842 30 39"; |
||||
|
|
||||
|
public function provideCodePoints() { |
||||
|
return [ |
||||
|
'U+0020 (HTML)' => [false, [0x20], "20"], |
||||
|
'U+0020 (fatal)' => [true, [0x20], "20"], |
||||
|
'U+005C (HTML)' => [false, [0x5C], "5C"], |
||||
|
'U+005C (fatal)' => [true, [0x5C], "5C"], |
||||
|
'U+007E (HTML)' => [false, [0x7E], "7E"], |
||||
|
'U+007E (fatal)' => [true, [0x7E], "7E"], |
||||
|
'U+00A5 (HTML)' => [false, [0xA5], "1B 28 4A 5C 1B 28 42"], |
||||
|
'U+00A5 (fatal)' => [true, [0xA5], "1B 28 4A 5C 1B 28 42"], |
||||
|
'U+203E (HTML)' => [false, [0x203E], "1B 28 4A 7E 1B 28 42"], |
||||
|
'U+203E (fatal)' => [true, [0x203E], "1B 28 4A 7E 1B 28 42"], |
||||
|
'U+FF61 (HTML)' => [false, [0xFF61], "1B 24 42 21 23 1B 28 42"], |
||||
|
'U+FF61 (fatal)' => [true, [0xFF61], "1B 24 42 21 23 1B 28 42"], |
||||
|
'U+FF9F (HTML)' => [false, [0xFF9F], "1B 24 42 21 2C 1B 28 42"], |
||||
|
'U+FF9F (fatal)' => [true, [0xFF9F], "1B 24 42 21 2C 1B 28 42"], |
||||
|
'U+2212 (HTML)' => [false, [0x2212], "1B 24 42 21 5D 1B 28 42"], |
||||
|
'U+2212 (fatal)' => [true, [0x2212], "1B 24 42 21 5D 1B 28 42"], |
||||
|
'U+2116 (HTML)' => [false, [0x2116], "1B 24 42 2D 62 1B 28 42"], |
||||
|
'U+2116 (fatal)' => [true, [0x2116], "1B 24 42 2D 62 1B 28 42"], |
||||
|
'U+FFE2 (HTML)' => [false, [0xFFE2], "1B 24 42 22 4C 1B 28 42"], |
||||
|
'U+FFE2 (fatal)' => [true, [0xFFE2], "1B 24 42 22 4C 1B 28 42"], |
||||
|
'U+00C6 (HTML)' => [false, [0xC6], "26 23 31 39 38 3B"], |
||||
|
'U+00C6 (fatal)' => [true, [0xC6], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)], |
||||
|
'U+FFFD (HTML)' => [false, [0xFFFD], "26 23 36 35 35 33 33 3B"], |
||||
|
'U+FFFD (fatal)' => [true, [0xFFFD], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)], |
||||
|
'Roman (HTML)' => [false, [0xA5, 0x20, 0x203E], "1B 28 4A 5C 20 7E 1B 28 42"], |
||||
|
'Roman (fatal)' => [true, [0xA5, 0x20, 0x203E], "1B 28 4A 5C 20 7E 1B 28 42"], |
||||
|
'Roman to ASCII (HTML)' => [false, [0xA5, 0x5C], "1B 28 4A 5C 1B 28 42 5C"], |
||||
|
'Roman to ASCII (fatal)' => [true, [0xA5, 0x5C], "1B 28 4A 5C 1B 28 42 5C"], |
||||
|
'Roman to error (HTML)' => [false, [0xA5, 0x80], "1B 28 4A 5C 26 23 31 32 38 3B 1B 28 42"], |
||||
|
'Roman to error (fatal)' => [true, [0xA5, 0x80], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)], |
||||
|
'JIS (HTML)' => [false, [0x2116, 0xFFE2, 0x2212], "1B 24 42 2D 62 22 4C 21 5D 1B 28 42"], |
||||
|
'JIS (fatal)' => [true, [0x2116, 0xFFE2, 0x2212], "1B 24 42 2D 62 22 4C 21 5D 1B 28 42"], |
||||
|
'JIS to Roman (HTML)' => [false, [0x2116, 0xA5], "1B 24 42 2D 62 1B 28 4A 5C 1B 28 42"], |
||||
|
'JIS to Roman (fatal)' => [true, [0x2116, 0xA5], "1B 24 42 2D 62 1B 28 4A 5C 1B 28 42"], |
||||
|
'JIS to ASCII 1 (HTML)' => [false, [0x2116, 0x20], "1B 24 42 2D 62 1B 28 42 20"], |
||||
|
'JIS to ASCII 1 (fatal)' => [true, [0x2116, 0x20], "1B 24 42 2D 62 1B 28 42 20"], |
||||
|
'JIS to ASCII 2 (HTML)' => [false, [0x2116, 0x5C], "1B 24 42 2D 62 1B 28 42 5C"], |
||||
|
'JIS to ASCII 2 (fatal)' => [true, [0x2116, 0x5C], "1B 24 42 2D 62 1B 28 42 5C"], |
||||
|
'JIS to error 1 (HTML)' => [false, [0x2116, 0x80], "1B 24 42 2D 62 1B 28 42 26 23 31 32 38 3B"], |
||||
|
'JIS to error 1 (fatal)' => [true, [0x2116, 0x80], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)], |
||||
|
'JIS to error 2 (HTML)' => [false, [0x2116, 0x1B], "1B 24 42 2D 62 1B 28 42 26 23 36 35 35 33 33 3B"], |
||||
|
'JIS to error 2 (fatal)' => [true, [0x2116, 0x1B], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)], |
||||
|
'Escape characters (HTML)' => [false, [0x1B, 0xE, 0xF], "26 23 36 35 35 33 33 3B 26 23 36 35 35 33 33 3B 26 23 36 35 35 33 33 3B"], |
||||
|
'Escape characters (fatal)' => [true, [0x1B, 0xE, 0xF], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)], |
||||
|
'-1 (HTML)' => [false, [-1], new EncoderException("", Coder::E_INVALID_CODE_POINT)], |
||||
|
'-1 (fatal)' => [true, [-1], new EncoderException("", Coder::E_INVALID_CODE_POINT)], |
||||
|
'0x110000 (HTML)' => [false, [0x110000], new EncoderException("", Coder::E_INVALID_CODE_POINT)], |
||||
|
'0x110000 (fatal)' => [true, [0x110000], new EncoderException("", Coder::E_INVALID_CODE_POINT)], |
||||
|
]; |
||||
|
} |
||||
|
|
||||
|
public function provideStrings() { |
||||
|
return [ |
||||
|
'empty string' => ["", []], |
||||
|
'Implied ASCII mode' => ["00 30 5C 7E 21 5F", [0, 48, 92, 126, 33, 95]], |
||||
|
'Explicit ASCII mode' => ["1B2842 00 30 5C 7E 21 5F", [0, 48, 92, 126, 33, 95]], |
||||
|
'Roman mode' => ["1B284A 00 30 5C 7E 21 5F", [0, 48, 165, 8254, 33, 95]], |
||||
|
'Katakana mode' => ["1B2849 00 30 5C 7E 21 5F", [65533, 65392, 65436, 65533, 65377, 65439]], |
||||
|
'Double-byte mode 1' => ["1B2440 00 305C 7E21 5F", [65533, 31227, 65533, 65533]], |
||||
|
'Double-byte mode 2' => ["1B2442 00 305C 7E21 5F", [65533, 31227, 65533, 65533]], |
||||
|
'Multiple modes' => ["5C 1B2849 21 1B2440 305C 1B284A 5C 1B2842 5C", [92, 65377, 31227, 165, 92]], |
||||
|
'Double escape' => ["1B2849 1B2842 5C", [65533, 92]], |
||||
|
'Triple escape' => ["1B2849 1B2842 1B284A 5C", [65533, 65533, 165]], |
||||
|
'Trailing escape' => ["20 1B284A 30 33 1B2849", [32, 48, 51]], |
||||
|
'Truncated escape 1' => ["1B", [65533]], |
||||
|
'Truncated escape 2' => ["1B28", [65533, 40]], |
||||
|
'Truncated escape 3' => ["1B2820", [65533, 40, 32]], |
||||
|
'Truncated escape 4' => ["1B2020", [65533, 32, 32]], |
||||
|
'Invalid escape 1' => ["1B2840", [65533, 40, 64]], |
||||
|
'Invalid escape 2' => ["1B244A", [65533, 36, 74]], |
||||
|
'Invalid bytes' => ["80 FF 1B2849 00 20 7F 1B2442 00 2100 FF FF", [65533, 65533, 65533, 65533, 65533, 65533, 65533, 65533, 65533]], |
||||
|
]; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideCodePoints |
||||
|
* @covers MensBeam\Intl\Encoding\Encoder |
||||
|
*/ |
||||
|
public function testEncodeCodePoints(bool $fatal, $input, $exp) { |
||||
|
return parent::testEncodeCodePoints($fatal, $input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideCodePoints |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::encode |
||||
|
*/ |
||||
|
public function testEncodeCodePointsStatically(bool $fatal, $input, $exp) { |
||||
|
$out = ""; |
||||
|
if ($exp instanceof \Throwable) { |
||||
|
$this->expectException(get_class($exp)); |
||||
|
$this->expectExceptionCode($exp->getCode()); |
||||
|
} else { |
||||
|
$exp = strtolower(str_replace(" ", "", $exp)); |
||||
|
} |
||||
|
foreach ($input as $char) { |
||||
|
$out .= ISO2022JP::encode($char, $fatal, $mode); |
||||
|
} |
||||
|
$out .= ISO2022JP::encode(null, $fatal, $mode); |
||||
|
$this->assertSame($exp, bin2hex($out)); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::__construct |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::nextCode |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::modeSet |
||||
|
*/ |
||||
|
public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) { |
||||
|
return parent::testDecodeMultipleCharactersAsCodePoints($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::__construct |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::nextChar |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::modeSet |
||||
|
*/ |
||||
|
public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) { |
||||
|
return parent::testDecodeMultipleCharactersAsStrings($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::seekBack |
||||
|
*/ |
||||
|
public function testSTepBackThroughAString(string $input, array $exp) { |
||||
|
return parent::testSTepBackThroughAString($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::seek |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::posChar |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::posByte |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::rewind |
||||
|
*/ |
||||
|
public function testSeekThroughAString() { |
||||
|
return parent::testSeekThroughAString(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::posChar |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::posByte |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::eof |
||||
|
*/ |
||||
|
public function testTraversePastTheEndOfAString() { |
||||
|
return parent::testTraversePastTheEndOfAString(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::peekChar |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::stateSave |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::stateApply |
||||
|
*/ |
||||
|
public function testPeekAtCharacters() { |
||||
|
return parent::testPeekAtCharacters(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::peekCode |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::stateSave |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::stateApply |
||||
|
*/ |
||||
|
public function testPeekAtCodePoints() { |
||||
|
return parent::testPeekAtCodePoints(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::lenChar |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::lenByte |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::stateSave |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::stateApply |
||||
|
*/ |
||||
|
public function testGetStringLength(string $input, array $points) { |
||||
|
return parent::testGetStringLength($input, $points); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::errDec |
||||
|
*/ |
||||
|
public function testReplacementModes() { |
||||
|
return parent::testReplacementModes(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::rewind |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::chars |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::codes |
||||
|
*/ |
||||
|
public function testIterateThroughAString(string $input, array $exp) { |
||||
|
return parent::testIterateThroughAString($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @coversNothing |
||||
|
*/ |
||||
|
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) { |
||||
|
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::seekBack |
||||
|
*/ |
||||
|
public function testSeekBackOverRandomData() { |
||||
|
return parent::testSeekBackOverRandomData(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::asciiSpan |
||||
|
*/ |
||||
|
public function testExtractAsciiSpans() { |
||||
|
parent::testExtractAsciiSpans(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::asciiSpanNot |
||||
|
*/ |
||||
|
public function testExtractNegativeAsciiSpans() { |
||||
|
parent::testExtractNegativeAsciiSpans(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @group optional |
||||
|
*/ |
||||
|
public function testPedanticallyDecodeSingleCharactersAsCodePoint() { |
||||
|
$series = [ |
||||
|
]; |
||||
|
foreach ($series as $test) { |
||||
|
foreach ($test[0] as $a => $input) { |
||||
|
$class = $this->testedClass; |
||||
|
$char = hex2bin($input); |
||||
|
$exp = $test[1][$a]; |
||||
|
$s = new $class($char); |
||||
|
$this->assertSame($exp, $s->nextCode(), "Sequence $input did not decode to $exp."); |
||||
|
$this->assertFalse($s->nextCode(), "Sequence $input did not end after one character"); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
@ -0,0 +1,221 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\TestCase\Encoding; |
||||
|
|
||||
|
use MensBeam\Intl\Encoding\Replacement; |
||||
|
use MensBeam\Intl\Encoding\DecoderException; |
||||
|
|
||||
|
class TestReplacement extends \MensBeam\Intl\Test\DecoderTest { |
||||
|
protected $testedClass = Replacement::class; |
||||
|
|
||||
|
public function provideStrings() { |
||||
|
return [ |
||||
|
// control samples |
||||
|
'empty string' => ["", []], |
||||
|
'Arbitrary string 1' => ["20", [0xFFFD]], |
||||
|
'Arbitrary string 2' => ["64 8B 20 00 FF A5", [0xFFFD]], |
||||
|
]; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::__construct |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::nextCode |
||||
|
*/ |
||||
|
public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) { |
||||
|
return parent::testDecodeMultipleCharactersAsCodePoints($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::__construct |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::nextChar |
||||
|
*/ |
||||
|
public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) { |
||||
|
return parent::testDecodeMultipleCharactersAsStrings($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::seek |
||||
|
*/ |
||||
|
public function testSTepBackThroughAString(string $input, array $exp) { |
||||
|
return parent::testSTepBackThroughAString($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @coversNothing |
||||
|
*/ |
||||
|
public function testSeekThroughAString() { |
||||
|
$this->assertTrue(true); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::posChar |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::posByte |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::seek |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::eof |
||||
|
*/ |
||||
|
public function testTraversePastTheEndOfAString() { |
||||
|
$d = new Replacement("a"); |
||||
|
$this->assertFalse($d->eof()); |
||||
|
$this->assertSame(0, $d->posChar()); |
||||
|
$this->assertSame(0, $d->posByte()); |
||||
|
$d->seek(1); |
||||
|
$this->assertTrue($d->eof()); |
||||
|
$this->assertSame(1, $d->posChar()); |
||||
|
$this->assertSame(1, $d->posByte()); |
||||
|
$d->seek(1); |
||||
|
$this->assertTrue($d->eof()); |
||||
|
$this->assertSame(1, $d->posChar()); |
||||
|
$this->assertSame(1, $d->posByte()); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::peekChar |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::posChar |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::posByte |
||||
|
*/ |
||||
|
public function testPeekAtCharacters() { |
||||
|
$d = new Replacement("A"); |
||||
|
$this->assertSame(0, $d->posChar()); |
||||
|
$this->assertSame(0, $d->posByte()); |
||||
|
$this->assertSame("\u{FFFD}", $d->peekChar(2112)); |
||||
|
$this->assertSame(0, $d->posChar()); |
||||
|
$this->assertSame(0, $d->posByte()); |
||||
|
$this->assertSame("", $d->peekChar(0)); |
||||
|
$this->assertSame("", $d->peekChar(-2112)); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::peekCode |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::posChar |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::posByte |
||||
|
*/ |
||||
|
public function testPeekAtCodePoints() { |
||||
|
$d = new Replacement("A"); |
||||
|
$this->assertSame(0, $d->posChar()); |
||||
|
$this->assertSame(0, $d->posByte()); |
||||
|
$this->assertSame([0xFFFD], $d->peekCode(2112)); |
||||
|
$this->assertSame(0, $d->posChar()); |
||||
|
$this->assertSame(0, $d->posByte()); |
||||
|
$this->assertSame([], $d->peekCode(0)); |
||||
|
$this->assertSame([], $d->peekCode(-2112)); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::lenChar |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::lenByte |
||||
|
*/ |
||||
|
public function testGetStringLength(string $input, array $points) { |
||||
|
return parent::testGetStringLength($input, $points); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::nextChar |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::nextCode |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::peekChar |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::peekCode |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::rewind |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::posChar |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::posByte |
||||
|
*/ |
||||
|
public function testReplacementModes() { |
||||
|
$d = new Replacement("VVVVVV", true); |
||||
|
$this->assertSame(0, $d->posChar()); |
||||
|
$this->assertSame(0, $d->posByte()); |
||||
|
try { |
||||
|
$p = $d->peekCode(); |
||||
|
} catch (\Exception $e) { |
||||
|
$p = $e; |
||||
|
} finally { |
||||
|
$this->assertInstanceOf(DecoderException::class, $p); |
||||
|
} |
||||
|
$this->assertSame(0, $d->posErr); |
||||
|
$this->assertSame(0, $d->posChar()); |
||||
|
$this->assertSame(0, $d->posByte()); |
||||
|
try { |
||||
|
$p = $d->nextCode(); |
||||
|
} catch (\Exception $e) { |
||||
|
$p = $e; |
||||
|
} finally { |
||||
|
$this->assertInstanceOf(DecoderException::class, $p); |
||||
|
} |
||||
|
$this->assertSame(1, $d->posErr); |
||||
|
$this->assertSame(1, $d->posChar()); |
||||
|
$this->assertSame(6, $d->posByte()); |
||||
|
$d->rewind(); |
||||
|
$this->assertSame(0, $d->posChar()); |
||||
|
$this->assertSame(0, $d->posByte()); |
||||
|
try { |
||||
|
$p = $d->peekChar(); |
||||
|
} catch (\Exception $e) { |
||||
|
$p = $e; |
||||
|
} finally { |
||||
|
$this->assertInstanceOf(DecoderException::class, $p); |
||||
|
} |
||||
|
$this->assertSame(1, $d->posErr); |
||||
|
$this->assertSame(0, $d->posChar()); |
||||
|
$this->assertSame(0, $d->posByte()); |
||||
|
try { |
||||
|
$p = $d->nextChar(); |
||||
|
} catch (\Exception $e) { |
||||
|
$p = $e; |
||||
|
} finally { |
||||
|
$this->assertInstanceOf(DecoderException::class, $p); |
||||
|
} |
||||
|
$this->assertSame(1, $d->posErr); |
||||
|
$this->assertSame(1, $d->posChar()); |
||||
|
$this->assertSame(6, $d->posByte()); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::rewind |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::chars |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::codes |
||||
|
*/ |
||||
|
public function testIterateThroughAString(string $input, array $exp) { |
||||
|
return parent::testIterateThroughAString($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::nextCode |
||||
|
*/ |
||||
|
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) { |
||||
|
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @coversNothing |
||||
|
*/ |
||||
|
public function testSeekBackOverRandomData() { |
||||
|
return parent::testSeekBackOverRandomData(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::asciiSpan |
||||
|
*/ |
||||
|
public function testExtractAsciiSpans() { |
||||
|
$d = new Replacement("VVVVVV"); |
||||
|
$this->assertSame("", $d->asciiSpan($this->allBytes())); |
||||
|
$d->nextChar(); |
||||
|
$this->assertTrue($d->eof()); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::asciiSpanNot |
||||
|
*/ |
||||
|
public function testExtractNegativeAsciiSpans() { |
||||
|
$d = new Replacement("VVVVVV"); |
||||
|
$this->assertSame("", $d->asciiSpanNot("")); |
||||
|
$d->nextChar(); |
||||
|
$this->assertTrue($d->eof()); |
||||
|
} |
||||
|
} |
File diff suppressed because one or more lines are too long
@ -0,0 +1,92 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\TestCase; |
||||
|
|
||||
|
use MensBeam\Intl\Encoding; |
||||
|
use MensBeam\Intl\Encoding\Encoder; |
||||
|
use MensBeam\Intl\Encoding\UTF16BE; |
||||
|
use MensBeam\Intl\Encoding\UTF16LE; |
||||
|
use MensBeam\Intl\Encoding\UTF8; |
||||
|
|
||||
|
class TestEncoding extends \PHPUnit\Framework\TestCase { |
||||
|
/** @dataProvider provideLabelData */ |
||||
|
public function testMatchALabelToAnEncoding(string $label, array $exp) { |
||||
|
$this->assertSame($exp, Encoding::matchLabel($label)); |
||||
|
$this->assertSame($exp, Encoding::matchLabel(strtoupper($label))); |
||||
|
$this->assertSame($exp, Encoding::matchLabel(" $label\n\n\r\t")); |
||||
|
} |
||||
|
|
||||
|
public function testFailToMatchALabelToAnEncoding() { |
||||
|
$this->assertNull(Encoding::matchLabel("Not a label")); |
||||
|
} |
||||
|
|
||||
|
/** @dataProvider provideLabelData */ |
||||
|
public function testCreateADecoderFromALabel(string $label, array $data) { |
||||
|
$this->assertInstanceOf($data['class'], Encoding::createDecoder($label, "")); |
||||
|
$this->assertInstanceOf($data['class'], Encoding::createDecoder(strtoupper($label), "")); |
||||
|
$this->assertInstanceOf($data['class'], Encoding::createDecoder(" $label\n\n\r\t", "")); |
||||
|
} |
||||
|
|
||||
|
/** @dataProvider provideBOMSniffings */ |
||||
|
public function testCreateADecoderWhileSniffingBOM(string $label, string $string, string $class) { |
||||
|
$this->assertInstanceOf($class, Encoding::createDecoder($label, $string)); |
||||
|
} |
||||
|
|
||||
|
public function testFailToCreateADecoderFromALabel() { |
||||
|
$this->assertNull(Encoding::createDecoder("Not a label", "")); |
||||
|
} |
||||
|
|
||||
|
/** @dataProvider provideLabelData */ |
||||
|
public function testCreateAnEncoderFromALabel(string $label, array $data) { |
||||
|
if ($data['encoder']) { |
||||
|
$this->assertInstanceOf(Encoder::class, Encoding::createEncoder($label)); |
||||
|
$this->assertInstanceOf(Encoder::class, Encoding::createEncoder(strtoupper($label))); |
||||
|
$this->assertInstanceOf(Encoder::class, Encoding::createEncoder(" $label\n\n\r\t")); |
||||
|
} else { |
||||
|
$this->assertNull(Encoding::createEncoder($label)); |
||||
|
$this->assertNull(Encoding::createEncoder(strtoupper($label))); |
||||
|
$this->assertNull(Encoding::createEncoder(" $label\n\n\r\t")); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public function testFailToCreateAnEncoderFromALabel() { |
||||
|
$this->assertNull(Encoding::createEncoder("Not a label")); |
||||
|
} |
||||
|
|
||||
|
public function provideLabelData() { |
||||
|
$ns = "MensBeam\\Intl\\Encoding\\"; |
||||
|
$labels = []; |
||||
|
$names = []; |
||||
|
foreach (new \GlobIterator(\MensBeam\Intl\BASE."/lib/Encoding/*.php", \FilesystemIterator::CURRENT_AS_PATHNAME) as $file) { |
||||
|
$file = basename($file, ".php"); |
||||
|
$className = $ns.$file; |
||||
|
$class = new \ReflectionClass($className); |
||||
|
if ($class->implementsInterface(\MensBeam\Intl\Encoding\Decoder::class) && $class->isInstantiable()) { |
||||
|
$name = $class->getConstant("NAME"); |
||||
|
$names[$name] = $className; |
||||
|
foreach ($class->getConstant("LABELS") as $label) { |
||||
|
$labels[$label] = $name; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
foreach ($labels as $label => $name) { |
||||
|
$class = $names[$name]; |
||||
|
$encoder = !in_array($name, ["UTF-16LE", "UTF-16BE", "replacement"]); |
||||
|
yield [(string) $label, ['label' => (string) $label, 'name' => $name, 'class' => $class, 'encoder' => $encoder]]; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public function provideBOMSniffings() { |
||||
|
return [ |
||||
|
'No BOM' => ["UTF-8", "Hello world!", UTF8::class], |
||||
|
'UTF-8 BOM' => ["Shift_JIS", "\xEF\xBB\xBFA", UTF8::class], |
||||
|
'UTF-16BE BOM' => ["UTF-8", "\xFE\xFF\x00A", UTF16BE::class], |
||||
|
'UTF-16LE BOM' => ["UTF-8", "\xFF\xFEA\x00", UTF16LE::class], |
||||
|
'GB18030 BOM' => ["UTF-8", "\x84\x31\x95\x33A", UTF8::class], |
||||
|
]; |
||||
|
} |
||||
|
} |
@ -0,0 +1,40 @@ |
|||||
|
<?php |
||||
|
// this script read and names and labels from each concrete |
||||
|
// class in the Encoding set and generates tables mapping labels |
||||
|
// to names and names to classes |
||||
|
|
||||
|
use MensBeam\Intl\Encoding\Decoder; |
||||
|
|
||||
|
define("BASE", dirname(__DIR__).DIRECTORY_SEPARATOR); |
||||
|
require_once BASE."vendor".DIRECTORY_SEPARATOR."autoload.php"; |
||||
|
|
||||
|
$ns = "\\MensBeam\\Intl\\Encoding\\"; |
||||
|
$labels = []; |
||||
|
$names = []; |
||||
|
foreach (new \GlobIterator(BASE."/lib/Encoding/*.php", \FilesystemIterator::CURRENT_AS_PATHNAME) as $file) { |
||||
|
$file = basename($file, ".php"); |
||||
|
$className = $ns.$file; |
||||
|
$class = new \ReflectionClass($className); |
||||
|
if ($class->implementsInterface(Decoder::class) && $class->isInstantiable()) { |
||||
|
$name = $class->getConstant("NAME"); |
||||
|
$names[$name] = $className; |
||||
|
foreach ($class->getConstant("LABELS") as $label) { |
||||
|
$labels[$label] = $name; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
$labelList = []; |
||||
|
foreach ($labels as $k => $v) { |
||||
|
$labelList[] = "'$k'=>\"$v\""; |
||||
|
} |
||||
|
$labelList = "const LABEL_MAP = [".implode(",", $labelList)."];"; |
||||
|
|
||||
|
$nameList = []; |
||||
|
foreach ($names as $k => $v) { |
||||
|
$nameList[] = "'$k'=>$v::class"; |
||||
|
} |
||||
|
$nameList = "const NAME_MAP = [".implode(",", $nameList)."];"; |
||||
|
|
||||
|
echo "$labelList\n"; |
||||
|
echo "$nameList\n"; |
@ -0,0 +1,57 @@ |
|||||
|
<!DOCTYPE html> |
||||
|
<meta charset=euc-jp> |
||||
|
<!-- Chromium does NOT produce correct results as of this writing; use Firefox to generate test data --> |
||||
|
<script> |
||||
|
var sampleStrings = { |
||||
|
'empty string': "", |
||||
|
// sanity checks |
||||
|
'sanity check': "40", |
||||
|
'former ASCII deviations': "5C 7E", |
||||
|
'changed multibyte index': "A1DD", |
||||
|
// JIS X 0201 |
||||
|
'JIS X 0201 range': "8EA1 8EDF", |
||||
|
'JIS X 0201 bogus range': "8EA0 8EE0", |
||||
|
'JIS X 0201 truncated character 1': "8E", |
||||
|
'JIS X 0201 truncated character 2': "8E 20", |
||||
|
'JIS X 0201 truncated character 3': "8E FF", |
||||
|
// JIS X 0212 |
||||
|
'JIS X 0212 assigned range': "8FA2AF 8FEDE3", |
||||
|
'JIS X 0212 total range': "8FA1A1 8FFEFE", |
||||
|
'JIS X 0212 bogus range 1': "8FA0A1 8FFFFE", |
||||
|
'JIS X 0212 bogus range 2': "8FA1A0 8FFEFF", |
||||
|
'JIS X 0212 truncated character 1': "8FA2", |
||||
|
'JIS X 0212 truncated character 2': "8FA2 20", |
||||
|
'JIS X 0212 truncated character 3': "8FA2 FF", |
||||
|
// JIS X 0208 |
||||
|
'JIS X 0208 assigned range': "A1A1 FCFE", |
||||
|
'JIS X 0208 total range': "A1A1 FEFE", |
||||
|
'JIS X 0208 bogus range': "A1A0 A0FE", |
||||
|
'JIS X 0208 truncated character 1': "A1", |
||||
|
'JIS X 0208 truncated character 2': "A1 20", |
||||
|
'JIS X 0208 truncated character 3': "A1 FF", |
||||
|
}; |
||||
|
var sampleCharacters = { |
||||
|
'U+0064': 0x64, |
||||
|
'U+00A5': 0xA5, |
||||
|
'U+203E': 0x203E, |
||||
|
'U+3088': 0x3088, |
||||
|
'U+FF96': 0xFF96, |
||||
|
'U+2212': 0x2212, |
||||
|
'U+00E6': 0xE6, |
||||
|
'U+FFE2': 0xFFE2, |
||||
|
'U+2116': 0x2116, |
||||
|
'-1': -1, |
||||
|
'0x110000': 0x110000, |
||||
|
}; |
||||
|
var seekCodePoints = [ |
||||
|
0x007A, |
||||
|
0xFF96, |
||||
|
0x3088, |
||||
|
0xFF0D, |
||||
|
0x005C, |
||||
|
0xFF9B, |
||||
|
/* This code point is not encodable and must be done manually entered as 8FB0EF */ |
||||
|
0x4F58, |
||||
|
]; |
||||
|
</script> |
||||
|
<script src="test.js"></script> |
@ -0,0 +1,46 @@ |
|||||
|
<!DOCTYPE html> |
||||
|
<meta charset=iso-2022-jp> |
||||
|
<!-- Chromium does NOT produce correct results as of this writing; use Firefox to generate test data --> |
||||
|
<script> |
||||
|
var sampleStrings = { |
||||
|
'empty string': "", |
||||
|
'Implied ASCII mode': "00 30 5C 7E 21 5F", |
||||
|
'Explicit ASCII mode': "1B2842 00 30 5C 7E 21 5F", |
||||
|
'Roman mode': "1B284A 00 30 5C 7E 21 5F", |
||||
|
'Katakana mode': "1B2849 00 30 5C 7E 21 5F", |
||||
|
'Double-byte mode 1': "1B2440 00 30 5C 7E 21 5F", |
||||
|
'Double-byte mode 2': "1B2442 00 30 5C 7E 21 5F", |
||||
|
'Multiple modes': "5C 1B2849 21 1B2440 305C 1B284A 5C 1B2842 5C", |
||||
|
'Double escape': "1B2849 1B2842 5C", |
||||
|
'Triple escape': "1B2849 1B2842 1B284A 5C", |
||||
|
'Trailing escape': "20 1B284A 30 33 1B2849", |
||||
|
'Invalid bytes': "80 FF 1B2849 00 20 7F 1B2442 00 2100 FF FF", |
||||
|
}; |
||||
|
var sampleCharacters = { |
||||
|
'U+0020': [0x20], |
||||
|
'U+005C': [0x5C], |
||||
|
'U+007E': [0x7E], |
||||
|
'U+00A5': [0xA5], |
||||
|
'U+203E': [0x203E], |
||||
|
'U+FF61': [0xFF61], |
||||
|
'U+FF9F': [0xFF9F], |
||||
|
'U+2212': [0x2212], |
||||
|
'U+2116': [0x2116], |
||||
|
'U+FFE2': [0xFFE2], |
||||
|
'U+00C6': [0xC6], |
||||
|
'U+FFFD': [0xFFFD], |
||||
|
'Roman': [0xA5, 0x20, 0x203E], |
||||
|
'Roman to ASCII': [0xA5, 0x5C], |
||||
|
'Roman to error': [0xA5, 0x80], |
||||
|
'JIS': [0x2116, 0xFFE2, 0x2212], |
||||
|
'JIS to Roman': [0x2116, 0xA5], |
||||
|
'JIS to ASCII 1': [0x2116, 0x20], |
||||
|
'JIS to ASCII 2': [0x2116, 0x5C], |
||||
|
'JIS to error 1': [0x2116, 0x80], |
||||
|
'JIS to error 2': [0x2116, 0x1B], // Even Firefox is wrong here; see https://github.com/web-platform-tests/wpt/pull/26158 |
||||
|
'Escape characters': [0x1B, 0xE, 0xF], // Even Firefox is wrong here; see https://github.com/web-platform-tests/wpt/pull/26158 |
||||
|
'-1': [-1], |
||||
|
'0x110000': [0x110000], |
||||
|
}; |
||||
|
</script> |
||||
|
<script src="test.js"></script> |
@ -0,0 +1,42 @@ |
|||||
|
<!DOCTYPE html> |
||||
|
<meta charset=shift_jis> |
||||
|
<!-- Chromium does NOT produce correct results as of this writing; use Firefox to generate test data --> |
||||
|
<script> |
||||
|
var sampleStrings = { |
||||
|
'empty string': "", |
||||
|
'sanity check': "40", |
||||
|
'former ASCII deviations': "5C 7E", |
||||
|
'JIS X 0201 range': "A1 DF", |
||||
|
'EUDC range': "F040 F9FC", |
||||
|
'JIS X 0208 assigned range': "8140 FC4B", |
||||
|
'JIS X 0208 total range': "8140 FCFC", |
||||
|
'JIS X 0208 truncated character 1': "81", |
||||
|
'JIS X 0208 truncated character 2': "81 20", |
||||
|
'JIS X 0208 truncated character 3': "81 FF", |
||||
|
}; |
||||
|
var sampleCharacters = { |
||||
|
'U+0064': 0x64, |
||||
|
'U+00A5': 0xA5, |
||||
|
'U+203E': 0x203E, |
||||
|
'U+3088': 0x3088, |
||||
|
'U+FF96': 0xFF96, |
||||
|
'U+2212': 0x2212, |
||||
|
'U+00E6': 0xE6, |
||||
|
'U+FFE2': 0xFFE2, |
||||
|
'U+2116': 0x2116, |
||||
|
'U+E000': 0xE000, |
||||
|
'-1': -1, |
||||
|
'0x110000': 0x110000, |
||||
|
}; |
||||
|
var seekCodePoints = [ |
||||
|
0x007A, |
||||
|
0xFF96, |
||||
|
0x3088, |
||||
|
0xFF0D, |
||||
|
0x005C, |
||||
|
0xFF9B, |
||||
|
/* This code point is not encodable and must be done manually entered as F040 */ |
||||
|
0xE000, |
||||
|
]; |
||||
|
</script> |
||||
|
<script src="test.js"></script> |
File diff suppressed because it is too large
@ -1,5 +1,5 @@ |
|||||
{ |
{ |
||||
"require": { |
"require": { |
||||
"phpunit/phpunit": "^8.5" |
"phpunit/phpunit": "^8.5 | ^9.0" |
||||
} |
} |
||||
} |
} |
||||
|
File diff suppressed because it is too large
@ -1,5 +1,5 @@ |
|||||
{ |
{ |
||||
"require": { |
"require": { |
||||
"consolidation/robo": "^1.1" |
"consolidation/robo": "^4.0" |
||||
} |
} |
||||
} |
} |
||||
|
File diff suppressed because it is too large
Loading…
Reference in new issue