Compare commits
94 Commits
91 changed files with 7674 additions and 2658 deletions
File diff suppressed because one or more lines are too long
@ -0,0 +1,250 @@ |
|||
<?php |
|||
/** @license MIT |
|||
* Copyright 2018 J. King et al. |
|||
* See LICENSE and AUTHORS files for details */ |
|||
|
|||
declare(strict_types=1); |
|||
namespace MensBeam\Intl\Encoding; |
|||
|
|||
abstract class AbstractEncoding implements Decoder { |
|||
protected const MODE_NULL = 0; |
|||
protected const MODE_REPLACE = 1; |
|||
protected const MODE_FATAL = 2; |
|||
|
|||
protected const HIGH_BYTES = "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF"; |
|||
|
|||
/** @var string $string The string being decoded */ |
|||
protected $string; |
|||
/** @var int $posByte The current byte position in the string */ |
|||
protected $posByte = 0; |
|||
/** @var int $posChar The current character (code point) position in the string */ |
|||
protected $posChar = 0; |
|||
/** @var int $lenByte The length of the string, in bytes */ |
|||
protected $lenByte = null; |
|||
/** @var int $lenChar The length of the string in characters, if known */ |
|||
protected $lenChar = null; |
|||
/** @var array $errStack A list of error data to aid in backwards seeking; the most recent error is kept off the stack */ |
|||
protected $errStack = []; |
|||
/** @var int $errMark The byte position marking the most recent error. The one or more bytes previous to this position constitute an invalid character */ |
|||
protected $errMark = -1; |
|||
/** @var int $errSync The byte position to which to move to skip over the most recent erroneous character */ |
|||
protected $errSync = -2; |
|||
/** @var int $errMode The selected error mode (fatal or replace) */ |
|||
protected $errMode = self::MODE_REPLACE; |
|||
/** @var bool $allowSurrogates Whether surrogates in encodings other than UTF-16 should be passed through */ |
|||
protected $allowSurrogates = false; |
|||
/** @var bool $selfSynchronizing Whether the concrete class represents a self-synchronizing decoder. Such decoders do not use the error stack */ |
|||
protected $selfSynchronizing = false; |
|||
/** @var string[] $stateProps The list of properties which constitutee state which must be saved when peeking/seeking; some encodings may add to this last for their own purposes */ |
|||
protected $stateProps = ["posChar", "posByte", "posErr"]; |
|||
|
|||
public $posErr = 0; |
|||
|
|||
/** Seeks backwards through the string the specified number of characters. |
|||
* If the beginning of the string is reached before the requested number |
|||
* of characters has been skipped over, the number of remaining characters |
|||
* is returned. |
|||
*/ |
|||
abstract protected function seekBack(int $distance): int; |
|||
|
|||
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) { |
|||
$this->string = $string; |
|||
$this->lenByte = strlen($string); |
|||
$this->errMode = $fatal ? self::MODE_FATAL : self::MODE_REPLACE; |
|||
$this->allowSurrogates = $allowSurrogates; |
|||
} |
|||
|
|||
public function posByte(): int { |
|||
return $this->posByte; |
|||
} |
|||
|
|||
public function posChar(): int { |
|||
return $this->posChar; |
|||
} |
|||
|
|||
public function rewind(): void { |
|||
$this->posByte = 0; |
|||
$this->posChar = 0; |
|||
$this->errMark = -1; |
|||
$this->errSync = -2; |
|||
$this->errStack = []; |
|||
} |
|||
|
|||
public function nextChar(): string { |
|||
// get the byte at the current position |
|||
$b = $this->string[$this->posByte] ?? ""; |
|||
if ($b === "") { |
|||
// if the byte is end of input, simply return it |
|||
return ""; |
|||
} elseif (ord($b) < 0x80) { |
|||
// if the byte is an ASCII character, simply return it |
|||
$this->posChar++; |
|||
$this->posByte++; |
|||
return $b; |
|||
} else { |
|||
// otherwise return the serialization of the code point at the current position |
|||
return UTF8::encode($this->nextCode()); |
|||
} |
|||
} |
|||
|
|||
public function seek(int $distance): int { |
|||
if ($distance > 0) { |
|||
do { |
|||
$p = $this->nextCode(); |
|||
} while ($p !== false && --$distance); |
|||
return $distance; |
|||
} elseif ($distance < 0) { |
|||
$distance = abs($distance); |
|||
if (!$this->posChar) { |
|||
return $distance; |
|||
} |
|||
$mode = $this->errMode; |
|||
$this->errMode = self::MODE_NULL; |
|||
$out = $this->seekBack($distance); |
|||
$this->errMode = $mode; |
|||
return $out; |
|||
} else { |
|||
return 0; |
|||
} |
|||
} |
|||
|
|||
public function peekChar(int $num = 1): string { |
|||
$out = ""; |
|||
$state = $this->stateSave(); |
|||
try { |
|||
while ($num-- > 0 && ($b = $this->nextChar()) !== "") { |
|||
$out .= $b; |
|||
} |
|||
} finally { |
|||
$this->stateApply($state); |
|||
} |
|||
return $out; |
|||
} |
|||
|
|||
public function peekCode(int $num = 1): array { |
|||
$out = []; |
|||
$state = $this->stateSave(); |
|||
try { |
|||
while ($num-- > 0 && ($b = $this->nextCode()) !== false) { |
|||
$out[] = $b; |
|||
} |
|||
} finally { |
|||
$this->stateApply($state); |
|||
} |
|||
return $out; |
|||
} |
|||
|
|||
public function lenByte(): int { |
|||
return $this->lenByte; |
|||
} |
|||
|
|||
public function lenChar(): int { |
|||
return $this->lenChar ?? (function() { |
|||
$state = $this->stateSave(); |
|||
while ($this->nextCode() !== false); |
|||
$this->lenChar = $this->posChar; |
|||
$this->stateApply($state); |
|||
return $this->lenChar; |
|||
})(); |
|||
} |
|||
|
|||
public function eof(): bool { |
|||
return $this->posByte >= $this->lenByte; |
|||
} |
|||
|
|||
public function chars(): \Generator { |
|||
while (($c = $this->nextChar()) !== "") { |
|||
yield ($this->posChar - 1) => $c; |
|||
} |
|||
} |
|||
|
|||
public function codes(): \Generator { |
|||
while (($c = $this->nextCode()) !== false) { |
|||
yield ($this->posChar - 1) => $c; |
|||
} |
|||
} |
|||
|
|||
public function asciiSpan(string $mask, int $length = null): string { |
|||
$mask = preg_replace('/[\x80-\xFF]/s', "", $mask); |
|||
if ($length !== null) { |
|||
$len = strspn($this->string, $mask, $this->posByte, $length); |
|||
} else { |
|||
$len = strspn($this->string, $mask, $this->posByte); |
|||
} |
|||
if ($len) { |
|||
$out = substr($this->string, $this->posByte, $len); |
|||
$this->posByte += $len; |
|||
$this->posChar += $len; |
|||
return $out; |
|||
} else { |
|||
return ""; |
|||
} |
|||
} |
|||
|
|||
public function asciiSpanNot(string $mask, int $length = null): string { |
|||
$mask .= self::HIGH_BYTES; |
|||
if ($length !== null) { |
|||
$len = strcspn($this->string, $mask, $this->posByte, $length); |
|||
} else { |
|||
$len = strcspn($this->string, $mask, $this->posByte); |
|||
} |
|||
if ($len) { |
|||
$out = substr($this->string, $this->posByte, $len); |
|||
$this->posByte += $len; |
|||
$this->posChar += $len; |
|||
return $out; |
|||
} else { |
|||
return ""; |
|||
} |
|||
} |
|||
|
|||
/** Returns a copy of the decoder's state to keep in memory */ |
|||
protected function stateSave(): array { |
|||
$out = ['errCount' => sizeof($this->errStack)]; |
|||
foreach ($this->stateProps as $prop) { |
|||
$out[$prop] = $this->$prop; |
|||
} |
|||
return $out; |
|||
} |
|||
|
|||
/** Sets the decoder's state to the values specified */ |
|||
protected function stateApply(array $state): void { |
|||
while (sizeof($this->errStack) > $state['errCount']) { |
|||
list($this->errMark, $this->errSync) = array_pop($this->errStack); |
|||
} |
|||
unset($state['errCount']); |
|||
foreach ($state as $key => $value) { |
|||
$this->$key = $value; |
|||
} |
|||
} |
|||
|
|||
/** Handles decoding errors */ |
|||
protected function errDec(int $mode, int $charOffset, int $byteOffset): ?int { |
|||
if ($mode !== self::MODE_NULL) { |
|||
// expose the error to the user; this disambiguates a literal replacement character |
|||
$this->posErr = $this->posChar; |
|||
// unless the decoder is self-synchronizing, mark the error so that it can be skipped when seeking back |
|||
if (!$this->selfSynchronizing) { |
|||
$this->errStack[] = [$this->errMark, $this->errSync]; |
|||
$this->errMark = $this->posByte; |
|||
$this->errSync = $byteOffset; |
|||
} |
|||
if ($mode === self::MODE_FATAL) { |
|||
throw new DecoderException("Invalid code sequence at character offset $charOffset (byte offset $byteOffset)", self::E_INVALID_BYTE); |
|||
} else { |
|||
return 0xFFFD; |
|||
} |
|||
} |
|||
return null; |
|||
} |
|||
|
|||
/** Handles encoding errors */ |
|||
protected static function errEnc(bool $htmlMode, $data = null): string { |
|||
if ($htmlMode) { |
|||
return "&#".(string) $data.";"; |
|||
} else { |
|||
// fatal replacement mode for encoders; not applicable to Unicode transformation formats |
|||
throw new EncoderException("Code point $data not available in target encoding", Coder::E_UNAVAILABLE_CODE_POINT); |
|||
} |
|||
} |
|||
} |
File diff suppressed because one or more lines are too long
@ -0,0 +1,20 @@ |
|||
<?php |
|||
/** @license MIT |
|||
* Copyright 2018 J. King et al. |
|||
* See LICENSE and AUTHORS files for details */ |
|||
|
|||
declare(strict_types=1); |
|||
namespace MensBeam\Intl\Encoding; |
|||
|
|||
interface Coder { |
|||
public const E_INVALID_CODE_POINT = 1; |
|||
public const E_UNAVAILABLE_CODE_POINT = 3; |
|||
public const E_UNAVAILABLE_ENCODER = 4; |
|||
|
|||
/** Returns the encoding of $codePoint as a byte string |
|||
* |
|||
* @param int $codePoint The Unicode code point to encode. If less than 0 or greater than 1114111, an exception is thrown |
|||
* @param bool $fatal Whether an exception will be thrown if the code point cannot be encoded into a character; if false HTML character references will be substituted |
|||
*/ |
|||
public static function encode(int $codePoint, bool $fatal = true): string; |
|||
} |
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -0,0 +1,322 @@ |
|||
<?php |
|||
/** @license MIT |
|||
* Copyright 2018 J. King et al. |
|||
* See LICENSE and AUTHORS files for details */ |
|||
|
|||
declare(strict_types=1); |
|||
namespace MensBeam\Intl\Encoding; |
|||
|
|||
use MensBeam\Intl\Encoding as Matcher; |
|||
|
|||
class Encoder { |
|||
protected $name; |
|||
protected $fatal = true; |
|||
protected $mode = null; |
|||
|
|||
/** Constructs a new encoder for the specified $label |
|||
* |
|||
* @param string $label One of the encoding labels listed in the specification e.g. "utf-8", "Latin1", "shift_JIS" |
|||
* @param bool $fatal If true (the default) exceptions will be thrown when a character cannot be represented in the target encoding; if false HTML character references will be substituted instead |
|||
* |
|||
* @see https://encoding.spec.whatwg.org#names-and-labels |
|||
*/ |
|||
public function __construct(string $label, bool $fatal = true) { |
|||
$l = Matcher::matchLabel($label); |
|||
if (!$l || !$l['encoder']) { |
|||
throw new EncoderException("Label '$label' does not have an encoder", Coder::E_UNAVAILABLE_ENCODER); |
|||
} else { |
|||
$this->name = $l['name']; |
|||
$this->fatal = $fatal; |
|||
} |
|||
} |
|||
|
|||
/** Encodes a series of code point numbers into a string |
|||
* |
|||
* @param iterable $codePoints An iterable set of integers representing code points in the Unicode range |
|||
*/ |
|||
public function encode(iterable $codePoints): string { |
|||
$out = ""; |
|||
switch ($this->name) { |
|||
case "UTF-8": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= UTF8::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "Big5": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= Big5::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "EUC-JP": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= EUCJP::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "EUC-KR": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= EUCKR::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "gb18030": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= GB18030::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "GBK": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= GBK::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "IBM866": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= IBM866::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "ISO-2022-JP": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= ISO2022JP::encode($codePoint, $this->fatal, $mode); |
|||
} |
|||
$out .= ISO2022JP::encode(null, $this->fatal, $mode); |
|||
break; |
|||
case "ISO-8859-2": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= ISO88592::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "ISO-8859-3": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= ISO88593::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "ISO-8859-4": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= ISO88594::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "ISO-8859-5": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= ISO88595::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "ISO-8859-6": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= ISO88596::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "ISO-8859-7": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= ISO88597::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "ISO-8859-8": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= ISO88598::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "ISO-8859-8-I": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= ISO88598I::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "ISO-8859-10": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= ISO885910::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "ISO-8859-13": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= ISO885913::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "ISO-8859-14": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= ISO885914::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "ISO-8859-15": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= ISO885915::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "ISO-8859-16": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= ISO885916::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "KOI8-R": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= KOI8R::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "KOI8-U": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= KOI8U::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "macintosh": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= Macintosh::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "Shift_JIS": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= ShiftJIS::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "windows-1250": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= Windows1250::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "windows-1251": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= Windows1251::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "windows-1252": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= Windows1252::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "windows-1253": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= Windows1253::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "windows-1254": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= Windows1254::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "windows-1255": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= Windows1255::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "windows-1256": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= Windows1256::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "windows-1257": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= Windows1257::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "windows-1258": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= Windows1258::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "windows-874": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= Windows874::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "x-mac-cyrillic": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= XMacCyrillic::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
case "x-user-defined": |
|||
foreach ($codePoints as $codePoint) { |
|||
$out .= XUserDefined::encode($codePoint, $this->fatal); |
|||
} |
|||
break; |
|||
} |
|||
return $out; |
|||
} |
|||
|
|||
/** Encodes a single character into a string |
|||
* |
|||
* When using this method to encode a string, the finalize() method should be called to terminate the string |
|||
* |
|||
* @param int $codePoint An integer representing the Unicode code point number to encode |
|||
*/ |
|||
public function encodeChar(int $codePoint): string { |
|||
switch ($this->name) { |
|||
case "UTF-8": |
|||
return UTF8::encode($codePoint, $this->fatal); |
|||
case "Big5": |
|||
return Big5::encode($codePoint, $this->fatal); |
|||
case "EUC-JP": |
|||
return EUCJP::encode($codePoint, $this->fatal); |
|||
case "EUC-KR": |
|||
return EUCKR::encode($codePoint, $this->fatal); |
|||
case "gb18030": |
|||
return GB18030::encode($codePoint, $this->fatal); |
|||
case "GBK": |
|||
return GBK::encode($codePoint, $this->fatal); |
|||
case "IBM866": |
|||
return IBM866::encode($codePoint, $this->fatal); |
|||
case "ISO-8859-2": |
|||
return ISO88592::encode($codePoint, $this->fatal); |
|||
case "ISO-8859-3": |
|||
return ISO88593::encode($codePoint, $this->fatal); |
|||
case "ISO-8859-4": |
|||
return ISO88594::encode($codePoint, $this->fatal); |
|||
case "ISO-8859-5": |
|||
return ISO88595::encode($codePoint, $this->fatal); |
|||
case "ISO-8859-6": |
|||
return ISO88596::encode($codePoint, $this->fatal); |
|||
case "ISO-8859-7": |
|||
return ISO88597::encode($codePoint, $this->fatal); |
|||
case "ISO-8859-8": |
|||
return ISO88598::encode($codePoint, $this->fatal); |
|||
case "ISO-8859-8-I": |
|||
return ISO88598I::encode($codePoint, $this->fatal); |
|||
case "ISO-8859-10": |
|||
return ISO885910::encode($codePoint, $this->fatal); |
|||
case "ISO-8859-13": |
|||
return ISO885913::encode($codePoint, $this->fatal); |
|||
case "ISO-8859-14": |
|||
return ISO885914::encode($codePoint, $this->fatal); |
|||
case "ISO-8859-15": |
|||
return ISO885915::encode($codePoint, $this->fatal); |
|||
case "ISO-8859-16": |
|||
return ISO885916::encode($codePoint, $this->fatal); |
|||
case "KOI8-R": |
|||
return KOI8R::encode($codePoint, $this->fatal); |
|||
case "KOI8-U": |
|||
return KOI8U::encode($codePoint, $this->fatal); |
|||
case "macintosh": |
|||
return Macintosh::encode($codePoint, $this->fatal); |
|||
case "Shift_JIS": |
|||
return ShiftJIS::encode($codePoint, $this->fatal); |
|||
case "windows-1250": |
|||
return Windows1250::encode($codePoint, $this->fatal); |
|||
case "windows-1251": |
|||
return Windows1251::encode($codePoint, $this->fatal); |
|||
case "windows-1252": |
|||
return Windows1252::encode($codePoint, $this->fatal); |
|||
case "windows-1253": |
|||
return Windows1253::encode($codePoint, $this->fatal); |
|||
case "windows-1254": |
|||
return Windows1254::encode($codePoint, $this->fatal); |
|||
case "windows-1255": |
|||
return Windows1255::encode($codePoint, $this->fatal); |
|||
case "windows-1256": |
|||
return Windows1256::encode($codePoint, $this->fatal); |
|||
case "windows-1257": |
|||
return Windows1257::encode($codePoint, $this->fatal); |
|||
case "windows-1258": |
|||
return Windows1258::encode($codePoint, $this->fatal); |
|||
case "windows-874": |
|||
return Windows874::encode($codePoint, $this->fatal); |
|||
case "x-mac-cyrillic": |
|||
return XMacCyrillic::encode($codePoint, $this->fatal); |
|||
case "x-user-defined": |
|||
return XUserDefined::encode($codePoint, $this->fatal); |
|||
case "ISO-2022-JP": |
|||
return ISO2022JP::encode($codePoint, $this->fatal, $this->mode); |
|||
} |
|||
} // @codeCoverageIgnore |
|||
|
|||
/** Finalizes a string, returning any terminal bytes to append to the output |
|||
* |
|||
* For the ISO-2022-JP encoding, this method must be called fater the last character is encoded to correctly encode a string; for other encodings this is a no-op |
|||
*/ |
|||
public function finalize(): string { |
|||
return ISO2022JP::encode(null, $this->fatal, $this->mode); |
|||
} |
|||
} |
File diff suppressed because one or more lines are too long
@ -1,201 +0,0 @@ |
|||
<?php |
|||
/** @license MIT |
|||
* Copyright 2018 J. King et al. |
|||
* See LICENSE and AUTHORS files for details */ |
|||
|
|||
declare(strict_types=1); |
|||
namespace MensBeam\Intl\Encoding; |
|||
|
|||
trait GenericEncoding { |
|||
protected $string; |
|||
protected $posByte = 0; |
|||
protected $posChar = 0; |
|||
protected $lenByte = null; |
|||
protected $lenChar = null; |
|||
protected $errMode = self::MODE_REPLACE; |
|||
|
|||
/** Constructs a new decoder |
|||
* |
|||
* If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted |
|||
*/ |
|||
public function __construct(string $string, bool $fatal = false) { |
|||
$this->string = $string; |
|||
$this->lenByte = strlen($string); |
|||
$this->errMode = $fatal ? self::MODE_FATAL_DEC : self::MODE_REPLACE; |
|||
} |
|||
|
|||
/** Returns the current byte position of the decoder */ |
|||
public function posByte(): int { |
|||
return $this->posByte; |
|||
} |
|||
|
|||
/** Returns the current character position of the decoder */ |
|||
public function posChar(): int { |
|||
return $this->posChar; |
|||
} |
|||
|
|||
/** Seeks to the start of the string |
|||
* |
|||
* This is usually faster than using the seek method for the same purpose |
|||
*/ |
|||
public function rewind() { |
|||
$this->posByte = 0; |
|||
$this->posChar = 0; |
|||
} |
|||
|
|||
/** Retrieve the next character in the string, in UTF-8 encoding |
|||
* |
|||
* The returned character may be a replacement character, or the empty string if the end of the string has been reached |
|||
*/ |
|||
public function nextChar(): string { |
|||
// get the byte at the current position |
|||
$b = @$this->string[$this->posByte]; |
|||
if ($b === "") { |
|||
// if the byte is end of input, simply return it |
|||
return ""; |
|||
} elseif (ord($b) < 0x80) { |
|||
// if the byte is an ASCII character, simply return it |
|||
$this->posChar++; |
|||
$this->posByte++; |
|||
return $b; |
|||
} else { |
|||
// otherwise return the serialization of the code point at the current position |
|||
return UTF8::encode($this->nextCode()); |
|||
} |
|||
} |
|||
|
|||
/** Advance $distance characters through the string |
|||
* |
|||
* If $distance is negative, the operation will be performed in reverse |
|||
* |
|||
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned |
|||
*/ |
|||
public function seek(int $distance): int { |
|||
if ($distance > 0) { |
|||
if ($this->posByte == strlen($this->string)) { |
|||
return $distance; |
|||
} |
|||
do { |
|||
$p = $this->nextCode(); |
|||
} while (--$distance && $p !== false); |
|||
return $distance; |
|||
} elseif ($distance < 0) { |
|||
$distance = abs($distance); |
|||
if (!$this->posChar) { |
|||
return $distance; |
|||
} |
|||
$mode = $this->errMode; |
|||
$this->errMode = self::MODE_NULL; |
|||
$out = $this->seekBack($distance); |
|||
$this->errMode = $mode; |
|||
return $out; |
|||
} else { |
|||
return 0; |
|||
} |
|||
} |
|||
|
|||
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */ |
|||
public function peekChar(int $num = 1): string { |
|||
$out = ""; |
|||
$state = $this->stateSave(); |
|||
try { |
|||
while ($num-- > 0 && ($b = $this->nextChar()) !== "") { |
|||
$out .= $b; |
|||
} |
|||
} finally { |
|||
$this->stateApply($state); |
|||
} |
|||
return $out; |
|||
} |
|||
|
|||
/** Retrieves the next $num code points from the string, without advancing the character pointer */ |
|||
public function peekCode(int $num = 1): array { |
|||
$out = []; |
|||
$state = $this->stateSave(); |
|||
try { |
|||
while ($num-- > 0 && ($b = $this->nextCode()) !== false) { |
|||
$out[] = $b; |
|||
} |
|||
} finally { |
|||
$this->stateApply($state); |
|||
} |
|||
return $out; |
|||
} |
|||
|
|||
/** Calculates the length of the string in bytes */ |
|||
public function lenByte(): int { |
|||
return $this->lenByte; |
|||
} |
|||
|
|||
/** Calculates the length of the string in code points |
|||
* |
|||
* Note that this may involve processing to the end of the string |
|||
*/ |
|||
public function lenChar(): int { |
|||
return $this->lenChar ?? (function() { |
|||
$state = $this->stateSave(); |
|||
while ($this->nextCode() !== false); |
|||
$this->lenChar = $this->posChar; |
|||
$this->stateApply($state); |
|||
return $this->lenChar; |
|||
})(); |
|||
} |
|||
|
|||
/** Returns whether the character pointer is at the end of the string */ |
|||
public function eof(): bool { |
|||
return $this->posByte >= $this->lenByte; |
|||
} |
|||
|
|||
/** Generates an iterator which steps through each character in the string */ |
|||
public function chars(): \Generator { |
|||
while (($c = $this->nextChar()) !== "") { |
|||
yield ($this->posChar - 1) => $c; |
|||
} |
|||
} |
|||
|
|||
/** Generates an iterator which steps through each code point in the string */ |
|||
public function codes(): \Generator { |
|||
while (($c = $this->nextCode()) !== false) { |
|||
yield ($this->posChar - 1) => $c; |
|||
} |
|||
} |
|||
|
|||
/** Returns a copy of the decoder's state to keep in memory */ |
|||
protected function stateSave(): array { |
|||
return [ |
|||
'posChar' => $this->posChar, |
|||
'posByte' => $this->posByte, |
|||
]; |
|||
} |
|||
|
|||
/** Sets the decoder's state to the values specified */ |
|||
protected function stateApply(array $state) { |
|||
foreach ($state as $key => $value) { |
|||
$this->$key = $value; |
|||
} |
|||
} |
|||
|
|||
/** Handles decoding and encoding errors */ |
|||
protected static function err(int $mode, $data = null) { |
|||
switch ($mode) { |
|||
case self::MODE_NULL: |
|||
// used internally during backward seeking for some encodings |
|||
return null; // @codeCoverageIgnore |
|||
case self::MODE_REPLACE: |
|||
// standard "replace" mode |
|||
return 0xFFFD; |
|||
case self::MODE_HTML: |
|||
// the "html" replacement mode; not applicable to Unicode transformation formats |
|||
return "&#".(string) $data.";"; |
|||
case self::MODE_FATAL_DEC: |
|||
// fatal replacement mode for decoders |
|||
throw new DecoderException("Invalid code sequence at character offset {$data[0]} (byte offset {$data[1]})", self::E_INVALID_BYTE); |
|||
case self::MODE_FATAL_ENC: |
|||
// fatal replacement mode for decoders; not applicable to Unicode transformation formats |
|||
throw new EncoderException("Code point $data not available in target encoding", self::E_UNAVAILABLE_CODE_POINT); |
|||
default: |
|||
// indicative of internal bug; should never be triggered |
|||
throw new DecoderException("Invalid replacement mode {$mode}", self::E_INVALID_MODE); // @codeCoverageIgnore |
|||
} |
|||
} |
|||
} |
File diff suppressed because one or more lines are too long
@ -0,0 +1,17 @@ |
|||
<?php |
|||
/** @license MIT |
|||
* Copyright 2018 J. King et al. |
|||
* See LICENSE and AUTHORS files for details */ |
|||
|
|||
declare(strict_types=1); |
|||
namespace MensBeam\Intl\Encoding; |
|||
|
|||
interface ModalCoder { |
|||
/** Returns the encoding of $codePoint as a byte string |
|||
* |
|||
* @param int $codePoint The Unicode code point to encode. If less than 0 or greater than 1114111, an exception is thrown; if $codePoint is null this signals end-of-file |
|||
* @param bool $fatal Whether an exception will be thrown if the code point cannot be encoded into a character; if false HTML character references will be substituted |
|||
* @param mixed &$mode A reference keeping track of the current encoder mode. An uninitialized variable should be passed on first invocation, and that variable used for further invocations. |
|||
*/ |
|||
public static function encode(?int $codePoint, bool $fatal = true, &$mode = null): string; |
|||
} |
@ -0,0 +1,133 @@ |
|||
<?php |
|||
/** @license MIT |
|||
* Copyright 2018 J. King et al. |
|||
* See LICENSE and AUTHORS files for details */ |
|||
|
|||
declare(strict_types=1); |
|||
namespace MensBeam\Intl\Encoding; |
|||
|
|||
class Replacement implements Decoder { |
|||
public const NAME = "replacement"; |
|||
public const LABELS = [ |
|||
"csiso2022kr", |
|||
"hz-gb-2312", |
|||
"iso-2022-cn", |
|||
"iso-2022-cn-ext", |
|||
"iso-2022-kr", |
|||
"replacement", |
|||
]; |
|||
|
|||
protected $len = 0; |
|||
protected $done = false; |
|||
protected $fatal = false; |
|||
|
|||
public $posErr = 0; |
|||
|
|||
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) { |
|||
$this->len = strlen($string); |
|||
$this->fatal = $fatal; |
|||
} |
|||
|
|||
public function posByte(): int { |
|||
return $this->done ? $this->len : 0; |
|||
} |
|||
|
|||
public function posChar(): int { |
|||
return $this->done ? 1 : 0; |
|||
} |
|||
|
|||
public function nextChar(): string { |
|||
if (!$this->eof()) { |
|||
try { |
|||
return $this->peekChar(); |
|||
} finally { |
|||
$this->done = true; |
|||
$this->posErr = 1; |
|||
} |
|||
} |
|||
return ""; |
|||
} |
|||
|
|||
public function nextCode() { |
|||
if (!$this->eof()) { |
|||
try { |
|||
return $this->peekCode()[0]; |
|||
} finally { |
|||
$this->done = true; |
|||
$this->posErr = 1; |
|||
} |
|||
} |
|||
return false; |
|||
} |
|||
|
|||
public function seek(int $distance): int { |
|||
if ($distance > 0) { |
|||
if (!$this->eof()) { |
|||
$distance--; |
|||
$this->nextCode(); |
|||
} |
|||
} elseif ($distance < 0) { |
|||
if ($this->eof()) { |
|||
$distance++; |
|||
$this->rewind(); |
|||
} |
|||
} |
|||
return $distance; |
|||
} |
|||
|
|||
public function rewind(): void { |
|||
$this->done = false; |
|||
} |
|||
|
|||
public function peekChar(int $num = 1): string { |
|||
if (!$this->eof() && $num > 0) { |
|||
if ($this->fatal) { |
|||
throw new DecoderException("Unable to decode string", self::E_INVALID_BYTE); |
|||
} |
|||
return "\u{FFFD}"; |
|||
} |
|||
return ""; |
|||
} |
|||
|
|||
public function peekCode(int $num = 1): array { |
|||
if (!$this->eof() && $num > 0) { |
|||
if ($this->fatal) { |
|||
throw new DecoderException("Unable to decode string", self::E_INVALID_BYTE); |
|||
} |
|||
return [0xFFFD]; |
|||
} |
|||
return []; |
|||
} |
|||
|
|||
public function lenByte(): int { |
|||
return $this->len; |
|||
} |
|||
|
|||
public function lenChar(): int { |
|||
return (int) ($this->len > 0); |
|||
} |
|||
|
|||
public function eof(): bool { |
|||
return $this->done || $this->len === 0; |
|||
} |
|||
|
|||
public function chars(): \Generator { |
|||
if (!$this->eof()) { |
|||
yield 0 => $this->nextChar(); |
|||
} |
|||
} |
|||
|
|||
public function codes(): \Generator { |
|||
if (!$this->eof()) { |
|||
yield 0 => $this->nextCode(); |
|||
} |
|||
} |
|||
|
|||
public function asciiSpan(string $mask, int $length = null): string { |
|||
return ""; |
|||
} |
|||
|
|||
public function asciiSpanNot(string $mask, int $length = null): string { |
|||
return ""; |
|||
} |
|||
} |
File diff suppressed because one or more lines are too long
@ -1,18 +0,0 @@ |
|||
<?php |
|||
/** @license MIT |
|||
* Copyright 2018 J. King et al. |
|||
* See LICENSE and AUTHORS files for details */ |
|||
|
|||
declare(strict_types=1); |
|||
namespace MensBeam\Intl\Encoding; |
|||
|
|||
interface StatefulEncoding extends Encoding { |
|||
|
|||
/** Returns the encoding of $codePoint as a byte string |
|||
* |
|||
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown |
|||
* |
|||
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted |
|||
*/ |
|||
public static function encode(array $codePoints, bool $fatal = true): string; |
|||
} |
@ -1,18 +0,0 @@ |
|||
<?php |
|||
/** @license MIT |
|||
* Copyright 2018 J. King et al. |
|||
* See LICENSE and AUTHORS files for details */ |
|||
|
|||
declare(strict_types=1); |
|||
namespace MensBeam\Intl\Encoding; |
|||
|
|||
interface StatelessEncoding extends Encoding { |
|||
|
|||
/** Returns the encoding of $codePoint as a byte string |
|||
* |
|||
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown |
|||
* |
|||
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted |
|||
*/ |
|||
public static function encode(int $codePoint, bool $fatal = true): string; |
|||
} |
File diff suppressed because one or more lines are too long
@ -0,0 +1,279 @@ |
|||
<?php |
|||
/** @license MIT |
|||
* Copyright 2018 J. King et al. |
|||
* See LICENSE and AUTHORS files for details */ |
|||
|
|||
declare(strict_types=1); |
|||
namespace MensBeam\Intl\TestCase\Encoding; |
|||
|
|||
use MensBeam\Intl\Encoding\ISO2022JP; |
|||
use MensBeam\Intl\Encoding\Coder; |
|||
use MensBeam\Intl\Encoding\EncoderException; |
|||
|
|||
class TestISO2022JP extends \MensBeam\Intl\Test\CoderDecoderTest { |
|||
protected $testedClass = ISO2022JP::class; |
|||
/* |
|||
Char 0 U+007A (1 byte) Offset 0 |
|||
Esc: Katakana (3 bytes) Offset 1 |
|||
Char 1 U+FF9C (1 byte) Offset 4 |
|||
Char 2 U+FF9F (1 byte) Offset 5 |
|||
Esc: Double-byte (3 bytes) Offset 6 |
|||
Char 3 U+79FB (2 bytes) Offset 9 |
|||
Char 4 U+67B8 (2 bytes) Offset 11 |
|||
Char 5 U+9B91 (2 bytes) Offset 13 |
|||
Esc: ASCII (3 bytes) Offset 15 |
|||
Char 6 U+007E (1 byte) Offset 18 |
|||
Esc: Roman (3 bytes) Offset 19 |
|||
End of string at char 7, offset 22 |
|||
*/ |
|||
protected $seekString = "7A 1B2849 5C 5F 1B2440 305C 5B4E 723A 1B2842 7E 1B284A"; |
|||
protected $seekCodes = [0x7A, 0xFF9C, 0xFF9F, 0x79FB, 0x67B8, 0x9B91, 0x7E]; |
|||
protected $seekOffsets = [0, 1, 5, 6, 11, 13, 15, 19]; |
|||
/* This string contains an invalid character sequence sandwiched between two null characters */ |
|||
protected $brokenChar = "00 FF 00"; |
|||
/* This string conatins the ASCII characters "A" and "Z" followed by two arbitrary non-ASCII characters, followed by the two ASCII characters "0" and "9" */ |
|||
protected $spanString = "1B284A 41 5A 1B2849 5C 5F 1B2842 30 39"; |
|||
|
|||
public function provideCodePoints() { |
|||
return [ |
|||
'U+0020 (HTML)' => [false, [0x20], "20"], |
|||
'U+0020 (fatal)' => [true, [0x20], "20"], |
|||
'U+005C (HTML)' => [false, [0x5C], "5C"], |
|||
'U+005C (fatal)' => [true, [0x5C], "5C"], |
|||
'U+007E (HTML)' => [false, [0x7E], "7E"], |
|||
'U+007E (fatal)' => [true, [0x7E], "7E"], |
|||
'U+00A5 (HTML)' => [false, [0xA5], "1B 28 4A 5C 1B 28 42"], |
|||
'U+00A5 (fatal)' => [true, [0xA5], "1B 28 4A 5C 1B 28 42"], |
|||
'U+203E (HTML)' => [false, [0x203E], "1B 28 4A 7E 1B 28 42"], |
|||
'U+203E (fatal)' => [true, [0x203E], "1B 28 4A 7E 1B 28 42"], |
|||
'U+FF61 (HTML)' => [false, [0xFF61], "1B 24 42 21 23 1B 28 42"], |
|||
'U+FF61 (fatal)' => [true, [0xFF61], "1B 24 42 21 23 1B 28 42"], |
|||
'U+FF9F (HTML)' => [false, [0xFF9F], "1B 24 42 21 2C 1B 28 42"], |
|||
'U+FF9F (fatal)' => [true, [0xFF9F], "1B 24 42 21 2C 1B 28 42"], |
|||
'U+2212 (HTML)' => [false, [0x2212], "1B 24 42 21 5D 1B 28 42"], |
|||
'U+2212 (fatal)' => [true, [0x2212], "1B 24 42 21 5D 1B 28 42"], |
|||
'U+2116 (HTML)' => [false, [0x2116], "1B 24 42 2D 62 1B 28 42"], |
|||
'U+2116 (fatal)' => [true, [0x2116], "1B 24 42 2D 62 1B 28 42"], |
|||
'U+FFE2 (HTML)' => [false, [0xFFE2], "1B 24 42 22 4C 1B 28 42"], |
|||
'U+FFE2 (fatal)' => [true, [0xFFE2], "1B 24 42 22 4C 1B 28 42"], |
|||
'U+00C6 (HTML)' => [false, [0xC6], "26 23 31 39 38 3B"], |
|||
'U+00C6 (fatal)' => [true, [0xC6], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)], |
|||
'U+FFFD (HTML)' => [false, [0xFFFD], "26 23 36 35 35 33 33 3B"], |
|||
'U+FFFD (fatal)' => [true, [0xFFFD], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)], |
|||
'Roman (HTML)' => [false, [0xA5, 0x20, 0x203E], "1B 28 4A 5C 20 7E 1B 28 42"], |
|||
'Roman (fatal)' => [true, [0xA5, 0x20, 0x203E], "1B 28 4A 5C 20 7E 1B 28 42"], |
|||
'Roman to ASCII (HTML)' => [false, [0xA5, 0x5C], "1B 28 4A 5C 1B 28 42 5C"], |
|||
'Roman to ASCII (fatal)' => [true, [0xA5, 0x5C], "1B 28 4A 5C 1B 28 42 5C"], |
|||
'Roman to error (HTML)' => [false, [0xA5, 0x80], "1B 28 4A 5C 26 23 31 32 38 3B 1B 28 42"], |
|||
'Roman to error (fatal)' => [true, [0xA5, 0x80], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)], |
|||
'JIS (HTML)' => [false, [0x2116, 0xFFE2, 0x2212], "1B 24 42 2D 62 22 4C 21 5D 1B 28 42"], |
|||
'JIS (fatal)' => [true, [0x2116, 0xFFE2, 0x2212], "1B 24 42 2D 62 22 4C 21 5D 1B 28 42"], |
|||
'JIS to Roman (HTML)' => [false, [0x2116, 0xA5], "1B 24 42 2D 62 1B 28 4A 5C 1B 28 42"], |
|||
'JIS to Roman (fatal)' => [true, [0x2116, 0xA5], "1B 24 42 2D 62 1B 28 4A 5C 1B 28 42"], |
|||
'JIS to ASCII 1 (HTML)' => [false, [0x2116, 0x20], "1B 24 42 2D 62 1B 28 42 20"], |
|||
'JIS to ASCII 1 (fatal)' => [true, [0x2116, 0x20], "1B 24 42 2D 62 1B 28 42 20"], |
|||
'JIS to ASCII 2 (HTML)' => [false, [0x2116, 0x5C], "1B 24 42 2D 62 1B 28 42 5C"], |
|||
'JIS to ASCII 2 (fatal)' => [true, [0x2116, 0x5C], "1B 24 42 2D 62 1B 28 42 5C"], |
|||
'JIS to error 1 (HTML)' => [false, [0x2116, 0x80], "1B 24 42 2D 62 1B 28 42 26 23 31 32 38 3B"], |
|||
'JIS to error 1 (fatal)' => [true, [0x2116, 0x80], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)], |
|||
'JIS to error 2 (HTML)' => [false, [0x2116, 0x1B], "1B 24 42 2D 62 1B 28 42 26 23 36 35 35 33 33 3B"], |
|||
'JIS to error 2 (fatal)' => [true, [0x2116, 0x1B], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)], |
|||
'Escape characters (HTML)' => [false, [0x1B, 0xE, 0xF], "26 23 36 35 35 33 33 3B 26 23 36 35 35 33 33 3B 26 23 36 35 35 33 33 3B"], |
|||
'Escape characters (fatal)' => [true, [0x1B, 0xE, 0xF], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)], |
|||
'-1 (HTML)' => [false, [-1], new EncoderException("", Coder::E_INVALID_CODE_POINT)], |
|||
'-1 (fatal)' => [true, [-1], new EncoderException("", Coder::E_INVALID_CODE_POINT)], |
|||
'0x110000 (HTML)' => [false, [0x110000], new EncoderException("", Coder::E_INVALID_CODE_POINT)], |
|||
'0x110000 (fatal)' => [true, [0x110000], new EncoderException("", Coder::E_INVALID_CODE_POINT)], |
|||
]; |
|||
} |
|||
|
|||
public function provideStrings() { |
|||
return [ |
|||
'empty string' => ["", []], |
|||
'Implied ASCII mode' => ["00 30 5C 7E 21 5F", [0, 48, 92, 126, 33, 95]], |
|||
'Explicit ASCII mode' => ["1B2842 00 30 5C 7E 21 5F", [0, 48, 92, 126, 33, 95]], |
|||
'Roman mode' => ["1B284A 00 30 5C 7E 21 5F", [0, 48, 165, 8254, 33, 95]], |
|||
'Katakana mode' => ["1B2849 00 30 5C 7E 21 5F", [65533, 65392, 65436, 65533, 65377, 65439]], |
|||
'Double-byte mode 1' => ["1B2440 00 305C 7E21 5F", [65533, 31227, 65533, 65533]], |
|||
'Double-byte mode 2' => ["1B2442 00 305C 7E21 5F", [65533, 31227, 65533, 65533]], |
|||
'Multiple modes' => ["5C 1B2849 21 1B2440 305C 1B284A 5C 1B2842 5C", [92, 65377, 31227, 165, 92]], |
|||
'Double escape' => ["1B2849 1B2842 5C", [65533, 92]], |
|||
'Triple escape' => ["1B2849 1B2842 1B284A 5C", [65533, 65533, 165]], |
|||
'Trailing escape' => ["20 1B284A 30 33 1B2849", [32, 48, 51]], |
|||
'Truncated escape 1' => ["1B", [65533]], |
|||
'Truncated escape 2' => ["1B28", [65533, 40]], |
|||
'Truncated escape 3' => ["1B2820", [65533, 40, 32]], |
|||
'Truncated escape 4' => ["1B2020", [65533, 32, 32]], |
|||
'Invalid escape 1' => ["1B2840", [65533, 40, 64]], |
|||
'Invalid escape 2' => ["1B244A", [65533, 36, 74]], |
|||
'Invalid bytes' => ["80 FF 1B2849 00 20 7F 1B2442 00 2100 FF FF", [65533, 65533, 65533, 65533, 65533, 65533, 65533, 65533, 65533]], |
|||
]; |
|||
} |
|||
|
|||
/** |
|||
* @dataProvider provideCodePoints |
|||
* @covers MensBeam\Intl\Encoding\Encoder |
|||
*/ |
|||
public function testEncodeCodePoints(bool $fatal, $input, $exp) { |
|||
return parent::testEncodeCodePoints($fatal, $input, $exp); |
|||
} |
|||
|
|||
/** |
|||
* @dataProvider provideCodePoints |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::encode |
|||
*/ |
|||
public function testEncodeCodePointsStatically(bool $fatal, $input, $exp) { |
|||
$out = ""; |
|||
if ($exp instanceof \Throwable) { |
|||
$this->expectException(get_class($exp)); |
|||
$this->expectExceptionCode($exp->getCode()); |
|||
} else { |
|||
$exp = strtolower(str_replace(" ", "", $exp)); |
|||
} |
|||
foreach ($input as $char) { |
|||
$out .= ISO2022JP::encode($char, $fatal, $mode); |
|||
} |
|||
$out .= ISO2022JP::encode(null, $fatal, $mode); |
|||
$this->assertSame($exp, bin2hex($out)); |
|||
} |
|||
|
|||
/** |
|||
* @dataProvider provideStrings |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::__construct |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::nextCode |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::modeSet |
|||
*/ |
|||
public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) { |
|||
return parent::testDecodeMultipleCharactersAsCodePoints($input, $exp); |
|||
} |
|||
|
|||
/** |
|||
* @dataProvider provideStrings |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::__construct |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::nextChar |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::modeSet |
|||
*/ |
|||
public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) { |
|||
return parent::testDecodeMultipleCharactersAsStrings($input, $exp); |
|||
} |
|||
|
|||
/** |
|||
* @dataProvider provideStrings |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::seekBack |
|||
*/ |
|||
public function testSTepBackThroughAString(string $input, array $exp) { |
|||
return parent::testSTepBackThroughAString($input, $exp); |
|||
} |
|||
|
|||
/** |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::seek |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::posChar |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::posByte |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::rewind |
|||
*/ |
|||
public function testSeekThroughAString() { |
|||
return parent::testSeekThroughAString(); |
|||
} |
|||
|
|||
/** |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::posChar |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::posByte |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::eof |
|||
*/ |
|||
public function testTraversePastTheEndOfAString() { |
|||
return parent::testTraversePastTheEndOfAString(); |
|||
} |
|||
|
|||
/** |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::peekChar |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::stateSave |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::stateApply |
|||
*/ |
|||
public function testPeekAtCharacters() { |
|||
return parent::testPeekAtCharacters(); |
|||
} |
|||
|
|||
/** |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::peekCode |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::stateSave |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::stateApply |
|||
*/ |
|||
public function testPeekAtCodePoints() { |
|||
return parent::testPeekAtCodePoints(); |
|||
} |
|||
|
|||
/** |
|||
* @dataProvider provideStrings |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::lenChar |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::lenByte |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::stateSave |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::stateApply |
|||
*/ |
|||
public function testGetStringLength(string $input, array $points) { |
|||
return parent::testGetStringLength($input, $points); |
|||
} |
|||
|
|||
/** |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::errDec |
|||
*/ |
|||
public function testReplacementModes() { |
|||
return parent::testReplacementModes(); |
|||
} |
|||
|
|||
/** |
|||
* @dataProvider provideStrings |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::rewind |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::chars |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::codes |
|||
*/ |
|||
public function testIterateThroughAString(string $input, array $exp) { |
|||
return parent::testIterateThroughAString($input, $exp); |
|||
} |
|||
|
|||
/** |
|||
* @dataProvider provideStrings |
|||
* @coversNothing |
|||
*/ |
|||
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) { |
|||
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp); |
|||
} |
|||
|
|||
/** |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::seekBack |
|||
*/ |
|||
public function testSeekBackOverRandomData() { |
|||
return parent::testSeekBackOverRandomData(); |
|||
} |
|||
|
|||
/** |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::asciiSpan |
|||
*/ |
|||
public function testExtractAsciiSpans() { |
|||
parent::testExtractAsciiSpans(); |
|||
} |
|||
|
|||
/** |
|||
* @covers MensBeam\Intl\Encoding\ISO2022JP::asciiSpanNot |
|||
*/ |
|||
public function testExtractNegativeAsciiSpans() { |
|||
parent::testExtractNegativeAsciiSpans(); |
|||
} |
|||
|
|||
/** |
|||
* @group optional |
|||
*/ |
|||
public function testPedanticallyDecodeSingleCharactersAsCodePoint() { |
|||
$series = [ |
|||
]; |
|||
foreach ($series as $test) { |
|||
foreach ($test[0] as $a => $input) { |
|||
$class = $this->testedClass; |
|||
$char = hex2bin($input); |
|||
$exp = $test[1][$a]; |
|||
$s = new $class($char); |
|||
$this->assertSame($exp, $s->nextCode(), "Sequence $input did not decode to $exp."); |
|||
$this->assertFalse($s->nextCode(), "Sequence $input did not end after one character"); |
|||
} |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,221 @@ |
|||
<?php |
|||
/** @license MIT |
|||
* Copyright 2018 J. King et al. |
|||
* See LICENSE and AUTHORS files for details */ |
|||
|
|||
declare(strict_types=1); |
|||
namespace MensBeam\Intl\TestCase\Encoding; |
|||
|
|||
use MensBeam\Intl\Encoding\Replacement; |
|||
use MensBeam\Intl\Encoding\DecoderException; |
|||
|
|||
class TestReplacement extends \MensBeam\Intl\Test\DecoderTest { |
|||
protected $testedClass = Replacement::class; |
|||
|
|||
public function provideStrings() { |
|||
return [ |
|||
// control samples |
|||
'empty string' => ["", []], |
|||
'Arbitrary string 1' => ["20", [0xFFFD]], |
|||
'Arbitrary string 2' => ["64 8B 20 00 FF A5", [0xFFFD]], |
|||
]; |
|||
} |
|||
|
|||
/** |
|||
* @dataProvider provideStrings |
|||
* @covers MensBeam\Intl\Encoding\Replacement::__construct |
|||
* @covers MensBeam\Intl\Encoding\Replacement::nextCode |
|||
*/ |
|||
public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) { |
|||
return parent::testDecodeMultipleCharactersAsCodePoints($input, $exp); |
|||
} |
|||
|
|||
/** |
|||
* @dataProvider provideStrings |
|||
* @covers MensBeam\Intl\Encoding\Replacement::__construct |
|||
* @covers MensBeam\Intl\Encoding\Replacement::nextChar |
|||
*/ |
|||
public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) { |
|||
return parent::testDecodeMultipleCharactersAsStrings($input, $exp); |
|||
} |
|||
|
|||
/** |
|||
* @dataProvider provideStrings |
|||
* @covers MensBeam\Intl\Encoding\Replacement::seek |
|||
*/ |
|||
public function testSTepBackThroughAString(string $input, array $exp) { |
|||
return parent::testSTepBackThroughAString($input, $exp); |
|||
} |
|||
|
|||
/** |
|||
* @coversNothing |
|||
*/ |
|||
public function testSeekThroughAString() { |
|||
$this->assertTrue(true); |
|||
} |
|||
|
|||
/** |
|||
* @covers MensBeam\Intl\Encoding\Replacement::posChar |
|||
* @covers MensBeam\Intl\Encoding\Replacement::posByte |
|||
* @covers MensBeam\Intl\Encoding\Replacement::seek |
|||
* @covers MensBeam\Intl\Encoding\Replacement::eof |
|||
*/ |
|||
public function testTraversePastTheEndOfAString() { |
|||
$d = new Replacement("a"); |
|||
$this->assertFalse($d->eof()); |
|||
$this->assertSame(0, $d->posChar()); |
|||
$this->assertSame(0, $d->posByte()); |
|||
$d->seek(1); |
|||
$this->assertTrue($d->eof()); |
|||
$this->assertSame(1, $d->posChar()); |
|||
$this->assertSame(1, $d->posByte()); |
|||
$d->seek(1); |
|||
$this->assertTrue($d->eof()); |
|||
$this->assertSame(1, $d->posChar()); |
|||
$this->assertSame(1, $d->posByte()); |
|||
} |
|||
|
|||
/** |
|||
* @covers MensBeam\Intl\Encoding\Replacement::peekChar |
|||
* @covers MensBeam\Intl\Encoding\Replacement::posChar |
|||
* @covers MensBeam\Intl\Encoding\Replacement::posByte |
|||
*/ |
|||
public function testPeekAtCharacters() { |
|||
$d = new Replacement("A"); |
|||
$this->assertSame(0, $d->posChar()); |
|||
$this->assertSame(0, $d->posByte()); |
|||
$this->assertSame("\u{FFFD}", $d->peekChar(2112)); |
|||
$this->assertSame(0, $d->posChar()); |
|||
$this->assertSame(0, $d->posByte()); |
|||
$this->assertSame("", $d->peekChar(0)); |
|||
$this->assertSame("", $d->peekChar(-2112)); |
|||
} |
|||
|
|||
/** |
|||
* @covers MensBeam\Intl\Encoding\Replacement::peekCode |
|||
* @covers MensBeam\Intl\Encoding\Replacement::posChar |
|||
* @covers MensBeam\Intl\Encoding\Replacement::posByte |
|||
*/ |
|||
public function testPeekAtCodePoints() { |
|||
$d = new Replacement("A"); |
|||
$this->assertSame(0, $d->posChar()); |
|||
$this->assertSame(0, $d->posByte()); |
|||
$this->assertSame([0xFFFD], $d->peekCode(2112)); |
|||
$this->assertSame(0, $d->posChar()); |
|||
$this->assertSame(0, $d->posByte()); |
|||
$this->assertSame([], $d->peekCode(0)); |
|||
$this->assertSame([], $d->peekCode(-2112)); |
|||
} |
|||
|
|||
/** |
|||
* @dataProvider provideStrings |
|||
* @covers MensBeam\Intl\Encoding\Replacement::lenChar |
|||
* @covers MensBeam\Intl\Encoding\Replacement::lenByte |
|||
*/ |
|||
public function testGetStringLength(string $input, array $points) { |
|||
return parent::testGetStringLength($input, $points); |
|||
} |
|||
|
|||
/** |
|||
* @covers MensBeam\Intl\Encoding\Replacement::nextChar |
|||
* @covers MensBeam\Intl\Encoding\Replacement::nextCode |
|||
* @covers MensBeam\Intl\Encoding\Replacement::peekChar |
|||
* @covers MensBeam\Intl\Encoding\Replacement::peekCode |
|||
* @covers MensBeam\Intl\Encoding\Replacement::rewind |
|||
* @covers MensBeam\Intl\Encoding\Replacement::posChar |
|||
* @covers MensBeam\Intl\Encoding\Replacement::posByte |
|||
*/ |
|||
public function testReplacementModes() { |
|||
$d = new Replacement("VVVVVV", true); |
|||
$this->assertSame(0, $d->posChar()); |
|||
$this->assertSame(0, $d->posByte()); |
|||
try { |
|||
$p = $d->peekCode(); |
|||
} catch (\Exception $e) { |
|||
$p = $e; |
|||
} finally { |
|||
$this->assertInstanceOf(DecoderException::class, $p); |
|||
} |
|||
$this->assertSame(0, $d->posErr); |
|||
$this->assertSame(0, $d->posChar()); |
|||
$this->assertSame(0, $d->posByte()); |
|||
try { |
|||
$p = $d->nextCode(); |
|||
} catch (\Exception $e) { |
|||
$p = $e; |
|||
} finally { |
|||
$this->assertInstanceOf(DecoderException::class, $p); |
|||
} |
|||
$this->assertSame(1, $d->posErr); |
|||
$this->assertSame(1, $d->posChar()); |
|||
$this->assertSame(6, $d->posByte()); |
|||
$d->rewind(); |
|||
$this->assertSame(0, $d->posChar()); |
|||
$this->assertSame(0, $d->posByte()); |
|||
try { |
|||
$p = $d->peekChar(); |
|||
} catch (\Exception $e) { |
|||
$p = $e; |
|||
} finally { |
|||
$this->assertInstanceOf(DecoderException::class, $p); |
|||
} |
|||
$this->assertSame(1, $d->posErr); |
|||
$this->assertSame(0, $d->posChar()); |
|||
$this->assertSame(0, $d->posByte()); |
|||
try { |
|||
$p = $d->nextChar(); |
|||
} catch (\Exception $e) { |
|||
$p = $e; |
|||
} finally { |
|||
$this->assertInstanceOf(DecoderException::class, $p); |
|||
} |
|||
$this->assertSame(1, $d->posErr); |
|||
$this->assertSame(1, $d->posChar()); |
|||
$this->assertSame(6, $d->posByte()); |
|||
} |
|||
|
|||
/** |
|||
* @dataProvider provideStrings |
|||
* @covers MensBeam\Intl\Encoding\Replacement::rewind |
|||
* @covers MensBeam\Intl\Encoding\Replacement::chars |
|||
* @covers MensBeam\Intl\Encoding\Replacement::codes |
|||
*/ |
|||
public function testIterateThroughAString(string $input, array $exp) { |
|||
return parent::testIterateThroughAString($input, $exp); |
|||
} |
|||
|
|||
/** |
|||
* @dataProvider provideStrings |
|||
* @covers MensBeam\Intl\Encoding\Replacement::nextCode |
|||
*/ |
|||
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) { |
|||
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp); |
|||
} |
|||
|
|||
/** |
|||
* @coversNothing |
|||
*/ |
|||
public function testSeekBackOverRandomData() { |
|||
return parent::testSeekBackOverRandomData(); |
|||
} |
|||
|
|||
/** |
|||
* @covers MensBeam\Intl\Encoding\Replacement::asciiSpan |
|||
*/ |
|||
public function testExtractAsciiSpans() { |
|||
$d = new Replacement("VVVVVV"); |
|||
$this->assertSame("", $d->asciiSpan($this->allBytes())); |
|||
$d->nextChar(); |
|||
$this->assertTrue($d->eof()); |
|||
} |
|||
|
|||
/** |
|||
* @covers MensBeam\Intl\Encoding\Replacement::asciiSpanNot |
|||
*/ |
|||
public function testExtractNegativeAsciiSpans() { |
|||
$d = new Replacement("VVVVVV"); |
|||
$this->assertSame("", $d->asciiSpanNot("")); |
|||
$d->nextChar(); |
|||
$this->assertTrue($d->eof()); |
|||
} |
|||
} |
File diff suppressed because one or more lines are too long
@ -0,0 +1,92 @@ |
|||
<?php |
|||
/** @license MIT |
|||
* Copyright 2018 J. King et al. |
|||
* See LICENSE and AUTHORS files for details */ |
|||
|
|||
declare(strict_types=1); |
|||
namespace MensBeam\Intl\TestCase; |
|||
|
|||
use MensBeam\Intl\Encoding; |
|||
use MensBeam\Intl\Encoding\Encoder; |
|||
use MensBeam\Intl\Encoding\UTF16BE; |
|||
use MensBeam\Intl\Encoding\UTF16LE; |
|||
use MensBeam\Intl\Encoding\UTF8; |
|||
|
|||
class TestEncoding extends \PHPUnit\Framework\TestCase { |
|||
/** @dataProvider provideLabelData */ |
|||
public function testMatchALabelToAnEncoding(string $label, array $exp) { |
|||
$this->assertSame($exp, Encoding::matchLabel($label)); |
|||
$this->assertSame($exp, Encoding::matchLabel(strtoupper($label))); |
|||
$this->assertSame($exp, Encoding::matchLabel(" $label\n\n\r\t")); |
|||
} |
|||
|
|||
public function testFailToMatchALabelToAnEncoding() { |
|||
$this->assertNull(Encoding::matchLabel("Not a label")); |
|||
} |
|||
|
|||
/** @dataProvider provideLabelData */ |
|||
public function testCreateADecoderFromALabel(string $label, array $data) { |
|||
$this->assertInstanceOf($data['class'], Encoding::createDecoder($label, "")); |
|||
$this->assertInstanceOf($data['class'], Encoding::createDecoder(strtoupper($label), "")); |
|||
$this->assertInstanceOf($data['class'], Encoding::createDecoder(" $label\n\n\r\t", "")); |
|||
} |
|||
|
|||
/** @dataProvider provideBOMSniffings */ |
|||
public function testCreateADecoderWhileSniffingBOM(string $label, string $string, string $class) { |
|||
$this->assertInstanceOf($class, Encoding::createDecoder($label, $string)); |
|||
} |
|||
|
|||
public function testFailToCreateADecoderFromALabel() { |
|||
$this->assertNull(Encoding::createDecoder("Not a label", "")); |
|||
} |
|||
|
|||
/** @dataProvider provideLabelData */ |
|||
public function testCreateAnEncoderFromALabel(string $label, array $data) { |
|||
if ($data['encoder']) { |
|||
$this->assertInstanceOf(Encoder::class, Encoding::createEncoder($label)); |
|||
$this->assertInstanceOf(Encoder::class, Encoding::createEncoder(strtoupper($label))); |
|||
$this->assertInstanceOf(Encoder::class, Encoding::createEncoder(" $label\n\n\r\t")); |
|||
} else { |
|||
$this->assertNull(Encoding::createEncoder($label)); |
|||
$this->assertNull(Encoding::createEncoder(strtoupper($label))); |
|||
$this->assertNull(Encoding::createEncoder(" $label\n\n\r\t")); |
|||
} |
|||
} |
|||
|
|||
public function testFailToCreateAnEncoderFromALabel() { |
|||
$this->assertNull(Encoding::createEncoder("Not a label")); |
|||
} |
|||
|
|||
public function provideLabelData() { |
|||
$ns = "MensBeam\\Intl\\Encoding\\"; |
|||
$labels = []; |
|||
$names = []; |
|||
foreach (new \GlobIterator(\MensBeam\Intl\BASE."/lib/Encoding/*.php", \FilesystemIterator::CURRENT_AS_PATHNAME) as $file) { |
|||
$file = basename($file, ".php"); |
|||
$className = $ns.$file; |
|||
$class = new \ReflectionClass($className); |
|||
if ($class->implementsInterface(\MensBeam\Intl\Encoding\Decoder::class) && $class->isInstantiable()) { |
|||
$name = $class->getConstant("NAME"); |
|||
$names[$name] = $className; |
|||
foreach ($class->getConstant("LABELS") as $label) { |
|||
$labels[$label] = $name; |
|||
} |
|||
} |
|||
} |
|||
foreach ($labels as $label => $name) { |
|||
$class = $names[$name]; |
|||
$encoder = !in_array($name, ["UTF-16LE", "UTF-16BE", "replacement"]); |
|||
yield [(string) $label, ['label' => (string) $label, 'name' => $name, 'class' => $class, 'encoder' => $encoder]]; |
|||
} |
|||
} |
|||
|
|||
public function provideBOMSniffings() { |
|||
return [ |
|||
'No BOM' => ["UTF-8", "Hello world!", UTF8::class], |
|||
'UTF-8 BOM' => ["Shift_JIS", "\xEF\xBB\xBFA", UTF8::class], |
|||
'UTF-16BE BOM' => ["UTF-8", "\xFE\xFF\x00A", UTF16BE::class], |
|||
'UTF-16LE BOM' => ["UTF-8", "\xFF\xFEA\x00", UTF16LE::class], |
|||
'GB18030 BOM' => ["UTF-8", "\x84\x31\x95\x33A", UTF8::class], |
|||
]; |
|||
} |
|||
} |
@ -0,0 +1,40 @@ |
|||
<?php |
|||
// this script read and names and labels from each concrete |
|||
// class in the Encoding set and generates tables mapping labels |
|||
// to names and names to classes |
|||
|
|||
use MensBeam\Intl\Encoding\Decoder; |
|||
|
|||
define("BASE", dirname(__DIR__).DIRECTORY_SEPARATOR); |
|||
require_once BASE."vendor".DIRECTORY_SEPARATOR."autoload.php"; |
|||
|
|||
$ns = "\\MensBeam\\Intl\\Encoding\\"; |
|||
$labels = []; |
|||
$names = []; |
|||
foreach (new \GlobIterator(BASE."/lib/Encoding/*.php", \FilesystemIterator::CURRENT_AS_PATHNAME) as $file) { |
|||
$file = basename($file, ".php"); |
|||
$className = $ns.$file; |
|||
$class = new \ReflectionClass($className); |
|||
if ($class->implementsInterface(Decoder::class) && $class->isInstantiable()) { |
|||
$name = $class->getConstant("NAME"); |
|||
$names[$name] = $className; |
|||
foreach ($class->getConstant("LABELS") as $label) { |
|||
$labels[$label] = $name; |
|||
} |
|||
} |
|||
} |
|||
|
|||
$labelList = []; |
|||
foreach ($labels as $k => $v) { |
|||
$labelList[] = "'$k'=>\"$v\""; |
|||
} |
|||
$labelList = "const LABEL_MAP = [".implode(",", $labelList)."];"; |
|||
|
|||
$nameList = []; |
|||
foreach ($names as $k => $v) { |
|||
$nameList[] = "'$k'=>$v::class"; |
|||
} |
|||
$nameList = "const NAME_MAP = [".implode(",", $nameList)."];"; |
|||
|
|||
echo "$labelList\n"; |
|||
echo "$nameList\n"; |
@ -0,0 +1,57 @@ |
|||
<!DOCTYPE html> |
|||
<meta charset=euc-jp> |
|||
<!-- Chromium does NOT produce correct results as of this writing; use Firefox to generate test data --> |
|||
<script> |
|||
var sampleStrings = { |
|||
'empty string': "", |
|||
// sanity checks |
|||
'sanity check': "40", |
|||
'former ASCII deviations': "5C 7E", |
|||
'changed multibyte index': "A1DD", |
|||
// JIS X 0201 |
|||
'JIS X 0201 range': "8EA1 8EDF", |
|||
'JIS X 0201 bogus range': "8EA0 8EE0", |
|||
'JIS X 0201 truncated character 1': "8E", |
|||
'JIS X 0201 truncated character 2': "8E 20", |
|||
'JIS X 0201 truncated character 3': "8E FF", |
|||
// JIS X 0212 |
|||
'JIS X 0212 assigned range': "8FA2AF 8FEDE3", |
|||
'JIS X 0212 total range': "8FA1A1 8FFEFE", |
|||
'JIS X 0212 bogus range 1': "8FA0A1 8FFFFE", |
|||
'JIS X 0212 bogus range 2': "8FA1A0 8FFEFF", |
|||
'JIS X 0212 truncated character 1': "8FA2", |
|||
'JIS X 0212 truncated character 2': "8FA2 20", |
|||
'JIS X 0212 truncated character 3': "8FA2 FF", |
|||
// JIS X 0208 |
|||
'JIS X 0208 assigned range': "A1A1 FCFE", |
|||
'JIS X 0208 total range': "A1A1 FEFE", |
|||
'JIS X 0208 bogus range': "A1A0 A0FE", |
|||
'JIS X 0208 truncated character 1': "A1", |
|||
'JIS X 0208 truncated character 2': "A1 20", |
|||
'JIS X 0208 truncated character 3': "A1 FF", |
|||
}; |
|||
var sampleCharacters = { |
|||
'U+0064': 0x64, |
|||
'U+00A5': 0xA5, |
|||
'U+203E': 0x203E, |
|||
'U+3088': 0x3088, |
|||
'U+FF96': 0xFF96, |
|||
'U+2212': 0x2212, |
|||
'U+00E6': 0xE6, |
|||
'U+FFE2': 0xFFE2, |
|||
'U+2116': 0x2116, |
|||
'-1': -1, |
|||
'0x110000': 0x110000, |
|||
}; |
|||
var seekCodePoints = [ |
|||
0x007A, |
|||
0xFF96, |
|||
0x3088, |
|||
0xFF0D, |
|||
0x005C, |
|||
0xFF9B, |
|||
/* This code point is not encodable and must be done manually entered as 8FB0EF */ |
|||
0x4F58, |
|||
]; |
|||
</script> |
|||
<script src="test.js"></script> |
@ -0,0 +1,46 @@ |
|||
<!DOCTYPE html> |
|||
<meta charset=iso-2022-jp> |
|||
<!-- Chromium does NOT produce correct results as of this writing; use Firefox to generate test data --> |
|||
<script> |
|||
var sampleStrings = { |
|||
'empty string': "", |
|||
'Implied ASCII mode': "00 30 5C 7E 21 5F", |
|||
'Explicit ASCII mode': "1B2842 00 30 5C 7E 21 5F", |
|||
'Roman mode': "1B284A 00 30 5C 7E 21 5F", |
|||
'Katakana mode': "1B2849 00 30 5C 7E 21 5F", |
|||
'Double-byte mode 1': "1B2440 00 30 5C 7E 21 5F", |
|||
'Double-byte mode 2': "1B2442 00 30 5C 7E 21 5F", |
|||
'Multiple modes': "5C 1B2849 21 1B2440 305C 1B284A 5C 1B2842 5C", |
|||
'Double escape': "1B2849 1B2842 5C", |
|||
'Triple escape': "1B2849 1B2842 1B284A 5C", |
|||
'Trailing escape': "20 1B284A 30 33 1B2849", |
|||
'Invalid bytes': "80 FF 1B2849 00 20 7F 1B2442 00 2100 FF FF", |
|||
}; |
|||
var sampleCharacters = { |
|||
'U+0020': [0x20], |
|||
'U+005C': [0x5C], |
|||
'U+007E': [0x7E], |
|||
'U+00A5': [0xA5], |
|||
'U+203E': [0x203E], |
|||
'U+FF61': [0xFF61], |
|||
'U+FF9F': [0xFF9F], |
|||
'U+2212': [0x2212], |
|||
'U+2116': [0x2116], |
|||
'U+FFE2': [0xFFE2], |
|||
'U+00C6': [0xC6], |
|||
'U+FFFD': [0xFFFD], |
|||
'Roman': [0xA5, 0x20, 0x203E], |
|||
'Roman to ASCII': [0xA5, 0x5C], |
|||
'Roman to error': [0xA5, 0x80], |
|||
'JIS': [0x2116, 0xFFE2, 0x2212], |
|||
'JIS to Roman': [0x2116, 0xA5], |
|||
'JIS to ASCII 1': [0x2116, 0x20], |
|||
'JIS to ASCII 2': [0x2116, 0x5C], |
|||
'JIS to error 1': [0x2116, 0x80], |
|||
'JIS to error 2': [0x2116, 0x1B], // Even Firefox is wrong here; see https://github.com/web-platform-tests/wpt/pull/26158 |
|||
'Escape characters': [0x1B, 0xE, 0xF], // Even Firefox is wrong here; see https://github.com/web-platform-tests/wpt/pull/26158 |
|||
'-1': [-1], |
|||
'0x110000': [0x110000], |
|||
}; |
|||
</script> |
|||
<script src="test.js"></script> |
@ -0,0 +1,42 @@ |
|||
<!DOCTYPE html> |
|||
<meta charset=shift_jis> |
|||
<!-- Chromium does NOT produce correct results as of this writing; use Firefox to generate test data --> |
|||
<script> |
|||
var sampleStrings = { |
|||
'empty string': "", |
|||
'sanity check': "40", |
|||
'former ASCII deviations': "5C 7E", |
|||
'JIS X 0201 range': "A1 DF", |
|||
'EUDC range': "F040 F9FC", |
|||
'JIS X 0208 assigned range': "8140 FC4B", |
|||
'JIS X 0208 total range': "8140 FCFC", |
|||
'JIS X 0208 truncated character 1': "81", |
|||
'JIS X 0208 truncated character 2': "81 20", |
|||
'JIS X 0208 truncated character 3': "81 FF", |
|||
}; |
|||
var sampleCharacters = { |
|||
'U+0064': 0x64, |
|||
'U+00A5': 0xA5, |
|||
'U+203E': 0x203E, |
|||
'U+3088': 0x3088, |
|||
'U+FF96': 0xFF96, |
|||
'U+2212': 0x2212, |
|||
'U+00E6': 0xE6, |
|||
'U+FFE2': 0xFFE2, |
|||
'U+2116': 0x2116, |
|||
'U+E000': 0xE000, |
|||
'-1': -1, |
|||
'0x110000': 0x110000, |
|||
}; |
|||
var seekCodePoints = [ |
|||
0x007A, |
|||
0xFF96, |
|||
0x3088, |
|||
0xFF0D, |
|||
0x005C, |
|||
0xFF9B, |
|||
/* This code point is not encodable and must be done manually entered as F040 */ |
|||
0xE000, |
|||
]; |
|||
</script> |
|||
<script src="test.js"></script> |
File diff suppressed because it is too large
@ -1,5 +1,5 @@ |
|||
{ |
|||
"require": { |
|||
"phpunit/phpunit": "^8.5" |
|||
"phpunit/phpunit": "^8.5 | ^9.0" |
|||
} |
|||
} |
|||
|
File diff suppressed because it is too large
@ -1,5 +1,5 @@ |
|||
{ |
|||
"require": { |
|||
"consolidation/robo": "^1.1" |
|||
"consolidation/robo": "^4.0" |
|||
} |
|||
} |
|||
|
File diff suppressed because it is too large
Loading…
Reference in new issue