Compare commits
119 Commits
100 changed files with 10574 additions and 3395 deletions
File diff suppressed because one or more lines are too long
@ -0,0 +1,250 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\Encoding; |
||||
|
|
||||
|
abstract class AbstractEncoding implements Decoder { |
||||
|
protected const MODE_NULL = 0; |
||||
|
protected const MODE_REPLACE = 1; |
||||
|
protected const MODE_FATAL = 2; |
||||
|
|
||||
|
protected const HIGH_BYTES = "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF"; |
||||
|
|
||||
|
/** @var string $string The string being decoded */ |
||||
|
protected $string; |
||||
|
/** @var int $posByte The current byte position in the string */ |
||||
|
protected $posByte = 0; |
||||
|
/** @var int $posChar The current character (code point) position in the string */ |
||||
|
protected $posChar = 0; |
||||
|
/** @var int $lenByte The length of the string, in bytes */ |
||||
|
protected $lenByte = null; |
||||
|
/** @var int $lenChar The length of the string in characters, if known */ |
||||
|
protected $lenChar = null; |
||||
|
/** @var array $errStack A list of error data to aid in backwards seeking; the most recent error is kept off the stack */ |
||||
|
protected $errStack = []; |
||||
|
/** @var int $errMark The byte position marking the most recent error. The one or more bytes previous to this position constitute an invalid character */ |
||||
|
protected $errMark = -1; |
||||
|
/** @var int $errSync The byte position to which to move to skip over the most recent erroneous character */ |
||||
|
protected $errSync = -2; |
||||
|
/** @var int $errMode The selected error mode (fatal or replace) */ |
||||
|
protected $errMode = self::MODE_REPLACE; |
||||
|
/** @var bool $allowSurrogates Whether surrogates in encodings other than UTF-16 should be passed through */ |
||||
|
protected $allowSurrogates = false; |
||||
|
/** @var bool $selfSynchronizing Whether the concrete class represents a self-synchronizing decoder. Such decoders do not use the error stack */ |
||||
|
protected $selfSynchronizing = false; |
||||
|
/** @var string[] $stateProps The list of properties which constitutee state which must be saved when peeking/seeking; some encodings may add to this last for their own purposes */ |
||||
|
protected $stateProps = ["posChar", "posByte", "posErr"]; |
||||
|
|
||||
|
public $posErr = 0; |
||||
|
|
||||
|
/** Seeks backwards through the string the specified number of characters. |
||||
|
* If the beginning of the string is reached before the requested number |
||||
|
* of characters has been skipped over, the number of remaining characters |
||||
|
* is returned. |
||||
|
*/ |
||||
|
abstract protected function seekBack(int $distance): int; |
||||
|
|
||||
|
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) { |
||||
|
$this->string = $string; |
||||
|
$this->lenByte = strlen($string); |
||||
|
$this->errMode = $fatal ? self::MODE_FATAL : self::MODE_REPLACE; |
||||
|
$this->allowSurrogates = $allowSurrogates; |
||||
|
} |
||||
|
|
||||
|
public function posByte(): int { |
||||
|
return $this->posByte; |
||||
|
} |
||||
|
|
||||
|
public function posChar(): int { |
||||
|
return $this->posChar; |
||||
|
} |
||||
|
|
||||
|
public function rewind(): void { |
||||
|
$this->posByte = 0; |
||||
|
$this->posChar = 0; |
||||
|
$this->errMark = -1; |
||||
|
$this->errSync = -2; |
||||
|
$this->errStack = []; |
||||
|
} |
||||
|
|
||||
|
public function nextChar(): string { |
||||
|
// get the byte at the current position |
||||
|
$b = $this->string[$this->posByte] ?? ""; |
||||
|
if ($b === "") { |
||||
|
// if the byte is end of input, simply return it |
||||
|
return ""; |
||||
|
} elseif (ord($b) < 0x80) { |
||||
|
// if the byte is an ASCII character, simply return it |
||||
|
$this->posChar++; |
||||
|
$this->posByte++; |
||||
|
return $b; |
||||
|
} else { |
||||
|
// otherwise return the serialization of the code point at the current position |
||||
|
return UTF8::encode($this->nextCode()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public function seek(int $distance): int { |
||||
|
if ($distance > 0) { |
||||
|
do { |
||||
|
$p = $this->nextCode(); |
||||
|
} while ($p !== false && --$distance); |
||||
|
return $distance; |
||||
|
} elseif ($distance < 0) { |
||||
|
$distance = abs($distance); |
||||
|
if (!$this->posChar) { |
||||
|
return $distance; |
||||
|
} |
||||
|
$mode = $this->errMode; |
||||
|
$this->errMode = self::MODE_NULL; |
||||
|
$out = $this->seekBack($distance); |
||||
|
$this->errMode = $mode; |
||||
|
return $out; |
||||
|
} else { |
||||
|
return 0; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public function peekChar(int $num = 1): string { |
||||
|
$out = ""; |
||||
|
$state = $this->stateSave(); |
||||
|
try { |
||||
|
while ($num-- > 0 && ($b = $this->nextChar()) !== "") { |
||||
|
$out .= $b; |
||||
|
} |
||||
|
} finally { |
||||
|
$this->stateApply($state); |
||||
|
} |
||||
|
return $out; |
||||
|
} |
||||
|
|
||||
|
public function peekCode(int $num = 1): array { |
||||
|
$out = []; |
||||
|
$state = $this->stateSave(); |
||||
|
try { |
||||
|
while ($num-- > 0 && ($b = $this->nextCode()) !== false) { |
||||
|
$out[] = $b; |
||||
|
} |
||||
|
} finally { |
||||
|
$this->stateApply($state); |
||||
|
} |
||||
|
return $out; |
||||
|
} |
||||
|
|
||||
|
public function lenByte(): int { |
||||
|
return $this->lenByte; |
||||
|
} |
||||
|
|
||||
|
public function lenChar(): int { |
||||
|
return $this->lenChar ?? (function() { |
||||
|
$state = $this->stateSave(); |
||||
|
while ($this->nextCode() !== false); |
||||
|
$this->lenChar = $this->posChar; |
||||
|
$this->stateApply($state); |
||||
|
return $this->lenChar; |
||||
|
})(); |
||||
|
} |
||||
|
|
||||
|
public function eof(): bool { |
||||
|
return $this->posByte >= $this->lenByte; |
||||
|
} |
||||
|
|
||||
|
public function chars(): \Generator { |
||||
|
while (($c = $this->nextChar()) !== "") { |
||||
|
yield ($this->posChar - 1) => $c; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public function codes(): \Generator { |
||||
|
while (($c = $this->nextCode()) !== false) { |
||||
|
yield ($this->posChar - 1) => $c; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public function asciiSpan(string $mask, int $length = null): string { |
||||
|
$mask = preg_replace('/[\x80-\xFF]/s', "", $mask); |
||||
|
if ($length !== null) { |
||||
|
$len = strspn($this->string, $mask, $this->posByte, $length); |
||||
|
} else { |
||||
|
$len = strspn($this->string, $mask, $this->posByte); |
||||
|
} |
||||
|
if ($len) { |
||||
|
$out = substr($this->string, $this->posByte, $len); |
||||
|
$this->posByte += $len; |
||||
|
$this->posChar += $len; |
||||
|
return $out; |
||||
|
} else { |
||||
|
return ""; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public function asciiSpanNot(string $mask, int $length = null): string { |
||||
|
$mask .= self::HIGH_BYTES; |
||||
|
if ($length !== null) { |
||||
|
$len = strcspn($this->string, $mask, $this->posByte, $length); |
||||
|
} else { |
||||
|
$len = strcspn($this->string, $mask, $this->posByte); |
||||
|
} |
||||
|
if ($len) { |
||||
|
$out = substr($this->string, $this->posByte, $len); |
||||
|
$this->posByte += $len; |
||||
|
$this->posChar += $len; |
||||
|
return $out; |
||||
|
} else { |
||||
|
return ""; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** Returns a copy of the decoder's state to keep in memory */ |
||||
|
protected function stateSave(): array { |
||||
|
$out = ['errCount' => sizeof($this->errStack)]; |
||||
|
foreach ($this->stateProps as $prop) { |
||||
|
$out[$prop] = $this->$prop; |
||||
|
} |
||||
|
return $out; |
||||
|
} |
||||
|
|
||||
|
/** Sets the decoder's state to the values specified */ |
||||
|
protected function stateApply(array $state): void { |
||||
|
while (sizeof($this->errStack) > $state['errCount']) { |
||||
|
list($this->errMark, $this->errSync) = array_pop($this->errStack); |
||||
|
} |
||||
|
unset($state['errCount']); |
||||
|
foreach ($state as $key => $value) { |
||||
|
$this->$key = $value; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** Handles decoding errors */ |
||||
|
protected function errDec(int $mode, int $charOffset, int $byteOffset): ?int { |
||||
|
if ($mode !== self::MODE_NULL) { |
||||
|
// expose the error to the user; this disambiguates a literal replacement character |
||||
|
$this->posErr = $this->posChar; |
||||
|
// unless the decoder is self-synchronizing, mark the error so that it can be skipped when seeking back |
||||
|
if (!$this->selfSynchronizing) { |
||||
|
$this->errStack[] = [$this->errMark, $this->errSync]; |
||||
|
$this->errMark = $this->posByte; |
||||
|
$this->errSync = $byteOffset; |
||||
|
} |
||||
|
if ($mode === self::MODE_FATAL) { |
||||
|
throw new DecoderException("Invalid code sequence at character offset $charOffset (byte offset $byteOffset)", self::E_INVALID_BYTE); |
||||
|
} else { |
||||
|
return 0xFFFD; |
||||
|
} |
||||
|
} |
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
/** Handles encoding errors */ |
||||
|
protected static function errEnc(bool $htmlMode, $data = null): string { |
||||
|
if ($htmlMode) { |
||||
|
return "&#".(string) $data.";"; |
||||
|
} else { |
||||
|
// fatal replacement mode for encoders; not applicable to Unicode transformation formats |
||||
|
throw new EncoderException("Code point $data not available in target encoding", Coder::E_UNAVAILABLE_CODE_POINT); |
||||
|
} |
||||
|
} |
||||
|
} |
File diff suppressed because one or more lines are too long
@ -0,0 +1,20 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\Encoding; |
||||
|
|
||||
|
interface Coder { |
||||
|
public const E_INVALID_CODE_POINT = 1; |
||||
|
public const E_UNAVAILABLE_CODE_POINT = 3; |
||||
|
public const E_UNAVAILABLE_ENCODER = 4; |
||||
|
|
||||
|
/** Returns the encoding of $codePoint as a byte string |
||||
|
* |
||||
|
* @param int $codePoint The Unicode code point to encode. If less than 0 or greater than 1114111, an exception is thrown |
||||
|
* @param bool $fatal Whether an exception will be thrown if the code point cannot be encoded into a character; if false HTML character references will be substituted |
||||
|
*/ |
||||
|
public static function encode(int $codePoint, bool $fatal = true): string; |
||||
|
} |
@ -0,0 +1,105 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\Encoding; |
||||
|
|
||||
|
interface Decoder { |
||||
|
public const E_INVALID_BYTE = 2; |
||||
|
|
||||
|
/** Constructs a new decoder |
||||
|
* |
||||
|
* @param string $string The string to decode |
||||
|
* @param bool $fatal If true, throw enceptions when encountering invalid input. If false, substitute U+FFFD REPLACEMENT CHARACTER instead |
||||
|
* @param bool $allowSurrogates If true, treats surrogate characters as valid input; this only affects UTF-8 and UTF-16 encodings |
||||
|
*/ |
||||
|
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false); |
||||
|
|
||||
|
/** Returns the current byte position of the decoder */ |
||||
|
public function posByte(): int; |
||||
|
|
||||
|
/** Returns the current character position of the decoder */ |
||||
|
public function posChar(): int; |
||||
|
|
||||
|
/** Retrieve the next character in the string, in UTF-8 encoding |
||||
|
* |
||||
|
* The returned character may be a replacement character, or the empty string if the end of the string has been reached |
||||
|
*/ |
||||
|
public function nextChar(): string; |
||||
|
|
||||
|
/** Decodes the next character from the string and returns its code point number |
||||
|
* |
||||
|
* If the end of the string has been reached, false is returned |
||||
|
* |
||||
|
* @return int|false |
||||
|
*/ |
||||
|
public function nextCode(); |
||||
|
|
||||
|
/** Advance $distance characters through the string |
||||
|
* |
||||
|
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned |
||||
|
* |
||||
|
* @param int $distance The number of characters to advance. If negative, the operation will seek back toward the beginning of the string |
||||
|
*/ |
||||
|
public function seek(int $distance): int; |
||||
|
|
||||
|
/** Seeks to the start of the string |
||||
|
* |
||||
|
* This is usually faster than using the seek method for the same purpose |
||||
|
*/ |
||||
|
public function rewind(): void; |
||||
|
|
||||
|
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer |
||||
|
* |
||||
|
* @param int $num The number of characters to retrieve |
||||
|
*/ |
||||
|
public function peekChar(int $num = 1): string; |
||||
|
|
||||
|
/** Retrieves the next $num code points from the string, without advancing the character pointer |
||||
|
* |
||||
|
* @param int $num The number of code points to retrieve |
||||
|
*/ |
||||
|
public function peekCode(int $num = 1): array; |
||||
|
|
||||
|
/** Calculates the length of the string in bytes */ |
||||
|
public function lenByte(): int; |
||||
|
|
||||
|
/** Calculates the length of the string in code points |
||||
|
* |
||||
|
* Note that this may involve processing to the end of the string |
||||
|
*/ |
||||
|
public function lenChar(): int; |
||||
|
|
||||
|
/** Returns whether the character pointer is at the end of the string */ |
||||
|
public function eof(): bool; |
||||
|
|
||||
|
/** Generates an iterator which steps through each character in the string */ |
||||
|
public function chars(): \Generator; |
||||
|
|
||||
|
/** Generates an iterator which steps through each code point in the string */ |
||||
|
public function codes(): \Generator; |
||||
|
|
||||
|
/** Fast-forwards through a span of ASCII characters matching the supplied mask, returning any consumed characters |
||||
|
* |
||||
|
* The mask must consist only of ASCII characters. |
||||
|
* |
||||
|
* Note that if the empty string is returned, this does not necessarily signal the end of the string |
||||
|
* |
||||
|
* @param string $mask The set of ASCII characters to match |
||||
|
* @param int $length The maximum number oof characters to advance by |
||||
|
*/ |
||||
|
public function asciiSpan(string $mask, int $length = null): string; |
||||
|
|
||||
|
/** Fast-forwards through a span of ASCII characters not matching the supplied mask, returning any consumed characters |
||||
|
* |
||||
|
* The mask must consist only of ASCII characters. |
||||
|
* |
||||
|
* Note that if the empty string is returned, this does not necessarily signal the end of the string |
||||
|
* |
||||
|
* @param string $mask The set of ASCII characters to not match |
||||
|
* @param int $length The maximum number oof characters to advance by |
||||
|
*/ |
||||
|
public function asciiSpanNot(string $mask, int $length = null): string; |
||||
|
} |
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -0,0 +1,322 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\Encoding; |
||||
|
|
||||
|
use MensBeam\Intl\Encoding as Matcher; |
||||
|
|
||||
|
class Encoder { |
||||
|
protected $name; |
||||
|
protected $fatal = true; |
||||
|
protected $mode = null; |
||||
|
|
||||
|
/** Constructs a new encoder for the specified $label |
||||
|
* |
||||
|
* @param string $label One of the encoding labels listed in the specification e.g. "utf-8", "Latin1", "shift_JIS" |
||||
|
* @param bool $fatal If true (the default) exceptions will be thrown when a character cannot be represented in the target encoding; if false HTML character references will be substituted instead |
||||
|
* |
||||
|
* @see https://encoding.spec.whatwg.org#names-and-labels |
||||
|
*/ |
||||
|
public function __construct(string $label, bool $fatal = true) { |
||||
|
$l = Matcher::matchLabel($label); |
||||
|
if (!$l || !$l['encoder']) { |
||||
|
throw new EncoderException("Label '$label' does not have an encoder", Coder::E_UNAVAILABLE_ENCODER); |
||||
|
} else { |
||||
|
$this->name = $l['name']; |
||||
|
$this->fatal = $fatal; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** Encodes a series of code point numbers into a string |
||||
|
* |
||||
|
* @param iterable $codePoints An iterable set of integers representing code points in the Unicode range |
||||
|
*/ |
||||
|
public function encode(iterable $codePoints): string { |
||||
|
$out = ""; |
||||
|
switch ($this->name) { |
||||
|
case "UTF-8": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= UTF8::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "Big5": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= Big5::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "EUC-JP": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= EUCJP::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "EUC-KR": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= EUCKR::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "gb18030": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= GB18030::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "GBK": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= GBK::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "IBM866": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= IBM866::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-2022-JP": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO2022JP::encode($codePoint, $this->fatal, $mode); |
||||
|
} |
||||
|
$out .= ISO2022JP::encode(null, $this->fatal, $mode); |
||||
|
break; |
||||
|
case "ISO-8859-2": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO88592::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-8859-3": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO88593::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-8859-4": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO88594::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-8859-5": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO88595::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-8859-6": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO88596::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-8859-7": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO88597::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-8859-8": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO88598::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-8859-8-I": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO88598I::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-8859-10": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO885910::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-8859-13": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO885913::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-8859-14": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO885914::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-8859-15": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO885915::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "ISO-8859-16": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ISO885916::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "KOI8-R": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= KOI8R::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "KOI8-U": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= KOI8U::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "macintosh": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= Macintosh::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "Shift_JIS": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= ShiftJIS::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "windows-1250": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= Windows1250::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "windows-1251": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= Windows1251::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "windows-1252": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= Windows1252::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "windows-1253": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= Windows1253::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "windows-1254": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= Windows1254::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "windows-1255": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= Windows1255::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "windows-1256": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= Windows1256::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "windows-1257": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= Windows1257::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "windows-1258": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= Windows1258::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "windows-874": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= Windows874::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "x-mac-cyrillic": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= XMacCyrillic::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
case "x-user-defined": |
||||
|
foreach ($codePoints as $codePoint) { |
||||
|
$out .= XUserDefined::encode($codePoint, $this->fatal); |
||||
|
} |
||||
|
break; |
||||
|
} |
||||
|
return $out; |
||||
|
} |
||||
|
|
||||
|
/** Encodes a single character into a string |
||||
|
* |
||||
|
* When using this method to encode a string, the finalize() method should be called to terminate the string |
||||
|
* |
||||
|
* @param int $codePoint An integer representing the Unicode code point number to encode |
||||
|
*/ |
||||
|
public function encodeChar(int $codePoint): string { |
||||
|
switch ($this->name) { |
||||
|
case "UTF-8": |
||||
|
return UTF8::encode($codePoint, $this->fatal); |
||||
|
case "Big5": |
||||
|
return Big5::encode($codePoint, $this->fatal); |
||||
|
case "EUC-JP": |
||||
|
return EUCJP::encode($codePoint, $this->fatal); |
||||
|
case "EUC-KR": |
||||
|
return EUCKR::encode($codePoint, $this->fatal); |
||||
|
case "gb18030": |
||||
|
return GB18030::encode($codePoint, $this->fatal); |
||||
|
case "GBK": |
||||
|
return GBK::encode($codePoint, $this->fatal); |
||||
|
case "IBM866": |
||||
|
return IBM866::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-2": |
||||
|
return ISO88592::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-3": |
||||
|
return ISO88593::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-4": |
||||
|
return ISO88594::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-5": |
||||
|
return ISO88595::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-6": |
||||
|
return ISO88596::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-7": |
||||
|
return ISO88597::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-8": |
||||
|
return ISO88598::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-8-I": |
||||
|
return ISO88598I::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-10": |
||||
|
return ISO885910::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-13": |
||||
|
return ISO885913::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-14": |
||||
|
return ISO885914::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-15": |
||||
|
return ISO885915::encode($codePoint, $this->fatal); |
||||
|
case "ISO-8859-16": |
||||
|
return ISO885916::encode($codePoint, $this->fatal); |
||||
|
case "KOI8-R": |
||||
|
return KOI8R::encode($codePoint, $this->fatal); |
||||
|
case "KOI8-U": |
||||
|
return KOI8U::encode($codePoint, $this->fatal); |
||||
|
case "macintosh": |
||||
|
return Macintosh::encode($codePoint, $this->fatal); |
||||
|
case "Shift_JIS": |
||||
|
return ShiftJIS::encode($codePoint, $this->fatal); |
||||
|
case "windows-1250": |
||||
|
return Windows1250::encode($codePoint, $this->fatal); |
||||
|
case "windows-1251": |
||||
|
return Windows1251::encode($codePoint, $this->fatal); |
||||
|
case "windows-1252": |
||||
|
return Windows1252::encode($codePoint, $this->fatal); |
||||
|
case "windows-1253": |
||||
|
return Windows1253::encode($codePoint, $this->fatal); |
||||
|
case "windows-1254": |
||||
|
return Windows1254::encode($codePoint, $this->fatal); |
||||
|
case "windows-1255": |
||||
|
return Windows1255::encode($codePoint, $this->fatal); |
||||
|
case "windows-1256": |
||||
|
return Windows1256::encode($codePoint, $this->fatal); |
||||
|
case "windows-1257": |
||||
|
return Windows1257::encode($codePoint, $this->fatal); |
||||
|
case "windows-1258": |
||||
|
return Windows1258::encode($codePoint, $this->fatal); |
||||
|
case "windows-874": |
||||
|
return Windows874::encode($codePoint, $this->fatal); |
||||
|
case "x-mac-cyrillic": |
||||
|
return XMacCyrillic::encode($codePoint, $this->fatal); |
||||
|
case "x-user-defined": |
||||
|
return XUserDefined::encode($codePoint, $this->fatal); |
||||
|
case "ISO-2022-JP": |
||||
|
return ISO2022JP::encode($codePoint, $this->fatal, $this->mode); |
||||
|
} |
||||
|
} // @codeCoverageIgnore |
||||
|
|
||||
|
/** Finalizes a string, returning any terminal bytes to append to the output |
||||
|
* |
||||
|
* For the ISO-2022-JP encoding, this method must be called fater the last character is encoded to correctly encode a string; for other encodings this is a no-op |
||||
|
*/ |
||||
|
public function finalize(): string { |
||||
|
return ISO2022JP::encode(null, $this->fatal, $this->mode); |
||||
|
} |
||||
|
} |
@ -1,78 +0,0 @@ |
|||||
<?php |
|
||||
/** @license MIT |
|
||||
* Copyright 2018 J. King et al. |
|
||||
* See LICENSE and AUTHORS files for details */ |
|
||||
|
|
||||
declare(strict_types=1); |
|
||||
namespace MensBeam\Intl\Encoding; |
|
||||
|
|
||||
interface Encoding { |
|
||||
const MODE_NULL = 0; |
|
||||
const MODE_REPLACE = 1; |
|
||||
const MODE_HTML = 2; |
|
||||
const MODE_FATAL_DEC = 3; |
|
||||
const MODE_FATAL_ENC = 4; |
|
||||
|
|
||||
const E_INVALID_CODE_POINT = 1; |
|
||||
const E_INVALID_BYTE = 2; |
|
||||
const E_INVALID_MODE = 3; |
|
||||
const E_UNAVAILABLE_CODE_POINT = 4; |
|
||||
|
|
||||
/** Constructs a new decoder |
|
||||
* |
|
||||
* If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted |
|
||||
*/ |
|
||||
public function __construct(string $string, bool $fatal = false); |
|
||||
|
|
||||
/** Returns the current byte position of the decoder */ |
|
||||
public function posByte(): int; |
|
||||
|
|
||||
/** Returns the current character position of the decoder */ |
|
||||
public function posChar(): int; |
|
||||
|
|
||||
/** Retrieve the next character in the string, in UTF-8 encoding |
|
||||
* |
|
||||
* The returned character may be a replacement character, or the empty string if the end of the string has been reached |
|
||||
*/ |
|
||||
public function nextChar(): string; |
|
||||
|
|
||||
/** Decodes the next character from the string and returns its code point number |
|
||||
* |
|
||||
* If the end of the string has been reached, false is returned |
|
||||
* |
|
||||
* @return int|bool |
|
||||
*/ |
|
||||
public function nextCode(); |
|
||||
|
|
||||
/** Advance $distance characters through the string |
|
||||
* |
|
||||
* If $distance is negative, the operation will be performed in reverse |
|
||||
* |
|
||||
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned |
|
||||
*/ |
|
||||
public function seek(int $distance): int; |
|
||||
|
|
||||
/** Seeks to the start of the string |
|
||||
* |
|
||||
* This is usually faster than using the seek method for the same purpose |
|
||||
*/ |
|
||||
public function rewind(); |
|
||||
|
|
||||
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */ |
|
||||
public function peekChar(int $num = 1): string; |
|
||||
|
|
||||
/** Retrieves the next $num code points from the string, without advancing the character pointer */ |
|
||||
public function peekCode(int $num = 1): array; |
|
||||
|
|
||||
/** Calculates the length of the string in code points |
|
||||
* |
|
||||
* Note that this may involve processing to the end of the string |
|
||||
*/ |
|
||||
public function len(): int; |
|
||||
|
|
||||
/** Generates an iterator which steps through each character in the string */ |
|
||||
public function chars(): \Generator; |
|
||||
|
|
||||
/** Generates an iterator which steps through each code point in the string */ |
|
||||
public function codes(): \Generator; |
|
||||
} |
|
File diff suppressed because one or more lines are too long
@ -1,191 +0,0 @@ |
|||||
<?php |
|
||||
/** @license MIT |
|
||||
* Copyright 2018 J. King et al. |
|
||||
* See LICENSE and AUTHORS files for details */ |
|
||||
|
|
||||
declare(strict_types=1); |
|
||||
namespace MensBeam\Intl\Encoding; |
|
||||
|
|
||||
trait GenericEncoding { |
|
||||
protected $string; |
|
||||
protected $posByte = 0; |
|
||||
protected $posChar = 0; |
|
||||
protected $lenByte = null; |
|
||||
protected $lenChar = null; |
|
||||
protected $errMode = self::MODE_REPLACE; |
|
||||
|
|
||||
/** Constructs a new decoder |
|
||||
* |
|
||||
* If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted |
|
||||
*/ |
|
||||
public function __construct(string $string, bool $fatal = false) { |
|
||||
$this->string = $string; |
|
||||
$this->lenByte = strlen($string); |
|
||||
$this->errMode = $fatal ? self::MODE_FATAL_DEC : self::MODE_REPLACE; |
|
||||
} |
|
||||
|
|
||||
/** Returns the current byte position of the decoder */ |
|
||||
public function posByte(): int { |
|
||||
return $this->posByte; |
|
||||
} |
|
||||
|
|
||||
/** Returns the current character position of the decoder */ |
|
||||
public function posChar(): int { |
|
||||
return $this->posChar; |
|
||||
} |
|
||||
|
|
||||
/** Seeks to the start of the string |
|
||||
* |
|
||||
* This is usually faster than using the seek method for the same purpose |
|
||||
*/ |
|
||||
public function rewind() { |
|
||||
$this->posByte = 0; |
|
||||
$this->posChar = 0; |
|
||||
} |
|
||||
|
|
||||
/** Retrieve the next character in the string, in UTF-8 encoding |
|
||||
* |
|
||||
* The returned character may be a replacement character, or the empty string if the end of the string has been reached |
|
||||
*/ |
|
||||
public function nextChar(): string { |
|
||||
// get the byte at the current position |
|
||||
$b = @$this->string[$this->posByte]; |
|
||||
if ($b === "") { |
|
||||
// if the byte is end of input, simply return it |
|
||||
return ""; |
|
||||
} elseif (ord($b) < 0x80) { |
|
||||
// if the byte is an ASCII character, simply return it |
|
||||
$this->posChar++; |
|
||||
$this->posByte++; |
|
||||
return $b; |
|
||||
} else { |
|
||||
// otherwise return the serialization of the code point at the current position |
|
||||
return UTF8::encode($this->nextCode()); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
/** Advance $distance characters through the string |
|
||||
* |
|
||||
* If $distance is negative, the operation will be performed in reverse |
|
||||
* |
|
||||
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned |
|
||||
*/ |
|
||||
public function seek(int $distance): int { |
|
||||
if ($distance > 0) { |
|
||||
if ($this->posByte == strlen($this->string)) { |
|
||||
return $distance; |
|
||||
} |
|
||||
do { |
|
||||
$p = $this->nextCode(); |
|
||||
} while (--$distance && $p !== false); |
|
||||
return $distance; |
|
||||
} elseif ($distance < 0) { |
|
||||
$distance = abs($distance); |
|
||||
if (!$this->posByte) { |
|
||||
return $distance; |
|
||||
} |
|
||||
$mode = $this->errMode; |
|
||||
$this->errMode = self::MODE_NULL; |
|
||||
$out = $this->seekBack($distance); |
|
||||
$this->errMode = $mode; |
|
||||
return $out; |
|
||||
} else { |
|
||||
return 0; |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */ |
|
||||
public function peekChar(int $num = 1): string { |
|
||||
$out = ""; |
|
||||
$state = $this->stateSave(); |
|
||||
try { |
|
||||
while ($num-- > 0 && ($b = $this->nextChar()) !== "") { |
|
||||
$out .= $b; |
|
||||
} |
|
||||
} finally { |
|
||||
$this->stateApply($state); |
|
||||
} |
|
||||
return $out; |
|
||||
} |
|
||||
|
|
||||
/** Retrieves the next $num code points from the string, without advancing the character pointer */ |
|
||||
public function peekCode(int $num = 1): array { |
|
||||
$out = []; |
|
||||
$state = $this->stateSave(); |
|
||||
try { |
|
||||
while ($num-- > 0 && ($b = $this->nextCode()) !== false) { |
|
||||
$out[] = $b; |
|
||||
} |
|
||||
} finally { |
|
||||
$this->stateApply($state); |
|
||||
} |
|
||||
return $out; |
|
||||
} |
|
||||
|
|
||||
/** Calculates the length of the string in code points |
|
||||
* |
|
||||
* Note that this may involve processing to the end of the string |
|
||||
*/ |
|
||||
public function len(): int { |
|
||||
return $this->lenChar ?? (function() { |
|
||||
$state = $this->stateSave(); |
|
||||
while ($this->nextCode() !== false); |
|
||||
$this->lenChar = $this->posChar; |
|
||||
$this->stateApply($state); |
|
||||
return $this->lenChar; |
|
||||
})(); |
|
||||
} |
|
||||
|
|
||||
/** Generates an iterator which steps through each character in the string */ |
|
||||
public function chars(): \Generator { |
|
||||
while (($c = $this->nextChar()) !== "") { |
|
||||
yield ($this->posChar - 1) => $c; |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
/** Generates an iterator which steps through each code point in the string */ |
|
||||
public function codes(): \Generator { |
|
||||
while (($c = $this->nextCode()) !== false) { |
|
||||
yield ($this->posChar - 1) => $c; |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
/** Returns a copy of the decoder's state to keep in memory */ |
|
||||
protected function stateSave(): array { |
|
||||
return [ |
|
||||
'posChar' => $this->posChar, |
|
||||
'posByte' => $this->posByte, |
|
||||
]; |
|
||||
} |
|
||||
|
|
||||
/** Sets the decoder's state to the values specified */ |
|
||||
protected function stateApply(array $state) { |
|
||||
foreach ($state as $key => $value) { |
|
||||
$this->$key = $value; |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
/** Handles decoding and encoding errors */ |
|
||||
protected static function err(int $mode, $data = null) { |
|
||||
switch ($mode) { |
|
||||
case self::MODE_NULL: |
|
||||
// used internally during backward seeking for some encodings |
|
||||
return null; // @codeCoverageIgnore |
|
||||
case self::MODE_REPLACE: |
|
||||
// standard "replace" mode |
|
||||
return 0xFFFD; |
|
||||
case self::MODE_HTML: |
|
||||
// the "html" replacement mode; not applicable to Unicode transformation formats |
|
||||
return "&#".(string) $data.";"; |
|
||||
case self::MODE_FATAL_DEC: |
|
||||
// fatal replacement mode for decoders |
|
||||
throw new DecoderException("Invalid code sequence at character offset {$data[0]} (byte offset {$data[1]})", self::E_INVALID_BYTE); |
|
||||
case self::MODE_FATAL_ENC: |
|
||||
// fatal replacement mode for decoders; not applicable to Unicode transformation formats |
|
||||
throw new EncoderException("Code point $data not available in target encoding", self::E_UNAVAILABLE_CODE_POINT); |
|
||||
default: |
|
||||
// indicative of internal bug; should never be triggered |
|
||||
throw new DecoderException("Invalid replacement mode {$mode}", self::E_INVALID_MODE); // @codeCoverageIgnore |
|
||||
} |
|
||||
} |
|
||||
} |
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,17 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\Encoding; |
||||
|
|
||||
|
interface ModalCoder { |
||||
|
/** Returns the encoding of $codePoint as a byte string |
||||
|
* |
||||
|
* @param int $codePoint The Unicode code point to encode. If less than 0 or greater than 1114111, an exception is thrown; if $codePoint is null this signals end-of-file |
||||
|
* @param bool $fatal Whether an exception will be thrown if the code point cannot be encoded into a character; if false HTML character references will be substituted |
||||
|
* @param mixed &$mode A reference keeping track of the current encoder mode. An uninitialized variable should be passed on first invocation, and that variable used for further invocations. |
||||
|
*/ |
||||
|
public static function encode(?int $codePoint, bool $fatal = true, &$mode = null): string; |
||||
|
} |
@ -0,0 +1,133 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\Encoding; |
||||
|
|
||||
|
class Replacement implements Decoder { |
||||
|
public const NAME = "replacement"; |
||||
|
public const LABELS = [ |
||||
|
"csiso2022kr", |
||||
|
"hz-gb-2312", |
||||
|
"iso-2022-cn", |
||||
|
"iso-2022-cn-ext", |
||||
|
"iso-2022-kr", |
||||
|
"replacement", |
||||
|
]; |
||||
|
|
||||
|
protected $len = 0; |
||||
|
protected $done = false; |
||||
|
protected $fatal = false; |
||||
|
|
||||
|
public $posErr = 0; |
||||
|
|
||||
|
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) { |
||||
|
$this->len = strlen($string); |
||||
|
$this->fatal = $fatal; |
||||
|
} |
||||
|
|
||||
|
public function posByte(): int { |
||||
|
return $this->done ? $this->len : 0; |
||||
|
} |
||||
|
|
||||
|
public function posChar(): int { |
||||
|
return $this->done ? 1 : 0; |
||||
|
} |
||||
|
|
||||
|
public function nextChar(): string { |
||||
|
if (!$this->eof()) { |
||||
|
try { |
||||
|
return $this->peekChar(); |
||||
|
} finally { |
||||
|
$this->done = true; |
||||
|
$this->posErr = 1; |
||||
|
} |
||||
|
} |
||||
|
return ""; |
||||
|
} |
||||
|
|
||||
|
public function nextCode() { |
||||
|
if (!$this->eof()) { |
||||
|
try { |
||||
|
return $this->peekCode()[0]; |
||||
|
} finally { |
||||
|
$this->done = true; |
||||
|
$this->posErr = 1; |
||||
|
} |
||||
|
} |
||||
|
return false; |
||||
|
} |
||||
|
|
||||
|
public function seek(int $distance): int { |
||||
|
if ($distance > 0) { |
||||
|
if (!$this->eof()) { |
||||
|
$distance--; |
||||
|
$this->nextCode(); |
||||
|
} |
||||
|
} elseif ($distance < 0) { |
||||
|
if ($this->eof()) { |
||||
|
$distance++; |
||||
|
$this->rewind(); |
||||
|
} |
||||
|
} |
||||
|
return $distance; |
||||
|
} |
||||
|
|
||||
|
public function rewind(): void { |
||||
|
$this->done = false; |
||||
|
} |
||||
|
|
||||
|
public function peekChar(int $num = 1): string { |
||||
|
if (!$this->eof() && $num > 0) { |
||||
|
if ($this->fatal) { |
||||
|
throw new DecoderException("Unable to decode string", self::E_INVALID_BYTE); |
||||
|
} |
||||
|
return "\u{FFFD}"; |
||||
|
} |
||||
|
return ""; |
||||
|
} |
||||
|
|
||||
|
public function peekCode(int $num = 1): array { |
||||
|
if (!$this->eof() && $num > 0) { |
||||
|
if ($this->fatal) { |
||||
|
throw new DecoderException("Unable to decode string", self::E_INVALID_BYTE); |
||||
|
} |
||||
|
return [0xFFFD]; |
||||
|
} |
||||
|
return []; |
||||
|
} |
||||
|
|
||||
|
public function lenByte(): int { |
||||
|
return $this->len; |
||||
|
} |
||||
|
|
||||
|
public function lenChar(): int { |
||||
|
return (int) ($this->len > 0); |
||||
|
} |
||||
|
|
||||
|
public function eof(): bool { |
||||
|
return $this->done || $this->len === 0; |
||||
|
} |
||||
|
|
||||
|
public function chars(): \Generator { |
||||
|
if (!$this->eof()) { |
||||
|
yield 0 => $this->nextChar(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public function codes(): \Generator { |
||||
|
if (!$this->eof()) { |
||||
|
yield 0 => $this->nextCode(); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public function asciiSpan(string $mask, int $length = null): string { |
||||
|
return ""; |
||||
|
} |
||||
|
|
||||
|
public function asciiSpanNot(string $mask, int $length = null): string { |
||||
|
return ""; |
||||
|
} |
||||
|
} |
File diff suppressed because one or more lines are too long
@ -1,18 +0,0 @@ |
|||||
<?php |
|
||||
/** @license MIT |
|
||||
* Copyright 2018 J. King et al. |
|
||||
* See LICENSE and AUTHORS files for details */ |
|
||||
|
|
||||
declare(strict_types=1); |
|
||||
namespace MensBeam\Intl\Encoding; |
|
||||
|
|
||||
interface StatefulEncoding extends Encoding { |
|
||||
|
|
||||
/** Returns the encoding of $codePoint as a byte string |
|
||||
* |
|
||||
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown |
|
||||
* |
|
||||
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted |
|
||||
*/ |
|
||||
public static function encode(array $codePoints, bool $fatal = true): string; |
|
||||
} |
|
@ -1,18 +0,0 @@ |
|||||
<?php |
|
||||
/** @license MIT |
|
||||
* Copyright 2018 J. King et al. |
|
||||
* See LICENSE and AUTHORS files for details */ |
|
||||
|
|
||||
declare(strict_types=1); |
|
||||
namespace MensBeam\Intl\Encoding; |
|
||||
|
|
||||
interface StatelessEncoding extends Encoding { |
|
||||
|
|
||||
/** Returns the encoding of $codePoint as a byte string |
|
||||
* |
|
||||
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown |
|
||||
* |
|
||||
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted |
|
||||
*/ |
|
||||
public static function encode(int $codePoint, bool $fatal = true): string; |
|
||||
} |
|
@ -0,0 +1,180 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\Encoding; |
||||
|
|
||||
|
abstract class UTF16 extends AbstractEncoding { |
||||
|
protected $selfSynchronizing = true; |
||||
|
protected $dirtyEOF = 0; |
||||
|
/** @var int The size of the string's byte order mark, if any */ |
||||
|
protected $BOM = 0; |
||||
|
|
||||
|
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) { |
||||
|
$this->stateProps[] = "dirtyEOF"; |
||||
|
parent::__construct($string, $fatal, $allowSurrogates); |
||||
|
if (substr($string, 0, 2) === (static::BE ? "\xFE\xFF" : "\xFF\xFE")) { |
||||
|
$this->BOM = 2; |
||||
|
$this->posByte = 2; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public function rewind(): void { |
||||
|
parent::rewind(); |
||||
|
$this->posByte = $this->BOM; |
||||
|
} |
||||
|
|
||||
|
public function nextCode() { |
||||
|
$lead_b = null; |
||||
|
$lead_s = null; |
||||
|
$this->posChar++; |
||||
|
while (($b = $this->string[$this->posByte++] ?? "") !== "") { |
||||
|
$b = ord($b); |
||||
|
if (is_null($lead_b)) { |
||||
|
$lead_b = $b; |
||||
|
continue; |
||||
|
} else { |
||||
|
if (static::BE) { |
||||
|
$code = ($lead_b << 8) + $b; |
||||
|
} else { |
||||
|
$code = ($b << 8) + $lead_b; |
||||
|
} |
||||
|
$lead_b = null; |
||||
|
if (!is_null($lead_s)) { |
||||
|
if ($code >= 0xDC00 && $code <= 0xDFFF) { |
||||
|
return 0x10000 + (($lead_s - 0xD800) << 10) + ($code - 0xDC00); |
||||
|
} elseif ($this->allowSurrogates) { |
||||
|
$this->posByte -= 2; |
||||
|
return $lead_s; |
||||
|
} else { |
||||
|
$this->posByte -= 2; |
||||
|
$this->posErr = $this->posChar; |
||||
|
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 2); |
||||
|
} |
||||
|
} else { |
||||
|
if ($code >= 0xD800 && $code <= 0xDBFF) { |
||||
|
$lead_s = $code; |
||||
|
continue; |
||||
|
} elseif ($code >= 0xDC00 && $code <= 0xDFFF) { |
||||
|
if ($this->allowSurrogates) { |
||||
|
return $code; |
||||
|
} else { |
||||
|
$this->posErr = $this->posChar; |
||||
|
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - 2); |
||||
|
} |
||||
|
} else { |
||||
|
return $code; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
$this->posByte--; |
||||
|
if (($lead_b + $lead_s) == 0) { |
||||
|
// clean EOF |
||||
|
$this->posChar--; |
||||
|
return false; |
||||
|
} else { |
||||
|
// dirty EOF; note how many bytes the last character had |
||||
|
// properly synchronizing UTF-16 is possible without retaining this information, but retaining it makes the task easier |
||||
|
$this->dirtyEOF = ($lead_s && $lead_b ? 3 : ($lead_s ? 2 : 1)); |
||||
|
return $this->errDec($this->errMode, $this->posChar - 1, $this->posByte - $this->dirtyEOF); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public function nextChar(): string { |
||||
|
// get the byte at the current position |
||||
|
$b = $this->string[$this->posByte] ?? ""; |
||||
|
if ($b === "") { |
||||
|
// if the byte is end of input, simply return it |
||||
|
return ""; |
||||
|
} else { |
||||
|
// otherwise return the serialization of the code point at the current position |
||||
|
return UTF8::encode($this->nextCode()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public function asciiSpan(string $mask, int $length = null): string { |
||||
|
// UTF-16 has no ASCII characters, so we must do things the hard way |
||||
|
$out = ""; |
||||
|
$left = ($length === null) ? -1 : $length; |
||||
|
while ($left) { |
||||
|
$c1 = $this->string[$this->posByte] ?? ""; |
||||
|
$c2 = $this->string[$this->posByte + 1] ?? ""; |
||||
|
$b = ord(static::BE ? $c1 : $c2); |
||||
|
if (!$b) { |
||||
|
$c = static::BE ? $c2 : $c1; |
||||
|
$b = ord($c); |
||||
|
if ($b < 0x80 && strpos($mask, $c) !== false && $c1 !== "" && $c2 !== "") { |
||||
|
$out .= $c; |
||||
|
$this->posByte += 2; |
||||
|
$this->posChar++; |
||||
|
$left--; |
||||
|
} else { |
||||
|
break; |
||||
|
} |
||||
|
} else { |
||||
|
break; |
||||
|
} |
||||
|
} |
||||
|
return $out; |
||||
|
} |
||||
|
|
||||
|
public function asciiSpanNot(string $mask, int $length = null): string { |
||||
|
// this is a copy of asciiSpan above with only the strpos check reversed |
||||
|
$out = ""; |
||||
|
$left = ($length === null) ? -1 : $length; |
||||
|
while ($left) { |
||||
|
$c1 = $this->string[$this->posByte] ?? ""; |
||||
|
$c2 = $this->string[$this->posByte + 1] ?? ""; |
||||
|
$b = ord(static::BE ? $c1 : $c2); |
||||
|
if (!$b) { |
||||
|
$c = static::BE ? $c2 : $c1; |
||||
|
$b = ord($c); |
||||
|
if ($b < 0x80 && strpos($mask, $c) === false && $c1 !== "" && $c2 !== "") { |
||||
|
$out .= $c; |
||||
|
$this->posByte += 2; |
||||
|
$this->posChar++; |
||||
|
$left--; |
||||
|
} else { |
||||
|
break; |
||||
|
} |
||||
|
} else { |
||||
|
break; |
||||
|
} |
||||
|
} |
||||
|
return $out; |
||||
|
} |
||||
|
|
||||
|
/** Implements backward seeking $distance characters */ |
||||
|
protected function seekBack(int $distance): int { |
||||
|
if ($this->dirtyEOF && $distance) { |
||||
|
$distance--; |
||||
|
$this->posChar--; |
||||
|
$this->posByte -= $this->dirtyEOF; |
||||
|
$this->dirtyEOF = 0; |
||||
|
} |
||||
|
while ($distance > 0 && $this->posChar > 0) { |
||||
|
$distance--; |
||||
|
$this->posChar--; |
||||
|
if ($this->posByte < 4) { |
||||
|
// if we're less than four bytes into the string, the previous character is necessarily double-byte |
||||
|
$this->posByte -= 2; |
||||
|
} else { |
||||
|
// otherwise go back four bytes and consume a character |
||||
|
$start = $this->posByte; |
||||
|
$this->posByte -= 4; |
||||
|
$this->posChar--; |
||||
|
$this->nextCode(); |
||||
|
if ($this->posByte == $start) { |
||||
|
// if we're back at our starting position the character was four bytes |
||||
|
$this->posByte -= 4; |
||||
|
} else { |
||||
|
// otherwise we're already where we need to be |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
return $distance; |
||||
|
} |
||||
|
} |
@ -0,0 +1,16 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\Encoding; |
||||
|
|
||||
|
class UTF16BE extends UTF16 { |
||||
|
protected const BE = true; |
||||
|
public const NAME = "UTF-16BE"; |
||||
|
public const LABELS = [ |
||||
|
"unicodefffe", |
||||
|
"utf-16be", |
||||
|
]; |
||||
|
} |
@ -0,0 +1,21 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\Encoding; |
||||
|
|
||||
|
class UTF16LE extends UTF16 { |
||||
|
protected const BE = false; |
||||
|
public const NAME = "UTF-16LE"; |
||||
|
public const LABELS = [ |
||||
|
"csunicode", |
||||
|
"iso-10646-ucs-2", |
||||
|
"ucs-2", |
||||
|
"unicode", |
||||
|
"unicodefeff", |
||||
|
"utf-16", |
||||
|
"utf-16le", |
||||
|
]; |
||||
|
} |
@ -0,0 +1,113 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\Encoding; |
||||
|
|
||||
|
class XUserDefined extends AbstractEncoding implements Coder, Decoder { |
||||
|
public const NAME = "x-user-defined"; |
||||
|
public const LABELS = ["x-user-defined"]; |
||||
|
|
||||
|
/** Retrieve the next character in the string, in UTF-8 encoding |
||||
|
* |
||||
|
* The returned character may be a replacement character, or the empty string if the end of the string has been reached |
||||
|
*/ |
||||
|
public function nextChar(): string { |
||||
|
// get the byte at the current position |
||||
|
$b = $this->string[$this->posChar] ?? ""; |
||||
|
if ($b === "") { |
||||
|
return ""; |
||||
|
} |
||||
|
$this->posChar++; |
||||
|
$this->posByte++; |
||||
|
$p = ord($b); |
||||
|
if ($p < 0x80) { |
||||
|
// if the byte is an ASCII character or end of input, simply return it |
||||
|
return $b; |
||||
|
} else { |
||||
|
return UTF8::encode(0xF700 + $p); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** Decodes the next character from the string and returns its code point number |
||||
|
* |
||||
|
* If the end of the string has been reached, false is returned |
||||
|
* |
||||
|
* @return int|bool |
||||
|
*/ |
||||
|
public function nextCode() { |
||||
|
// get the byte at the current position |
||||
|
$b = $this->string[$this->posChar] ?? ""; |
||||
|
if ($b === "") { |
||||
|
return false; |
||||
|
} |
||||
|
$this->posChar++; |
||||
|
$this->posByte++; |
||||
|
$p = ord($b); |
||||
|
if ($p < 0x80) { |
||||
|
// if the byte is an ASCII character or end of input, simply return it |
||||
|
return $p; |
||||
|
} else { |
||||
|
return 0xF700 + $p; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** Advance $distance characters through the string |
||||
|
* |
||||
|
* If $distance is negative, the operation will be performed in reverse |
||||
|
* |
||||
|
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned |
||||
|
*/ |
||||
|
public function seek(int $distance): int { |
||||
|
if ($distance > 0) { |
||||
|
while ($this->posChar < $this->lenByte && $distance > 0) { |
||||
|
$this->nextCode(); |
||||
|
$distance--; |
||||
|
} |
||||
|
return $distance; |
||||
|
} elseif ($distance < 0) { |
||||
|
$distance = abs($distance); |
||||
|
while ($this->posChar > 0 && $distance > 0) { |
||||
|
$this->posChar--; |
||||
|
$this->posByte--; |
||||
|
$distance--; |
||||
|
} |
||||
|
return $distance; |
||||
|
} else { |
||||
|
return 0; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public static function encode(int $codePoint, bool $fatal = true): string { |
||||
|
if ($codePoint < 0 || $codePoint > 0x10FFFF) { |
||||
|
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT); |
||||
|
} elseif ($codePoint < 0x80) { |
||||
|
return chr($codePoint); |
||||
|
} elseif ($codePoint >= 0xF780 && $codePoint <= 0xF7FF) { |
||||
|
return chr($codePoint - 0xF780 + 0x80); |
||||
|
} else { |
||||
|
return self::errEnc(!$fatal, $codePoint); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** @codeCoverageIgnore */ |
||||
|
protected function seekBack(int $distance): int { |
||||
|
// stub: not used |
||||
|
return 0; |
||||
|
} |
||||
|
|
||||
|
/** Calculates the length of the string in code points |
||||
|
* |
||||
|
* Note that this may involve processing to the end of the string |
||||
|
*/ |
||||
|
public function lenChar(): int { |
||||
|
return $this->lenByte; |
||||
|
} |
||||
|
|
||||
|
/** Returns whether the character pointer is at the end of the string */ |
||||
|
public function eof(): bool { |
||||
|
return $this->posChar >= $this->lenByte; |
||||
|
} |
||||
|
} |
@ -1,10 +1,14 @@ |
|||||
#! /bin/sh |
#! /bin/sh |
||||
base=`dirname "$0"` |
base=`dirname "$0"` |
||||
roboCommand="$1" |
roboCommand="$1" |
||||
|
if [ $# -eq 0 ]; then |
||||
shift |
"$base/vendor/bin/robo" |
||||
if [ "$1" == "clean" ]; then |
|
||||
"$base/vendor/bin/robo" "$roboCommand" $* |
|
||||
else |
else |
||||
"$base/vendor/bin/robo" "$roboCommand" -- $* |
shift |
||||
fi |
ulimit -n 2048 |
||||
|
if [ "$1" = "clean" ]; then |
||||
|
"$base/vendor/bin/robo" "$roboCommand" "$@" |
||||
|
else |
||||
|
"$base/vendor/bin/robo" "$roboCommand" -- "$@" |
||||
|
fi |
||||
|
fi |
||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -0,0 +1,279 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\TestCase\Encoding; |
||||
|
|
||||
|
use MensBeam\Intl\Encoding\ISO2022JP; |
||||
|
use MensBeam\Intl\Encoding\Coder; |
||||
|
use MensBeam\Intl\Encoding\EncoderException; |
||||
|
|
||||
|
class TestISO2022JP extends \MensBeam\Intl\Test\CoderDecoderTest { |
||||
|
protected $testedClass = ISO2022JP::class; |
||||
|
/* |
||||
|
Char 0 U+007A (1 byte) Offset 0 |
||||
|
Esc: Katakana (3 bytes) Offset 1 |
||||
|
Char 1 U+FF9C (1 byte) Offset 4 |
||||
|
Char 2 U+FF9F (1 byte) Offset 5 |
||||
|
Esc: Double-byte (3 bytes) Offset 6 |
||||
|
Char 3 U+79FB (2 bytes) Offset 9 |
||||
|
Char 4 U+67B8 (2 bytes) Offset 11 |
||||
|
Char 5 U+9B91 (2 bytes) Offset 13 |
||||
|
Esc: ASCII (3 bytes) Offset 15 |
||||
|
Char 6 U+007E (1 byte) Offset 18 |
||||
|
Esc: Roman (3 bytes) Offset 19 |
||||
|
End of string at char 7, offset 22 |
||||
|
*/ |
||||
|
protected $seekString = "7A 1B2849 5C 5F 1B2440 305C 5B4E 723A 1B2842 7E 1B284A"; |
||||
|
protected $seekCodes = [0x7A, 0xFF9C, 0xFF9F, 0x79FB, 0x67B8, 0x9B91, 0x7E]; |
||||
|
protected $seekOffsets = [0, 1, 5, 6, 11, 13, 15, 19]; |
||||
|
/* This string contains an invalid character sequence sandwiched between two null characters */ |
||||
|
protected $brokenChar = "00 FF 00"; |
||||
|
/* This string conatins the ASCII characters "A" and "Z" followed by two arbitrary non-ASCII characters, followed by the two ASCII characters "0" and "9" */ |
||||
|
protected $spanString = "1B284A 41 5A 1B2849 5C 5F 1B2842 30 39"; |
||||
|
|
||||
|
public function provideCodePoints() { |
||||
|
return [ |
||||
|
'U+0020 (HTML)' => [false, [0x20], "20"], |
||||
|
'U+0020 (fatal)' => [true, [0x20], "20"], |
||||
|
'U+005C (HTML)' => [false, [0x5C], "5C"], |
||||
|
'U+005C (fatal)' => [true, [0x5C], "5C"], |
||||
|
'U+007E (HTML)' => [false, [0x7E], "7E"], |
||||
|
'U+007E (fatal)' => [true, [0x7E], "7E"], |
||||
|
'U+00A5 (HTML)' => [false, [0xA5], "1B 28 4A 5C 1B 28 42"], |
||||
|
'U+00A5 (fatal)' => [true, [0xA5], "1B 28 4A 5C 1B 28 42"], |
||||
|
'U+203E (HTML)' => [false, [0x203E], "1B 28 4A 7E 1B 28 42"], |
||||
|
'U+203E (fatal)' => [true, [0x203E], "1B 28 4A 7E 1B 28 42"], |
||||
|
'U+FF61 (HTML)' => [false, [0xFF61], "1B 24 42 21 23 1B 28 42"], |
||||
|
'U+FF61 (fatal)' => [true, [0xFF61], "1B 24 42 21 23 1B 28 42"], |
||||
|
'U+FF9F (HTML)' => [false, [0xFF9F], "1B 24 42 21 2C 1B 28 42"], |
||||
|
'U+FF9F (fatal)' => [true, [0xFF9F], "1B 24 42 21 2C 1B 28 42"], |
||||
|
'U+2212 (HTML)' => [false, [0x2212], "1B 24 42 21 5D 1B 28 42"], |
||||
|
'U+2212 (fatal)' => [true, [0x2212], "1B 24 42 21 5D 1B 28 42"], |
||||
|
'U+2116 (HTML)' => [false, [0x2116], "1B 24 42 2D 62 1B 28 42"], |
||||
|
'U+2116 (fatal)' => [true, [0x2116], "1B 24 42 2D 62 1B 28 42"], |
||||
|
'U+FFE2 (HTML)' => [false, [0xFFE2], "1B 24 42 22 4C 1B 28 42"], |
||||
|
'U+FFE2 (fatal)' => [true, [0xFFE2], "1B 24 42 22 4C 1B 28 42"], |
||||
|
'U+00C6 (HTML)' => [false, [0xC6], "26 23 31 39 38 3B"], |
||||
|
'U+00C6 (fatal)' => [true, [0xC6], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)], |
||||
|
'U+FFFD (HTML)' => [false, [0xFFFD], "26 23 36 35 35 33 33 3B"], |
||||
|
'U+FFFD (fatal)' => [true, [0xFFFD], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)], |
||||
|
'Roman (HTML)' => [false, [0xA5, 0x20, 0x203E], "1B 28 4A 5C 20 7E 1B 28 42"], |
||||
|
'Roman (fatal)' => [true, [0xA5, 0x20, 0x203E], "1B 28 4A 5C 20 7E 1B 28 42"], |
||||
|
'Roman to ASCII (HTML)' => [false, [0xA5, 0x5C], "1B 28 4A 5C 1B 28 42 5C"], |
||||
|
'Roman to ASCII (fatal)' => [true, [0xA5, 0x5C], "1B 28 4A 5C 1B 28 42 5C"], |
||||
|
'Roman to error (HTML)' => [false, [0xA5, 0x80], "1B 28 4A 5C 26 23 31 32 38 3B 1B 28 42"], |
||||
|
'Roman to error (fatal)' => [true, [0xA5, 0x80], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)], |
||||
|
'JIS (HTML)' => [false, [0x2116, 0xFFE2, 0x2212], "1B 24 42 2D 62 22 4C 21 5D 1B 28 42"], |
||||
|
'JIS (fatal)' => [true, [0x2116, 0xFFE2, 0x2212], "1B 24 42 2D 62 22 4C 21 5D 1B 28 42"], |
||||
|
'JIS to Roman (HTML)' => [false, [0x2116, 0xA5], "1B 24 42 2D 62 1B 28 4A 5C 1B 28 42"], |
||||
|
'JIS to Roman (fatal)' => [true, [0x2116, 0xA5], "1B 24 42 2D 62 1B 28 4A 5C 1B 28 42"], |
||||
|
'JIS to ASCII 1 (HTML)' => [false, [0x2116, 0x20], "1B 24 42 2D 62 1B 28 42 20"], |
||||
|
'JIS to ASCII 1 (fatal)' => [true, [0x2116, 0x20], "1B 24 42 2D 62 1B 28 42 20"], |
||||
|
'JIS to ASCII 2 (HTML)' => [false, [0x2116, 0x5C], "1B 24 42 2D 62 1B 28 42 5C"], |
||||
|
'JIS to ASCII 2 (fatal)' => [true, [0x2116, 0x5C], "1B 24 42 2D 62 1B 28 42 5C"], |
||||
|
'JIS to error 1 (HTML)' => [false, [0x2116, 0x80], "1B 24 42 2D 62 1B 28 42 26 23 31 32 38 3B"], |
||||
|
'JIS to error 1 (fatal)' => [true, [0x2116, 0x80], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)], |
||||
|
'JIS to error 2 (HTML)' => [false, [0x2116, 0x1B], "1B 24 42 2D 62 1B 28 42 26 23 36 35 35 33 33 3B"], |
||||
|
'JIS to error 2 (fatal)' => [true, [0x2116, 0x1B], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)], |
||||
|
'Escape characters (HTML)' => [false, [0x1B, 0xE, 0xF], "26 23 36 35 35 33 33 3B 26 23 36 35 35 33 33 3B 26 23 36 35 35 33 33 3B"], |
||||
|
'Escape characters (fatal)' => [true, [0x1B, 0xE, 0xF], new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)], |
||||
|
'-1 (HTML)' => [false, [-1], new EncoderException("", Coder::E_INVALID_CODE_POINT)], |
||||
|
'-1 (fatal)' => [true, [-1], new EncoderException("", Coder::E_INVALID_CODE_POINT)], |
||||
|
'0x110000 (HTML)' => [false, [0x110000], new EncoderException("", Coder::E_INVALID_CODE_POINT)], |
||||
|
'0x110000 (fatal)' => [true, [0x110000], new EncoderException("", Coder::E_INVALID_CODE_POINT)], |
||||
|
]; |
||||
|
} |
||||
|
|
||||
|
public function provideStrings() { |
||||
|
return [ |
||||
|
'empty string' => ["", []], |
||||
|
'Implied ASCII mode' => ["00 30 5C 7E 21 5F", [0, 48, 92, 126, 33, 95]], |
||||
|
'Explicit ASCII mode' => ["1B2842 00 30 5C 7E 21 5F", [0, 48, 92, 126, 33, 95]], |
||||
|
'Roman mode' => ["1B284A 00 30 5C 7E 21 5F", [0, 48, 165, 8254, 33, 95]], |
||||
|
'Katakana mode' => ["1B2849 00 30 5C 7E 21 5F", [65533, 65392, 65436, 65533, 65377, 65439]], |
||||
|
'Double-byte mode 1' => ["1B2440 00 305C 7E21 5F", [65533, 31227, 65533, 65533]], |
||||
|
'Double-byte mode 2' => ["1B2442 00 305C 7E21 5F", [65533, 31227, 65533, 65533]], |
||||
|
'Multiple modes' => ["5C 1B2849 21 1B2440 305C 1B284A 5C 1B2842 5C", [92, 65377, 31227, 165, 92]], |
||||
|
'Double escape' => ["1B2849 1B2842 5C", [65533, 92]], |
||||
|
'Triple escape' => ["1B2849 1B2842 1B284A 5C", [65533, 65533, 165]], |
||||
|
'Trailing escape' => ["20 1B284A 30 33 1B2849", [32, 48, 51]], |
||||
|
'Truncated escape 1' => ["1B", [65533]], |
||||
|
'Truncated escape 2' => ["1B28", [65533, 40]], |
||||
|
'Truncated escape 3' => ["1B2820", [65533, 40, 32]], |
||||
|
'Truncated escape 4' => ["1B2020", [65533, 32, 32]], |
||||
|
'Invalid escape 1' => ["1B2840", [65533, 40, 64]], |
||||
|
'Invalid escape 2' => ["1B244A", [65533, 36, 74]], |
||||
|
'Invalid bytes' => ["80 FF 1B2849 00 20 7F 1B2442 00 2100 FF FF", [65533, 65533, 65533, 65533, 65533, 65533, 65533, 65533, 65533]], |
||||
|
]; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideCodePoints |
||||
|
* @covers MensBeam\Intl\Encoding\Encoder |
||||
|
*/ |
||||
|
public function testEncodeCodePoints(bool $fatal, $input, $exp) { |
||||
|
return parent::testEncodeCodePoints($fatal, $input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideCodePoints |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::encode |
||||
|
*/ |
||||
|
public function testEncodeCodePointsStatically(bool $fatal, $input, $exp) { |
||||
|
$out = ""; |
||||
|
if ($exp instanceof \Throwable) { |
||||
|
$this->expectException(get_class($exp)); |
||||
|
$this->expectExceptionCode($exp->getCode()); |
||||
|
} else { |
||||
|
$exp = strtolower(str_replace(" ", "", $exp)); |
||||
|
} |
||||
|
foreach ($input as $char) { |
||||
|
$out .= ISO2022JP::encode($char, $fatal, $mode); |
||||
|
} |
||||
|
$out .= ISO2022JP::encode(null, $fatal, $mode); |
||||
|
$this->assertSame($exp, bin2hex($out)); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::__construct |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::nextCode |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::modeSet |
||||
|
*/ |
||||
|
public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) { |
||||
|
return parent::testDecodeMultipleCharactersAsCodePoints($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::__construct |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::nextChar |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::modeSet |
||||
|
*/ |
||||
|
public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) { |
||||
|
return parent::testDecodeMultipleCharactersAsStrings($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::seekBack |
||||
|
*/ |
||||
|
public function testSTepBackThroughAString(string $input, array $exp) { |
||||
|
return parent::testSTepBackThroughAString($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::seek |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::posChar |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::posByte |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::rewind |
||||
|
*/ |
||||
|
public function testSeekThroughAString() { |
||||
|
return parent::testSeekThroughAString(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::posChar |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::posByte |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::eof |
||||
|
*/ |
||||
|
public function testTraversePastTheEndOfAString() { |
||||
|
return parent::testTraversePastTheEndOfAString(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::peekChar |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::stateSave |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::stateApply |
||||
|
*/ |
||||
|
public function testPeekAtCharacters() { |
||||
|
return parent::testPeekAtCharacters(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::peekCode |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::stateSave |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::stateApply |
||||
|
*/ |
||||
|
public function testPeekAtCodePoints() { |
||||
|
return parent::testPeekAtCodePoints(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::lenChar |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::lenByte |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::stateSave |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::stateApply |
||||
|
*/ |
||||
|
public function testGetStringLength(string $input, array $points) { |
||||
|
return parent::testGetStringLength($input, $points); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::errDec |
||||
|
*/ |
||||
|
public function testReplacementModes() { |
||||
|
return parent::testReplacementModes(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::rewind |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::chars |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::codes |
||||
|
*/ |
||||
|
public function testIterateThroughAString(string $input, array $exp) { |
||||
|
return parent::testIterateThroughAString($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @coversNothing |
||||
|
*/ |
||||
|
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) { |
||||
|
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::seekBack |
||||
|
*/ |
||||
|
public function testSeekBackOverRandomData() { |
||||
|
return parent::testSeekBackOverRandomData(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::asciiSpan |
||||
|
*/ |
||||
|
public function testExtractAsciiSpans() { |
||||
|
parent::testExtractAsciiSpans(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\ISO2022JP::asciiSpanNot |
||||
|
*/ |
||||
|
public function testExtractNegativeAsciiSpans() { |
||||
|
parent::testExtractNegativeAsciiSpans(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @group optional |
||||
|
*/ |
||||
|
public function testPedanticallyDecodeSingleCharactersAsCodePoint() { |
||||
|
$series = [ |
||||
|
]; |
||||
|
foreach ($series as $test) { |
||||
|
foreach ($test[0] as $a => $input) { |
||||
|
$class = $this->testedClass; |
||||
|
$char = hex2bin($input); |
||||
|
$exp = $test[1][$a]; |
||||
|
$s = new $class($char); |
||||
|
$this->assertSame($exp, $s->nextCode(), "Sequence $input did not decode to $exp."); |
||||
|
$this->assertFalse($s->nextCode(), "Sequence $input did not end after one character"); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
@ -0,0 +1,221 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\TestCase\Encoding; |
||||
|
|
||||
|
use MensBeam\Intl\Encoding\Replacement; |
||||
|
use MensBeam\Intl\Encoding\DecoderException; |
||||
|
|
||||
|
class TestReplacement extends \MensBeam\Intl\Test\DecoderTest { |
||||
|
protected $testedClass = Replacement::class; |
||||
|
|
||||
|
public function provideStrings() { |
||||
|
return [ |
||||
|
// control samples |
||||
|
'empty string' => ["", []], |
||||
|
'Arbitrary string 1' => ["20", [0xFFFD]], |
||||
|
'Arbitrary string 2' => ["64 8B 20 00 FF A5", [0xFFFD]], |
||||
|
]; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::__construct |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::nextCode |
||||
|
*/ |
||||
|
public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) { |
||||
|
return parent::testDecodeMultipleCharactersAsCodePoints($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::__construct |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::nextChar |
||||
|
*/ |
||||
|
public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) { |
||||
|
return parent::testDecodeMultipleCharactersAsStrings($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::seek |
||||
|
*/ |
||||
|
public function testSTepBackThroughAString(string $input, array $exp) { |
||||
|
return parent::testSTepBackThroughAString($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @coversNothing |
||||
|
*/ |
||||
|
public function testSeekThroughAString() { |
||||
|
$this->assertTrue(true); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::posChar |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::posByte |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::seek |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::eof |
||||
|
*/ |
||||
|
public function testTraversePastTheEndOfAString() { |
||||
|
$d = new Replacement("a"); |
||||
|
$this->assertFalse($d->eof()); |
||||
|
$this->assertSame(0, $d->posChar()); |
||||
|
$this->assertSame(0, $d->posByte()); |
||||
|
$d->seek(1); |
||||
|
$this->assertTrue($d->eof()); |
||||
|
$this->assertSame(1, $d->posChar()); |
||||
|
$this->assertSame(1, $d->posByte()); |
||||
|
$d->seek(1); |
||||
|
$this->assertTrue($d->eof()); |
||||
|
$this->assertSame(1, $d->posChar()); |
||||
|
$this->assertSame(1, $d->posByte()); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::peekChar |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::posChar |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::posByte |
||||
|
*/ |
||||
|
public function testPeekAtCharacters() { |
||||
|
$d = new Replacement("A"); |
||||
|
$this->assertSame(0, $d->posChar()); |
||||
|
$this->assertSame(0, $d->posByte()); |
||||
|
$this->assertSame("\u{FFFD}", $d->peekChar(2112)); |
||||
|
$this->assertSame(0, $d->posChar()); |
||||
|
$this->assertSame(0, $d->posByte()); |
||||
|
$this->assertSame("", $d->peekChar(0)); |
||||
|
$this->assertSame("", $d->peekChar(-2112)); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::peekCode |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::posChar |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::posByte |
||||
|
*/ |
||||
|
public function testPeekAtCodePoints() { |
||||
|
$d = new Replacement("A"); |
||||
|
$this->assertSame(0, $d->posChar()); |
||||
|
$this->assertSame(0, $d->posByte()); |
||||
|
$this->assertSame([0xFFFD], $d->peekCode(2112)); |
||||
|
$this->assertSame(0, $d->posChar()); |
||||
|
$this->assertSame(0, $d->posByte()); |
||||
|
$this->assertSame([], $d->peekCode(0)); |
||||
|
$this->assertSame([], $d->peekCode(-2112)); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::lenChar |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::lenByte |
||||
|
*/ |
||||
|
public function testGetStringLength(string $input, array $points) { |
||||
|
return parent::testGetStringLength($input, $points); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::nextChar |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::nextCode |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::peekChar |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::peekCode |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::rewind |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::posChar |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::posByte |
||||
|
*/ |
||||
|
public function testReplacementModes() { |
||||
|
$d = new Replacement("VVVVVV", true); |
||||
|
$this->assertSame(0, $d->posChar()); |
||||
|
$this->assertSame(0, $d->posByte()); |
||||
|
try { |
||||
|
$p = $d->peekCode(); |
||||
|
} catch (\Exception $e) { |
||||
|
$p = $e; |
||||
|
} finally { |
||||
|
$this->assertInstanceOf(DecoderException::class, $p); |
||||
|
} |
||||
|
$this->assertSame(0, $d->posErr); |
||||
|
$this->assertSame(0, $d->posChar()); |
||||
|
$this->assertSame(0, $d->posByte()); |
||||
|
try { |
||||
|
$p = $d->nextCode(); |
||||
|
} catch (\Exception $e) { |
||||
|
$p = $e; |
||||
|
} finally { |
||||
|
$this->assertInstanceOf(DecoderException::class, $p); |
||||
|
} |
||||
|
$this->assertSame(1, $d->posErr); |
||||
|
$this->assertSame(1, $d->posChar()); |
||||
|
$this->assertSame(6, $d->posByte()); |
||||
|
$d->rewind(); |
||||
|
$this->assertSame(0, $d->posChar()); |
||||
|
$this->assertSame(0, $d->posByte()); |
||||
|
try { |
||||
|
$p = $d->peekChar(); |
||||
|
} catch (\Exception $e) { |
||||
|
$p = $e; |
||||
|
} finally { |
||||
|
$this->assertInstanceOf(DecoderException::class, $p); |
||||
|
} |
||||
|
$this->assertSame(1, $d->posErr); |
||||
|
$this->assertSame(0, $d->posChar()); |
||||
|
$this->assertSame(0, $d->posByte()); |
||||
|
try { |
||||
|
$p = $d->nextChar(); |
||||
|
} catch (\Exception $e) { |
||||
|
$p = $e; |
||||
|
} finally { |
||||
|
$this->assertInstanceOf(DecoderException::class, $p); |
||||
|
} |
||||
|
$this->assertSame(1, $d->posErr); |
||||
|
$this->assertSame(1, $d->posChar()); |
||||
|
$this->assertSame(6, $d->posByte()); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::rewind |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::chars |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::codes |
||||
|
*/ |
||||
|
public function testIterateThroughAString(string $input, array $exp) { |
||||
|
return parent::testIterateThroughAString($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::nextCode |
||||
|
*/ |
||||
|
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) { |
||||
|
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @coversNothing |
||||
|
*/ |
||||
|
public function testSeekBackOverRandomData() { |
||||
|
return parent::testSeekBackOverRandomData(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::asciiSpan |
||||
|
*/ |
||||
|
public function testExtractAsciiSpans() { |
||||
|
$d = new Replacement("VVVVVV"); |
||||
|
$this->assertSame("", $d->asciiSpan($this->allBytes())); |
||||
|
$d->nextChar(); |
||||
|
$this->assertTrue($d->eof()); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\Replacement::asciiSpanNot |
||||
|
*/ |
||||
|
public function testExtractNegativeAsciiSpans() { |
||||
|
$d = new Replacement("VVVVVV"); |
||||
|
$this->assertSame("", $d->asciiSpanNot("")); |
||||
|
$d->nextChar(); |
||||
|
$this->assertTrue($d->eof()); |
||||
|
} |
||||
|
} |
File diff suppressed because one or more lines are too long
@ -0,0 +1,49 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\TestCase\Encoding; |
||||
|
|
||||
|
use MensBeam\Intl\Encoding\UTF16BE; |
||||
|
|
||||
|
class TestUTF16BE extends TestUTF16LE { |
||||
|
protected $testedClass = UTF16BE::class; |
||||
|
/* |
||||
|
Byte Order Mark (2 bytes) Offset 0 |
||||
|
Char 0 U+007A (2 bytes) Offset 2 |
||||
|
Char 1 U+00A2 (2 bytes) Offset 4 |
||||
|
Char 2 U+6C34 (2 bytes) Offset 6 |
||||
|
Char 3 U+1D11E (4 bytes) Offset 8 |
||||
|
Char 4 U+F8FF (2 bytes) Offset 12 |
||||
|
Char 5 U+10FFFD (4 bytes) Offset 14 |
||||
|
Char 6 U+FFFE (2 bytes) Offset 18 |
||||
|
End of string at char 7, offset 20 |
||||
|
*/ |
||||
|
protected $seekString = "FEFF 007A 00A2 6C34 D834DD1E F8FF DBFFDFFD FFFE"; |
||||
|
protected $seekCodes = [0x007A, 0x00A2, 0x6C34, 0x1D11E, 0xF8FF, 0x10FFFD, 0xFFFE]; |
||||
|
protected $seekOffsets = [2, 4, 6, 8, 12, 14, 18, 20]; |
||||
|
/* This string contains an invalid character sequence sandwiched between two null characters */ |
||||
|
protected $brokenChar = "0000 DC00 0000"; |
||||
|
/* This string conatins the ASCII characters "A" and "Z" followed by two arbitrary non-ASCII characters, followed by the two ASCII characters "0" and "9" */ |
||||
|
protected $spanString = "0041 005A 6C34 D834DD1E 0030 0039"; |
||||
|
protected $lowerA = "\x00a"; |
||||
|
|
||||
|
public function provideStrings() { |
||||
|
foreach (parent::provideStrings() as $name => $test) { |
||||
|
if (sizeof($test) == 2) { |
||||
|
$test[] = null; |
||||
|
} |
||||
|
list($string, $codes, $altCodes) = $test; |
||||
|
$words = explode(" ", $string); |
||||
|
foreach ($words as $a => $word) { |
||||
|
if (strlen($word) == 4) { |
||||
|
$words[$a] = $word[2].$word[3].$word[0].$word[1]; |
||||
|
} |
||||
|
} |
||||
|
$string = implode(" ", $words); |
||||
|
yield $name => [$string, $codes, $altCodes]; |
||||
|
} |
||||
|
} |
||||
|
} |
@ -0,0 +1,170 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\TestCase\Encoding; |
||||
|
|
||||
|
use MensBeam\Intl\Encoding\UTF16LE; |
||||
|
|
||||
|
class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest { |
||||
|
protected $testedClass = UTF16LE::class; |
||||
|
/* |
||||
|
Byte Order Mark (2 bytes) Offset 0 |
||||
|
Char 0 U+007A (2 bytes) Offset 2 |
||||
|
Char 1 U+00A2 (2 bytes) Offset 4 |
||||
|
Char 2 U+6C34 (2 bytes) Offset 6 |
||||
|
Char 3 U+1D11E (4 bytes) Offset 8 |
||||
|
Char 4 U+F8FF (2 bytes) Offset 12 |
||||
|
Char 5 U+10FFFD (4 bytes) Offset 14 |
||||
|
Char 6 U+FFFE (2 bytes) Offset 18 |
||||
|
End of string at char 7, offset 20 |
||||
|
*/ |
||||
|
protected $seekString = "FFFE 7A00 A200 346C 34D81EDD FFF8 FFDBFDDF FEFF"; |
||||
|
protected $seekCodes = [0x007A, 0x00A2, 0x6C34, 0x1D11E, 0xF8FF, 0x10FFFD, 0xFFFE]; |
||||
|
protected $seekOffsets = [2, 4, 6, 8, 12, 14, 18, 20]; |
||||
|
/* This string contains an invalid character sequence sandwiched between two null characters */ |
||||
|
protected $brokenChar = "0000 00DC 0000"; |
||||
|
/* This string conatins the ASCII characters "A" and "Z" followed by two arbitrary non-ASCII characters, followed by the two ASCII characters "0" and "9" */ |
||||
|
protected $spanString = "4100 5A00 346C 34D81EDD 3000 3900"; |
||||
|
protected $lowerA = "a\x00"; |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::__construct |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::nextCode |
||||
|
*/ |
||||
|
public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) { |
||||
|
return parent::testDecodeMultipleCharactersAsCodePoints($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::__construct |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::nextChar |
||||
|
*/ |
||||
|
public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) { |
||||
|
return parent::testDecodeMultipleCharactersAsStrings($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::seekBack |
||||
|
*/ |
||||
|
public function testSTepBackThroughAString(string $input, array $exp) { |
||||
|
return parent::testSTepBackThroughAString($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::seek |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::posChar |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::posByte |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::rewind |
||||
|
*/ |
||||
|
public function testSeekThroughAString() { |
||||
|
return parent::testSeekThroughAString(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::posChar |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::posByte |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::eof |
||||
|
*/ |
||||
|
public function testTraversePastTheEndOfAString() { |
||||
|
return parent::testTraversePastTheEndOfAString(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::peekChar |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::stateSave |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::stateApply |
||||
|
*/ |
||||
|
public function testPeekAtCharacters() { |
||||
|
return parent::testPeekAtCharacters(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::peekCode |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::stateSave |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::stateApply |
||||
|
*/ |
||||
|
public function testPeekAtCodePoints() { |
||||
|
return parent::testPeekAtCodePoints(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::lenChar |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::lenByte |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::stateSave |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::stateApply |
||||
|
*/ |
||||
|
public function testGetStringLength(string $input, array $points) { |
||||
|
return parent::testGetStringLength($input, $points); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::errDec |
||||
|
*/ |
||||
|
public function testReplacementModes() { |
||||
|
return parent::testReplacementModes(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::rewind |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::chars |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::codes |
||||
|
*/ |
||||
|
public function testIterateThroughAString(string $input, array $exp) { |
||||
|
return parent::testIterateThroughAString($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::nextCode |
||||
|
*/ |
||||
|
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) { |
||||
|
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::seekBack |
||||
|
*/ |
||||
|
public function testSeekBackOverRandomData() { |
||||
|
return parent::testSeekBackOverRandomData(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::asciiSpan |
||||
|
*/ |
||||
|
public function testExtractAsciiSpans() { |
||||
|
parent::testExtractAsciiSpans(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::asciiSpanNot |
||||
|
*/ |
||||
|
public function testExtractNegativeAsciiSpans() { |
||||
|
parent::testExtractNegativeAsciiSpans(); |
||||
|
} |
||||
|
|
||||
|
public function provideStrings() { |
||||
|
return [ |
||||
|
// control samples |
||||
|
'empty string' => ["", []], |
||||
|
'sanity check' => ["6100 6200 6300 3100 3200 3300", [97, 98, 99, 49, 50, 51]], |
||||
|
'mixed sample' => ["7A00 A200 346C 34D8 1EDD FFF8 FFDB FDDF FEFF", [122, 162, 27700, 119070, 63743, 1114109, 65534]], |
||||
|
// unexpected EOF |
||||
|
'EOF in BMP character' => ["0000 FF", [0, 65533]], |
||||
|
'EOF after lead surrogate' => ["0000 34D8", [0, 65533]], |
||||
|
'EOF in trail surrogate' => ["0000 34D8 1E", [0, 65533]], |
||||
|
// invalid UTF-16 surrogates |
||||
|
'lead surrogate without trail' => ["34D8 0000", [65533, 0], [0xD834, 0]], |
||||
|
'trail surrogate without lead' => ["1EDD 0000", [65533, 0], [0xDD1E, 0]], |
||||
|
'double lead surrogate' => ["34D8 34D8 1EDD", [65533, 119070], [0xD834, 119070]], |
||||
|
'double trail surrogate' => ["34D8 1EDD 1EDD", [119070, 65533], [119070, 0xDD1E]], |
||||
|
]; |
||||
|
} |
||||
|
} |
@ -1,409 +1,244 @@ |
|||||
<?php |
<?php |
||||
/** @license MIT |
/** @license MIT |
||||
* Copyright 2017 J. King, Dustin Wilson et al. |
* Copyright 2018 J. King et al. |
||||
* See LICENSE and AUTHORS files for details */ |
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
declare(strict_types=1); |
declare(strict_types=1); |
||||
namespace MensBeam\Intl\TestCase\Encoding; |
namespace MensBeam\Intl\TestCase\Encoding; |
||||
|
|
||||
use MensBeam\Intl\Encoding\UTF8; |
use MensBeam\Intl\Encoding\UTF8; |
||||
|
use MensBeam\Intl\Encoding\Coder; |
||||
use MensBeam\Intl\Encoding\EncoderException; |
use MensBeam\Intl\Encoding\EncoderException; |
||||
use MensBeam\Intl\Encoding\DecoderException; |
|
||||
|
|
||||
class TestUTF8 extends \PHPUnit\Framework\TestCase { |
class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest { |
||||
|
protected $testedClass = UTF8::class; |
||||
|
/* |
||||
|
Byte Order Mark (3 bytes) Offset 0 |
||||
|
Char 0 U+007A (1 byte) Offset 3 |
||||
|
Char 1 U+00A2 (2 bytes) Offset 4 |
||||
|
Char 2 U+6C34 (3 bytes) Offset 6 |
||||
|
Char 3 U+1D11E (4 bytes) Offset 9 |
||||
|
Char 4 U+F8FF (3 bytes) Offset 13 |
||||
|
Char 5 U+10FFFD (4 bytes) Offset 16 |
||||
|
Char 6 U+FFFE (3 bytes) Offset 20 |
||||
|
End of string at char 7, offset 23 |
||||
|
*/ |
||||
|
protected $seekString = "EFBBBF 7A C2A2 E6B0B4 F09D849E EFA3BF F48FBFBD EFBFBE"; |
||||
|
protected $seekCodes = [0x007A, 0x00A2, 0x6C34, 0x1D11E, 0xF8FF, 0x10FFFD, 0xFFFE]; |
||||
|
protected $seekOffsets = [3, 4, 6, 9, 13, 16, 20, 23]; |
||||
|
/* This string contains an invalid character sequence sandwiched between two null characters */ |
||||
|
protected $brokenChar = "00 FF 00"; |
||||
|
/* This string conatins the ASCII characters "A" and "Z" followed by two arbitrary non-ASCII characters, followed by the two ASCII characters "0" and "9" */ |
||||
|
protected $spanString = "41 5A E6B0B4 F09D849E 30 39"; |
||||
|
|
||||
|
public function provideCodePoints() { |
||||
|
return [ |
||||
|
'U+007A (HTML)' => [false, 0x7A, "7A"], |
||||
|
'U+007A (fatal)' => [true, 0x7A, "7A"], |
||||
|
'U+00A2 (HTML)' => [false, 0xA2, "C2 A2"], |
||||
|
'U+00A2 (fatal)' => [true, 0xA2, "C2 A2"], |
||||
|
'U+6C34 (HTML)' => [false, 0x6C34, "E6 B0 B4"], |
||||
|
'U+6C34 (fatal)' => [true, 0x6C34, "E6 B0 B4"], |
||||
|
'U+1D11E (HTML)' => [false, 0x1D11E, "F0 9D 84 9E"], |
||||
|
'U+1D11E (fatal)' => [true, 0x1D11E, "F0 9D 84 9E"], |
||||
|
'U+F8FF (HTML)' => [false, 0xF8FF, "EF A3 BF"], |
||||
|
'U+F8FF (fatal)' => [true, 0xF8FF, "EF A3 BF"], |
||||
|
'U+10FFFD (HTML)' => [false, 0x10FFFD, "F4 8F BF BD"], |
||||
|
'U+10FFFD (fatal)' => [true, 0x10FFFD, "F4 8F BF BD"], |
||||
|
'U+FFFE (HTML)' => [false, 0xFFFE, "EF BF BE"], |
||||
|
'U+FFFE (fatal)' => [true, 0xFFFE, "EF BF BE"], |
||||
|
'-1 (HTML)' => [false, -1, new EncoderException("", Coder::E_INVALID_CODE_POINT)], |
||||
|
'-1 (fatal)' => [true, -1, new EncoderException("", Coder::E_INVALID_CODE_POINT)], |
||||
|
'0x110000 (HTML)' => [false, 0x110000, new EncoderException("", Coder::E_INVALID_CODE_POINT)], |
||||
|
'0x110000 (fatal)' => [true, 0x110000, new EncoderException("", Coder::E_INVALID_CODE_POINT)], |
||||
|
]; |
||||
|
} |
||||
|
|
||||
|
public function provideStrings() { |
||||
|
return [ |
||||
|
// control samples |
||||
|
'empty string' => ["", []], |
||||
|
'sanity check' => ["61 62 63 31 32 33", [97, 98, 99, 49, 50, 51]], |
||||
|
'multibyte control' => ["E5 8F A4 E6 B1 A0 E3 82 84 E8 9B 99 E9 A3 9B E3 81 B3 E8 BE BC E3 82 80 E6 B0 B4 E3 81 AE E9 9F B3", [21476, 27744, 12420, 34521, 39131, 12403, 36796, 12416, 27700, 12398, 38899]], |
||||
|
'mixed sample' => ["7A C2 A2 E6 B0 B4 F0 9D 84 9E EF A3 BF F4 8F BF BD EF BF BE", [122, 162, 27700, 119070, 63743, 1114109, 65534]], |
||||
|
// various invalid sequences |
||||
|
'invalid code' => ["FF", [65533]], |
||||
|
'ends early' => ["C0", [65533]], |
||||
|
'ends early 2' => ["E0", [65533]], |
||||
|
'invalid trail' => ["C0 00", [65533, 0]], |
||||
|
'invalid trail 2' => ["C0 C0", [65533, 65533]], |
||||
|
'invalid trail 3' => ["E0 00", [65533, 0]], |
||||
|
'invalid trail 4' => ["E0 C0", [65533, 65533]], |
||||
|
'invalid trail 5' => ["E0 80 00", [65533, 65533, 0]], |
||||
|
'invalid trail 6' => ["E0 80 C0", [65533, 65533, 65533]], |
||||
|
'> 0x10FFFF' => ["FC 80 80 80 80 80", [65533, 65533, 65533, 65533, 65533, 65533]], |
||||
|
'obsolete lead byte' => ["FE 80 80 80 80 80", [65533, 65533, 65533, 65533, 65533, 65533]], |
||||
|
'overlong U+0000 - 2 bytes' => ["C0 80", [65533, 65533]], |
||||
|
'overlong U+0000 - 3 bytes' => ["E0 80 80", [65533, 65533, 65533]], |
||||
|
'overlong U+0000 - 4 bytes' => ["F0 80 80 80", [65533, 65533, 65533, 65533]], |
||||
|
'overlong U+0000 - 5 bytes' => ["F8 80 80 80 80", [65533, 65533, 65533, 65533, 65533]], |
||||
|
'overlong U+0000 - 6 bytes' => ["FC 80 80 80 80 80", [65533, 65533, 65533, 65533, 65533, 65533]], |
||||
|
'overlong U+007F - 2 bytes' => ["C1 BF", [65533, 65533]], |
||||
|
'overlong U+007F - 3 bytes' => ["E0 81 BF", [65533, 65533, 65533]], |
||||
|
'overlong U+007F - 4 bytes' => ["F0 80 81 BF", [65533, 65533, 65533, 65533]], |
||||
|
'overlong U+007F - 5 bytes' => ["F8 80 80 81 BF", [65533, 65533, 65533, 65533, 65533]], |
||||
|
'overlong U+007F - 6 bytes' => ["FC 80 80 80 81 BF", [65533, 65533, 65533, 65533, 65533, 65533]], |
||||
|
'overlong U+07FF - 3 bytes' => ["E0 9F BF", [65533, 65533, 65533]], |
||||
|
'overlong U+07FF - 4 bytes' => ["F0 80 9F BF", [65533, 65533, 65533, 65533]], |
||||
|
'overlong U+07FF - 5 bytes' => ["F8 80 80 9F BF", [65533, 65533, 65533, 65533, 65533]], |
||||
|
'overlong U+07FF - 6 bytes' => ["FC 80 80 80 9F BF", [65533, 65533, 65533, 65533, 65533, 65533]], |
||||
|
'overlong U+FFFF - 4 bytes' => ["F0 8F BF BF", [65533, 65533, 65533, 65533]], |
||||
|
'overlong U+FFFF - 5 bytes' => ["F8 80 8F BF BF", [65533, 65533, 65533, 65533, 65533]], |
||||
|
'overlong U+FFFF - 6 bytes' => ["FC 80 80 8F BF BF", [65533, 65533, 65533, 65533, 65533, 65533]], |
||||
|
'overlong U+10FFFF - 5 bytes' => ["F8 84 8F BF BF", [65533, 65533, 65533, 65533, 65533]], |
||||
|
'overlong U+10FFFF - 6 bytes' => ["FC 80 84 8F BF BF", [65533, 65533, 65533, 65533, 65533, 65533]], |
||||
|
// UTF-16 surrogates |
||||
|
// surrogates have alternate outputs for when surrogates are being allowed |
||||
|
'lead surrogate' => ["ED A0 80", [65533, 65533, 65533], [0xD800]], |
||||
|
'trail surrogate' => ["ED B0 80", [65533, 65533, 65533], [0xDC00]], |
||||
|
'surrogate pair' => ["ED A0 80 ED B0 80", [65533, 65533, 65533, 65533, 65533, 65533], [0xD800, 0xDC00]], |
||||
|
// self-sync edge cases |
||||
|
'trailing continuation' => ["0A 80 80", [10, 65533, 65533]], |
||||
|
'trailing continuation 2' => ["E5 8F A4 80", [21476, 65533]], |
||||
|
]; |
||||
|
} |
||||
|
|
||||
/** |
/** |
||||
* @dataProvider provideCodePoints |
* @dataProvider provideCodePoints |
||||
|
* @covers MensBeam\Intl\Encoding\Encoder |
||||
* @covers MensBeam\Intl\Encoding\UTF8::encode |
* @covers MensBeam\Intl\Encoding\UTF8::encode |
||||
*/ |
* @covers MensBeam\Intl\Encoding\UTF8::errEnc |
||||
public function testEncodeCodePoints(int $input, $exp) { |
*/ |
||||
if ($exp instanceof \Throwable) { |
public function testEncodeCodePoints(bool $fatal, $input, $exp) { |
||||
$this->expectException(get_class($exp)); |
return parent::testEncodeCodePoints($fatal, $input, $exp); |
||||
$this->expectExceptionCode($exp->getCode()); |
} |
||||
} |
|
||||
$out = UTF8::encode($input); |
/** |
||||
$this->assertSame(bin2hex($exp), bin2hex($out)); |
* @dataProvider provideCodePoints |
||||
|
* @covers MensBeam\Intl\Encoding\UTF8::encode |
||||
|
* @covers MensBeam\Intl\Encoding\UTF8::errEnc |
||||
|
*/ |
||||
|
public function testEncodeCodePointsStatically(bool $fatal, $input, $exp) { |
||||
|
return parent::testEncodeCodePointsStatically($fatal, $input, $exp); |
||||
} |
} |
||||
|
|
||||
/** |
/** |
||||
* @dataProvider provideStrings |
* @dataProvider provideStrings |
||||
* @covers MensBeam\Intl\Encoding\UTF8::__construct |
* @covers MensBeam\Intl\Encoding\UTF8::__construct |
||||
* @covers MensBeam\Intl\Encoding\UTF8::nextCode |
* @covers MensBeam\Intl\Encoding\UTF8::nextCode |
||||
*/ |
*/ |
||||
public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) { |
public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) { |
||||
$s = new UTF8($input); |
return parent::testDecodeMultipleCharactersAsCodePoints($input, $exp); |
||||
$out = []; |
|
||||
while (($p = $s->nextCode()) !== false) { |
|
||||
$out[] = $p; |
|
||||
} |
|
||||
$this->assertEquals($exp, $out); |
|
||||
} |
} |
||||
|
|
||||
/** |
/** |
||||
* @dataProvider provideStrings |
* @dataProvider provideStrings |
||||
* @covers MensBeam\Intl\Encoding\UTF8::__construct |
* @covers MensBeam\Intl\Encoding\UTF8::__construct |
||||
* @covers MensBeam\Intl\Encoding\UTF8::nextChar |
* @covers MensBeam\Intl\Encoding\UTF8::nextChar |
||||
*/ |
*/ |
||||
public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) { |
public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) { |
||||
$out = []; |
return parent::testDecodeMultipleCharactersAsStrings($input, $exp); |
||||
$exp = array_map(function($v) { |
|
||||
return \IntlChar::chr($v); |
|
||||
}, $exp); |
|
||||
$s = new UTF8($input); |
|
||||
while (($c = $s->nextChar()) !== "") { |
|
||||
$out[] = $c; |
|
||||
} |
|
||||
$this->assertEquals($exp, $out); |
|
||||
} |
|
||||
|
|
||||
/** |
|
||||
* @dataProvider provideStrings |
|
||||
* @covers MensBeam\Intl\Encoding\UTF8::rewind |
|
||||
* @covers MensBeam\Intl\Encoding\UTF8::chars |
|
||||
* @covers MensBeam\Intl\Encoding\UTF8::codes |
|
||||
*/ |
|
||||
public function testIterateThroughAString(string $input, array $exp) { |
|
||||
$out = []; |
|
||||
$s = new UTF8($input); |
|
||||
$a = 0; |
|
||||
$this->assertTrue(true); // prevent risky test of empty string |
|
||||
foreach ($s->codes() as $index => $p) { |
|
||||
$this->assertSame($a, $index, "Character key at index $a reported incorrectly"); |
|
||||
$this->assertSame($exp[$a], $p, "Character at index $a decoded incorrectly"); |
|
||||
$a++; |
|
||||
} |
|
||||
$a = 0; |
|
||||
foreach ($s->codes() as $p) { |
|
||||
$a++; |
|
||||
} |
|
||||
$this->assertSame(0, $a); |
|
||||
$s->rewind(); |
|
||||
foreach ($s->codes() as $p) { |
|
||||
$a++; |
|
||||
} |
|
||||
$this->assertSame(sizeof($exp), $a); |
|
||||
|
|
||||
$exp = array_map(function($v) { |
|
||||
return \IntlChar::chr($v); |
|
||||
}, $exp); |
|
||||
|
|
||||
foreach ($s->chars() as $index => $p) { |
|
||||
$this->assertSame($a, $index, "Character key at index $a reported incorrectly"); |
|
||||
$this->assertSame(bin2hex($exp[$a]), bin2hex($p), "Character at index $a decoded incorrectly"); |
|
||||
$a++; |
|
||||
} |
|
||||
$a = 0; |
|
||||
foreach ($s->chars() as $p) { |
|
||||
$a++; |
|
||||
} |
|
||||
$this->assertSame(0, $a); |
|
||||
$s->rewind(); |
|
||||
foreach ($s->chars() as $p) { |
|
||||
$a++; |
|
||||
} |
|
||||
$this->assertSame(sizeof($exp), $a); |
|
||||
} |
} |
||||
|
|
||||
/** |
/** |
||||
* @dataProvider provideStrings |
* @dataProvider provideStrings |
||||
* @covers MensBeam\Intl\Encoding\UTF8::sync |
* @covers MensBeam\Intl\Encoding\UTF8::seekBack |
||||
*/ |
*/ |
||||
public function testSTepBackThroughAString(string $input, array $points) { |
public function testSTepBackThroughAString(string $input, array $exp) { |
||||
$s = new UTF8($input); |
return parent::testSTepBackThroughAString($input, $exp); |
||||
$a = 0; |
|
||||
$this->assertTrue(true); // prevent risky test of empty string |
|
||||
while (($p1 = $s->nextCode()) !== false) { |
|
||||
$this->assertSame(0, $s->seek(-1)); |
|
||||
$p2 = $s->nextCode(); |
|
||||
$this->assertSame($p1, $p2, "Mismatch at character position $a"); |
|
||||
$this->assertSame(++$a, $s->posChar(), "Character position should be $a"); |
|
||||
} |
|
||||
} |
} |
||||
|
|
||||
/** |
/** |
||||
* @covers MensBeam\Intl\Encoding\UTF8::seek |
* @covers MensBeam\Intl\Encoding\UTF8::seek |
||||
* @covers MensBeam\Intl\Encoding\UTF8::posChar |
* @covers MensBeam\Intl\Encoding\UTF8::posChar |
||||
* @covers MensBeam\Intl\Encoding\UTF8::posByte |
* @covers MensBeam\Intl\Encoding\UTF8::posByte |
||||
*/ |
* @covers MensBeam\Intl\Encoding\UTF8::rewind |
||||
|
*/ |
||||
public function testSeekThroughAString() { |
public function testSeekThroughAString() { |
||||
/* |
return parent::testSeekThroughAString(); |
||||
Char 0 U+007A (1 byte) Offset 0 |
|
||||
Char 1 U+00A2 (2 bytes) Offset 1 |
|
||||
Char 2 U+6C34 (3 bytes) Offset 3 |
|
||||
Char 3 U+1D11E (4 bytes) Offset 6 |
|
||||
Char 4 U+F8FF (3 bytes) Offset 10 |
|
||||
Char 5 U+10FFFD (4 bytes) Offset 13 |
|
||||
Char 6 U+FFFE (3 bytes) Offset 17 |
|
||||
End of string at char 7, offset 20 |
|
||||
*/ |
|
||||
$input = "\x7A\xC2\xA2\xE6\xB0\xB4\xF0\x9D\x84\x9E\xEF\xA3\xBF\xF4\x8F\xBF\xBD\xEF\xBF\xBE"; |
|
||||
$s = new UTF8($input); |
|
||||
$this->assertSame(0, $s->posChar()); |
|
||||
$this->assertSame(0, $s->posByte()); |
|
||||
|
|
||||
$this->assertSame(0, $s->seek(0)); |
|
||||
$this->assertSame(0, $s->posChar()); |
|
||||
$this->assertSame(0, $s->posByte()); |
|
||||
|
|
||||
$this->assertSame(1, $s->seek(-1)); |
|
||||
$this->assertSame(0, $s->posChar()); |
|
||||
$this->assertSame(0, $s->posByte()); |
|
||||
|
|
||||
$this->assertSame(0, $s->seek(1)); |
|
||||
$this->assertSame(1, $s->posChar()); |
|
||||
$this->assertSame(1, $s->posByte()); |
|
||||
|
|
||||
$this->assertSame(0, $s->seek(2)); |
|
||||
$this->assertSame(3, $s->posChar()); |
|
||||
$this->assertSame(6, $s->posByte()); |
|
||||
|
|
||||
$this->assertSame(0, $s->seek(4)); |
|
||||
$this->assertSame(7, $s->posChar()); |
|
||||
$this->assertSame(20, $s->posByte()); |
|
||||
|
|
||||
$this->assertSame(1, $s->seek(1)); |
|
||||
$this->assertSame(7, $s->posChar()); |
|
||||
$this->assertSame(20, $s->posByte()); |
|
||||
|
|
||||
$this->assertSame(0, $s->seek(-3)); |
|
||||
$this->assertSame(4, $s->posChar()); |
|
||||
$this->assertSame(10, $s->posByte()); |
|
||||
|
|
||||
$this->assertSame(6, $s->seek(-10)); |
|
||||
$this->assertSame(0, $s->posChar()); |
|
||||
$this->assertSame(0, $s->posByte()); |
|
||||
} |
} |
||||
|
|
||||
/** |
/** |
||||
* @covers MensBeam\Intl\Encoding\UTF8::posChar |
* @covers MensBeam\Intl\Encoding\UTF8::posChar |
||||
* @covers MensBeam\Intl\Encoding\UTF8::posByte |
* @covers MensBeam\Intl\Encoding\UTF8::posByte |
||||
*/ |
* @covers MensBeam\Intl\Encoding\UTF8::eof |
||||
|
*/ |
||||
public function testTraversePastTheEndOfAString() { |
public function testTraversePastTheEndOfAString() { |
||||
$s = new UTF8("a"); |
return parent::testTraversePastTheEndOfAString(); |
||||
$this->assertSame(0, $s->posChar()); |
|
||||
$this->assertSame(0, $s->posByte()); |
|
||||
|
|
||||
$this->assertSame("a", $s->nextChar()); |
|
||||
$this->assertSame(1, $s->posChar()); |
|
||||
$this->assertSame(1, $s->posByte()); |
|
||||
|
|
||||
$this->assertSame("", $s->nextChar()); |
|
||||
$this->assertSame(1, $s->posChar()); |
|
||||
$this->assertSame(1, $s->posByte()); |
|
||||
|
|
||||
$s = new UTF8("a"); |
|
||||
$this->assertSame(0, $s->posChar()); |
|
||||
$this->assertSame(0, $s->posByte()); |
|
||||
|
|
||||
$this->assertSame(ord("a"), $s->nextCode()); |
|
||||
$this->assertSame(1, $s->posChar()); |
|
||||
$this->assertSame(1, $s->posByte()); |
|
||||
|
|
||||
$this->assertSame(false, $s->nextCode()); |
|
||||
$this->assertSame(1, $s->posChar()); |
|
||||
$this->assertSame(1, $s->posByte()); |
|
||||
} |
} |
||||
|
|
||||
/** |
/** |
||||
* @covers MensBeam\Intl\Encoding\UTF8::peekChar |
* @covers MensBeam\Intl\Encoding\UTF8::peekChar |
||||
*/ |
* @covers MensBeam\Intl\Encoding\UTF8::stateSave |
||||
|
* @covers MensBeam\Intl\Encoding\UTF8::stateApply |
||||
|
*/ |
||||
public function testPeekAtCharacters() { |
public function testPeekAtCharacters() { |
||||
/* |
return parent::testPeekAtCharacters(); |
||||
Char 0 U+007A (1 byte) Offset 0 |
|
||||
Char 1 U+00A2 (2 bytes) Offset 1 |
|
||||
Char 2 U+6C34 (3 bytes) Offset 3 |
|
||||
Char 3 U+1D11E (4 bytes) Offset 6 |
|
||||
Char 4 U+F8FF (3 bytes) Offset 10 |
|
||||
Char 5 U+10FFFD (4 bytes) Offset 13 |
|
||||
Char 6 U+FFFE (3 bytes) Offset 17 |
|
||||
End of string at char 7, offset 20 |
|
||||
*/ |
|
||||
$input = "\x7A\xC2\xA2\xE6\xB0\xB4\xF0\x9D\x84\x9E\xEF\xA3\xBF\xF4\x8F\xBF\xBD\xEF\xBF\xBE"; |
|
||||
$s = new UTF8($input); |
|
||||
$s->seek(2); |
|
||||
$this->assertSame(2, $s->posChar()); |
|
||||
$this->assertSame(3, $s->posByte()); |
|
||||
|
|
||||
$this->assertSame(bin2hex("\u{6C34}"), bin2hex($s->peekChar())); |
|
||||
$this->assertSame(2, $s->posChar()); |
|
||||
$this->assertSame(3, $s->posByte()); |
|
||||
|
|
||||
$this->assertSame(bin2hex("\u{6C34}\u{1D11E}"), bin2hex($s->peekChar(2))); |
|
||||
$this->assertSame(2, $s->posChar()); |
|
||||
$this->assertSame(3, $s->posByte()); |
|
||||
|
|
||||
$s->seek(3); |
|
||||
$this->assertSame(5, $s->posChar()); |
|
||||
$this->assertSame(13, $s->posByte()); |
|
||||
|
|
||||
$this->assertSame(bin2hex("\u{10FFFD}\u{FFFE}"), bin2hex($s->peekChar(3))); |
|
||||
$this->assertSame(5, $s->posChar()); |
|
||||
$this->assertSame(13, $s->posByte()); |
|
||||
|
|
||||
$this->assertSame("", $s->peekChar(-5)); |
|
||||
$this->assertSame(5, $s->posChar()); |
|
||||
$this->assertSame(13, $s->posByte()); |
|
||||
} |
} |
||||
|
|
||||
/** |
/** |
||||
* @covers MensBeam\Intl\Encoding\UTF8::peekCode |
* @covers MensBeam\Intl\Encoding\UTF8::peekCode |
||||
*/ |
* @covers MensBeam\Intl\Encoding\UTF8::stateSave |
||||
|
* @covers MensBeam\Intl\Encoding\UTF8::stateApply |
||||
|
*/ |
||||
public function testPeekAtCodePoints() { |
public function testPeekAtCodePoints() { |
||||
/* |
return parent::testPeekAtCodePoints(); |
||||
Char 0 U+007A (1 byte) Offset 0 |
|
||||
Char 1 U+00A2 (2 bytes) Offset 1 |
|
||||
Char 2 U+6C34 (3 bytes) Offset 3 |
|
||||
Char 3 U+1D11E (4 bytes) Offset 6 |
|
||||
Char 4 U+F8FF (3 bytes) Offset 10 |
|
||||
Char 5 U+10FFFD (4 bytes) Offset 13 |
|
||||
Char 6 U+FFFE (3 bytes) Offset 17 |
|
||||
End of string at char 7, offset 20 |
|
||||
*/ |
|
||||
$input = "\x7A\xC2\xA2\xE6\xB0\xB4\xF0\x9D\x84\x9E\xEF\xA3\xBF\xF4\x8F\xBF\xBD\xEF\xBF\xBE"; |
|
||||
$s = new UTF8($input); |
|
||||
$s->seek(2); |
|
||||
$this->assertSame(2, $s->posChar()); |
|
||||
$this->assertSame(3, $s->posByte()); |
|
||||
|
|
||||
$this->assertSame([0x6C34], $s->peekCode()); |
|
||||
$this->assertSame(2, $s->posChar()); |
|
||||
$this->assertSame(3, $s->posByte()); |
|
||||
|
|
||||
$this->assertSame([0x6C34, 0x1D11E], $s->peekCode(2)); |
|
||||
$this->assertSame(2, $s->posChar()); |
|
||||
$this->assertSame(3, $s->posByte()); |
|
||||
|
|
||||
$s->seek(3); |
|
||||
$this->assertSame(5, $s->posChar()); |
|
||||
$this->assertSame(13, $s->posByte()); |
|
||||
|
|
||||
$this->assertSame([0x10FFFD, 0xFFFE], $s->peekCode(3)); |
|
||||
$this->assertSame(5, $s->posChar()); |
|
||||
$this->assertSame(13, $s->posByte()); |
|
||||
|
|
||||
$this->assertSame([], $s->peekCode(-5)); |
|
||||
$this->assertSame(5, $s->posChar()); |
|
||||
$this->assertSame(13, $s->posByte()); |
|
||||
} |
} |
||||
|
|
||||
/** |
/** |
||||
* @dataProvider provideStrings |
* @dataProvider provideStrings |
||||
* @covers MensBeam\Intl\Encoding\UTF8::len |
* @covers MensBeam\Intl\Encoding\UTF8::lenChar |
||||
|
* @covers MensBeam\Intl\Encoding\UTF8::lenByte |
||||
* @covers MensBeam\Intl\Encoding\UTF8::stateSave |
* @covers MensBeam\Intl\Encoding\UTF8::stateSave |
||||
* @covers MensBeam\Intl\Encoding\UTF8::stateApply |
* @covers MensBeam\Intl\Encoding\UTF8::stateApply |
||||
*/ |
*/ |
||||
public function testGetStringLength(string $input, array $points) { |
public function testGetStringLength(string $input, array $points) { |
||||
$s = new UTF8($input); |
return parent::testGetStringLength($input, $points); |
||||
$s->seek(1); |
|
||||
$posChar = $s->posChar(); |
|
||||
$posByte = $s->posByte(); |
|
||||
|
|
||||
$this->assertSame(sizeof($points), $s->len()); |
|
||||
$this->assertSame($posChar, $s->posChar()); |
|
||||
$this->assertSame($posByte, $s->posByte()); |
|
||||
} |
} |
||||
|
|
||||
/** |
/** |
||||
* @covers MensBeam\Intl\Encoding\UTF8::err |
* @covers MensBeam\Intl\Encoding\UTF8::errDec |
||||
*/ |
*/ |
||||
public function testReplacementModes() { |
public function testReplacementModes() { |
||||
$input = "\x30\xFF\x30"; |
return parent::testReplacementModes(); |
||||
// officially test replacement characters and null replacement (already effectively tested by other tests) |
|
||||
$s = new UTF8($input, false); |
|
||||
$s->seek(1); |
|
||||
$this->assertSame(0xFFFD, $s->nextCode()); |
|
||||
$s->seek(-2); |
|
||||
// test fatal mode |
|
||||
$s = new UTF8($input, true); |
|
||||
$s->seek(1); |
|
||||
try { |
|
||||
$p = $s->nextCode(); |
|
||||
} catch (DecoderException $e) { |
|
||||
$p = $e; |
|
||||
} finally { |
|
||||
$this->assertInstanceOf(DecoderException::class, $p); |
|
||||
} |
|
||||
$this->assertSame(2, $s->posChar()); |
|
||||
$this->assertSame(0x30, $s->nextCode()); |
|
||||
$s->seek(-2); |
|
||||
$this->assertSame(1, $s->posChar()); |
|
||||
try { |
|
||||
$p = $s->peekCode(); |
|
||||
} catch (DecoderException $e) { |
|
||||
$p = $e; |
|
||||
} finally { |
|
||||
$this->assertInstanceOf(DecoderException::class, $p); |
|
||||
} |
|
||||
$this->assertSame(1, $s->posChar()); |
|
||||
try { |
|
||||
$p = $s->peekChar(); |
|
||||
} catch (DecoderException $e) { |
|
||||
$p = $e; |
|
||||
} finally { |
|
||||
$this->assertInstanceOf(DecoderException::class, $p); |
|
||||
} |
|
||||
$this->assertSame(1, $s->posChar()); |
|
||||
} |
} |
||||
|
|
||||
public function provideCodePoints() { |
/** |
||||
return [ |
* @dataProvider provideStrings |
||||
"122" => [122, "\x7A"], |
* @covers MensBeam\Intl\Encoding\UTF8::rewind |
||||
"162" => [162, "\xC2\xA2"], |
* @covers MensBeam\Intl\Encoding\UTF8::chars |
||||
"27700" => [27700, "\xE6\xB0\xB4"], |
* @covers MensBeam\Intl\Encoding\UTF8::codes |
||||
"119070" => [119070, "\xF0\x9D\x84\x9E"], |
*/ |
||||
"63743" => [63743, "\xEF\xA3\xBF"], |
public function testIterateThroughAString(string $input, array $exp) { |
||||
"1114109" => [1114109, "\xF4\x8F\xBF\xBD"], |
return parent::testIterateThroughAString($input, $exp); |
||||
"65534" => [65534, "\xEF\xBF\xBE"], |
|
||||
"-1" => [-1, new EncoderException("", UTF8::E_INVALID_CODE_POINT)], |
|
||||
"1114112" => [1114112, new EncoderException("", UTF8::E_INVALID_CODE_POINT)], |
|
||||
]; |
|
||||
} |
} |
||||
|
|
||||
public function provideStrings() { |
/** |
||||
return [ |
* @dataProvider provideStrings |
||||
// control samples |
* @covers MensBeam\Intl\Encoding\UTF8::nextCode |
||||
'empty string' => ["", []], |
*/ |
||||
'sanity check' => ["\x61\x62\x63\x31\x32\x33", [97, 98, 99, 49, 50, 51]], |
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) { |
||||
'multibyte control' => ["\xE5\x8F\xA4\xE6\xB1\xA0\xE3\x82\x84\xE8\x9B\x99\xE9\xA3\x9B\xE3\x81\xB3\xE8\xBE\xBC\xE3\x82\x80\xE6\xB0\xB4\xE3\x81\xAE\xE9\x9F\xB3", [21476, 27744, 12420, 34521, 39131, 12403, 36796, 12416, 27700, 12398, 38899]], |
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp); |
||||
'mixed sample' => ["\x7A\xC2\xA2\xE6\xB0\xB4\xF0\x9D\x84\x9E\xEF\xA3\xBF\xF4\x8F\xBF\xBD\xEF\xBF\xBE", [122, 162, 27700, 119070, 63743, 1114109, 65534]], |
} |
||||
// various invalid sequences |
|
||||
'invalid code' => ["\xFF", [65533]], |
/** |
||||
'ends early' => ["\xC0", [65533]], |
* @covers MensBeam\Intl\Encoding\UTF8::seekBack |
||||
'ends early 2' => ["\xE0", [65533]], |
*/ |
||||
'invalid trail' => ["\xC0\x00", [65533, 0]], |
public function testSeekBackOverRandomData() { |
||||
'invalid trail 2' => ["\xC0\xC0", [65533, 65533]], |
return parent::testSeekBackOverRandomData(); |
||||
'invalid trail 3' => ["\xE0\x00", [65533, 0]], |
} |
||||
'invalid trail 4' => ["\xE0\xC0", [65533, 65533]], |
|
||||
'invalid trail 5' => ["\xE0\x80\x00", [65533, 65533, 0]], |
/** |
||||
'invalid trail 6' => ["\xE0\x80\xC0", [65533, 65533, 65533]], |
* @covers MensBeam\Intl\Encoding\UTF8::asciiSpan |
||||
'> 0x10FFFF' => ["\xFC\x80\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533, 65533]], |
*/ |
||||
'obsolete lead byte' => ["\xFE\x80\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533, 65533]], |
public function testExtractAsciiSpans() { |
||||
'overlong U+0000 - 2 bytes' => ["\xC0\x80", [65533, 65533]], |
parent::testExtractAsciiSpans(); |
||||
'overlong U+0000 - 3 bytes' => ["\xE0\x80\x80", [65533, 65533, 65533]], |
} |
||||
'overlong U+0000 - 4 bytes' => ["\xF0\x80\x80\x80", [65533, 65533, 65533, 65533]], |
|
||||
'overlong U+0000 - 5 bytes' => ["\xF8\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533]], |
/** |
||||
'overlong U+0000 - 6 bytes' => ["\xFC\x80\x80\x80\x80\x80", [65533, 65533, 65533, 65533, 65533, 65533]], |
* @covers MensBeam\Intl\Encoding\UTF8::asciiSpanNot |
||||
'overlong U+007F - 2 bytes' => ["\xC1\xBF", [65533, 65533]], |
*/ |
||||
'overlong U+007F - 3 bytes' => ["\xE0\x81\xBF", [65533, 65533, 65533]], |
public function testExtractNegativeAsciiSpans() { |
||||
'overlong U+007F - 4 bytes' => ["\xF0\x80\x81\xBF", [65533, 65533, 65533, 65533]], |
parent::testExtractNegativeAsciiSpans(); |
||||
'overlong U+007F - 5 bytes' => ["\xF8\x80\x80\x81\xBF", [65533, 65533, 65533, 65533, 65533]], |
|
||||
'overlong U+007F - 6 bytes' => ["\xFC\x80\x80\x80\x81\xBF", [65533, 65533, 65533, 65533, 65533, 65533]], |
|
||||
'overlong U+07FF - 3 bytes' => ["\xE0\x9F\xBF", [65533, 65533, 65533]], |
|
||||
'overlong U+07FF - 4 bytes' => ["\xF0\x80\x9F\xBF", [65533, 65533, 65533, 65533]], |
|
||||
'overlong U+07FF - 5 bytes' => ["\xF8\x80\x80\x9F\xBF", [65533, 65533, 65533, 65533, 65533]], |
|
||||
'overlong U+07FF - 6 bytes' => ["\xFC\x80\x80\x80\x9F\xBF", [65533, 65533, 65533, 65533, 65533, 65533]], |
|
||||
'overlong U+FFFF - 4 bytes' => ["\xF0\x8F\xBF\xBF", [65533, 65533, 65533, 65533]], |
|
||||
'overlong U+FFFF - 5 bytes' => ["\xF8\x80\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533]], |
|
||||
'overlong U+FFFF - 6 bytes' => ["\xFC\x80\x80\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533, 65533]], |
|
||||
'overlong U+10FFFF - 5 bytes' => ["\xF8\x84\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533]], |
|
||||
'overlong U+10FFFF - 6 bytes' => ["\xFC\x80\x84\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533, 65533]], |
|
||||
// UTF-16 surrogates |
|
||||
'lead surrogate' => ["\xED\xA0\x80", [65533, 65533, 65533]], |
|
||||
'trail surrogate' => ["\xED\xB0\x80", [65533, 65533, 65533]], |
|
||||
'surrogate pair' => ["\xED\xA0\x80\xED\xB0\x80", [65533, 65533, 65533, 65533, 65533, 65533]], |
|
||||
// self-sync edge cases |
|
||||
'trailing continuation' => ["\x0A\x80\x80", [10, 65533, 65533]], |
|
||||
'trailing continuation 2' => ["\xE5\x8F\xA4\x80", [21476, 65533]], |
|
||||
]; |
|
||||
} |
} |
||||
} |
} |
||||
|
@ -0,0 +1,202 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\TestCase\Encoding; |
||||
|
|
||||
|
use MensBeam\Intl\Encoding\XUserDefined; |
||||
|
use MensBeam\Intl\Encoding\Coder; |
||||
|
use MensBeam\Intl\Encoding\EncoderException; |
||||
|
|
||||
|
class TestXUserDefined extends \MensBeam\Intl\Test\CoderDecoderTest { |
||||
|
protected $testedClass = XUserDefined::class; |
||||
|
/* X-user-defined doesn't have complex seeking, so this string is generic */ |
||||
|
protected $seekString = "30 31 32 33 34 35 36"; |
||||
|
protected $seekCodes = [0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36]; |
||||
|
protected $seekOffsets = [0, 1, 2, 3, 4, 5, 6, 7]; |
||||
|
/* This string is supposed to contain an invalid character sequence sandwiched between two null characters, but x-user-defined has no invalid characters */ |
||||
|
protected $brokenChar = ""; |
||||
|
/* This string conatins the ASCII characters "A" and "Z" followed by two arbitrary non-ASCII characters, followed by the two ASCII characters "0" and "9" */ |
||||
|
protected $spanString = "41 5A 80 FF 30 39"; |
||||
|
|
||||
|
public function provideCodePoints() { |
||||
|
return [ |
||||
|
'U+0064 (HTML)' => [false, 0x64, "64"], |
||||
|
'U+0064 (fatal)' => [true, 0x64, "64"], |
||||
|
'U+F780 (HTML)' => [false, 0xF780, "80"], |
||||
|
'U+F780 (fatal)' => [true, 0xF780, "80"], |
||||
|
'U+F7FF (HTML)' => [false, 0xF7FF, "FF"], |
||||
|
'U+F7FF (fatal)' => [true, 0xF7FF, "FF"], |
||||
|
'U+00CA (HTML)' => [false, 0xCA, bin2hex("Ê")], |
||||
|
'U+00CA (fatal)' => [true, 0xCA, new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)], |
||||
|
'-1 (HTML)' => [false, -1, new EncoderException("", Coder::E_INVALID_CODE_POINT)], |
||||
|
'-1 (fatal)' => [true, -1, new EncoderException("", Coder::E_INVALID_CODE_POINT)], |
||||
|
'0x110000 (HTML)' => [false, 0x110000, new EncoderException("", Coder::E_INVALID_CODE_POINT)], |
||||
|
'0x110000 (fatal)' => [true, 0x110000, new EncoderException("", Coder::E_INVALID_CODE_POINT)], |
||||
|
]; |
||||
|
} |
||||
|
|
||||
|
public function provideStrings() { |
||||
|
$a_bytes = []; |
||||
|
$a_codes = []; |
||||
|
for ($a = 0; $a < 0x80; $a++) { |
||||
|
$a_bytes[] = strtoupper(bin2hex(chr($a))); |
||||
|
$a_codes[] = $a; |
||||
|
} |
||||
|
$p_bytes = []; |
||||
|
$p_codes = []; |
||||
|
for ($a = 0; $a < 0x80; $a++) { |
||||
|
$p_bytes[] = strtoupper(bin2hex(chr(0x80 + $a))); |
||||
|
$p_codes[] = 0xF780 + $a; |
||||
|
} |
||||
|
$a_bytes = implode(" ", $a_bytes); |
||||
|
$p_bytes = implode(" ", $p_bytes); |
||||
|
return [ |
||||
|
'empty string' => ["", []], |
||||
|
'ASCI bytes' => [$a_bytes, $a_codes], |
||||
|
'private-use bytes' => [$p_bytes, $p_codes], |
||||
|
]; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideCodePoints |
||||
|
* @covers MensBeam\Intl\Encoding\Encoder |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::encode |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::errEnc |
||||
|
*/ |
||||
|
public function testEncodeCodePoints(bool $fatal, $input, $exp) { |
||||
|
return parent::testEncodeCodePoints($fatal, $input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideCodePoints |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::encode |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::errEnc |
||||
|
*/ |
||||
|
public function testEncodeCodePointsStatically(bool $fatal, $input, $exp) { |
||||
|
return parent::testEncodeCodePointsStatically($fatal, $input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::__construct |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::nextCode |
||||
|
*/ |
||||
|
public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) { |
||||
|
return parent::testDecodeMultipleCharactersAsCodePoints($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::__construct |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::nextChar |
||||
|
*/ |
||||
|
public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) { |
||||
|
return parent::testDecodeMultipleCharactersAsStrings($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @coversNothing |
||||
|
*/ |
||||
|
public function testSTepBackThroughAString(string $input, array $exp) { |
||||
|
// this test has no meaning for x-user-defined |
||||
|
return parent::testSTepBackThroughAString($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::seek |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::posChar |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::posByte |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::rewind |
||||
|
*/ |
||||
|
public function testSeekThroughAString() { |
||||
|
return parent::testSeekThroughAString(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::posChar |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::posByte |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::eof |
||||
|
*/ |
||||
|
public function testTraversePastTheEndOfAString() { |
||||
|
return parent::testTraversePastTheEndOfAString(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::peekChar |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::stateSave |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::stateApply |
||||
|
*/ |
||||
|
public function testPeekAtCharacters() { |
||||
|
return parent::testPeekAtCharacters(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::peekCode |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::stateSave |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::stateApply |
||||
|
*/ |
||||
|
public function testPeekAtCodePoints() { |
||||
|
return parent::testPeekAtCodePoints(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::lenChar |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::lenByte |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::stateSave |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::stateApply |
||||
|
*/ |
||||
|
public function testGetStringLength(string $input, array $points) { |
||||
|
return parent::testGetStringLength($input, $points); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::errDec |
||||
|
*/ |
||||
|
public function testReplacementModes() { |
||||
|
return parent::testReplacementModes(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::rewind |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::chars |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::codes |
||||
|
*/ |
||||
|
public function testIterateThroughAString(string $input, array $exp) { |
||||
|
return parent::testIterateThroughAString($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @coversNothing |
||||
|
*/ |
||||
|
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) { |
||||
|
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @coversNothing |
||||
|
*/ |
||||
|
public function testSeekBackOverRandomData() { |
||||
|
return parent::testSeekBackOverRandomData(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::asciiSpan |
||||
|
*/ |
||||
|
public function testExtractAsciiSpans() { |
||||
|
parent::testExtractAsciiSpans(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\XUserDefined::asciiSpanNot |
||||
|
*/ |
||||
|
public function testExtractNegativeAsciiSpans() { |
||||
|
parent::testExtractNegativeAsciiSpans(); |
||||
|
} |
||||
|
} |
@ -0,0 +1,92 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\TestCase; |
||||
|
|
||||
|
use MensBeam\Intl\Encoding; |
||||
|
use MensBeam\Intl\Encoding\Encoder; |
||||
|
use MensBeam\Intl\Encoding\UTF16BE; |
||||
|
use MensBeam\Intl\Encoding\UTF16LE; |
||||
|
use MensBeam\Intl\Encoding\UTF8; |
||||
|
|
||||
|
class TestEncoding extends \PHPUnit\Framework\TestCase { |
||||
|
/** @dataProvider provideLabelData */ |
||||
|
public function testMatchALabelToAnEncoding(string $label, array $exp) { |
||||
|
$this->assertSame($exp, Encoding::matchLabel($label)); |
||||
|
$this->assertSame($exp, Encoding::matchLabel(strtoupper($label))); |
||||
|
$this->assertSame($exp, Encoding::matchLabel(" $label\n\n\r\t")); |
||||
|
} |
||||
|
|
||||
|
public function testFailToMatchALabelToAnEncoding() { |
||||
|
$this->assertNull(Encoding::matchLabel("Not a label")); |
||||
|
} |
||||
|
|
||||
|
/** @dataProvider provideLabelData */ |
||||
|
public function testCreateADecoderFromALabel(string $label, array $data) { |
||||
|
$this->assertInstanceOf($data['class'], Encoding::createDecoder($label, "")); |
||||
|
$this->assertInstanceOf($data['class'], Encoding::createDecoder(strtoupper($label), "")); |
||||
|
$this->assertInstanceOf($data['class'], Encoding::createDecoder(" $label\n\n\r\t", "")); |
||||
|
} |
||||
|
|
||||
|
/** @dataProvider provideBOMSniffings */ |
||||
|
public function testCreateADecoderWhileSniffingBOM(string $label, string $string, string $class) { |
||||
|
$this->assertInstanceOf($class, Encoding::createDecoder($label, $string)); |
||||
|
} |
||||
|
|
||||
|
public function testFailToCreateADecoderFromALabel() { |
||||
|
$this->assertNull(Encoding::createDecoder("Not a label", "")); |
||||
|
} |
||||
|
|
||||
|
/** @dataProvider provideLabelData */ |
||||
|
public function testCreateAnEncoderFromALabel(string $label, array $data) { |
||||
|
if ($data['encoder']) { |
||||
|
$this->assertInstanceOf(Encoder::class, Encoding::createEncoder($label)); |
||||
|
$this->assertInstanceOf(Encoder::class, Encoding::createEncoder(strtoupper($label))); |
||||
|
$this->assertInstanceOf(Encoder::class, Encoding::createEncoder(" $label\n\n\r\t")); |
||||
|
} else { |
||||
|
$this->assertNull(Encoding::createEncoder($label)); |
||||
|
$this->assertNull(Encoding::createEncoder(strtoupper($label))); |
||||
|
$this->assertNull(Encoding::createEncoder(" $label\n\n\r\t")); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public function testFailToCreateAnEncoderFromALabel() { |
||||
|
$this->assertNull(Encoding::createEncoder("Not a label")); |
||||
|
} |
||||
|
|
||||
|
public function provideLabelData() { |
||||
|
$ns = "MensBeam\\Intl\\Encoding\\"; |
||||
|
$labels = []; |
||||
|
$names = []; |
||||
|
foreach (new \GlobIterator(\MensBeam\Intl\BASE."/lib/Encoding/*.php", \FilesystemIterator::CURRENT_AS_PATHNAME) as $file) { |
||||
|
$file = basename($file, ".php"); |
||||
|
$className = $ns.$file; |
||||
|
$class = new \ReflectionClass($className); |
||||
|
if ($class->implementsInterface(\MensBeam\Intl\Encoding\Decoder::class) && $class->isInstantiable()) { |
||||
|
$name = $class->getConstant("NAME"); |
||||
|
$names[$name] = $className; |
||||
|
foreach ($class->getConstant("LABELS") as $label) { |
||||
|
$labels[$label] = $name; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
foreach ($labels as $label => $name) { |
||||
|
$class = $names[$name]; |
||||
|
$encoder = !in_array($name, ["UTF-16LE", "UTF-16BE", "replacement"]); |
||||
|
yield [(string) $label, ['label' => (string) $label, 'name' => $name, 'class' => $class, 'encoder' => $encoder]]; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public function provideBOMSniffings() { |
||||
|
return [ |
||||
|
'No BOM' => ["UTF-8", "Hello world!", UTF8::class], |
||||
|
'UTF-8 BOM' => ["Shift_JIS", "\xEF\xBB\xBFA", UTF8::class], |
||||
|
'UTF-16BE BOM' => ["UTF-8", "\xFE\xFF\x00A", UTF16BE::class], |
||||
|
'UTF-16LE BOM' => ["UTF-8", "\xFF\xFEA\x00", UTF16LE::class], |
||||
|
'GB18030 BOM' => ["UTF-8", "\x84\x31\x95\x33A", UTF8::class], |
||||
|
]; |
||||
|
} |
||||
|
} |
@ -0,0 +1,48 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\Test; |
||||
|
|
||||
|
use \MensBeam\Intl\Encoding\Encoder; |
||||
|
|
||||
|
abstract class CoderDecoderTest extends DecoderTest { |
||||
|
public function testEncodeCodePoints(bool $fatal, $input, $exp) { |
||||
|
$class = $this->testedClass; |
||||
|
$label = $class::NAME; |
||||
|
$e = new Encoder($label, $fatal); |
||||
|
$input = (array) $input; |
||||
|
if ($exp instanceof \Throwable) { |
||||
|
$this->expectException(get_class($exp)); |
||||
|
$this->expectExceptionCode($exp->getCode()); |
||||
|
} else { |
||||
|
$exp = strtolower(str_replace(" ", "", $exp)); |
||||
|
} |
||||
|
$out = $e->encode($input); |
||||
|
$this->assertSame($exp, bin2hex($out)); |
||||
|
$out = ""; |
||||
|
foreach ($input as $c) { |
||||
|
$out .= $e->encodeChar($c); |
||||
|
} |
||||
|
$out .= $e->finalize(); |
||||
|
$this->assertSame($exp, bin2hex($out)); |
||||
|
} |
||||
|
|
||||
|
public function testEncodeCodePointsStatically(bool $fatal, $input, $exp) { |
||||
|
$class = $this->testedClass; |
||||
|
if (!method_exists($class, "encode")) { |
||||
|
$this->assertTrue(true); |
||||
|
return; |
||||
|
} |
||||
|
if ($exp instanceof \Throwable) { |
||||
|
$this->expectException(get_class($exp)); |
||||
|
$this->expectExceptionCode($exp->getCode()); |
||||
|
} else { |
||||
|
$exp = strtolower(str_replace(" ", "", $exp)); |
||||
|
} |
||||
|
$out = $class::encode($input, $fatal); |
||||
|
$this->assertSame($exp, bin2hex($out)); |
||||
|
} |
||||
|
} |
@ -0,0 +1,404 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\Test; |
||||
|
|
||||
|
use MensBeam\Intl\Encoding\DecoderException; |
||||
|
use MensBeam\Intl\Encoding\ISO2022JP; |
||||
|
use MensBeam\Intl\Encoding\UTF16BE; |
||||
|
use MensBeam\Intl\Encoding\UTF16LE; |
||||
|
use MensBeam\Intl\Encoding\UTF8; |
||||
|
|
||||
|
abstract class DecoderTest extends \PHPUnit\Framework\TestCase { |
||||
|
protected $random = "L51yGwEFuatjbZi7wgNC80qYncvauVm1Lh8vCSK/KJs6QxoynMU8TCamx5TNhbjeh5VpWqQ0Q1j/W6u4O/InxBDxk8g83azJFQHzU+L7Npk0bkdofFv2AHDI2SUlXotYeEOnkKa/c6eQiDk8NapS0LGnb64ypKASacAMp6s2wSUU03l6iVVapHsNBgYs0cD++vnG8ckgbGsV3KkE3Lh601u6jviDyeRwbTxLZcUfSS2uIzrvvGWFfw6D4/FOa3uTR1k2Ya6jT+T/F+OdMgWlUPouuAVgLuvFxj9v9ZBnI+FAFc0kX4aT/JoTuBGMm8YS4xPVvczdrPXCUijML5TZrU201uFqeB9LDDWULp1Ai9d41fcD/8GBFrzlpXPIV+hsSJ4HvWswXdDeVKWgSMrQ78pf+zwvD66TA4FjMiEsLLpf9bb+mPiS2Aa3BP0JpjPwi0gdBu8QipLXNGFUUGW/15jGlj3eNynELRAtvyYZnoYIYShsN1TIU+buw8hHOp9iKsKT+fqPaEuuLLtlJ/cqhcxaZhbaWRB6vCQW9mO7f8whl7cpbBOO+NwDDCJZCsULh7rINF2omkexfOZzQSt/LC3yw+Pzqrf5Pmp5YgpMvoNgHcY1FkpsHc48IHMsJ+gex2zltIG51TQBAhy/fWF0KIqd+IPT+qngVGYIw/WuXj0LaK7XIVp33tc6fzuXNv+GUzYwpv4k9ry8R/DW8EX572FXFA49HHxbytSIJLD/+KpE2CE1WOr3ONwOXm6WduUBmFi4bwlRrCKnHqnFtLztVdLwMOauFa8N822XoAnWvHs+8R1DLHtgUyZas3ktp/qjMp5oVsb2PO+VpPFHIighHySgljrPl+sKaPULh7P/rAHXOuS9p9zTZKHrQ4nccl8SnYZlHKdioWo1NK5LRZB0PXYH8Ytu8aWVBmb4lAlpAFbSTqtOhydUJ/lyM29STG5mTV3rbG6tWMsUXBpaX4PrGCnhj40RVdz0BzsgvzLu4PNI+s3TJ6ZKV4hGS5on040xMDC2423DpKHPNa7mbl7J036dFt0JcYeGu07maGxssJnwLbebg5cm36Ecea7cTBWEGFMqiFjLoBEu0Y2CfF/GEbwqOf55/p1ewaZMrunFKd/Mj89qyYU5bp6mwmXSwj10psAA+qtXYm3XzRrLHKfCuiukyPEtvI+RdjbQDtMP1vF5qkmjlQLHXvEDpviJMaqvIPkjGrZkvAej1JX5yka50z0od9LLz8TIernjLLoVZ+cWtpd3kchO6w+zTpIOups4HdD66zaiPJrXIrJwi5bIgwTOWLhVs3ufZ0loFjlWWUh5FlTW+oWl1AD4h/yPBHWglqfMaTTqH75B4XEriy+Bw9k="; |
||||
|
protected $lowerA = "a"; |
||||
|
|
||||
|
public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) { |
||||
|
$class = $this->testedClass; |
||||
|
$input = $this->prepString($input); |
||||
|
$s = new $class($input); |
||||
|
$out = []; |
||||
|
$a = 0; |
||||
|
$this->assertSame($a, $s->posChar()); |
||||
|
while (($p = $s->nextCode()) !== false) { |
||||
|
$this->assertSame(++$a, $s->posChar()); |
||||
|
$out[] = $p; |
||||
|
} |
||||
|
$this->assertSame($exp, $out); |
||||
|
$this->assertSame(strlen($input), $s->posByte()); |
||||
|
} |
||||
|
|
||||
|
public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) { |
||||
|
$class = $this->testedClass; |
||||
|
$exp = array_map(function($v) { |
||||
|
return \IntlChar::chr($v); |
||||
|
}, $exp); |
||||
|
$input = $this->prepString($input); |
||||
|
$s = new $class($input); |
||||
|
$out = []; |
||||
|
while (($p = $s->nextChar()) !== "") { |
||||
|
$out[] = $p; |
||||
|
} |
||||
|
$this->assertSame($exp, $out); |
||||
|
$this->assertSame(strlen($input), $s->posByte()); |
||||
|
} |
||||
|
|
||||
|
public function testSTepBackThroughAString(string $input, array $exp) { |
||||
|
$class = $this->testedClass; |
||||
|
$input = $this->prepString($input); |
||||
|
$s = new $class($input); |
||||
|
$exp = array_reverse($exp); |
||||
|
$act = []; |
||||
|
$pos = 0; |
||||
|
while ($s->nextCode() !== false) { |
||||
|
$this->assertSame(++$pos, $s->posChar()); |
||||
|
} |
||||
|
$this->assertSame(sizeof($exp), $pos); |
||||
|
while ($s->posChar()) { |
||||
|
$this->assertSame(0, $s->seek(-1), "Error stepping back to position ".($pos - 1)); |
||||
|
$this->assertSame(--$pos, $s->posChar()); |
||||
|
$act[] = $s->nextCode(); |
||||
|
$s->seek(-1); |
||||
|
} |
||||
|
$this->assertEquals($exp, $act); |
||||
|
} |
||||
|
|
||||
|
public function testSeekThroughAString() { |
||||
|
$class = $this->testedClass; |
||||
|
if (!$this->seekString) { |
||||
|
$this->markTestSkipped(); |
||||
|
return; |
||||
|
} |
||||
|
$input = $this->prepString($this->seekString); |
||||
|
$off = $this->seekOffsets; |
||||
|
$s = new $class($input); |
||||
|
$bom = [ |
||||
|
UTF8::class => 3, |
||||
|
UTF16BE::class => 2, |
||||
|
UTF16LE::class => 2, |
||||
|
][$this->testedClass] ?? 0; |
||||
|
|
||||
|
$this->assertSame(0, $s->posChar()); |
||||
|
$this->assertSame($bom, $s->posByte()); |
||||
|
|
||||
|
$this->assertSame(0, $s->seek(0)); |
||||
|
$this->assertSame(0, $s->posChar()); |
||||
|
$this->assertSame($bom, $s->posByte()); |
||||
|
|
||||
|
$this->assertSame(1, $s->seek(-1)); |
||||
|
$this->assertSame(0, $s->posChar()); |
||||
|
$this->assertSame($bom, $s->posByte()); |
||||
|
|
||||
|
$this->assertSame(0, $s->seek(1)); |
||||
|
$this->assertSame(1, $s->posChar()); |
||||
|
$this->assertSame($off[1], $s->posByte()); |
||||
|
|
||||
|
$this->assertSame(0, $s->seek(2)); |
||||
|
$this->assertSame(3, $s->posChar()); |
||||
|
$this->assertSame($off[3], $s->posByte()); |
||||
|
|
||||
|
$this->assertSame(0, $s->seek(4)); |
||||
|
$this->assertSame(7, $s->posChar()); |
||||
|
$this->assertSame($off[7], $s->posByte()); |
||||
|
|
||||
|
$this->assertSame(1, $s->seek(1)); |
||||
|
$this->assertSame(7, $s->posChar()); |
||||
|
if ($this->testedClass !== ISO2022JP::class) { |
||||
|
$this->assertSame($off[7], $s->posByte()); |
||||
|
} else { |
||||
|
$this->assertSame($off[7] + 3, $s->posByte()); |
||||
|
} |
||||
|
|
||||
|
$this->assertSame(0, $s->seek(-3)); |
||||
|
$this->assertSame(4, $s->posChar()); |
||||
|
$this->assertSame($off[4], $s->posByte()); |
||||
|
|
||||
|
$this->assertSame(6, $s->seek(-10)); |
||||
|
$this->assertSame(0, $s->posChar()); |
||||
|
$this->assertSame($bom, $s->posByte()); |
||||
|
|
||||
|
$this->assertSame(0, $s->seek(5)); |
||||
|
$this->assertSame(5, $s->posChar()); |
||||
|
$this->assertSame($off[5], $s->posByte()); |
||||
|
|
||||
|
$s->rewind(); |
||||
|
$this->assertSame(0, $s->posChar()); |
||||
|
$this->assertSame($bom, $s->posByte()); |
||||
|
} |
||||
|
|
||||
|
public function testTraversePastTheEndOfAString() { |
||||
|
$class = $this->testedClass; |
||||
|
$s = new $class($this->lowerA); |
||||
|
$l = strlen($this->lowerA); |
||||
|
$this->assertSame(0, $s->posChar()); |
||||
|
$this->assertSame(0, $s->posByte()); |
||||
|
$this->assertFalse($s->eof()); |
||||
|
|
||||
|
$this->assertSame("a", $s->nextChar()); |
||||
|
$this->assertSame(1, $s->posChar()); |
||||
|
$this->assertSame($l, $s->posByte()); |
||||
|
$this->assertTrue($s->eof()); |
||||
|
|
||||
|
$this->assertSame("", $s->nextChar()); |
||||
|
$this->assertSame(1, $s->posChar()); |
||||
|
$this->assertSame($l, $s->posByte()); |
||||
|
$this->assertTrue($s->eof()); |
||||
|
|
||||
|
$s = new $class($this->lowerA); |
||||
|
$this->assertSame(0, $s->posChar()); |
||||
|
$this->assertSame(0, $s->posByte()); |
||||
|
$this->assertFalse($s->eof()); |
||||
|
|
||||
|
$this->assertSame(ord("a"), $s->nextCode()); |
||||
|
$this->assertSame(1, $s->posChar()); |
||||
|
$this->assertSame($l, $s->posByte()); |
||||
|
$this->assertTrue($s->eof()); |
||||
|
|
||||
|
$this->assertSame(false, $s->nextCode()); |
||||
|
$this->assertSame(1, $s->posChar()); |
||||
|
$this->assertSame($l, $s->posByte()); |
||||
|
$this->assertTrue($s->eof()); |
||||
|
} |
||||
|
|
||||
|
public function testPeekAtCharacters() { |
||||
|
$class = $this->testedClass; |
||||
|
if (!$this->seekString) { |
||||
|
$this->markTestSkipped(); |
||||
|
return; |
||||
|
} |
||||
|
$input = $this->prepString($this->seekString); |
||||
|
$off = $this->seekOffsets; |
||||
|
$codes = $this->seekCodes; |
||||
|
$s = new $class($input); |
||||
|
$s->seek(2); |
||||
|
$this->assertSame(2, $s->posChar()); |
||||
|
$this->assertSame($off[2], $s->posByte()); |
||||
|
|
||||
|
$this->assertSame(bin2hex(\IntlChar::chr($codes[2])), bin2hex($s->peekChar())); |
||||
|
$this->assertSame(2, $s->posChar()); |
||||
|
$this->assertSame($off[2], $s->posByte()); |
||||
|
|
||||
|
$this->assertSame(bin2hex(\IntlChar::chr($codes[2]).\IntlChar::chr($codes[3])), bin2hex($s->peekChar(2))); |
||||
|
$this->assertSame(2, $s->posChar()); |
||||
|
$this->assertSame($off[2], $s->posByte()); |
||||
|
|
||||
|
$s->seek(3); |
||||
|
$this->assertSame(5, $s->posChar()); |
||||
|
$this->assertSame($off[5], $s->posByte()); |
||||
|
|
||||
|
$this->assertSame(bin2hex(\IntlChar::chr($codes[5]).\IntlChar::chr($codes[6])), bin2hex($s->peekChar(3))); |
||||
|
$this->assertSame(5, $s->posChar()); |
||||
|
$this->assertSame($off[5], $s->posByte()); |
||||
|
|
||||
|
$this->assertSame("", $s->peekChar(-5)); |
||||
|
$this->assertSame(5, $s->posChar()); |
||||
|
$this->assertSame($off[5], $s->posByte()); |
||||
|
} |
||||
|
|
||||
|
public function testPeekAtCodePoints() { |
||||
|
$class = $this->testedClass; |
||||
|
if (!$this->seekString) { |
||||
|
$this->markTestSkipped(); |
||||
|
return; |
||||
|
} |
||||
|
$input = $this->prepString($this->seekString); |
||||
|
$off = $this->seekOffsets; |
||||
|
$codes = $this->seekCodes; |
||||
|
$s = new $class($input); |
||||
|
$s->seek(2); |
||||
|
$this->assertSame(2, $s->posChar()); |
||||
|
$this->assertSame($off[2], $s->posByte()); |
||||
|
|
||||
|
$this->assertSame([$codes[2]], $s->peekCode()); |
||||
|
$this->assertSame(2, $s->posChar()); |
||||
|
$this->assertSame($off[2], $s->posByte()); |
||||
|
|
||||
|
$this->assertSame([$codes[2], $codes[3]], $s->peekCode(2)); |
||||
|
$this->assertSame(2, $s->posChar()); |
||||
|
$this->assertSame($off[2], $s->posByte()); |
||||
|
|
||||
|
$s->seek(3); |
||||
|
$this->assertSame(5, $s->posChar()); |
||||
|
$this->assertSame($off[5], $s->posByte()); |
||||
|
|
||||
|
$this->assertSame([$codes[5], $codes[6]], $s->peekCode(3)); |
||||
|
$this->assertSame(5, $s->posChar()); |
||||
|
$this->assertSame($off[5], $s->posByte()); |
||||
|
|
||||
|
$this->assertSame([], $s->peekCode(-5)); |
||||
|
$this->assertSame(5, $s->posChar()); |
||||
|
$this->assertSame($off[5], $s->posByte()); |
||||
|
} |
||||
|
|
||||
|
public function testGetStringLength(string $input, array $points) { |
||||
|
$class = $this->testedClass; |
||||
|
$input = $this->prepString($input); |
||||
|
$s = new $class($input); |
||||
|
$s->seek(1); |
||||
|
$posChar = $s->posChar(); |
||||
|
$posByte = $s->posByte(); |
||||
|
|
||||
|
$this->assertSame(sizeof($points), $s->lenChar()); |
||||
|
$this->assertSame($posChar, $s->posChar()); |
||||
|
$this->assertSame($posByte, $s->posByte()); |
||||
|
$this->assertSame(strlen($input), $s->lenByte()); |
||||
|
$this->assertSame($posChar, $s->posChar()); |
||||
|
$this->assertSame($posByte, $s->posByte()); |
||||
|
} |
||||
|
|
||||
|
public function testReplacementModes() { |
||||
|
if (!$this->brokenChar) { |
||||
|
// decoder for this encoding never produces errors |
||||
|
$this->assertTrue(true); |
||||
|
return; |
||||
|
} |
||||
|
$class = $this->testedClass; |
||||
|
$input = $this->prepString($this->brokenChar); |
||||
|
// officially test replacement characters (already effectively tested by other tests) |
||||
|
$s = new $class($input, false); |
||||
|
$s->seek(1); |
||||
|
$this->assertSame(0xFFFD, $s->nextCode()); |
||||
|
$s->seek(-2); |
||||
|
// test fatal mode |
||||
|
$s = new $class($input, true); |
||||
|
$s->seek(1); |
||||
|
try { |
||||
|
$p = $s->nextCode(); |
||||
|
} catch (DecoderException $e) { |
||||
|
$p = $e; |
||||
|
} finally { |
||||
|
$this->assertInstanceOf(DecoderException::class, $p); |
||||
|
} |
||||
|
$this->assertSame(2, $s->posChar()); |
||||
|
$this->assertSame(0x00, $s->nextCode()); |
||||
|
$this->assertSame(3, $s->posChar()); |
||||
|
$this->assertSame(0, $s->seek(-2)); |
||||
|
$this->assertSame(1, $s->posChar()); |
||||
|
try { |
||||
|
$p = $s->peekCode(); |
||||
|
} catch (DecoderException $e) { |
||||
|
$p = $e; |
||||
|
} finally { |
||||
|
$this->assertInstanceOf(DecoderException::class, $p); |
||||
|
} |
||||
|
$this->assertSame(1, $s->posChar()); |
||||
|
try { |
||||
|
$p = $s->peekChar(); |
||||
|
} catch (DecoderException $e) { |
||||
|
$p = $e; |
||||
|
} finally { |
||||
|
$this->assertInstanceOf(DecoderException::class, $p); |
||||
|
} |
||||
|
$this->assertSame(1, $s->posChar()); |
||||
|
} |
||||
|
|
||||
|
public function testIterateThroughAString(string $input, array $exp) { |
||||
|
$this->iterateThroughAString($input, $exp, false); |
||||
|
} |
||||
|
|
||||
|
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) { |
||||
|
$exp = $relaxedExp ?? $strictExp; |
||||
|
$this->iterateThroughAString($input, $exp, true); |
||||
|
} |
||||
|
|
||||
|
public function testSeekBackOverRandomData() { |
||||
|
$class = $this->testedClass; |
||||
|
$bytes = base64_decode($this->random); |
||||
|
$i = new $class($bytes); |
||||
|
$fwd = []; |
||||
|
do { |
||||
|
$fwd[] = [$i->posByte(), $i->nextCode()]; |
||||
|
} while ($i->posByte() < strlen($bytes)); |
||||
|
while (sizeof($fwd)) { |
||||
|
list($expPos, $expCode) = array_pop($fwd); |
||||
|
$this->assertSame(0, $i->seek(-1), "Start of string reached prematureley"); |
||||
|
$this->assertSame($expPos, $i->posByte(), "Position desynchronized"); |
||||
|
$this->assertSame($expCode, $i->peekCode(1)[0], "Incorrect character decoded at byte position $expPos"); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
protected function iterateThroughAString(string $input, array $exp, bool $allowSurrogates) { |
||||
|
$class = $this->testedClass; |
||||
|
$input = $this->prepString($input); |
||||
|
$s = new $class($input, false, $allowSurrogates); |
||||
|
$a = 0; |
||||
|
$this->assertTrue(true); // prevent risky test of empty string |
||||
|
foreach ($s->codes() as $index => $p) { |
||||
|
$this->assertSame($a, $index, "Character key at index $a reported incorrectly"); |
||||
|
$this->assertSame($exp[$a], $p, "Character at index $a decoded incorrectly"); |
||||
|
$a++; |
||||
|
} |
||||
|
$a = 0; |
||||
|
foreach ($s->codes() as $p) { |
||||
|
$a++; |
||||
|
} |
||||
|
$this->assertSame(0, $a); |
||||
|
$s->rewind(); |
||||
|
foreach ($s->codes() as $p) { |
||||
|
$a++; |
||||
|
} |
||||
|
$this->assertSame(sizeof($exp), $a); |
||||
|
|
||||
|
$exp = array_map(function($v) { |
||||
|
return \IntlChar::chr($v); |
||||
|
}, $exp); |
||||
|
|
||||
|
foreach ($s->chars() as $index => $p) { |
||||
|
$this->assertSame($a, $index, "Character key at index $a reported incorrectly"); |
||||
|
$this->assertSame(bin2hex($exp[$a]), bin2hex($p), "Character at index $a decoded incorrectly"); |
||||
|
$a++; |
||||
|
} |
||||
|
$a = 0; |
||||
|
foreach ($s->chars() as $p) { |
||||
|
$a++; |
||||
|
} |
||||
|
$this->assertSame(0, $a); |
||||
|
$s->rewind(); |
||||
|
foreach ($s->chars() as $p) { |
||||
|
$a++; |
||||
|
} |
||||
|
$this->assertSame(sizeof($exp), $a); |
||||
|
} |
||||
|
|
||||
|
public function testExtractAsciiSpans() { |
||||
|
$allBytes = $this->allBytes(); |
||||
|
$class = $this->testedClass; |
||||
|
$d = new $class($this->prepString($this->spanString)); |
||||
|
$this->assertSame("", $d->asciiSpan("az")); |
||||
|
$this->assertSame("A", $d->asciiSpan("AZ", 1)); |
||||
|
$this->assertSame("Z", $d->asciiSpan("AZ")); |
||||
|
$this->assertSame("", $d->asciiSpan($allBytes)); |
||||
|
$d->nextChar(); |
||||
|
$this->assertSame("", $d->asciiSpan($allBytes)); |
||||
|
$d->nextChar(); |
||||
|
$this->assertSame("09", $d->asciiSpan($allBytes)); |
||||
|
} |
||||
|
|
||||
|
public function testExtractNegativeAsciiSpans() { |
||||
|
$class = $this->testedClass; |
||||
|
$d = new $class($this->prepString($this->spanString)); |
||||
|
$this->assertSame("", $d->asciiSpanNot("AZ")); |
||||
|
$this->assertSame("A", $d->asciiSpanNot("az", 1)); |
||||
|
$this->assertSame("Z", $d->asciiSpanNot("az")); |
||||
|
$this->assertSame("", $d->asciiSpanNot("")); |
||||
|
$d->nextChar(); |
||||
|
$this->assertSame("", $d->asciiSpanNot("")); |
||||
|
$d->nextChar(); |
||||
|
$this->assertSame("09", $d->asciiSpanNot("")); |
||||
|
} |
||||
|
|
||||
|
protected function prepString(string $str): string { |
||||
|
return hex2bin(str_replace(" ", "", $str)); |
||||
|
} |
||||
|
|
||||
|
protected function allBytes(): string { |
||||
|
$out = ""; |
||||
|
for ($a = 0x00; $a <= 0xFF; $a++) { |
||||
|
$out .= chr($a); |
||||
|
} |
||||
|
return $out; |
||||
|
} |
||||
|
} |
@ -1,45 +0,0 @@ |
|||||
<?php |
|
||||
// retrieve the GB18030 index file for two-byte sequences |
|
||||
$label = "gb18030"; |
|
||||
$data = file_get_contents("https://encoding.spec.whatwg.org/index-$label.txt") or die("index file for '$label' could not be retrieved from network."); |
|
||||
// find lines that contain data |
|
||||
preg_match_all("/^\s*(\d+)\s+0x([0-9A-Z]+)/m", $data, $matches, \PREG_SET_ORDER); |
|
||||
// set up |
|
||||
$dec_gbk = []; |
|
||||
// loop through each line |
|
||||
foreach ($matches as $match) { |
|
||||
// only the code point is relevant |
|
||||
$dec_gbk[] = hexdec($match[2]); |
|
||||
} |
|
||||
|
|
||||
// retrieve the GB18030 range index file for four-byte sequences |
|
||||
$label = "gb18030"; |
|
||||
$data = file_get_contents("https://encoding.spec.whatwg.org/index-$label-ranges.txt") or die("range index file for '$label' could not be retrieved from network."); |
|
||||
// find lines that contain data |
|
||||
preg_match_all("/^\s*(\d+)\s+0x([0-9A-Z]+)/m", $data, $matches, \PREG_SET_ORDER); |
|
||||
// set up |
|
||||
$dec_max = []; |
|
||||
$dec_off = []; |
|
||||
// loop through each line |
|
||||
foreach ($matches as $match) { |
|
||||
// gather the range starts in one array; they will actually be used as range ends |
|
||||
$dec_max[] = (int) $match[1]; |
|
||||
// gather the starting code points in another array |
|
||||
$dec_off[] = hexdec($match[2]); |
|
||||
} |
|
||||
// fudge the top of the ranges |
|
||||
// see https://encoding.spec.whatwg.org/#index-gb18030-ranges-code-point Step 1 |
|
||||
// we also add 0x110000 (one beyond the top of the Unicode range) to the offsets for encoding |
|
||||
$penult = array_pop($dec_max); |
|
||||
$dec_max = array_merge($dec_max, [39420, $penult, 1237576]); |
|
||||
array_splice($dec_off, -1, 0, "null"); |
|
||||
$dec_off[] = 0x110000; |
|
||||
|
|
||||
// output |
|
||||
$dec_gbk = implode(",", $dec_gbk); |
|
||||
$dec_max = implode(",", $dec_max); |
|
||||
$dec_off = implode(",", $dec_off); |
|
||||
|
|
||||
echo " const TABLE_GBK = [$dec_gbk];\n"; |
|
||||
echo " const TABLE_RANGES = [$dec_max];\n"; |
|
||||
echo " const TABLE_OFFSETS = [$dec_off];\n"; |
|
@ -1,54 +1,256 @@ |
|||||
<?php |
<?php |
||||
// retrieve the relevant index file |
declare(strict_types=1); |
||||
|
// This script produces the index lookup tables |
||||
|
// for a given encoding from the source data at WHATWG |
||||
|
|
||||
|
$labels = [ |
||||
|
'big5' => "big5", |
||||
|
'euc-jp' => "eucjp", |
||||
|
'euc-kr' => "euckr", |
||||
|
'gb18030' => "gb18030", |
||||
|
'ibm866' => "single_byte", |
||||
|
'iso-2022-jp' => "iso2022jp", |
||||
|
'iso-8859-10' => "single_byte", |
||||
|
'iso-8859-13' => "single_byte", |
||||
|
'iso-8859-14' => "single_byte", |
||||
|
'iso-8859-15' => "single_byte", |
||||
|
'iso-8859-16' => "single_byte", |
||||
|
'iso-8859-2' => "single_byte", |
||||
|
'iso-8859-3' => "single_byte", |
||||
|
'iso-8859-4' => "single_byte", |
||||
|
'iso-8859-5' => "single_byte", |
||||
|
'iso-8859-6' => "single_byte", |
||||
|
'iso-8859-7' => "single_byte", |
||||
|
'iso-8859-8' => "single_byte", |
||||
|
'koi8-r' => "single_byte", |
||||
|
'koi8-u' => "single_byte", |
||||
|
'macintosh' => "single_byte", |
||||
|
'shift-jis' => "shiftjis", |
||||
|
'windows-1250' => "single_byte", |
||||
|
'windows-1251' => "single_byte", |
||||
|
'windows-1252' => "single_byte", |
||||
|
'windows-1253' => "single_byte", |
||||
|
'windows-1254' => "single_byte", |
||||
|
'windows-1255' => "single_byte", |
||||
|
'windows-1256' => "single_byte", |
||||
|
'windows-1257' => "single_byte", |
||||
|
'windows-1258' => "single_byte", |
||||
|
'windows-874' => "single_byte", |
||||
|
'x-mac-cyrillic' => "single_byte", |
||||
|
]; |
||||
$label = $argv[1] ?? ""; |
$label = $argv[1] ?? ""; |
||||
$label = trim(strtolower($label)); |
$label = trim(strtolower($label)); |
||||
$data = file_get_contents("https://encoding.spec.whatwg.org/index-$label.txt") or die("index file for '$label' could not be retrieved from network."); |
if (!isset($labels[$label])) { |
||||
// find lines that contain data |
die("Invalid label specified. Must be one of: ".json_encode(array_keys($labels))."\n"); |
||||
preg_match_all("/^\s*(\d+)\s+0x([0-9A-Z]+)/m", $data, $matches, \PREG_SET_ORDER); |
} |
||||
// set up |
($labels[$label])($label); |
||||
$dec_char = []; |
|
||||
$dec_code = []; |
// encoding-specific output generators |
||||
$enc = []; |
|
||||
$i = 0; |
function single_byte(string $label) { |
||||
// loop through each line |
$table = read_index($label, "https://encoding.spec.whatwg.org/index-$label.txt"); |
||||
foreach ($matches as $match) { |
$dec_char = serialize_char_array($table); |
||||
// index is the byte value minus 128 |
$dec_code = serialize_point_array($table); |
||||
$index = (int) $match[1]; |
$enc = serialize_single_byte_array($table); |
||||
// byte is a reconstruction of the hexdecimal value of the byte value, padded to two nybbles |
echo "const TABLE_DEC_CHAR = $dec_char;\n"; |
||||
$byte = strtoupper(str_pad(dechex($index + 128), 2, "0", \STR_PAD_LEFT)); |
echo "const TABLE_DEC_CODE = $dec_code;\n"; |
||||
// code is the Unocide code point |
echo "const TABLE_ENC = $enc;\n"; |
||||
$code = hexdec($match[2]); |
} |
||||
// hex is the code point in hexadecimal |
|
||||
$hex = dechex($code); |
function gb18030(string $label) { |
||||
// missing indexes necessitate specifying keys explicitly |
$gbk = read_index($label, "https://encoding.spec.whatwg.org/index-$label.txt"); |
||||
if ($index == $i) { |
$dec_gbk = serialize_point_array($gbk); |
||||
$key = ""; |
$enc_gbk = serialize_point_array(make_override_array($gbk)); |
||||
} else { |
$ranges = read_index($label, "https://encoding.spec.whatwg.org/index-$label-ranges.txt"); |
||||
$key = "$index=>"; |
$dec_max = []; |
||||
$i = $index; |
$dec_off = []; |
||||
} |
foreach ($ranges as $pointer => $code) { |
||||
$dec_code[] = $key."$code"; |
// gather the range starts in one array; they will actually be used as range ends |
||||
$dec_char[] = $key."\"\\u{".$hex."}\""; |
$dec_max[] = $pointer; |
||||
// the encoder table will be reprocessed later |
// gather the starting code points in another array |
||||
$enc[$code] = "\"\\x$byte\""; |
$dec_off[] = $code; |
||||
$i++; |
} |
||||
} |
// fudge the top of the ranges |
||||
// sort the encoder table by keys to order it correctly |
// see https://encoding.spec.whatwg.org/#index-gb18030-ranges-code-point Step 1 |
||||
ksort($enc); |
// we also add 0x110000 (one beyond the top of the Unicode range) to the offsets for encoding |
||||
$i = 0; |
$penult = array_pop($dec_max); |
||||
foreach ($enc as $index => $value) { |
$dec_max = array_merge($dec_max, [39420, $penult, 1237576]); |
||||
if ($index == $i) { |
array_splice($dec_off, -1, 0, "null"); |
||||
$key = ""; |
$dec_off[] = 0x110000; |
||||
} else { |
$dec_max = "[".implode(",", $dec_max)."]"; |
||||
$key = "$index=>"; |
$dec_off = "[".implode(",", $dec_off)."]"; |
||||
$i = $index; |
echo "const TABLE_CODES = $dec_gbk;\n"; |
||||
} |
echo "const TABLE_POINTERS = $enc_gbk;\n"; |
||||
$enc[$index] = "$key$value"; |
echo "const TABLE_RANGES = $dec_max;\n"; |
||||
$i++; |
echo "const TABLE_OFFSETS = $dec_off;\n"; |
||||
} |
} |
||||
$dec_char = implode(",", $dec_char); |
|
||||
$dec_code = implode(",", $dec_code); |
function big5(string $label) { |
||||
$enc = implode(",", $enc); |
// Big5 has unusually complex encoding requirements |
||||
echo " const TABLE_DEC_CHAR = [$dec_char];\n"; |
// see https://encoding.spec.whatwg.org/#index-big5-pointer for particulars |
||||
echo " const TABLE_DEC_CODE = [$dec_code];\n"; |
$table = read_index($label, "https://encoding.spec.whatwg.org/index-$label.txt"); |
||||
echo " const TABLE_ENC = [$enc];\n"; |
$specials = <<<ARRAY_LITERAL |
||||
|
[ |
||||
|
1133 => [0x00CA, 0x0304], |
||||
|
1135 => [0x00CA, 0x030C], |
||||
|
1164 => [0x00EA, 0x0304], |
||||
|
1166 => [0x00EA, 0x030C], |
||||
|
] |
||||
|
ARRAY_LITERAL; |
||||
|
// split Hong Kong Supplement code points from the rest of Big5 |
||||
|
$stop = (0xA1 - 0x81) * 157; |
||||
|
$hk = []; |
||||
|
$nhk = []; |
||||
|
foreach ($table as $pointer => $code) { |
||||
|
if ($pointer < $stop) { |
||||
|
$hk[$pointer] = $code; |
||||
|
} else { |
||||
|
$nhk[$pointer] = $code; |
||||
|
} |
||||
|
} |
||||
|
// search the Big5 rump for duplicates |
||||
|
$dupes = make_override_array($nhk); |
||||
|
// remove those duplicates which should use the last code point |
||||
|
foreach ([0x2550, 0x255E, 0x2561, 0x256A, 0x5341, 0x5345] as $code) { |
||||
|
unset($dupes[$code]); |
||||
|
} |
||||
|
// serialize and print; Hong Kong characters are kept separate as they are not used in encoding |
||||
|
$codes_tw = serialize_point_array($nhk); |
||||
|
$codes_hk = serialize_point_array($hk); |
||||
|
$enc = serialize_point_array($dupes); |
||||
|
echo "const TABLE_DOUBLES = $specials;\n"; |
||||
|
echo "const TABLE_CODES_TW = $codes_tw;\n"; |
||||
|
echo "const TABLE_CODES_HK = $codes_hk;\n"; |
||||
|
echo "const TABLE_POINTERS = $enc;\n"; |
||||
|
} |
||||
|
|
||||
|
function euckr(string $label) { |
||||
|
$codes = serialize_point_array(read_index($label, "https://encoding.spec.whatwg.org/index-$label.txt")); |
||||
|
echo "const TABLE_CODES = $codes;\n"; |
||||
|
} |
||||
|
|
||||
|
function eucjp(string $label) { |
||||
|
$jis0212 = serialize_point_array(read_index("jis0212", "https://encoding.spec.whatwg.org/index-jis0212.txt")); |
||||
|
$table = read_index("jis0208", "https://encoding.spec.whatwg.org/index-jis0208.txt"); |
||||
|
$dupes = serialize_point_array(make_override_array($table)); |
||||
|
$jis0208 = serialize_point_array($table); |
||||
|
echo "const TABLE_JIS0208 = $jis0208;\n"; |
||||
|
echo "const TABLE_JIS0212 = $jis0212;\n"; |
||||
|
echo "const TABLE_POINTERS = $dupes;\n"; |
||||
|
} |
||||
|
|
||||
|
function iso2022jp(string $label) { |
||||
|
$kana = serialize_point_array(read_index("jis0208", "https://encoding.spec.whatwg.org/index-iso-2022-jp-katakana.txt")); |
||||
|
$table = read_index("jis0208", "https://encoding.spec.whatwg.org/index-jis0208.txt"); |
||||
|
$dupes = serialize_point_array(make_override_array($table)); |
||||
|
$jis0208 = serialize_point_array($table); |
||||
|
echo "const TABLE_JIS0208 = $jis0208;\n"; |
||||
|
echo "const TABLE_KATAKANA = $kana;\n"; |
||||
|
echo "const TABLE_POINTERS = $dupes;\n"; |
||||
|
} |
||||
|
|
||||
|
function shiftjis(string $label) { |
||||
|
$table = read_index($label, "https://encoding.spec.whatwg.org/index-jis0208.txt"); |
||||
|
// exclude a range of pointers from encoding consideration |
||||
|
$dec = []; |
||||
|
$shared = []; |
||||
|
foreach ($table as $pointer => $code) { |
||||
|
if ($pointer < 8272 || $pointer > 8835) { |
||||
|
$shared[$pointer] = $code; |
||||
|
} else { |
||||
|
$dec[$pointer] = $code; |
||||
|
} |
||||
|
} |
||||
|
// search the encoder set for duplicates |
||||
|
$dupes = make_override_array($shared); |
||||
|
// serialize and print; the $shared set is used for both encoding and decoding; the $dec set is used only for decoding |
||||
|
$codes = serialize_point_array($shared); |
||||
|
$codes_extra = serialize_point_array($dec); |
||||
|
$enc = serialize_point_array($dupes); |
||||
|
echo "const TABLE_CODES = $codes;\n"; |
||||
|
echo "const TABLE_CODES_EXTRA = $codes_extra;\n"; |
||||
|
echo "const TABLE_POINTERS = $enc;\n"; |
||||
|
} |
||||
|
|
||||
|
// generic helper functions |
||||
|
|
||||
|
function read_index(string $label, string $url): array { |
||||
|
$data = file_get_contents($url) or die("index file for '$label' could not be retrieved from network."); |
||||
|
// find lines that contain data |
||||
|
preg_match_all("/^\s*(\d+)\s+0x([0-9A-Z]+)/m", $data, $matches, \PREG_SET_ORDER); |
||||
|
$out = []; |
||||
|
foreach ($matches as list($match, $index, $code)) { |
||||
|
$out[(int) $index] = (int) hexdec($code); |
||||
|
} |
||||
|
return $out; |
||||
|
} |
||||
|
|
||||
|
function serialize_point_array(array $table): string { |
||||
|
$out = []; |
||||
|
$i = 0; |
||||
|
foreach ($table as $index => $code) { |
||||
|
// non-sequential indices must be printed, but others can be omitted |
||||
|
if ($index === $i) { |
||||
|
$key = ""; |
||||
|
} else { |
||||
|
$key = "$index=>"; |
||||
|
$i = $index; |
||||
|
} |
||||
|
$out[] = $key.$code; |
||||
|
$i++; |
||||
|
} |
||||
|
return "[".implode(",", $out)."]"; |
||||
|
} |
||||
|
|
||||
|
function serialize_char_array(array $table): string { |
||||
|
$out = []; |
||||
|
$i = 0; |
||||
|
foreach ($table as $index => $code) { |
||||
|
// non-sequential indices must be printed, but others can be omitted |
||||
|
if ($index === $i) { |
||||
|
$key = ""; |
||||
|
} else { |
||||
|
$key = "$index=>"; |
||||
|
$i = $index; |
||||
|
} |
||||
|
$out[] = $key."\"\\u{".$code."}\""; |
||||
|
$i++; |
||||
|
} |
||||
|
return "[".implode(",", $out)."]"; |
||||
|
} |
||||
|
|
||||
|
// this is only used for single-byte encoders; other encoders instead flip their decoder arrays with overrides for duplicates or special cases |
||||
|
function serialize_single_byte_array(array $table): string { |
||||
|
$out = []; |
||||
|
foreach ($table as $index => $code) { |
||||
|
$byte = strtoupper(str_pad(dechex($index + 128), 2, "0", \STR_PAD_LEFT)); |
||||
|
$out[$code] = "\"\\x$byte\""; |
||||
|
} |
||||
|
ksort($out); |
||||
|
$i = 0; |
||||
|
foreach ($out as $index => $value) { |
||||
|
if ($index == $i) { |
||||
|
$key = ""; |
||||
|
} else { |
||||
|
$key = "$index=>"; |
||||
|
$i = $index; |
||||
|
} |
||||
|
$out[$index] = "$key$value"; |
||||
|
$i++; |
||||
|
} |
||||
|
return "[".implode(",", $out)."]"; |
||||
|
} |
||||
|
|
||||
|
// indexes with duplicate code points by default need to match the lowest pointer when encoding |
||||
|
// PHP's array_flip() function retains the last duplicate rather than the first, so we have to find duplicates |
||||
|
function make_override_array(array $table): array { |
||||
|
$out = []; |
||||
|
$dupes = array_keys(array_filter(array_count_values($table), function($v) { |
||||
|
return $v > 1; |
||||
|
})); |
||||
|
foreach ($dupes as $code_point) { |
||||
|
$out[$code_point] = array_search($code_point, $table); |
||||
|
} |
||||
|
ksort($out); |
||||
|
return $out; |
||||
|
} |
||||
|
@ -0,0 +1,40 @@ |
|||||
|
<?php |
||||
|
// this script read and names and labels from each concrete |
||||
|
// class in the Encoding set and generates tables mapping labels |
||||
|
// to names and names to classes |
||||
|
|
||||
|
use MensBeam\Intl\Encoding\Decoder; |
||||
|
|
||||
|
define("BASE", dirname(__DIR__).DIRECTORY_SEPARATOR); |
||||
|
require_once BASE."vendor".DIRECTORY_SEPARATOR."autoload.php"; |
||||
|
|
||||
|
$ns = "\\MensBeam\\Intl\\Encoding\\"; |
||||
|
$labels = []; |
||||
|
$names = []; |
||||
|
foreach (new \GlobIterator(BASE."/lib/Encoding/*.php", \FilesystemIterator::CURRENT_AS_PATHNAME) as $file) { |
||||
|
$file = basename($file, ".php"); |
||||
|
$className = $ns.$file; |
||||
|
$class = new \ReflectionClass($className); |
||||
|
if ($class->implementsInterface(Decoder::class) && $class->isInstantiable()) { |
||||
|
$name = $class->getConstant("NAME"); |
||||
|
$names[$name] = $className; |
||||
|
foreach ($class->getConstant("LABELS") as $label) { |
||||
|
$labels[$label] = $name; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
$labelList = []; |
||||
|
foreach ($labels as $k => $v) { |
||||
|
$labelList[] = "'$k'=>\"$v\""; |
||||
|
} |
||||
|
$labelList = "const LABEL_MAP = [".implode(",", $labelList)."];"; |
||||
|
|
||||
|
$nameList = []; |
||||
|
foreach ($names as $k => $v) { |
||||
|
$nameList[] = "'$k'=>$v::class"; |
||||
|
} |
||||
|
$nameList = "const NAME_MAP = [".implode(",", $nameList)."];"; |
||||
|
|
||||
|
echo "$labelList\n"; |
||||
|
echo "$nameList\n"; |
@ -0,0 +1,72 @@ |
|||||
|
<?php |
||||
|
declare(strict_types=1); |
||||
|
// this script generates a test series from the Web Platform test suite which exercises the index tables of multi-byte encodings with single characters |
||||
|
// they are pedantic sets of tests, and so the test suite itself only uses this series in optional tests |
||||
|
|
||||
|
$tests = [ |
||||
|
'gb18030' => [ |
||||
|
// the Web Platform test suite does not have tests for gb18030, but a pull request was made in 2016 with a set of tests |
||||
|
'two-byte GBK' => "https://raw.githubusercontent.com/web-platform-tests/wpt/5847108cb16dc0047331da3f746652f35b3e9c90/encoding/legacy-mb-schinese/gb18030/gb18030_chars.html", |
||||
|
'four-byte Han' => "https://raw.githubusercontent.com/web-platform-tests/wpt/5847108cb16dc0047331da3f746652f35b3e9c90/encoding/legacy-mb-schinese/gb18030/gb18030_extra_han_chars.html", |
||||
|
'four-byte Hangul' => "https://raw.githubusercontent.com/web-platform-tests/wpt/5847108cb16dc0047331da3f746652f35b3e9c90/encoding/legacy-mb-schinese/gb18030/gb18030_extra_hangul_chars.html", |
||||
|
'four-byte miscellaneous' => "https://raw.githubusercontent.com/web-platform-tests/wpt/5847108cb16dc0047331da3f746652f35b3e9c90/encoding/legacy-mb-schinese/gb18030/gb18030_extra_misc_chars.html", |
||||
|
'four-byte private use' => "https://raw.githubusercontent.com/web-platform-tests/wpt/5847108cb16dc0047331da3f746652f35b3e9c90/encoding/legacy-mb-schinese/gb18030/gb18030_extra_pua_chars.html", |
||||
|
], |
||||
|
'big5' => [ |
||||
|
'standard characters' => "https://raw.githubusercontent.com/web-platform-tests/wpt/master/encoding/legacy-mb-tchinese/big5/big5_chars.html", |
||||
|
'extended characters' => "https://raw.githubusercontent.com/web-platform-tests/wpt/master/encoding/legacy-mb-tchinese/big5/big5_chars_extra.html", |
||||
|
], |
||||
|
'euc-kr' => [ |
||||
|
'characters' => "https://raw.githubusercontent.com/web-platform-tests/wpt/master/encoding/legacy-mb-korean/euc-kr/euckr_chars.html", |
||||
|
], |
||||
|
'euc-jp' => [ |
||||
|
'characters' => "https://raw.githubusercontent.com/web-platform-tests/wpt/master/encoding/legacy-mb-japanese/euc-jp/eucjp_chars.html", |
||||
|
], |
||||
|
'iso-2022-jp' => [ |
||||
|
'characters' => "https://raw.githubusercontent.com/web-platform-tests/wpt/master/encoding/legacy-mb-japanese/iso-2022-jp/iso2022jp_chars.html", |
||||
|
], |
||||
|
'shiftjis' => [ |
||||
|
'characters' => "https://raw.githubusercontent.com/web-platform-tests/wpt/master/encoding/legacy-mb-japanese/shift_jis/sjis_chars.html", |
||||
|
], |
||||
|
]; |
||||
|
|
||||
|
$label = $argv[1] ?? ""; |
||||
|
$label = trim(strtolower($label)); |
||||
|
if (!isset($tests[$label])) { |
||||
|
die("Invalid label specified. Must be one of: ".json_encode(array_keys($tests))); |
||||
|
} |
||||
|
|
||||
|
foreach ($tests[$label] as $name => $url) { |
||||
|
$data = make_test($label, $url); |
||||
|
$in = $data[0]; |
||||
|
$out = $data[1]; |
||||
|
echo "'$name' => [[$in], [$out]],\n"; |
||||
|
} |
||||
|
|
||||
|
function make_test(string $label, string $url): array { |
||||
|
// retrieve the test data |
||||
|
$data = file_get_contents($url) or die("Could not retrieve $label test $url"); |
||||
|
// find the data |
||||
|
preg_match_all('/<span data-cp="([^"]+)" data-bytes="([^"]+)">/s', $data, $matches, \PREG_SET_ORDER); |
||||
|
// set up |
||||
|
$in = $out = []; |
||||
|
// loop through each match |
||||
|
foreach ($matches as $match) { |
||||
|
$bytes = str_replace(" ", "", $match[2]); |
||||
|
$code = hexdec($match[1]); |
||||
|
if ($label=="gb18030" && $bytes=="A8BC") { // this test is incorrect or out of date; both Vivaldi and Firefox yield code point 7743 |
||||
|
$code = 7743; |
||||
|
} elseif ($label=="euc-jp") { // three tests are out of date |
||||
|
$code = ["5C" => 92, "7E" => 126, "A1DD" => 65293][$bytes] ?? $code; |
||||
|
} elseif ($label=="shiftjis") { // three tests are incorrect |
||||
|
$code = ["5C" => 92, "7E" => 126, "817C" => 0xFF0D][$bytes] ?? $code; |
||||
|
} |
||||
|
// convert the code point to decimal |
||||
|
$out[] = $code; |
||||
|
// convert the hex bytes to PHP notation |
||||
|
$in[] = '"'.$bytes.'"'; |
||||
|
} |
||||
|
$in = implode(",", $in); |
||||
|
$out = implode(",", $out); |
||||
|
return [$in, $out]; |
||||
|
} |
@ -1,120 +0,0 @@ |
|||||
<!DOCTYPE html> |
|
||||
<meta charset=gb18030> |
|
||||
<!-- Correct results are provided by Firefox --> |
|
||||
<pre style="font-family: 'Consolas', monospace;"></pre> |
|
||||
<script> |
|
||||
var data = [ |
|
||||
{ encoding: 'gb18030', input: [], name: 'empty string' }, |
|
||||
{ encoding: 'gb18030', input: [0x40], name: 'sanity check' }, |
|
||||
{ encoding: 'gb18030', input: [0x80], name: 'special case for 0x80' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x37], name: 'four-byte special case' }, |
|
||||
{ encoding: 'gb18030', input: [0xA8, 0x4E], name: 'two-byte character' }, |
|
||||
{ encoding: 'gb18030', input: [0x82, 0x31, 0xA2, 0x37], name: 'four-byte character' }, |
|
||||
{ encoding: 'gb18030', input: [0x82], name: 'EOF after first byte' }, |
|
||||
{ encoding: 'gb18030', input: [0x82, 0x30], name: 'EOF after second byte' }, |
|
||||
{ encoding: 'gb18030', input: [0x82, 0x30, 0x81], name: 'EOF after third byte' }, |
|
||||
{ encoding: 'gb18030', input: [0xFF, 0x35, 0xF4, 0x37], name: 'bad first byte' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0xFF, 0xF4, 0x37], name: 'bad second byte' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x35, 0xFF, 0x37], name: 'bad third byte' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0xFF], name: 'bad fourth byte' }, |
|
||||
{ encoding: 'gb18030', input: [0x00, 0x35, 0xF4, 0x37], name: 'control first byte' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x00, 0xF4, 0x37], name: 'control second byte' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x35, 0x00, 0x37], name: 'control third byte' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x00], name: 'control fourth byte' }, |
|
||||
{ encoding: 'gb18030', input: [0xFF, 0x35, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'bad first byte (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0xFF, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'bad second byte (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x35, 0xFF, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'bad third byte (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0xFF, 0x00, 0x00, 0x00, 0x00], name: 'bad fourth byte (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x00, 0x35, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'control first byte (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x00, 0xF4, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'control second byte (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x35, 0x00, 0x37, 0x00, 0x00, 0x00, 0x00], name: 'control third byte (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x35, 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00], name: 'control fourth byte (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x84, 0x32, 0xA4, 0x39], name: 'void sequence' }, |
|
||||
{ encoding: 'gb18030', input: [0xFE, 0x39, 0xFE, 0x39], name: 'void sequence 2' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x81, 0x81, 0x30], name: 'seek test 1' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x81, 0x80], name: 'seek test 2' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x81, 0x00], name: 'seek test 3' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x81, 0x81, 0x00], name: 'seek test 4' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x30, 0x30, 0x30], name: 'seek test 5' }, |
|
||||
{ encoding: 'gb18030', input: [0x81, 0x30, 0x81, 0x81], name: 'seek test 6' }, |
|
||||
{ encoding: 'gb18030', input: [0x30, 0x30, 0x81, 0x81], name: 'seek test 7' }, |
|
||||
{ encoding: 'gb18030', input: [0xF8, 0x83, 0xFE, 0x80], name: 'seek test 8' }, |
|
||||
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x81, 0x81, 0x30, 0x00, 0x00, 0x00, 0x00], name: 'seek test 1 (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x81, 0x80, 0x00, 0x00, 0x00, 0x00], name: 'seek test 2 (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00], name: 'seek test 3 (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x81, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00], name: 'seek test 4 (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x30, 0x30, 0x30, 0x00, 0x00, 0x00, 0x00], name: 'seek test 5 (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x81, 0x30, 0x81, 0x81, 0x00, 0x00, 0x00, 0x00], name: 'seek test 6 (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0x30, 0x30, 0x81, 0x81, 0x00, 0x00, 0x00, 0x00], name: 'seek test 7 (padded)' }, |
|
||||
{ encoding: 'gb18030', input: [0x00, 0x00, 0x00, 0x00, 0xF8, 0x83, 0xFE, 0x80, 0x00, 0x00, 0x00, 0x00], name: 'seek test 8 (padded)' }, |
|
||||
]; |
|
||||
data.forEach(function(data) { |
|
||||
var bytes = []; |
|
||||
data.input.forEach((p) => { |
|
||||
bytes.push(p.toString(16).padStart(2, "0").toUpperCase()); |
|
||||
}); |
|
||||
var codes = []; |
|
||||
var text = new TextDecoder(data.encoding).decode(new Uint8Array(data.input)); |
|
||||
var b = 0; |
|
||||
for (let a = 0; a < text.length; a++) { |
|
||||
let point = text.codePointAt(a); |
|
||||
if (point >= 55296 && point <= 57343) { |
|
||||
// non-BMP characters have trailing low surrogates in JavaScript strings |
|
||||
continue; |
|
||||
} |
|
||||
codes[b++] = point; |
|
||||
} |
|
||||
bytes = bytes.join(" "); |
|
||||
codes = codes.join(", "); |
|
||||
var line = "'" + data.name + "' => [" + '"' + bytes + '", [' + codes + "]],\n"; |
|
||||
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode(line)); |
|
||||
}) |
|
||||
|
|
||||
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode("\n\n\n")); |
|
||||
|
|
||||
[0x64, 0x20AC, 0x2164, 0x3A74, 0xE7C7, 0x1D11E].forEach(function(code) { |
|
||||
var l = document.createElement("a"); |
|
||||
l.href = "http://example.com/?" + String.fromCodePoint(code); |
|
||||
var url = l.search.substr(1); |
|
||||
var bytes = []; |
|
||||
for (let a = 0; a < url.length; a++) { |
|
||||
if (url.charAt(a) == "%") { |
|
||||
bytes.push(url.charAt(a + 1) + url.charAt(a + 2)); |
|
||||
a = a + 2; |
|
||||
} else { |
|
||||
bytes.push(url.charCodeAt(a).toString(16).padStart(2, "0")); |
|
||||
} |
|
||||
} |
|
||||
var line = "0x" + code.toString(16).toUpperCase() + ", " + bytes.join(" ").toUpperCase() + "\n"; |
|
||||
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode(line)); |
|
||||
}) |
|
||||
|
|
||||
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode("\n\n\n")); |
|
||||
/* |
|
||||
Char 0 U+007A (1 byte) Offset 0 |
|
||||
Char 1 U+00A2 (2 bytes) Offset 1 |
|
||||
Char 2 U+6C34 (3 bytes) Offset 3 |
|
||||
Char 3 U+1D11E (4 bytes) Offset 6 |
|
||||
Char 4 U+F8FF (3 bytes) Offset 10 |
|
||||
Char 5 U+10FFFD (4 bytes) Offset 13 |
|
||||
Char 6 U+FFFE (3 bytes) Offset 17 |
|
||||
End of string at char 7, offset 20 |
|
||||
*/ |
|
||||
|
|
||||
[0x7A, 0xA2, 0x6C34, 0x1D11E, 0xF8FF, 0x10FFFD, 0xFFFE].forEach(function(code) { |
|
||||
var l = document.createElement("a"); |
|
||||
l.href = "http://example.com/?" + String.fromCodePoint(code); |
|
||||
var url = l.search.substr(1); |
|
||||
var bytes = []; |
|
||||
for (let a = 0; a < url.length; a++) { |
|
||||
if (url.charAt(a) == "%") { |
|
||||
bytes.push(url.charAt(a + 1) + url.charAt(a + 2)); |
|
||||
a = a + 2; |
|
||||
} else { |
|
||||
bytes.push(url.charCodeAt(a).toString(16).padStart(2, "0")); |
|
||||
} |
|
||||
} |
|
||||
var line = bytes.join(" ").toUpperCase() + "\n"; |
|
||||
document.getElementsByTagName("pre")[0].appendChild(document.createTextNode(line)); |
|
||||
}) |
|
||||
</script> |
|
@ -1,41 +0,0 @@ |
|||||
<?php |
|
||||
// the Web Platform test suite does not have tests for gb18030, but a pull request was made in 2016 with a partial set of tests |
|
||||
// this script generates a test series from those tests which exercises the index and range tables with single characters |
|
||||
// it is a pedantic set of tests, and so the test suite itself only uses this series in an optional test |
|
||||
$standard_tests = [ |
|
||||
'two-byte GBK' => "https://raw.githubusercontent.com/web-platform-tests/wpt/5847108cb16dc0047331da3f746652f35b3e9c90/encoding/legacy-mb-schinese/gb18030/gb18030_chars.html", |
|
||||
'four-byte Han' => "https://raw.githubusercontent.com/web-platform-tests/wpt/5847108cb16dc0047331da3f746652f35b3e9c90/encoding/legacy-mb-schinese/gb18030/gb18030_extra_han_chars.html", |
|
||||
'four-byte Hangul' => "https://raw.githubusercontent.com/web-platform-tests/wpt/5847108cb16dc0047331da3f746652f35b3e9c90/encoding/legacy-mb-schinese/gb18030/gb18030_extra_hangul_chars.html", |
|
||||
'four-byte miscellaneous' => "https://raw.githubusercontent.com/web-platform-tests/wpt/5847108cb16dc0047331da3f746652f35b3e9c90/encoding/legacy-mb-schinese/gb18030/gb18030_extra_misc_chars.html", |
|
||||
'four-byte private use' => "https://raw.githubusercontent.com/web-platform-tests/wpt/5847108cb16dc0047331da3f746652f35b3e9c90/encoding/legacy-mb-schinese/gb18030/gb18030_extra_pua_chars.html", |
|
||||
]; |
|
||||
foreach($standard_tests as $name=> $url) { |
|
||||
$data = make_standard_test($url); |
|
||||
$in = $data[0]; |
|
||||
$out = $data[1]; |
|
||||
echo "'$name' => [[$in], [$out]],\n"; |
|
||||
} |
|
||||
|
|
||||
function make_standard_test(string $url): array { |
|
||||
// retrieve the test data |
|
||||
$data = file_get_contents($url) or die("Could not retrieve test $url"); |
|
||||
// find the data |
|
||||
preg_match_all('/<span data-cp="([^"]+)" data-bytes="([^"]+)">/s', $data, $matches, \PREG_SET_ORDER); |
|
||||
// set up |
|
||||
$in = $out = []; |
|
||||
// loop through each match |
|
||||
foreach ($matches as $match) { |
|
||||
$bytes = str_replace(" ", "", $match[2]); |
|
||||
$code = hexdec($match[1]); |
|
||||
if ($bytes=="A8BC") { // this test is incorrect or out of date; both Vivaldi and Firefox yield code point 7743 |
|
||||
$code = 7743; |
|
||||
} |
|
||||
// convert the code point to decimal |
|
||||
$out[] = $code; |
|
||||
// convert the hex bytes to PHP notation |
|
||||
$in[] = '"'.$bytes.'"'; |
|
||||
} |
|
||||
$in = implode(",", $in); |
|
||||
$out = implode(",", $out); |
|
||||
return [$in, $out]; |
|
||||
} |
|
@ -0,0 +1,57 @@ |
|||||
|
<!DOCTYPE html> |
||||
|
<meta charset=big5> |
||||
|
<script> |
||||
|
var sampleStrings = { |
||||
|
'empty string': "", |
||||
|
// valid single characters |
||||
|
'sanity check': "40", |
||||
|
'two-byte character': "D7 D7", |
||||
|
// invalid sequences |
||||
|
'EOF after first byte': "D7", |
||||
|
'low byte after first byte': "D7 39", |
||||
|
'0x80 as first byte': "80 D7 00", |
||||
|
'0xFF as first byte': "FF D7 00", |
||||
|
'invalid high byte as first byte': "81 D7 00", |
||||
|
'0x7F after first byte': "D7 7F", |
||||
|
'0xFF after first byte': "D7 FF", |
||||
|
'invalid high byte after first byte': "D7 81", |
||||
|
'broken string': "00 FF 00", |
||||
|
// double sequences |
||||
|
'double-characters low': "88 62 88 64", |
||||
|
'double-characters high': "88 A3 88 A5", |
||||
|
// mixed string |
||||
|
'mixed string': "7A D7 AA A4 F4 88 62 88 A5", |
||||
|
'mixed string 2': "62 D7 D7 D7 D7 62", |
||||
|
}; |
||||
|
var sampleCharacters = { |
||||
|
'U+0064': 0x64, |
||||
|
'U+00CA': 0xCA, |
||||
|
'U+3007': 0x3007, |
||||
|
'U+5341': 0x5341, |
||||
|
'U+2561': 0x2561, |
||||
|
'U+256D': 0x256D, |
||||
|
'-1': -1, |
||||
|
'0x110000': 0x110000, |
||||
|
}; |
||||
|
var seekCodePoints = [ |
||||
|
/* |
||||
|
Char 0 U+007A (1 byte) Offset 0 |
||||
|
Char 1 U+86CC (2 bytes) Offset 1 |
||||
|
Char 2 U+6C34 (2 bytes) Offset 3 |
||||
|
Char 3 U+00CA (0 bytes) Offset 5 |
||||
|
Char 4 U+0304 (2 bytes) Offset 5 |
||||
|
Char 5 U+00EA (0 bytes) Offset 7 |
||||
|
Char 6 U+030C (2 bytes) Offset 7 |
||||
|
End of string at char 7, offset 9 |
||||
|
*/ |
||||
|
0x007A, |
||||
|
0x86CC, |
||||
|
0x6C34, |
||||
|
// these four should be replaced with bytes 8862 88A5, which together produce four characters |
||||
|
0x00CA, |
||||
|
0x0304, |
||||
|
0x00EA, |
||||
|
0x030C, |
||||
|
]; |
||||
|
</script> |
||||
|
<script src="test.js"></script> |
@ -0,0 +1,57 @@ |
|||||
|
<!DOCTYPE html> |
||||
|
<meta charset=euc-jp> |
||||
|
<!-- Chromium does NOT produce correct results as of this writing; use Firefox to generate test data --> |
||||
|
<script> |
||||
|
var sampleStrings = { |
||||
|
'empty string': "", |
||||
|
// sanity checks |
||||
|
'sanity check': "40", |
||||
|
'former ASCII deviations': "5C 7E", |
||||
|
'changed multibyte index': "A1DD", |
||||
|
// JIS X 0201 |
||||
|
'JIS X 0201 range': "8EA1 8EDF", |
||||
|
'JIS X 0201 bogus range': "8EA0 8EE0", |
||||
|
'JIS X 0201 truncated character 1': "8E", |
||||
|
'JIS X 0201 truncated character 2': "8E 20", |
||||
|
'JIS X 0201 truncated character 3': "8E FF", |
||||
|
// JIS X 0212 |
||||
|
'JIS X 0212 assigned range': "8FA2AF 8FEDE3", |
||||
|
'JIS X 0212 total range': "8FA1A1 8FFEFE", |
||||
|
'JIS X 0212 bogus range 1': "8FA0A1 8FFFFE", |
||||
|
'JIS X 0212 bogus range 2': "8FA1A0 8FFEFF", |
||||
|
'JIS X 0212 truncated character 1': "8FA2", |
||||
|
'JIS X 0212 truncated character 2': "8FA2 20", |
||||
|
'JIS X 0212 truncated character 3': "8FA2 FF", |
||||
|
// JIS X 0208 |
||||
|
'JIS X 0208 assigned range': "A1A1 FCFE", |
||||
|
'JIS X 0208 total range': "A1A1 FEFE", |
||||
|
'JIS X 0208 bogus range': "A1A0 A0FE", |
||||
|
'JIS X 0208 truncated character 1': "A1", |
||||
|
'JIS X 0208 truncated character 2': "A1 20", |
||||
|
'JIS X 0208 truncated character 3': "A1 FF", |
||||
|
}; |
||||
|
var sampleCharacters = { |
||||
|
'U+0064': 0x64, |
||||
|
'U+00A5': 0xA5, |
||||
|
'U+203E': 0x203E, |
||||
|
'U+3088': 0x3088, |
||||
|
'U+FF96': 0xFF96, |
||||
|
'U+2212': 0x2212, |
||||
|
'U+00E6': 0xE6, |
||||
|
'U+FFE2': 0xFFE2, |
||||
|
'U+2116': 0x2116, |
||||
|
'-1': -1, |
||||
|
'0x110000': 0x110000, |
||||
|
}; |
||||
|
var seekCodePoints = [ |
||||
|
0x007A, |
||||
|
0xFF96, |
||||
|
0x3088, |
||||
|
0xFF0D, |
||||
|
0x005C, |
||||
|
0xFF9B, |
||||
|
/* This code point is not encodable and must be done manually entered as 8FB0EF */ |
||||
|
0x4F58, |
||||
|
]; |
||||
|
</script> |
||||
|
<script src="test.js"></script> |
@ -0,0 +1,38 @@ |
|||||
|
<!DOCTYPE html> |
||||
|
<meta charset=euc-kr> |
||||
|
<script> |
||||
|
var sampleStrings = { |
||||
|
'empty string': "", |
||||
|
// valid single characters |
||||
|
'sanity check': "40", |
||||
|
'two-byte character': "D7 D7", |
||||
|
// invalid sequences |
||||
|
'EOF after first byte': "D7", |
||||
|
'low byte after first byte': "D7 39", |
||||
|
'0x80 as first byte': "80 D7 00", |
||||
|
'0xFF as first byte': "FF D7 00", |
||||
|
'0x7F after first byte': "D7 7F", |
||||
|
'0xFF after first byte': "D7 FF", |
||||
|
'non-character': "A5 DC", |
||||
|
// mixed string |
||||
|
'mixed string': "7A D7 AA A4 F4 88 62 88 A5", |
||||
|
'mixed string 2': "62 D7 D7 D7 D7 62", |
||||
|
}; |
||||
|
var sampleCharacters = { |
||||
|
'U+0064': 0x64, |
||||
|
'U+00CA': 0x00CA, |
||||
|
'U+ACF2': 0xACF2, |
||||
|
'-1': -1, |
||||
|
'0x110000': 0x110000, |
||||
|
}; |
||||
|
var seekCodePoints = [ |
||||
|
0x007A, |
||||
|
0xACF2, |
||||
|
0x0020, |
||||
|
0x6C34, |
||||
|
0x0391, |
||||
|
0x03C9, |
||||
|
0x002A, |
||||
|
]; |
||||
|
</script> |
||||
|
<script src="test.js"></script> |
@ -0,0 +1,77 @@ |
|||||
|
<!DOCTYPE html> |
||||
|
<meta charset=gb18030> |
||||
|
<script> |
||||
|
var sampleStrings = { |
||||
|
'empty string': "", |
||||
|
// valid single characters |
||||
|
'sanity check': "40", |
||||
|
'special case for 0x80': "80", |
||||
|
'four-byte special case': "81 35 F4 37", |
||||
|
'two-byte character': "A8 4E", |
||||
|
'four-byte character': "82 31 A2 37", |
||||
|
// cut sequences |
||||
|
'EOF after first byte': "82", |
||||
|
'EOF after second byte': "82 30", |
||||
|
'EOF after third byte': "82 30 81", |
||||
|
// invalid sequences |
||||
|
'bad first byte': "FF 35 F4 37", |
||||
|
'bad second byte': "81 FF F4 37", |
||||
|
'bad third byte': "81 35 FF 37", |
||||
|
'bad fourth byte': "81 35 F4 FF", |
||||
|
'control first byte': "00 35 F4 37", |
||||
|
'control second byte': "81 00 F4 37", |
||||
|
'control third byte': "81 35 00 37", |
||||
|
'control fourth byte': "81 35 F4 00", |
||||
|
// invalid sequences with clean EOF |
||||
|
'bad first byte (padded)': "FF 35 F4 37 00 00 00 00", |
||||
|
'bad second byte (padded)': "81 FF F4 37 00 00 00 00", |
||||
|
'bad third byte (padded)': "81 35 FF 37 00 00 00 00", |
||||
|
'bad fourth byte (padded)': "81 35 F4 FF 00 00 00 00", |
||||
|
'control first byte (padded)': "00 35 F4 37 00 00 00 00", |
||||
|
'control second byte (padded)': "81 00 F4 37 00 00 00 00", |
||||
|
'control third byte (padded)': "81 35 00 37 00 00 00 00", |
||||
|
'control fourth byte (padded)': "81 35 F4 00 00 00 00 00", |
||||
|
// out-of-range sequences |
||||
|
'void sequence': "84 32 A4 39", |
||||
|
'void sequence 2': "FE 39 FE 39", |
||||
|
// backward seeking tests |
||||
|
'seek test 1': "81 81 81 30", |
||||
|
'seek test 2': "81 81 80", |
||||
|
'seek test 3': "81 81 00", |
||||
|
'seek test 4': "81 81 81 00", |
||||
|
'seek test 5': "81 30 30 30", |
||||
|
'seek test 6': "81 30 81 81", |
||||
|
'seek test 7': "30 30 81 81", |
||||
|
'seek test 8': "F8 83 FE 80", |
||||
|
'seek test 1 (padded)': "00 00 00 00 81 81 81 30 00 00 00 00", |
||||
|
'seek test 2 (padded)': "00 00 00 00 81 81 80 00 00 00 00", |
||||
|
'seek test 3 (padded)': "00 00 00 00 81 81 00 00 00 00 00", |
||||
|
'seek test 4 (padded)': "00 00 00 00 81 81 81 00 00 00 00 00", |
||||
|
'seek test 5 (padded)': "00 00 00 00 81 30 30 30 00 00 00 00", |
||||
|
'seek test 6 (padded)': "00 00 00 00 81 30 81 81 00 00 00 00", |
||||
|
'seek test 7 (padded)': "00 00 00 00 30 30 81 81 00 00 00 00", |
||||
|
'seek test 8 (padded)': "00 00 00 00 F8 83 FE 80 00 00 00 00", |
||||
|
}; |
||||
|
var sampleCharacters = { |
||||
|
'U+0064': 0x64, |
||||
|
'U+20AC': 0x20AC, |
||||
|
'U+2164': 0x2164, |
||||
|
'U+3A74': 0x3A74, |
||||
|
'U+E7C7': 0xE7C7, |
||||
|
'U+1D11E': 0x1D11E, |
||||
|
'U+E5E5': 0xE5E5, |
||||
|
'U+3000': 0x3000, |
||||
|
'-1': -1, |
||||
|
'0x110000': 0x110000, |
||||
|
}; |
||||
|
var seekCodePoints = [ |
||||
|
0x007A, |
||||
|
0x00A2, |
||||
|
0x6C34, |
||||
|
0x1D11E, |
||||
|
0xF8FF, |
||||
|
0x10FFFD, |
||||
|
0xFFFE, |
||||
|
]; |
||||
|
</script> |
||||
|
<script src="test.js"></script> |
@ -0,0 +1,17 @@ |
|||||
|
<!DOCTYPE html> |
||||
|
<meta charset=gbk> |
||||
|
<script> |
||||
|
var sampleCharacters = { |
||||
|
'U+0064': 0x64, |
||||
|
'U+20AC': 0x20AC, |
||||
|
'U+2164': 0x2164, |
||||
|
'U+3A74': 0x3A74, |
||||
|
'U+E7C7': 0xE7C7, |
||||
|
'U+1D11E': 0x1D11E, |
||||
|
'U+E5E5': 0xE5E5, |
||||
|
'U+3000': 0x3000, |
||||
|
'-1': -1, |
||||
|
'0x110000': 0x110000, |
||||
|
}; |
||||
|
</script> |
||||
|
<script src="test.js"></script> |
@ -0,0 +1,46 @@ |
|||||
|
<!DOCTYPE html> |
||||
|
<meta charset=iso-2022-jp> |
||||
|
<!-- Chromium does NOT produce correct results as of this writing; use Firefox to generate test data --> |
||||
|
<script> |
||||
|
var sampleStrings = { |
||||
|
'empty string': "", |
||||
|
'Implied ASCII mode': "00 30 5C 7E 21 5F", |
||||
|
'Explicit ASCII mode': "1B2842 00 30 5C 7E 21 5F", |
||||
|
'Roman mode': "1B284A 00 30 5C 7E 21 5F", |
||||
|
'Katakana mode': "1B2849 00 30 5C 7E 21 5F", |
||||
|
'Double-byte mode 1': "1B2440 00 30 5C 7E 21 5F", |
||||
|
'Double-byte mode 2': "1B2442 00 30 5C 7E 21 5F", |
||||
|
'Multiple modes': "5C 1B2849 21 1B2440 305C 1B284A 5C 1B2842 5C", |
||||
|
'Double escape': "1B2849 1B2842 5C", |
||||
|
'Triple escape': "1B2849 1B2842 1B284A 5C", |
||||
|
'Trailing escape': "20 1B284A 30 33 1B2849", |
||||
|
'Invalid bytes': "80 FF 1B2849 00 20 7F 1B2442 00 2100 FF FF", |
||||
|
}; |
||||
|
var sampleCharacters = { |
||||
|
'U+0020': [0x20], |
||||
|
'U+005C': [0x5C], |
||||
|
'U+007E': [0x7E], |
||||
|
'U+00A5': [0xA5], |
||||
|
'U+203E': [0x203E], |
||||
|
'U+FF61': [0xFF61], |
||||
|
'U+FF9F': [0xFF9F], |
||||
|
'U+2212': [0x2212], |
||||
|
'U+2116': [0x2116], |
||||
|
'U+FFE2': [0xFFE2], |
||||
|
'U+00C6': [0xC6], |
||||
|
'U+FFFD': [0xFFFD], |
||||
|
'Roman': [0xA5, 0x20, 0x203E], |
||||
|
'Roman to ASCII': [0xA5, 0x5C], |
||||
|
'Roman to error': [0xA5, 0x80], |
||||
|
'JIS': [0x2116, 0xFFE2, 0x2212], |
||||
|
'JIS to Roman': [0x2116, 0xA5], |
||||
|
'JIS to ASCII 1': [0x2116, 0x20], |
||||
|
'JIS to ASCII 2': [0x2116, 0x5C], |
||||
|
'JIS to error 1': [0x2116, 0x80], |
||||
|
'JIS to error 2': [0x2116, 0x1B], // Even Firefox is wrong here; see https://github.com/web-platform-tests/wpt/pull/26158 |
||||
|
'Escape characters': [0x1B, 0xE, 0xF], // Even Firefox is wrong here; see https://github.com/web-platform-tests/wpt/pull/26158 |
||||
|
'-1': [-1], |
||||
|
'0x110000': [0x110000], |
||||
|
}; |
||||
|
</script> |
||||
|
<script src="test.js"></script> |
@ -0,0 +1,42 @@ |
|||||
|
<!DOCTYPE html> |
||||
|
<meta charset=shift_jis> |
||||
|
<!-- Chromium does NOT produce correct results as of this writing; use Firefox to generate test data --> |
||||
|
<script> |
||||
|
var sampleStrings = { |
||||
|
'empty string': "", |
||||
|
'sanity check': "40", |
||||
|
'former ASCII deviations': "5C 7E", |
||||
|
'JIS X 0201 range': "A1 DF", |
||||
|
'EUDC range': "F040 F9FC", |
||||
|
'JIS X 0208 assigned range': "8140 FC4B", |
||||
|
'JIS X 0208 total range': "8140 FCFC", |
||||
|
'JIS X 0208 truncated character 1': "81", |
||||
|
'JIS X 0208 truncated character 2': "81 20", |
||||
|
'JIS X 0208 truncated character 3': "81 FF", |
||||
|
}; |
||||
|
var sampleCharacters = { |
||||
|
'U+0064': 0x64, |
||||
|
'U+00A5': 0xA5, |
||||
|
'U+203E': 0x203E, |
||||
|
'U+3088': 0x3088, |
||||
|
'U+FF96': 0xFF96, |
||||
|
'U+2212': 0x2212, |
||||
|
'U+00E6': 0xE6, |
||||
|
'U+FFE2': 0xFFE2, |
||||
|
'U+2116': 0x2116, |
||||
|
'U+E000': 0xE000, |
||||
|
'-1': -1, |
||||
|
'0x110000': 0x110000, |
||||
|
}; |
||||
|
var seekCodePoints = [ |
||||
|
0x007A, |
||||
|
0xFF96, |
||||
|
0x3088, |
||||
|
0xFF0D, |
||||
|
0x005C, |
||||
|
0xFF9B, |
||||
|
/* This code point is not encodable and must be done manually entered as F040 */ |
||||
|
0xE000, |
||||
|
]; |
||||
|
</script> |
||||
|
<script src="test.js"></script> |
@ -0,0 +1,20 @@ |
|||||
|
<!DOCTYPE html> |
||||
|
<meta charset=utf-16> |
||||
|
<script> |
||||
|
var sampleStrings = { |
||||
|
// control samples |
||||
|
'empty string': "", |
||||
|
'sanity check': "6100 6200 6300 3100 3200 3300", |
||||
|
'mixed sample': "7A00 A200 346C 34D8 1EDD FFF8 FFDB FDDF FEFF", |
||||
|
// unexpected EOF |
||||
|
'EOF in BMP character': "0000 FF", |
||||
|
'EOF after lead surrogate': "0000 34D8", |
||||
|
'EOF in trail surrogate': "0000 34D8 1E", |
||||
|
// invalid UTF-16 surrogates |
||||
|
'lead surrogate without trail': "34D8 0000", |
||||
|
'trail surrogate without lead': "1EDD 0000", |
||||
|
'double lead surrogate': "34D8 34D8 1EDD", |
||||
|
'double trail surrogate': "34D8 1EDD 1EDD", |
||||
|
}; |
||||
|
</script> |
||||
|
<script src="test.js"></script> |
@ -0,0 +1,70 @@ |
|||||
|
<!DOCTYPE html> |
||||
|
<meta charset=utf-8> |
||||
|
<script> |
||||
|
var sampleStrings = { |
||||
|
// control samples |
||||
|
'empty string': "", |
||||
|
'sanity check': "61 62 63 31 32 33", |
||||
|
'multibyte control': "E5 8F A4 E6 B1 A0 E3 82 84 E8 9B 99 E9 A3 9B E3 81 B3 E8 BE BC E3 82 80 E6 B0 B4 E3 81 AE E9 9F B3", |
||||
|
'mixed sample': "7A C2 A2 E6 B0 B4 F0 9D 84 9E EF A3 BF F4 8F BF BD EF BF BE", |
||||
|
// various invalid sequences |
||||
|
'invalid code': "FF", |
||||
|
'ends early': "C0", |
||||
|
'ends early 2': "E0", |
||||
|
'invalid trail': "C0 00", |
||||
|
'invalid trail 2': "C0 C0", |
||||
|
'invalid trail 3': "E0 00", |
||||
|
'invalid trail 4': "E0 C0", |
||||
|
'invalid trail 5': "E0 80 00", |
||||
|
'invalid trail 6': "E0 80 C0", |
||||
|
'> 0x10FFFF': "FC 80 80 80 80 80", |
||||
|
'obsolete lead byte': "FE 80 80 80 80 80", |
||||
|
'overlong U+0000 - 2 bytes': "C0 80", |
||||
|
'overlong U+0000 - 3 bytes': "E0 80 80", |
||||
|
'overlong U+0000 - 4 bytes': "F0 80 80 80", |
||||
|
'overlong U+0000 - 5 bytes': "F8 80 80 80 80", |
||||
|
'overlong U+0000 - 6 bytes': "FC 80 80 80 80 80", |
||||
|
'overlong U+007F - 2 bytes': "C1 BF", |
||||
|
'overlong U+007F - 3 bytes': "E0 81 BF", |
||||
|
'overlong U+007F - 4 bytes': "F0 80 81 BF", |
||||
|
'overlong U+007F - 5 bytes': "F8 80 80 81 BF", |
||||
|
'overlong U+007F - 6 bytes': "FC 80 80 80 81 BF", |
||||
|
'overlong U+07FF - 3 bytes': "E0 9F BF", |
||||
|
'overlong U+07FF - 4 bytes': "F0 80 9F BF", |
||||
|
'overlong U+07FF - 5 bytes': "F8 80 80 9F BF", |
||||
|
'overlong U+07FF - 6 bytes': "FC 80 80 80 9F BF", |
||||
|
'overlong U+FFFF - 4 bytes': "F0 8F BF BF", |
||||
|
'overlong U+FFFF - 5 bytes': "F8 80 8F BF BF", |
||||
|
'overlong U+FFFF - 6 bytes': "FC 80 80 8F BF BF", |
||||
|
'overlong U+10FFFF - 5 bytes': "F8 84 8F BF BF", |
||||
|
'overlong U+10FFFF - 6 bytes': "FC 80 84 8F BF BF", |
||||
|
// UTF-16 surrogates |
||||
|
'lead surrogate': "ED A0 80", |
||||
|
'trail surrogate': "ED B0 80", |
||||
|
'surrogate pair': "ED A0 80 ED B0 80", |
||||
|
// self-sync edge cases |
||||
|
'trailing continuation': "0A 80 80", |
||||
|
'trailing continuation 2': "E5 8F A4 80", |
||||
|
}; |
||||
|
var sampleCharacters = { |
||||
|
'U+007A': 0x007A, |
||||
|
'U+00A2': 0x00A2, |
||||
|
'U+6C34': 0x6C34, |
||||
|
'U+1D11E': 0x1D11E, |
||||
|
'U+F8FF': 0xF8FF, |
||||
|
'U+10FFFD': 0x10FFFD, |
||||
|
'U+FFFE': 0xFFFE, |
||||
|
'-1': -1, |
||||
|
'0x110000': 0x110000, |
||||
|
}; |
||||
|
var seekCodePoints = [ |
||||
|
0x007A, |
||||
|
0x00A2, |
||||
|
0x6C34, |
||||
|
0x1D11E, |
||||
|
0xF8FF, |
||||
|
0x10FFFD, |
||||
|
0xFFFE, |
||||
|
]; |
||||
|
</script> |
||||
|
<script src="test.js"></script> |
@ -0,0 +1,177 @@ |
|||||
|
"use strict"; |
||||
|
// set out the output pre-formatted text element
|
||||
|
window.out = document.createElement("pre"); |
||||
|
document.documentElement.appendChild(out); |
||||
|
|
||||
|
var encoding = document.getElementsByTagName("meta")[0].getAttribute("charset"); |
||||
|
|
||||
|
function encodeCodePoint(code, fatal) { |
||||
|
if (code < 0 || code > 0x10FFFF) { |
||||
|
return 'new EncoderException("", Coder::E_INVALID_CODE_POINT)'; |
||||
|
} else { |
||||
|
var l = document.createElement("a"); |
||||
|
l.href = "http://example.com/?" + String.fromCodePoint(code) + "#"; |
||||
|
var bytes = []; |
||||
|
let url = l.search.substr(1); |
||||
|
for (let a = 0; a < url.length; a++) { |
||||
|
if ((url.charAt(a) == "%" && url.substr(a, 6) == "%26%23") || url.charAt(a) == "&") { |
||||
|
// character cannot be encoded
|
||||
|
if (fatal) { |
||||
|
return 'new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)'; |
||||
|
} else { |
||||
|
return decodeURIComponent(url); |
||||
|
} |
||||
|
} else if (url.charAt(a) == "%") { |
||||
|
bytes.push(url.charAt(a + 1) + url.charAt(a + 2)); |
||||
|
a = a + 2; |
||||
|
} else { |
||||
|
bytes.push(url.charCodeAt(a).toString(16).padStart(2, "0")); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
return bytes; |
||||
|
} |
||||
|
|
||||
|
function encodeCodePoints(codes, fatal) { |
||||
|
for (let a = 0; a < codes.length; a++) { |
||||
|
if (codes[a] < 0 || codes[a] > 0x10FFFF) { |
||||
|
return 'new EncoderException("", Coder::E_INVALID_CODE_POINT)'; |
||||
|
} |
||||
|
} |
||||
|
var l = document.createElement("a"); |
||||
|
l.href = "http://example.com/?" + String.fromCodePoint(...codes) + "#"; |
||||
|
var bytes = []; |
||||
|
let url = decodeURIComponent(l.search.substr(1)); |
||||
|
if (fatal && url.indexOf("&#") > -1) { |
||||
|
return 'new EncoderException("", Coder::E_UNAVAILABLE_CODE_POINT)'; |
||||
|
} |
||||
|
for (let a = 0; a < url.length; a++) { |
||||
|
bytes.push(url.charCodeAt(a).toString(16).padStart(2, "0").toUpperCase()); |
||||
|
} |
||||
|
return bytes; |
||||
|
} |
||||
|
|
||||
|
function wrapCodePoint(code, fatal) { |
||||
|
if (typeof code === "number") { |
||||
|
var out = encodeCodePoint(code, fatal); |
||||
|
} else { |
||||
|
var out = encodeCodePoints(code, fatal); |
||||
|
} |
||||
|
if (Array.isArray(out)) { |
||||
|
return ('"' + out.join(" ") + '"').toUpperCase(); |
||||
|
} else if (out.charAt(0) == "&") { |
||||
|
return 'bin2hex("' + out + '")'; |
||||
|
} else { |
||||
|
return out; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if(typeof sampleStrings != 'undefined') { |
||||
|
var decoder = new TextDecoder(encoding); |
||||
|
for (let name in sampleStrings) { |
||||
|
let input = sampleStrings[name].replace(/\s/g, ""); |
||||
|
let bytes = []; |
||||
|
for (let a = 0; a < input.length; a = a + 2) { |
||||
|
bytes.push(parseInt(input.substr(a, 2), 16)); |
||||
|
} |
||||
|
let text = decoder.decode(new Uint8Array(bytes)); |
||||
|
let codes = []; |
||||
|
for (let a = 0; a < text.length; a++) { |
||||
|
let point = text.codePointAt(a); |
||||
|
if (point >= 55296 && point <= 57343) { |
||||
|
// non-BMP characters have trailing low surrogates in JavaScript strings
|
||||
|
continue; |
||||
|
} |
||||
|
codes.push(point); |
||||
|
} |
||||
|
codes = codes.join(", "); |
||||
|
bytes = sampleStrings[name]; |
||||
|
let line = "'" + name + "' => [" + '"' + bytes + '", [' + codes + "]],\n"; |
||||
|
out.appendChild(document.createTextNode(line)); |
||||
|
} |
||||
|
out.appendChild(document.createTextNode("\n\n")); |
||||
|
} |
||||
|
|
||||
|
if(typeof sampleCharacters != 'undefined') { |
||||
|
for (name in sampleCharacters) { |
||||
|
let code = sampleCharacters[name]; |
||||
|
if (typeof code == "number" && code > -1 && code % 1 == 0) { |
||||
|
var displayCode = "0x" + code.toString(16).toUpperCase(); |
||||
|
} else if (typeof code !== "number") { |
||||
|
var displayCode = [...code]; |
||||
|
for (let a = 0; a < displayCode.length; a++) { |
||||
|
if (displayCode[a] > -1 && displayCode[a] % 1 == 0) { |
||||
|
displayCode[a] = "0x" + displayCode[a].toString(16).toUpperCase(); |
||||
|
} |
||||
|
} |
||||
|
displayCode = "[" + displayCode.join(", ") + "]"; |
||||
|
} else { |
||||
|
var displayCode = code; |
||||
|
} |
||||
|
let line1 = "'" + name + " (HTML)' => [false, " + displayCode + ", " + wrapCodePoint(code, false) + "],\n"; |
||||
|
let line2 = "'" + name + " (fatal)' => [true, " + displayCode + ", " + wrapCodePoint(code, true) + "],\n"; |
||||
|
out.appendChild(document.createTextNode(line1)); |
||||
|
out.appendChild(document.createTextNode(line2)); |
||||
|
} |
||||
|
out.appendChild(document.createTextNode("\n\n")); |
||||
|
} |
||||
|
|
||||
|
if(typeof seekCodePoints != 'undefined') { |
||||
|
// first gather statistics on the encoding of the specified array of code points
|
||||
|
var stats = []; |
||||
|
var a = 0; |
||||
|
var offset = 0; |
||||
|
for (let b = 0; b < seekCodePoints.length; b++) { |
||||
|
let code = seekCodePoints[b]; |
||||
|
stats[a] = { |
||||
|
'code': code, |
||||
|
'offset': offset, |
||||
|
'length': 0, |
||||
|
'bytes': "", |
||||
|
}; |
||||
|
let bytes = encodeCodePoint(code, true); |
||||
|
if (Array.isArray(bytes)) { |
||||
|
stats[a].length = bytes.length; |
||||
|
stats[a].bytes = bytes.join("").toUpperCase(); |
||||
|
offset = offset + bytes.length; |
||||
|
} else { |
||||
|
stats[a].length = 1; |
||||
|
stats[a].bytes = "()"; |
||||
|
offset = offset + 1; |
||||
|
} |
||||
|
a++; |
||||
|
} |
||||
|
var end = [a, offset]; |
||||
|
// summarize the statistics in a comment
|
||||
|
var comment = "/*\n"; |
||||
|
for (let a = 0; a < stats.length; a++) { |
||||
|
let length = (stats[a].length == 1) ? "(1 byte) " : "(" + stats[a].length + " bytes)"; |
||||
|
comment = comment + " Char " + a + " U+" + stats[a].code.toString(16).padStart(4, "0").padEnd(6, " ").toUpperCase() + " " + length + " Offset " + stats[a].offset + "\n"; |
||||
|
} |
||||
|
comment = comment + " End of string at char " + end[0] + ", offset " + end[1] + "\n"; |
||||
|
comment = comment + "*/\n"; |
||||
|
// build the encoded byte string
|
||||
|
var bytes = []; |
||||
|
for (let char of stats) { |
||||
|
bytes.push(char.bytes); |
||||
|
} |
||||
|
bytes = 'protected $seekString = "' + bytes.join(" ") + '";' + "\n"; |
||||
|
// build the array of code points
|
||||
|
var codes = []; |
||||
|
for (let char of stats) { |
||||
|
codes.push("0x" + char.code.toString(16).toUpperCase()); |
||||
|
} |
||||
|
codes = 'protected $seekCodes = [' + codes.join(", ") + "];\n"; |
||||
|
// build the array of offsets
|
||||
|
var offs = []; |
||||
|
for (let char of stats) { |
||||
|
offs.push(char.offset); |
||||
|
} |
||||
|
offs.push(end[1]); |
||||
|
offs = 'protected $seekOffsets = [' + offs.join(", ") + "];\n"; |
||||
|
// output the results
|
||||
|
out.appendChild(document.createTextNode(comment)); |
||||
|
out.appendChild(document.createTextNode(bytes)); |
||||
|
out.appendChild(document.createTextNode(codes)); |
||||
|
out.appendChild(document.createTextNode(offs)); |
||||
|
} |
File diff suppressed because it is too large
@ -1,5 +1,5 @@ |
|||||
{ |
{ |
||||
"require": { |
"require": { |
||||
"phpunit/phpunit": "^6.5" |
"phpunit/phpunit": "^8.5 | ^9.0" |
||||
} |
} |
||||
} |
} |
||||
|
File diff suppressed because it is too large
@ -1,5 +1,5 @@ |
|||||
{ |
{ |
||||
"require": { |
"require": { |
||||
"consolidation/robo": "^1.1" |
"consolidation/robo": "^4.0" |
||||
} |
} |
||||
} |
} |
||||
|
File diff suppressed because it is too large
Loading…
Reference in new issue