Optionally allow surrogates
Also removed unnecessary docblocks
This commit is contained in:
parent
2e47fde774
commit
200a310f72
18 changed files with 133 additions and 141 deletions
|
@ -1,3 +1,10 @@
|
|||
Version 0.6.0 (2019-12-18)
|
||||
==========================
|
||||
|
||||
New features:
|
||||
- Added $allowSurrogates parameter to Encoding constructor
|
||||
- Added posErr public instance property to Encoding
|
||||
|
||||
Version 0.5.0 (2019-12-13)
|
||||
==========================
|
||||
|
||||
|
|
|
@ -24,12 +24,6 @@ class Big5 implements StatelessEncoding {
|
|||
protected $bufferedCode = 0;
|
||||
|
||||
|
||||
/** Decodes the next character from the string and returns its code point number
|
||||
*
|
||||
* If the end of the string has been reached, false is returned
|
||||
*
|
||||
* @return int|bool
|
||||
*/
|
||||
public function nextCode() {
|
||||
$this->posChar++;
|
||||
if ($this->bufferedCode > 0) {
|
||||
|
@ -70,8 +64,10 @@ class Big5 implements StatelessEncoding {
|
|||
return $code;
|
||||
} else {
|
||||
if ($b < 0x80) {
|
||||
$this->posErr = $this->posChar;
|
||||
return self::err($this->errMode, [$this->posChar -1, --$this->posByte - 1]);
|
||||
} else {
|
||||
$this->posErr = $this->posChar;
|
||||
return self::err($this->errMode, [$this->posChar -1, $this->posByte - 2]);
|
||||
}
|
||||
}
|
||||
|
@ -85,16 +81,11 @@ class Big5 implements StatelessEncoding {
|
|||
} else {
|
||||
// dirty EOF
|
||||
$this->dirtyEOF = 1;
|
||||
$this->posErr = $this->posChar;
|
||||
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]);
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns the encoding of $codePoint as a byte string
|
||||
*
|
||||
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
|
||||
*
|
||||
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
|
||||
*/
|
||||
public static function encode(int $codePoint, bool $fatal = true): string {
|
||||
if ($codePoint < 0 || $codePoint > 0x10FFFF) {
|
||||
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);
|
||||
|
|
|
@ -27,12 +27,6 @@ class EUCKR implements StatelessEncoding {
|
|||
protected $dirtyEOF = 0;
|
||||
|
||||
|
||||
/** Decodes the next character from the string and returns its code point number
|
||||
*
|
||||
* If the end of the string has been reached, false is returned
|
||||
*
|
||||
* @return int|bool
|
||||
*/
|
||||
public function nextCode() {
|
||||
$this->posChar++;
|
||||
$lead = 0x00;
|
||||
|
@ -42,6 +36,7 @@ class EUCKR implements StatelessEncoding {
|
|||
if ($b < 0x80) {
|
||||
return $b;
|
||||
} elseif ($b == 0x80 || $b == 0xFF) {
|
||||
$this->posErr = $this->posChar;
|
||||
return self::err($this->errMode, [$this->posChar -1, $this->posByte - 1]);
|
||||
} else {
|
||||
$lead = $b;
|
||||
|
@ -57,8 +52,10 @@ class EUCKR implements StatelessEncoding {
|
|||
return $code;
|
||||
} else {
|
||||
if ($b < 0x80) {
|
||||
$this->posErr = $this->posChar;
|
||||
return self::err($this->errMode, [$this->posChar -1, --$this->posByte - 1]);
|
||||
} else {
|
||||
$this->posErr = $this->posChar;
|
||||
return self::err($this->errMode, [$this->posChar -1, $this->posByte - 2]);
|
||||
}
|
||||
}
|
||||
|
@ -72,16 +69,11 @@ class EUCKR implements StatelessEncoding {
|
|||
} else {
|
||||
// dirty EOF
|
||||
$this->dirtyEOF = 1;
|
||||
$this->posErr = $this->posChar;
|
||||
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]);
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns the encoding of $codePoint as a byte string
|
||||
*
|
||||
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
|
||||
*
|
||||
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
|
||||
*/
|
||||
public static function encode(int $codePoint, bool $fatal = true): string {
|
||||
if ($codePoint < 0 || $codePoint > 0x10FFFF) {
|
||||
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);
|
||||
|
|
|
@ -19,10 +19,10 @@ interface Encoding {
|
|||
const E_UNAVAILABLE_CODE_POINT = 4;
|
||||
|
||||
/** Constructs a new decoder
|
||||
*
|
||||
* If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted
|
||||
* @param bool $fatal If true, throw enceptions when encountering invalid input. If false, substitute U+FFFD REPLACEMENT CHARACTER instead
|
||||
* @param bool $allowSurrogates If true, treats surrogate characters as valid input; this only affects UTF-8 and UTF-16 encodings
|
||||
*/
|
||||
public function __construct(string $string, bool $fatal = false);
|
||||
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false);
|
||||
|
||||
/** Returns the current byte position of the decoder */
|
||||
public function posByte(): int;
|
||||
|
@ -40,15 +40,15 @@ interface Encoding {
|
|||
*
|
||||
* If the end of the string has been reached, false is returned
|
||||
*
|
||||
* @return int|bool
|
||||
* @return int|false
|
||||
*/
|
||||
public function nextCode();
|
||||
|
||||
/** Advance $distance characters through the string
|
||||
*
|
||||
* If $distance is negative, the operation will be performed in reverse
|
||||
*
|
||||
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
|
||||
*
|
||||
* @param int $distance The number of characters to advance. If negative, the operation will seek back toward the beginning of the string
|
||||
*/
|
||||
public function seek(int $distance): int;
|
||||
|
||||
|
@ -58,10 +58,16 @@ interface Encoding {
|
|||
*/
|
||||
public function rewind();
|
||||
|
||||
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */
|
||||
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer
|
||||
*
|
||||
* @param int $num The number of characters to retrieve
|
||||
*/
|
||||
public function peekChar(int $num = 1): string;
|
||||
|
||||
/** Retrieves the next $num code points from the string, without advancing the character pointer */
|
||||
/** Retrieves the next $num code points from the string, without advancing the character pointer
|
||||
*
|
||||
* @param int $num The number of code points to retrieve
|
||||
*/
|
||||
public function peekCode(int $num = 1): array;
|
||||
|
||||
/** Calculates the length of the string in bytes */
|
||||
|
|
|
@ -15,12 +15,6 @@ abstract class GBCommon implements StatelessEncoding {
|
|||
|
||||
protected $dirtyEOF = 0;
|
||||
|
||||
/** Decodes the next character from the string and returns its code point number
|
||||
*
|
||||
* If the end of the string has been reached, false is returned
|
||||
*
|
||||
* @return int|bool
|
||||
*/
|
||||
public function nextCode() {
|
||||
$first = 0;
|
||||
$second = 0;
|
||||
|
@ -37,6 +31,7 @@ abstract class GBCommon implements StatelessEncoding {
|
|||
$first = $b;
|
||||
continue;
|
||||
} else {
|
||||
$this->posErr = $this->posChar;
|
||||
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
|
||||
}
|
||||
} elseif ($second === 0) {
|
||||
|
@ -49,8 +44,10 @@ abstract class GBCommon implements StatelessEncoding {
|
|||
$pointer = ($first - 0x81) * 190 + ($b - $offset);
|
||||
return self::TABLE_GBK[$pointer];
|
||||
} elseif ($b < 0x80) {
|
||||
$this->posErr = $this->posChar;
|
||||
return self::err($this->errMode, [$this->posChar - 1, --$this->posByte]);
|
||||
} else {
|
||||
$this->posErr = $this->posChar;
|
||||
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
|
||||
}
|
||||
}
|
||||
|
@ -60,6 +57,7 @@ abstract class GBCommon implements StatelessEncoding {
|
|||
continue;
|
||||
} else {
|
||||
$this->posByte -= 2;
|
||||
$this->posErr = $this->posChar;
|
||||
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
|
||||
}
|
||||
} else {
|
||||
|
@ -79,10 +77,12 @@ abstract class GBCommon implements StatelessEncoding {
|
|||
if (isset($codePointOffset)) {
|
||||
return $codePointOffset + $pointer - $offset;
|
||||
} else {
|
||||
$this->posErr = $this->posChar;
|
||||
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
|
||||
}
|
||||
} else {
|
||||
$this->posByte -= 3;
|
||||
$this->posErr = $this->posChar;
|
||||
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
|
||||
}
|
||||
}
|
||||
|
@ -95,16 +95,11 @@ abstract class GBCommon implements StatelessEncoding {
|
|||
} else {
|
||||
// dirty EOF; note how many bytes the last character had
|
||||
$this->dirtyEOF = ($third ? 3 : ($second ? 2 : 1));
|
||||
$this->posErr = $this->posChar;
|
||||
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]);
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns the encoding of $codePoint as a byte string
|
||||
*
|
||||
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
|
||||
*
|
||||
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
|
||||
*/
|
||||
public static function encode(int $codePoint, bool $fatal = true): string {
|
||||
if ($codePoint < 0 || $codePoint > 0x10FFFF) {
|
||||
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);
|
||||
|
|
|
@ -13,40 +13,30 @@ trait GenericEncoding {
|
|||
protected $lenByte = null;
|
||||
protected $lenChar = null;
|
||||
protected $errMode = self::MODE_REPLACE;
|
||||
protected $allowSurrogates = false;
|
||||
|
||||
/** Constructs a new decoder
|
||||
*
|
||||
* If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted
|
||||
*/
|
||||
public function __construct(string $string, bool $fatal = false) {
|
||||
public $posErr = 0;
|
||||
|
||||
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) {
|
||||
$this->string = $string;
|
||||
$this->lenByte = strlen($string);
|
||||
$this->errMode = $fatal ? self::MODE_FATAL_DEC : self::MODE_REPLACE;
|
||||
$this->allowSurrogates = $allowSurrogates;
|
||||
}
|
||||
|
||||
/** Returns the current byte position of the decoder */
|
||||
public function posByte(): int {
|
||||
return $this->posByte;
|
||||
}
|
||||
|
||||
/** Returns the current character position of the decoder */
|
||||
public function posChar(): int {
|
||||
return $this->posChar;
|
||||
}
|
||||
|
||||
/** Seeks to the start of the string
|
||||
*
|
||||
* This is usually faster than using the seek method for the same purpose
|
||||
*/
|
||||
public function rewind() {
|
||||
$this->posByte = 0;
|
||||
$this->posChar = 0;
|
||||
}
|
||||
|
||||
/** Retrieve the next character in the string, in UTF-8 encoding
|
||||
*
|
||||
* The returned character may be a replacement character, or the empty string if the end of the string has been reached
|
||||
*/
|
||||
public function nextChar(): string {
|
||||
// get the byte at the current position
|
||||
$b = @$this->string[$this->posByte];
|
||||
|
@ -64,12 +54,6 @@ trait GenericEncoding {
|
|||
}
|
||||
}
|
||||
|
||||
/** Advance $distance characters through the string
|
||||
*
|
||||
* If $distance is negative, the operation will be performed in reverse
|
||||
*
|
||||
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
|
||||
*/
|
||||
public function seek(int $distance): int {
|
||||
if ($distance > 0) {
|
||||
if ($this->posByte == strlen($this->string)) {
|
||||
|
@ -94,7 +78,6 @@ trait GenericEncoding {
|
|||
}
|
||||
}
|
||||
|
||||
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */
|
||||
public function peekChar(int $num = 1): string {
|
||||
$out = "";
|
||||
$state = $this->stateSave();
|
||||
|
@ -108,7 +91,6 @@ trait GenericEncoding {
|
|||
return $out;
|
||||
}
|
||||
|
||||
/** Retrieves the next $num code points from the string, without advancing the character pointer */
|
||||
public function peekCode(int $num = 1): array {
|
||||
$out = [];
|
||||
$state = $this->stateSave();
|
||||
|
@ -122,15 +104,10 @@ trait GenericEncoding {
|
|||
return $out;
|
||||
}
|
||||
|
||||
/** Calculates the length of the string in bytes */
|
||||
public function lenByte(): int {
|
||||
return $this->lenByte;
|
||||
}
|
||||
|
||||
/** Calculates the length of the string in code points
|
||||
*
|
||||
* Note that this may involve processing to the end of the string
|
||||
*/
|
||||
public function lenChar(): int {
|
||||
return $this->lenChar ?? (function() {
|
||||
$state = $this->stateSave();
|
||||
|
@ -141,19 +118,16 @@ trait GenericEncoding {
|
|||
})();
|
||||
}
|
||||
|
||||
/** Returns whether the character pointer is at the end of the string */
|
||||
public function eof(): bool {
|
||||
return $this->posByte >= $this->lenByte;
|
||||
}
|
||||
|
||||
/** Generates an iterator which steps through each character in the string */
|
||||
public function chars(): \Generator {
|
||||
while (($c = $this->nextChar()) !== "") {
|
||||
yield ($this->posChar - 1) => $c;
|
||||
}
|
||||
}
|
||||
|
||||
/** Generates an iterator which steps through each code point in the string */
|
||||
public function codes(): \Generator {
|
||||
while (($c = $this->nextCode()) !== false) {
|
||||
yield ($this->posChar - 1) => $c;
|
||||
|
@ -165,6 +139,7 @@ trait GenericEncoding {
|
|||
return [
|
||||
'posChar' => $this->posChar,
|
||||
'posByte' => $this->posByte,
|
||||
'posErr' => $this->posErr,
|
||||
];
|
||||
}
|
||||
|
||||
|
@ -191,7 +166,7 @@ trait GenericEncoding {
|
|||
// fatal replacement mode for decoders
|
||||
throw new DecoderException("Invalid code sequence at character offset {$data[0]} (byte offset {$data[1]})", self::E_INVALID_BYTE);
|
||||
case self::MODE_FATAL_ENC:
|
||||
// fatal replacement mode for decoders; not applicable to Unicode transformation formats
|
||||
// fatal replacement mode for encoders; not applicable to Unicode transformation formats
|
||||
throw new EncoderException("Code point $data not available in target encoding", self::E_UNAVAILABLE_CODE_POINT);
|
||||
default:
|
||||
// indicative of internal bug; should never be triggered
|
||||
|
|
|
@ -9,10 +9,6 @@ namespace MensBeam\Intl\Encoding;
|
|||
abstract class SingleByteEncoding implements StatelessEncoding {
|
||||
use GenericEncoding;
|
||||
|
||||
/** Retrieve the next character in the string, in UTF-8 encoding
|
||||
*
|
||||
* The returned character may be a replacement character, or the empty string if the end of the string has been reached
|
||||
*/
|
||||
public function nextChar(): string {
|
||||
// get the byte at the current position
|
||||
$b = @$this->string[$this->posChar];
|
||||
|
@ -29,12 +25,6 @@ abstract class SingleByteEncoding implements StatelessEncoding {
|
|||
}
|
||||
}
|
||||
|
||||
/** Decodes the next character from the string and returns its code point number
|
||||
*
|
||||
* If the end of the string has been reached, false is returned
|
||||
*
|
||||
* @return int|bool
|
||||
*/
|
||||
public function nextCode() {
|
||||
// get the byte at the current position
|
||||
$b = @$this->string[$this->posChar];
|
||||
|
@ -51,12 +41,6 @@ abstract class SingleByteEncoding implements StatelessEncoding {
|
|||
}
|
||||
}
|
||||
|
||||
/** Returns the encoding of $codePoint as a byte string
|
||||
*
|
||||
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
|
||||
*
|
||||
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
|
||||
*/
|
||||
public static function encode(int $codePoint, bool $fatal = true): string {
|
||||
if ($codePoint < 0 || $codePoint > 0x10FFFF) {
|
||||
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);
|
||||
|
@ -67,12 +51,6 @@ abstract class SingleByteEncoding implements StatelessEncoding {
|
|||
}
|
||||
}
|
||||
|
||||
/** Advance $distance characters through the string
|
||||
*
|
||||
* If $distance is negative, the operation will be performed in reverse
|
||||
*
|
||||
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
|
||||
*/
|
||||
public function seek(int $distance): int {
|
||||
if ($distance > 0) {
|
||||
while ($this->posChar < $this->lenByte && $distance > 0) {
|
||||
|
@ -92,20 +70,14 @@ abstract class SingleByteEncoding implements StatelessEncoding {
|
|||
}
|
||||
}
|
||||
|
||||
/** Returns the current byte position of the decoder */
|
||||
public function posByte(): int {
|
||||
return $this->posChar;
|
||||
}
|
||||
|
||||
/** Calculates the length of the string in code points
|
||||
*
|
||||
* Note that this may involve processing to the end of the string
|
||||
*/
|
||||
public function lenChar(): int {
|
||||
return $this->lenByte;
|
||||
}
|
||||
|
||||
/** Returns whether the character pointer is at the end of the string */
|
||||
public function eof(): bool {
|
||||
return $this->posChar >= $this->lenByte;
|
||||
}
|
||||
|
|
|
@ -11,12 +11,6 @@ abstract class UTF16 implements Encoding {
|
|||
|
||||
protected $dirtyEOF = 0;
|
||||
|
||||
/** Decodes the next character from the string and returns its code point number
|
||||
*
|
||||
* If the end of the string has been reached, false is returned
|
||||
*
|
||||
* @return int|bool
|
||||
*/
|
||||
public function nextCode() {
|
||||
$lead_b = null;
|
||||
$lead_s = null;
|
||||
|
@ -36,6 +30,9 @@ abstract class UTF16 implements Encoding {
|
|||
if (!is_null($lead_s)) {
|
||||
if ($code >= 0xDC00 && $code <= 0xDFFF) {
|
||||
return 0x10000 + (($lead_s - 0xD800) << 10) + ($code - 0xDC00);
|
||||
} elseif ($this->allowSurrogates) {
|
||||
$this->posByte -= 2;
|
||||
return $lead_s;
|
||||
} else {
|
||||
$this->posByte -= 2;
|
||||
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]);
|
||||
|
@ -45,7 +42,11 @@ abstract class UTF16 implements Encoding {
|
|||
$lead_s = $code;
|
||||
continue;
|
||||
} elseif ($code >= 0xDC00 && $code <= 0xDFFF) {
|
||||
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]);
|
||||
if ($this->allowSurrogates) {
|
||||
return $code;
|
||||
} else {
|
||||
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]);
|
||||
}
|
||||
} else {
|
||||
return $code;
|
||||
}
|
||||
|
@ -65,10 +66,6 @@ abstract class UTF16 implements Encoding {
|
|||
}
|
||||
}
|
||||
|
||||
/** Retrieve the next character in the string, in UTF-8 encoding
|
||||
*
|
||||
* The returned character may be a replacement character, or the empty string if the end of the string has been reached
|
||||
*/
|
||||
public function nextChar(): string {
|
||||
// get the byte at the current position
|
||||
$b = @$this->string[$this->posByte];
|
||||
|
|
|
@ -12,12 +12,6 @@ class UTF8 implements StatelessEncoding {
|
|||
const NAME = "UTF-8";
|
||||
const LABELS = ["unicode-1-1-utf-8", "utf-8", "utf8"];
|
||||
|
||||
/** Decodes the next character from the string and returns its code point number
|
||||
*
|
||||
* If the end of the string has been reached, false is returned
|
||||
*
|
||||
* @return int|bool
|
||||
*/
|
||||
public function nextCode() {
|
||||
// this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder
|
||||
// optimization for ASCII characters
|
||||
|
@ -46,7 +40,7 @@ class UTF8 implements StatelessEncoding {
|
|||
if ($b==0xE0) {
|
||||
$lower = 0xA0;
|
||||
} elseif ($b==0xED) {
|
||||
$upper = 0x9F;
|
||||
$upper = ($this->allowSurrogates) ? 0xBF : 0x9F;
|
||||
}
|
||||
$point = $b & 0xF;
|
||||
} elseif ($b >= 0xF0 && $b <= 0xF4) { // four-byte character
|
||||
|
@ -58,9 +52,11 @@ class UTF8 implements StatelessEncoding {
|
|||
}
|
||||
$point = $b & 0x7;
|
||||
} else { // invalid byte
|
||||
$this->posErr = $this->posChar;
|
||||
return self::err($this->errMode, [$this->posChar, $this->posByte]);
|
||||
}
|
||||
} elseif ($b < $lower || $b > $upper) {
|
||||
$this->posErr = $this->posChar;
|
||||
return self::err($this->errMode, [$this->posChar, $this->posByte--]);
|
||||
} else {
|
||||
$lower = 0x80;
|
||||
|
@ -72,12 +68,6 @@ class UTF8 implements StatelessEncoding {
|
|||
return $point;
|
||||
}
|
||||
|
||||
/** Returns the encoding of $codePoint as a byte string
|
||||
*
|
||||
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
|
||||
*
|
||||
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted. When encoding to UTF-8, all Unicode characters can be encoded, so the argument is ignored
|
||||
*/
|
||||
public static function encode(int $codePoint, bool $fatal = true): string {
|
||||
// this function implements https://encoding.spec.whatwg.org/#utf-8-encoder
|
||||
if ($codePoint < 0 || $codePoint > 0x10FFFF) {
|
||||
|
|
|
@ -128,6 +128,14 @@ class TestBig5 extends \MensBeam\Intl\Test\CoderDecoderTest {
|
|||
return parent::testIterateThroughAString($input, $exp);
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider provideStrings
|
||||
* @coversNothing
|
||||
*/
|
||||
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
|
||||
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
|
||||
}
|
||||
|
||||
public function provideCodePoints() {
|
||||
return [
|
||||
'U+0064 (HTML)' => [false, 0x64, "64"],
|
||||
|
|
|
@ -128,6 +128,14 @@ class TestEUCKR extends \MensBeam\Intl\Test\CoderDecoderTest {
|
|||
return parent::testIterateThroughAString($input, $exp);
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider provideStrings
|
||||
* @coversNothing
|
||||
*/
|
||||
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
|
||||
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
|
||||
}
|
||||
|
||||
public function provideCodePoints() {
|
||||
return [
|
||||
'U+0064 (HTML)' => [false, 0x64, "64"],
|
||||
|
|
|
@ -136,6 +136,14 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest {
|
|||
return parent::testIterateThroughAString($input, $exp);
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider provideStrings
|
||||
* @coversNothing
|
||||
*/
|
||||
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
|
||||
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
|
||||
}
|
||||
|
||||
public function provideCodePoints() {
|
||||
// bytes confirmed using Firefox
|
||||
$series_gb18030 = [
|
||||
|
|
|
@ -205,6 +205,15 @@ class TestSingleByte extends \MensBeam\Intl\Test\CoderDecoderTest {
|
|||
return parent::testIterateThroughAString($input, $exp);
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider provideStrings
|
||||
* @coversNothing
|
||||
*/
|
||||
public function testIterateThroughAStringAllowingSurrogates(string $input, array $exp, $class = null) {
|
||||
$this->testedClass = $class;
|
||||
return parent::testIterateThroughAStringAllowingSurrogates($input, $exp, $exp);
|
||||
}
|
||||
|
||||
public function provideClasses() {
|
||||
foreach (self::$classes as $name => $class) {
|
||||
yield $name => [$class];
|
||||
|
|
|
@ -6,7 +6,6 @@
|
|||
declare(strict_types=1);
|
||||
namespace MensBeam\Intl\TestCase\Encoding;
|
||||
|
||||
use MensBeam\Intl\Encoding\UTF16LE;
|
||||
use MensBeam\Intl\Encoding\UTF16BE;
|
||||
|
||||
class TestUTF16BE extends TestUTF16LE {
|
||||
|
@ -30,7 +29,10 @@ class TestUTF16BE extends TestUTF16LE {
|
|||
|
||||
public function provideStrings() {
|
||||
foreach (parent::provideStrings() as $name => $test) {
|
||||
list($string, $codes) = $test;
|
||||
if (sizeof($test) == 2) {
|
||||
$test[] = null;
|
||||
}
|
||||
list($string, $codes, $altCodes) = $test;
|
||||
$words = explode(" ", $string);
|
||||
foreach ($words as $a => $word) {
|
||||
if (strlen($word) == 4) {
|
||||
|
@ -38,7 +40,7 @@ class TestUTF16BE extends TestUTF16LE {
|
|||
}
|
||||
}
|
||||
$string = implode(" ", $words);
|
||||
yield $name => [$string, $codes];
|
||||
yield $name => [$string, $codes, $altCodes];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,7 +7,6 @@ declare(strict_types=1);
|
|||
namespace MensBeam\Intl\TestCase\Encoding;
|
||||
|
||||
use MensBeam\Intl\Encoding\UTF16LE;
|
||||
use MensBeam\Intl\Encoding\UTF16BE;
|
||||
|
||||
class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
|
||||
protected $testedClass = UTF16LE::class;
|
||||
|
@ -119,6 +118,14 @@ class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
|
|||
return parent::testIterateThroughAString($input, $exp);
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider provideStrings
|
||||
* @covers MensBeam\Intl\Encoding\UTF16::nextCode
|
||||
*/
|
||||
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
|
||||
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
|
||||
}
|
||||
|
||||
public function provideStrings() {
|
||||
return [
|
||||
// control samples
|
||||
|
@ -130,10 +137,10 @@ class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
|
|||
'EOF after lead surrogate' => ["0000 34D8", [0, 65533]],
|
||||
'EOF in trail surrogate' => ["0000 34D8 1E", [0, 65533]],
|
||||
// invalid UTF-16 surrogates
|
||||
'lead surrogate without trail' => ["34D8 0000", [65533, 0]],
|
||||
'trail surrogate without lead' => ["1EDD 0000", [65533, 0]],
|
||||
'double lead surrogate' => ["34D8 34D8 1EDD", [65533, 119070]],
|
||||
'double trail surrogate' => ["34D8 1EDD 1EDD", [119070, 65533]],
|
||||
'lead surrogate without trail' => ["34D8 0000", [65533, 0], [0xD834, 0]],
|
||||
'trail surrogate without lead' => ["1EDD 0000", [65533, 0], [0xDD1E, 0]],
|
||||
'double lead surrogate' => ["34D8 34D8 1EDD", [65533, 119070], [0xD834, 119070]],
|
||||
'double trail surrogate' => ["34D8 1EDD 1EDD", [119070, 65533], [119070, 0xDD1E]],
|
||||
];
|
||||
}
|
||||
}
|
||||
|
|
|
@ -128,6 +128,14 @@ class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest {
|
|||
return parent::testIterateThroughAString($input, $exp);
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider provideStrings
|
||||
* @covers MensBeam\Intl\Encoding\UTF8::nextCode
|
||||
*/
|
||||
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
|
||||
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
|
||||
}
|
||||
|
||||
public function provideCodePoints() {
|
||||
return [
|
||||
'U+007A (HTML)' => [false, 0x7A, "7A"],
|
||||
|
@ -190,9 +198,10 @@ class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest {
|
|||
'overlong U+10FFFF - 5 bytes' => ["F8 84 8F BF BF", [65533, 65533, 65533, 65533, 65533]],
|
||||
'overlong U+10FFFF - 6 bytes' => ["FC 80 84 8F BF BF", [65533, 65533, 65533, 65533, 65533, 65533]],
|
||||
// UTF-16 surrogates
|
||||
'lead surrogate' => ["ED A0 80", [65533, 65533, 65533]],
|
||||
'trail surrogate' => ["ED B0 80", [65533, 65533, 65533]],
|
||||
'surrogate pair' => ["ED A0 80 ED B0 80", [65533, 65533, 65533, 65533, 65533, 65533]],
|
||||
// surrogates have alternate outputs for when surrogates are being allowed
|
||||
'lead surrogate' => ["ED A0 80", [65533, 65533, 65533], [0xD800]],
|
||||
'trail surrogate' => ["ED B0 80", [65533, 65533, 65533], [0xDC00]],
|
||||
'surrogate pair' => ["ED A0 80 ED B0 80", [65533, 65533, 65533, 65533, 65533, 65533], [0xD800, 0xDC00]],
|
||||
// self-sync edge cases
|
||||
'trailing continuation' => ["0A 80 80", [10, 65533, 65533]],
|
||||
'trailing continuation 2' => ["E5 8F A4 80", [21476, 65533]],
|
||||
|
|
|
@ -109,6 +109,14 @@ class TestXUserDefined extends \MensBeam\Intl\Test\DecoderTest {
|
|||
return parent::testIterateThroughAString($input, $exp);
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider provideStrings
|
||||
* @coversNothing
|
||||
*/
|
||||
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
|
||||
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
|
||||
}
|
||||
|
||||
public function provideStrings() {
|
||||
$a_bytes = [];
|
||||
$a_codes = [];
|
||||
|
|
|
@ -281,10 +281,18 @@ abstract class DecoderTest extends \PHPUnit\Framework\TestCase {
|
|||
}
|
||||
|
||||
public function testIterateThroughAString(string $input, array $exp) {
|
||||
$this->iterateThroughAString($input, $exp, false);
|
||||
}
|
||||
|
||||
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
|
||||
$exp = $relaxedExp ?? $strictExp;
|
||||
$this->iterateThroughAString($input, $exp, true);
|
||||
}
|
||||
|
||||
protected function iterateThroughAString(string $input, array $exp, bool $allowSurrogates) {
|
||||
$class = $this->testedClass;
|
||||
$input = $this->prepString($input);
|
||||
$s = new $class($input);
|
||||
$out = [];
|
||||
$s = new $class($input, false, $allowSurrogates);
|
||||
$a = 0;
|
||||
$this->assertTrue(true); // prevent risky test of empty string
|
||||
foreach ($s->codes() as $index => $p) {
|
||||
|
|
Loading…
Reference in a new issue