Browse Source

Optionally allow surrogates

Also removed unnecessary docblocks
span 0.6.0
J. King 4 years ago
parent
commit
200a310f72
  1. 7
      CHANGELOG
  2. 15
      lib/Encoding/Big5.php
  3. 16
      lib/Encoding/EUCKR.php
  4. 22
      lib/Encoding/Encoding.php
  5. 19
      lib/Encoding/GBCommon.php
  6. 39
      lib/Encoding/GenericEncoding.php
  7. 28
      lib/Encoding/SingleByteEncoding.php
  8. 19
      lib/Encoding/UTF16.php
  9. 16
      lib/Encoding/UTF8.php
  10. 8
      tests/cases/Encoding/TestBig5.php
  11. 8
      tests/cases/Encoding/TestEUCKR.php
  12. 8
      tests/cases/Encoding/TestGB18030.php
  13. 9
      tests/cases/Encoding/TestSingleByte.php
  14. 8
      tests/cases/Encoding/TestUTF16BE.php
  15. 17
      tests/cases/Encoding/TestUTF16LE.php
  16. 15
      tests/cases/Encoding/TestUTF8.php
  17. 8
      tests/cases/Encoding/TestXUserDefined.php
  18. 12
      tests/lib/DecoderTest.php

7
CHANGELOG

@ -1,3 +1,10 @@
Version 0.6.0 (2019-12-18)
==========================
New features:
- Added $allowSurrogates parameter to Encoding constructor
- Added posErr public instance property to Encoding
Version 0.5.0 (2019-12-13) Version 0.5.0 (2019-12-13)
========================== ==========================

15
lib/Encoding/Big5.php

@ -24,12 +24,6 @@ class Big5 implements StatelessEncoding {
protected $bufferedCode = 0; protected $bufferedCode = 0;
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
*/
public function nextCode() { public function nextCode() {
$this->posChar++; $this->posChar++;
if ($this->bufferedCode > 0) { if ($this->bufferedCode > 0) {
@ -70,8 +64,10 @@ class Big5 implements StatelessEncoding {
return $code; return $code;
} else { } else {
if ($b < 0x80) { if ($b < 0x80) {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar -1, --$this->posByte - 1]); return self::err($this->errMode, [$this->posChar -1, --$this->posByte - 1]);
} else { } else {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar -1, $this->posByte - 2]); return self::err($this->errMode, [$this->posChar -1, $this->posByte - 2]);
} }
} }
@ -85,16 +81,11 @@ class Big5 implements StatelessEncoding {
} else { } else {
// dirty EOF // dirty EOF
$this->dirtyEOF = 1; $this->dirtyEOF = 1;
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]); return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]);
} }
} }
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
*/
public static function encode(int $codePoint, bool $fatal = true): string { public static function encode(int $codePoint, bool $fatal = true): string {
if ($codePoint < 0 || $codePoint > 0x10FFFF) { if ($codePoint < 0 || $codePoint > 0x10FFFF) {
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT); throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);

16
lib/Encoding/EUCKR.php

@ -27,12 +27,6 @@ class EUCKR implements StatelessEncoding {
protected $dirtyEOF = 0; protected $dirtyEOF = 0;
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
*/
public function nextCode() { public function nextCode() {
$this->posChar++; $this->posChar++;
$lead = 0x00; $lead = 0x00;
@ -42,6 +36,7 @@ class EUCKR implements StatelessEncoding {
if ($b < 0x80) { if ($b < 0x80) {
return $b; return $b;
} elseif ($b == 0x80 || $b == 0xFF) { } elseif ($b == 0x80 || $b == 0xFF) {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar -1, $this->posByte - 1]); return self::err($this->errMode, [$this->posChar -1, $this->posByte - 1]);
} else { } else {
$lead = $b; $lead = $b;
@ -57,8 +52,10 @@ class EUCKR implements StatelessEncoding {
return $code; return $code;
} else { } else {
if ($b < 0x80) { if ($b < 0x80) {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar -1, --$this->posByte - 1]); return self::err($this->errMode, [$this->posChar -1, --$this->posByte - 1]);
} else { } else {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar -1, $this->posByte - 2]); return self::err($this->errMode, [$this->posChar -1, $this->posByte - 2]);
} }
} }
@ -72,16 +69,11 @@ class EUCKR implements StatelessEncoding {
} else { } else {
// dirty EOF // dirty EOF
$this->dirtyEOF = 1; $this->dirtyEOF = 1;
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]); return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]);
} }
} }
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
*/
public static function encode(int $codePoint, bool $fatal = true): string { public static function encode(int $codePoint, bool $fatal = true): string {
if ($codePoint < 0 || $codePoint > 0x10FFFF) { if ($codePoint < 0 || $codePoint > 0x10FFFF) {
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT); throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);

22
lib/Encoding/Encoding.php

@ -19,10 +19,10 @@ interface Encoding {
const E_UNAVAILABLE_CODE_POINT = 4; const E_UNAVAILABLE_CODE_POINT = 4;
/** Constructs a new decoder /** Constructs a new decoder
* * @param bool $fatal If true, throw enceptions when encountering invalid input. If false, substitute U+FFFD REPLACEMENT CHARACTER instead
* If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted * @param bool $allowSurrogates If true, treats surrogate characters as valid input; this only affects UTF-8 and UTF-16 encodings
*/ */
public function __construct(string $string, bool $fatal = false); public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false);
/** Returns the current byte position of the decoder */ /** Returns the current byte position of the decoder */
public function posByte(): int; public function posByte(): int;
@ -40,15 +40,15 @@ interface Encoding {
* *
* If the end of the string has been reached, false is returned * If the end of the string has been reached, false is returned
* *
* @return int|bool * @return int|false
*/ */
public function nextCode(); public function nextCode();
/** Advance $distance characters through the string /** Advance $distance characters through the string
*
* If $distance is negative, the operation will be performed in reverse
* *
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned * If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
*
* @param int $distance The number of characters to advance. If negative, the operation will seek back toward the beginning of the string
*/ */
public function seek(int $distance): int; public function seek(int $distance): int;
@ -58,10 +58,16 @@ interface Encoding {
*/ */
public function rewind(); public function rewind();
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */ /** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer
*
* @param int $num The number of characters to retrieve
*/
public function peekChar(int $num = 1): string; public function peekChar(int $num = 1): string;
/** Retrieves the next $num code points from the string, without advancing the character pointer */ /** Retrieves the next $num code points from the string, without advancing the character pointer
*
* @param int $num The number of code points to retrieve
*/
public function peekCode(int $num = 1): array; public function peekCode(int $num = 1): array;
/** Calculates the length of the string in bytes */ /** Calculates the length of the string in bytes */

19
lib/Encoding/GBCommon.php

@ -15,12 +15,6 @@ abstract class GBCommon implements StatelessEncoding {
protected $dirtyEOF = 0; protected $dirtyEOF = 0;
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
*/
public function nextCode() { public function nextCode() {
$first = 0; $first = 0;
$second = 0; $second = 0;
@ -37,6 +31,7 @@ abstract class GBCommon implements StatelessEncoding {
$first = $b; $first = $b;
continue; continue;
} else { } else {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
} }
} elseif ($second === 0) { } elseif ($second === 0) {
@ -49,8 +44,10 @@ abstract class GBCommon implements StatelessEncoding {
$pointer = ($first - 0x81) * 190 + ($b - $offset); $pointer = ($first - 0x81) * 190 + ($b - $offset);
return self::TABLE_GBK[$pointer]; return self::TABLE_GBK[$pointer];
} elseif ($b < 0x80) { } elseif ($b < 0x80) {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, --$this->posByte]); return self::err($this->errMode, [$this->posChar - 1, --$this->posByte]);
} else { } else {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
} }
} }
@ -60,6 +57,7 @@ abstract class GBCommon implements StatelessEncoding {
continue; continue;
} else { } else {
$this->posByte -= 2; $this->posByte -= 2;
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
} }
} else { } else {
@ -79,10 +77,12 @@ abstract class GBCommon implements StatelessEncoding {
if (isset($codePointOffset)) { if (isset($codePointOffset)) {
return $codePointOffset + $pointer - $offset; return $codePointOffset + $pointer - $offset;
} else { } else {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
} }
} else { } else {
$this->posByte -= 3; $this->posByte -= 3;
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
} }
} }
@ -95,16 +95,11 @@ abstract class GBCommon implements StatelessEncoding {
} else { } else {
// dirty EOF; note how many bytes the last character had // dirty EOF; note how many bytes the last character had
$this->dirtyEOF = ($third ? 3 : ($second ? 2 : 1)); $this->dirtyEOF = ($third ? 3 : ($second ? 2 : 1));
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]); return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]);
} }
} }
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
*/
public static function encode(int $codePoint, bool $fatal = true): string { public static function encode(int $codePoint, bool $fatal = true): string {
if ($codePoint < 0 || $codePoint > 0x10FFFF) { if ($codePoint < 0 || $codePoint > 0x10FFFF) {
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT); throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);

39
lib/Encoding/GenericEncoding.php

@ -13,40 +13,30 @@ trait GenericEncoding {
protected $lenByte = null; protected $lenByte = null;
protected $lenChar = null; protected $lenChar = null;
protected $errMode = self::MODE_REPLACE; protected $errMode = self::MODE_REPLACE;
protected $allowSurrogates = false;
/** Constructs a new decoder public $posErr = 0;
*
* If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) {
*/
public function __construct(string $string, bool $fatal = false) {
$this->string = $string; $this->string = $string;
$this->lenByte = strlen($string); $this->lenByte = strlen($string);
$this->errMode = $fatal ? self::MODE_FATAL_DEC : self::MODE_REPLACE; $this->errMode = $fatal ? self::MODE_FATAL_DEC : self::MODE_REPLACE;
$this->allowSurrogates = $allowSurrogates;
} }
/** Returns the current byte position of the decoder */
public function posByte(): int { public function posByte(): int {
return $this->posByte; return $this->posByte;
} }
/** Returns the current character position of the decoder */
public function posChar(): int { public function posChar(): int {
return $this->posChar; return $this->posChar;
} }
/** Seeks to the start of the string
*
* This is usually faster than using the seek method for the same purpose
*/
public function rewind() { public function rewind() {
$this->posByte = 0; $this->posByte = 0;
$this->posChar = 0; $this->posChar = 0;
} }
/** Retrieve the next character in the string, in UTF-8 encoding
*
* The returned character may be a replacement character, or the empty string if the end of the string has been reached
*/
public function nextChar(): string { public function nextChar(): string {
// get the byte at the current position // get the byte at the current position
$b = @$this->string[$this->posByte]; $b = @$this->string[$this->posByte];
@ -64,12 +54,6 @@ trait GenericEncoding {
} }
} }
/** Advance $distance characters through the string
*
* If $distance is negative, the operation will be performed in reverse
*
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
*/
public function seek(int $distance): int { public function seek(int $distance): int {
if ($distance > 0) { if ($distance > 0) {
if ($this->posByte == strlen($this->string)) { if ($this->posByte == strlen($this->string)) {
@ -94,7 +78,6 @@ trait GenericEncoding {
} }
} }
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */
public function peekChar(int $num = 1): string { public function peekChar(int $num = 1): string {
$out = ""; $out = "";
$state = $this->stateSave(); $state = $this->stateSave();
@ -108,7 +91,6 @@ trait GenericEncoding {
return $out; return $out;
} }
/** Retrieves the next $num code points from the string, without advancing the character pointer */
public function peekCode(int $num = 1): array { public function peekCode(int $num = 1): array {
$out = []; $out = [];
$state = $this->stateSave(); $state = $this->stateSave();
@ -122,15 +104,10 @@ trait GenericEncoding {
return $out; return $out;
} }
/** Calculates the length of the string in bytes */
public function lenByte(): int { public function lenByte(): int {
return $this->lenByte; return $this->lenByte;
} }
/** Calculates the length of the string in code points
*
* Note that this may involve processing to the end of the string
*/
public function lenChar(): int { public function lenChar(): int {
return $this->lenChar ?? (function() { return $this->lenChar ?? (function() {
$state = $this->stateSave(); $state = $this->stateSave();
@ -141,19 +118,16 @@ trait GenericEncoding {
})(); })();
} }
/** Returns whether the character pointer is at the end of the string */
public function eof(): bool { public function eof(): bool {
return $this->posByte >= $this->lenByte; return $this->posByte >= $this->lenByte;
} }
/** Generates an iterator which steps through each character in the string */
public function chars(): \Generator { public function chars(): \Generator {
while (($c = $this->nextChar()) !== "") { while (($c = $this->nextChar()) !== "") {
yield ($this->posChar - 1) => $c; yield ($this->posChar - 1) => $c;
} }
} }
/** Generates an iterator which steps through each code point in the string */
public function codes(): \Generator { public function codes(): \Generator {
while (($c = $this->nextCode()) !== false) { while (($c = $this->nextCode()) !== false) {
yield ($this->posChar - 1) => $c; yield ($this->posChar - 1) => $c;
@ -165,6 +139,7 @@ trait GenericEncoding {
return [ return [
'posChar' => $this->posChar, 'posChar' => $this->posChar,
'posByte' => $this->posByte, 'posByte' => $this->posByte,
'posErr' => $this->posErr,
]; ];
} }
@ -191,7 +166,7 @@ trait GenericEncoding {
// fatal replacement mode for decoders // fatal replacement mode for decoders
throw new DecoderException("Invalid code sequence at character offset {$data[0]} (byte offset {$data[1]})", self::E_INVALID_BYTE); throw new DecoderException("Invalid code sequence at character offset {$data[0]} (byte offset {$data[1]})", self::E_INVALID_BYTE);
case self::MODE_FATAL_ENC: case self::MODE_FATAL_ENC:
// fatal replacement mode for decoders; not applicable to Unicode transformation formats // fatal replacement mode for encoders; not applicable to Unicode transformation formats
throw new EncoderException("Code point $data not available in target encoding", self::E_UNAVAILABLE_CODE_POINT); throw new EncoderException("Code point $data not available in target encoding", self::E_UNAVAILABLE_CODE_POINT);
default: default:
// indicative of internal bug; should never be triggered // indicative of internal bug; should never be triggered

28
lib/Encoding/SingleByteEncoding.php

@ -9,10 +9,6 @@ namespace MensBeam\Intl\Encoding;
abstract class SingleByteEncoding implements StatelessEncoding { abstract class SingleByteEncoding implements StatelessEncoding {
use GenericEncoding; use GenericEncoding;
/** Retrieve the next character in the string, in UTF-8 encoding
*
* The returned character may be a replacement character, or the empty string if the end of the string has been reached
*/
public function nextChar(): string { public function nextChar(): string {
// get the byte at the current position // get the byte at the current position
$b = @$this->string[$this->posChar]; $b = @$this->string[$this->posChar];
@ -29,12 +25,6 @@ abstract class SingleByteEncoding implements StatelessEncoding {
} }
} }
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
*/
public function nextCode() { public function nextCode() {
// get the byte at the current position // get the byte at the current position
$b = @$this->string[$this->posChar]; $b = @$this->string[$this->posChar];
@ -51,12 +41,6 @@ abstract class SingleByteEncoding implements StatelessEncoding {
} }
} }
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
*/
public static function encode(int $codePoint, bool $fatal = true): string { public static function encode(int $codePoint, bool $fatal = true): string {
if ($codePoint < 0 || $codePoint > 0x10FFFF) { if ($codePoint < 0 || $codePoint > 0x10FFFF) {
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT); throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);
@ -67,12 +51,6 @@ abstract class SingleByteEncoding implements StatelessEncoding {
} }
} }
/** Advance $distance characters through the string
*
* If $distance is negative, the operation will be performed in reverse
*
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
*/
public function seek(int $distance): int { public function seek(int $distance): int {
if ($distance > 0) { if ($distance > 0) {
while ($this->posChar < $this->lenByte && $distance > 0) { while ($this->posChar < $this->lenByte && $distance > 0) {
@ -92,20 +70,14 @@ abstract class SingleByteEncoding implements StatelessEncoding {
} }
} }
/** Returns the current byte position of the decoder */
public function posByte(): int { public function posByte(): int {
return $this->posChar; return $this->posChar;
} }
/** Calculates the length of the string in code points
*
* Note that this may involve processing to the end of the string
*/
public function lenChar(): int { public function lenChar(): int {
return $this->lenByte; return $this->lenByte;
} }
/** Returns whether the character pointer is at the end of the string */
public function eof(): bool { public function eof(): bool {
return $this->posChar >= $this->lenByte; return $this->posChar >= $this->lenByte;
} }

19
lib/Encoding/UTF16.php

@ -11,12 +11,6 @@ abstract class UTF16 implements Encoding {
protected $dirtyEOF = 0; protected $dirtyEOF = 0;
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
*/
public function nextCode() { public function nextCode() {
$lead_b = null; $lead_b = null;
$lead_s = null; $lead_s = null;
@ -36,6 +30,9 @@ abstract class UTF16 implements Encoding {
if (!is_null($lead_s)) { if (!is_null($lead_s)) {
if ($code >= 0xDC00 && $code <= 0xDFFF) { if ($code >= 0xDC00 && $code <= 0xDFFF) {
return 0x10000 + (($lead_s - 0xD800) << 10) + ($code - 0xDC00); return 0x10000 + (($lead_s - 0xD800) << 10) + ($code - 0xDC00);
} elseif ($this->allowSurrogates) {
$this->posByte -= 2;
return $lead_s;
} else { } else {
$this->posByte -= 2; $this->posByte -= 2;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]); return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]);
@ -45,7 +42,11 @@ abstract class UTF16 implements Encoding {
$lead_s = $code; $lead_s = $code;
continue; continue;
} elseif ($code >= 0xDC00 && $code <= 0xDFFF) { } elseif ($code >= 0xDC00 && $code <= 0xDFFF) {
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]); if ($this->allowSurrogates) {
return $code;
} else {
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]);
}
} else { } else {
return $code; return $code;
} }
@ -65,10 +66,6 @@ abstract class UTF16 implements Encoding {
} }
} }
/** Retrieve the next character in the string, in UTF-8 encoding
*
* The returned character may be a replacement character, or the empty string if the end of the string has been reached
*/
public function nextChar(): string { public function nextChar(): string {
// get the byte at the current position // get the byte at the current position
$b = @$this->string[$this->posByte]; $b = @$this->string[$this->posByte];

16
lib/Encoding/UTF8.php

@ -12,12 +12,6 @@ class UTF8 implements StatelessEncoding {
const NAME = "UTF-8"; const NAME = "UTF-8";
const LABELS = ["unicode-1-1-utf-8", "utf-8", "utf8"]; const LABELS = ["unicode-1-1-utf-8", "utf-8", "utf8"];
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
*/
public function nextCode() { public function nextCode() {
// this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder // this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder
// optimization for ASCII characters // optimization for ASCII characters
@ -46,7 +40,7 @@ class UTF8 implements StatelessEncoding {
if ($b==0xE0) { if ($b==0xE0) {
$lower = 0xA0; $lower = 0xA0;
} elseif ($b==0xED) { } elseif ($b==0xED) {
$upper = 0x9F; $upper = ($this->allowSurrogates) ? 0xBF : 0x9F;
} }
$point = $b & 0xF; $point = $b & 0xF;
} elseif ($b >= 0xF0 && $b <= 0xF4) { // four-byte character } elseif ($b >= 0xF0 && $b <= 0xF4) { // four-byte character
@ -58,9 +52,11 @@ class UTF8 implements StatelessEncoding {
} }
$point = $b & 0x7; $point = $b & 0x7;
} else { // invalid byte } else { // invalid byte
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar, $this->posByte]); return self::err($this->errMode, [$this->posChar, $this->posByte]);
} }
} elseif ($b < $lower || $b > $upper) { } elseif ($b < $lower || $b > $upper) {
$this->posErr = $this->posChar;
return self::err($this->errMode, [$this->posChar, $this->posByte--]); return self::err($this->errMode, [$this->posChar, $this->posByte--]);
} else { } else {
$lower = 0x80; $lower = 0x80;
@ -72,12 +68,6 @@ class UTF8 implements StatelessEncoding {
return $point; return $point;
} }
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted. When encoding to UTF-8, all Unicode characters can be encoded, so the argument is ignored
*/
public static function encode(int $codePoint, bool $fatal = true): string { public static function encode(int $codePoint, bool $fatal = true): string {
// this function implements https://encoding.spec.whatwg.org/#utf-8-encoder // this function implements https://encoding.spec.whatwg.org/#utf-8-encoder
if ($codePoint < 0 || $codePoint > 0x10FFFF) { if ($codePoint < 0 || $codePoint > 0x10FFFF) {

8
tests/cases/Encoding/TestBig5.php

@ -128,6 +128,14 @@ class TestBig5 extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testIterateThroughAString($input, $exp); return parent::testIterateThroughAString($input, $exp);
} }
/**
* @dataProvider provideStrings
* @coversNothing
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
public function provideCodePoints() { public function provideCodePoints() {
return [ return [
'U+0064 (HTML)' => [false, 0x64, "64"], 'U+0064 (HTML)' => [false, 0x64, "64"],

8
tests/cases/Encoding/TestEUCKR.php

@ -128,6 +128,14 @@ class TestEUCKR extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testIterateThroughAString($input, $exp); return parent::testIterateThroughAString($input, $exp);
} }
/**
* @dataProvider provideStrings
* @coversNothing
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
public function provideCodePoints() { public function provideCodePoints() {
return [ return [
'U+0064 (HTML)' => [false, 0x64, "64"], 'U+0064 (HTML)' => [false, 0x64, "64"],

8
tests/cases/Encoding/TestGB18030.php

@ -136,6 +136,14 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testIterateThroughAString($input, $exp); return parent::testIterateThroughAString($input, $exp);
} }
/**
* @dataProvider provideStrings
* @coversNothing
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
public function provideCodePoints() { public function provideCodePoints() {
// bytes confirmed using Firefox // bytes confirmed using Firefox
$series_gb18030 = [ $series_gb18030 = [

9
tests/cases/Encoding/TestSingleByte.php

@ -205,6 +205,15 @@ class TestSingleByte extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testIterateThroughAString($input, $exp); return parent::testIterateThroughAString($input, $exp);
} }
/**
* @dataProvider provideStrings
* @coversNothing
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $exp, $class = null) {
$this->testedClass = $class;
return parent::testIterateThroughAStringAllowingSurrogates($input, $exp, $exp);
}
public function provideClasses() { public function provideClasses() {
foreach (self::$classes as $name => $class) { foreach (self::$classes as $name => $class) {
yield $name => [$class]; yield $name => [$class];

8
tests/cases/Encoding/TestUTF16BE.php

@ -6,7 +6,6 @@
declare(strict_types=1); declare(strict_types=1);
namespace MensBeam\Intl\TestCase\Encoding; namespace MensBeam\Intl\TestCase\Encoding;
use MensBeam\Intl\Encoding\UTF16LE;
use MensBeam\Intl\Encoding\UTF16BE; use MensBeam\Intl\Encoding\UTF16BE;
class TestUTF16BE extends TestUTF16LE { class TestUTF16BE extends TestUTF16LE {
@ -30,7 +29,10 @@ class TestUTF16BE extends TestUTF16LE {
public function provideStrings() { public function provideStrings() {
foreach (parent::provideStrings() as $name => $test) { foreach (parent::provideStrings() as $name => $test) {
list($string, $codes) = $test; if (sizeof($test) == 2) {
$test[] = null;
}
list($string, $codes, $altCodes) = $test;
$words = explode(" ", $string); $words = explode(" ", $string);
foreach ($words as $a => $word) { foreach ($words as $a => $word) {
if (strlen($word) == 4) { if (strlen($word) == 4) {
@ -38,7 +40,7 @@ class TestUTF16BE extends TestUTF16LE {
} }
} }
$string = implode(" ", $words); $string = implode(" ", $words);
yield $name => [$string, $codes]; yield $name => [$string, $codes, $altCodes];
} }
} }
} }

17
tests/cases/Encoding/TestUTF16LE.php

@ -7,7 +7,6 @@ declare(strict_types=1);
namespace MensBeam\Intl\TestCase\Encoding; namespace MensBeam\Intl\TestCase\Encoding;
use MensBeam\Intl\Encoding\UTF16LE; use MensBeam\Intl\Encoding\UTF16LE;
use MensBeam\Intl\Encoding\UTF16BE;
class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest { class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
protected $testedClass = UTF16LE::class; protected $testedClass = UTF16LE::class;
@ -119,6 +118,14 @@ class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
return parent::testIterateThroughAString($input, $exp); return parent::testIterateThroughAString($input, $exp);
} }
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\UTF16::nextCode
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
public function provideStrings() { public function provideStrings() {
return [ return [
// control samples // control samples
@ -130,10 +137,10 @@ class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
'EOF after lead surrogate' => ["0000 34D8", [0, 65533]], 'EOF after lead surrogate' => ["0000 34D8", [0, 65533]],
'EOF in trail surrogate' => ["0000 34D8 1E", [0, 65533]], 'EOF in trail surrogate' => ["0000 34D8 1E", [0, 65533]],
// invalid UTF-16 surrogates // invalid UTF-16 surrogates
'lead surrogate without trail' => ["34D8 0000", [65533, 0]], 'lead surrogate without trail' => ["34D8 0000", [65533, 0], [0xD834, 0]],
'trail surrogate without lead' => ["1EDD 0000", [65533, 0]], 'trail surrogate without lead' => ["1EDD 0000", [65533, 0], [0xDD1E, 0]],
'double lead surrogate' => ["34D8 34D8 1EDD", [65533, 119070]], 'double lead surrogate' => ["34D8 34D8 1EDD", [65533, 119070], [0xD834, 119070]],
'double trail surrogate' => ["34D8 1EDD 1EDD", [119070, 65533]], 'double trail surrogate' => ["34D8 1EDD 1EDD", [119070, 65533], [119070, 0xDD1E]],
]; ];
} }
} }

15
tests/cases/Encoding/TestUTF8.php

@ -128,6 +128,14 @@ class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest {
return parent::testIterateThroughAString($input, $exp); return parent::testIterateThroughAString($input, $exp);
} }
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\UTF8::nextCode
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
public function provideCodePoints() { public function provideCodePoints() {
return [ return [
'U+007A (HTML)' => [false, 0x7A, "7A"], 'U+007A (HTML)' => [false, 0x7A, "7A"],
@ -190,9 +198,10 @@ class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest {
'overlong U+10FFFF - 5 bytes' => ["F8 84 8F BF BF", [65533, 65533, 65533, 65533, 65533]], 'overlong U+10FFFF - 5 bytes' => ["F8 84 8F BF BF", [65533, 65533, 65533, 65533, 65533]],
'overlong U+10FFFF - 6 bytes' => ["FC 80 84 8F BF BF", [65533, 65533, 65533, 65533, 65533, 65533]], 'overlong U+10FFFF - 6 bytes' => ["FC 80 84 8F BF BF", [65533, 65533, 65533, 65533, 65533, 65533]],
// UTF-16 surrogates // UTF-16 surrogates
'lead surrogate' => ["ED A0 80", [65533, 65533, 65533]], // surrogates have alternate outputs for when surrogates are being allowed
'trail surrogate' => ["ED B0 80", [65533, 65533, 65533]], 'lead surrogate' => ["ED A0 80", [65533, 65533, 65533], [0xD800]],
'surrogate pair' => ["ED A0 80 ED B0 80", [65533, 65533, 65533, 65533, 65533, 65533]], 'trail surrogate' => ["ED B0 80", [65533, 65533, 65533], [0xDC00]],
'surrogate pair' => ["ED A0 80 ED B0 80", [65533, 65533, 65533, 65533, 65533, 65533], [0xD800, 0xDC00]],
// self-sync edge cases // self-sync edge cases
'trailing continuation' => ["0A 80 80", [10, 65533, 65533]], 'trailing continuation' => ["0A 80 80", [10, 65533, 65533]],
'trailing continuation 2' => ["E5 8F A4 80", [21476, 65533]], 'trailing continuation 2' => ["E5 8F A4 80", [21476, 65533]],

8
tests/cases/Encoding/TestXUserDefined.php

@ -109,6 +109,14 @@ class TestXUserDefined extends \MensBeam\Intl\Test\DecoderTest {
return parent::testIterateThroughAString($input, $exp); return parent::testIterateThroughAString($input, $exp);
} }
/**
* @dataProvider provideStrings
* @coversNothing
*/
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
}
public function provideStrings() { public function provideStrings() {
$a_bytes = []; $a_bytes = [];
$a_codes = []; $a_codes = [];

12
tests/lib/DecoderTest.php

@ -281,10 +281,18 @@ abstract class DecoderTest extends \PHPUnit\Framework\TestCase {
} }
public function testIterateThroughAString(string $input, array $exp) { public function testIterateThroughAString(string $input, array $exp) {
$this->iterateThroughAString($input, $exp, false);
}
public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
$exp = $relaxedExp ?? $strictExp;
$this->iterateThroughAString($input, $exp, true);
}
protected function iterateThroughAString(string $input, array $exp, bool $allowSurrogates) {
$class = $this->testedClass; $class = $this->testedClass;
$input = $this->prepString($input); $input = $this->prepString($input);
$s = new $class($input); $s = new $class($input, false, $allowSurrogates);
$out = [];
$a = 0; $a = 0;
$this->assertTrue(true); // prevent risky test of empty string $this->assertTrue(true); // prevent risky test of empty string
foreach ($s->codes() as $index => $p) { foreach ($s->codes() as $index => $p) {

Loading…
Cancel
Save