/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
*/
public static function encode(int $codePoint, bool $fatal = true): string {
if ($codePoint <0||$codePoint> 0x10FFFF) {
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
*/
public static function encode(int $codePoint, bool $fatal = true): string {
if ($codePoint <0||$codePoint> 0x10FFFF) {
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);
* If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted
* @param bool $fatal If true, throw enceptions when encountering invalid input. If false, substitute U+FFFD REPLACEMENT CHARACTER instead
* @param bool $allowSurrogates If true, treats surrogate characters as valid input; this only affects UTF-8 and UTF-16 encodings
*/
public function __construct(string $string, bool $fatal = false);
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false);
/** Returns the current byte position of the decoder */
public function posByte(): int;
@ -40,15 +40,15 @@ interface Encoding {
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
* @return int|false
*/
public function nextCode();
/** Advance $distance characters through the string
*
* If $distance is negative, the operation will be performed in reverse
*
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
*
* @param int $distance The number of characters to advance. If negative, the operation will seek back toward the beginning of the string
*/
public function seek(int $distance): int;
@ -58,10 +58,16 @@ interface Encoding {
*/
public function rewind();
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer
*
* @param int $num The number of characters to retrieve
*/
public function peekChar(int $num = 1): string;
/** Retrieves the next $num code points from the string, without advancing the character pointer */
/** Retrieves the next $num code points from the string, without advancing the character pointer
*
* @param int $num The number of code points to retrieve
*/
public function peekCode(int $num = 1): array;
/** Calculates the length of the string in bytes */
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
*/
public static function encode(int $codePoint, bool $fatal = true): string {
if ($codePoint <0||$codePoint> 0x10FFFF) {
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);
* If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted
*/
public function __construct(string $string, bool $fatal = false) {
public $posErr = 0;
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) {
abstract class SingleByteEncoding implements StatelessEncoding {
use GenericEncoding;
/** Retrieve the next character in the string, in UTF-8 encoding
*
* The returned character may be a replacement character, or the empty string if the end of the string has been reached
*/
public function nextChar(): string {
// get the byte at the current position
$b = @$this->string[$this->posChar];
@ -29,12 +25,6 @@ abstract class SingleByteEncoding implements StatelessEncoding {
}
}
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
*/
public function nextCode() {
// get the byte at the current position
$b = @$this->string[$this->posChar];
@ -51,12 +41,6 @@ abstract class SingleByteEncoding implements StatelessEncoding {
}
}
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
*/
public static function encode(int $codePoint, bool $fatal = true): string {
if ($codePoint <0||$codePoint> 0x10FFFF) {
throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);
@ -67,12 +51,6 @@ abstract class SingleByteEncoding implements StatelessEncoding {
}
}
/** Advance $distance characters through the string
*
* If $distance is negative, the operation will be performed in reverse
*
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
*/
public function seek(int $distance): int {
if ($distance > 0) {
while ($this->posChar < $this->lenByte && $distance > 0) {
@ -92,20 +70,14 @@ abstract class SingleByteEncoding implements StatelessEncoding {
}
}
/** Returns the current byte position of the decoder */
public function posByte(): int {
return $this->posChar;
}
/** Calculates the length of the string in code points
*
* Note that this may involve processing to the end of the string
*/
public function lenChar(): int {
return $this->lenByte;
}
/** Returns whether the character pointer is at the end of the string */
@ -72,12 +68,6 @@ class UTF8 implements StatelessEncoding {
return $point;
}
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted. When encoding to UTF-8, all Unicode characters can be encoded, so the argument is ignored
*/
public static function encode(int $codePoint, bool $fatal = true): string {
// this function implements https://encoding.spec.whatwg.org/#utf-8-encoder