Correct deficiencies in UTF-8 handling
Function now operates as defined by the WHATWG encoding standard; the practical implications of this are that: - More invalid sequences are correctly identified as invalid - Overlong encodings are normalized - ord() and chr() functions have been added as a consequence of this work
This commit is contained in:
parent
7d13a6c3b7
commit
30162e8525
1 changed files with 155 additions and 50 deletions
205
lib/UTF8.php
205
lib/UTF8.php
|
@ -30,42 +30,22 @@ abstract class UTF8 {
|
|||
return $b;
|
||||
} else {
|
||||
$errMode = $errMode ?? self::$errMode;
|
||||
// otherwise determine the byte-length of the UTF-8 character
|
||||
$l = self::l($b);
|
||||
if (!$l && $errMode==self::M_SKIP) {
|
||||
// if the byte is invalid and we're supposed to skip, advance the position and start over
|
||||
$pos++;
|
||||
goto start;
|
||||
} elseif (!$l && $errMode == self::M_REPLACE) {
|
||||
// otherwise determine the numeric code point of the character, as well as the position of the next character
|
||||
$p = self::ord($string, $pos, $next, self::M_REPLACE);
|
||||
if (is_int($p)) {
|
||||
// if the character is valid, return its serialization
|
||||
// we do a round trip (bytes > code point > bytes) to normalize overlong sequences
|
||||
return self::chr($p);
|
||||
} elseif ($errMode==self::M_REPLACE) {
|
||||
// if the byte is invalid and we're supposed to replace, return a replacement character
|
||||
$next = $pos + 1;
|
||||
return self::$replacementChar;
|
||||
} elseif (!$l) {
|
||||
} elseif ($errMode==self::M_SKIP) {
|
||||
// if the character is invalid and we're supposed to skip invalid characters, advance the position and start over
|
||||
$pos = $next;
|
||||
goto start;
|
||||
} else {
|
||||
// if the byte is invalid and we're supposed to halt, halt
|
||||
throw new \Exception;
|
||||
} else {
|
||||
// otherwise collect valid mid-sequence bytes into a buffer until the whole character is retrieved or an invalid byte is encountered
|
||||
$buffer = $b;
|
||||
do {
|
||||
$b = (++$pos < strlen($string)) ? $string[$pos] : "";
|
||||
if ($b >= "\x80" && $b <= "\xBF") {
|
||||
// if the byte is valid, add it to the buffer
|
||||
$buffer .= $b;
|
||||
} elseif ($errMode==self::M_SKIP) {
|
||||
// if the byte is invalid and we're supposed to skip, start over from the current position
|
||||
goto start;
|
||||
} elseif ($errMode==self::M_REPLACE) {
|
||||
// if the byte is invalid and we're supposed to replace, return a replacement character
|
||||
$next = $pos;
|
||||
return self::$replacementChar;
|
||||
} else {
|
||||
// if the byte is invalid and we're supposed to halt, halt
|
||||
throw new \Exception;
|
||||
}
|
||||
} while (strlen($buffer) < $l);
|
||||
// return the filled buffer and the position of the next byte
|
||||
$next = $pos + 1;
|
||||
return $buffer;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -121,19 +101,29 @@ abstract class UTF8 {
|
|||
} while (
|
||||
$b >= "\x80" && $b <= "\xBF" && // continuation bytes
|
||||
($t < 4 || $errMode==self::M_SKIP) && // stop after four bytes, unless we're skipping invalid sequences
|
||||
$pos // stop once the start of the string has been reached
|
||||
$pos > 0 // stop once the start of the string has been reached
|
||||
);
|
||||
// get the expected length of the character starting at the last examined byte
|
||||
$l = self::l($b);
|
||||
if ($l==$t) {
|
||||
// if the expected length matches the number of examined bytes, the character is valid
|
||||
// attempt to extract a code point at the current position
|
||||
$p = self::ord($string, $pos, $n, self::M_REPLACE);
|
||||
// if the position of the character after the one we just consumed is earlier than our start position,
|
||||
// then there was at least one invalid sequence between the consumed character and the start position
|
||||
if ($n < $s) {
|
||||
if ($errMode==self::M_SKIP) {
|
||||
// if we're supposed to skip invalid sequences, there is no need to do anything
|
||||
} elseif ($errMode==self::M_REPLACE) {
|
||||
// if we're supposed to replace invalid sequences, return the starting offset: it is itself a character
|
||||
return $s;
|
||||
} else {
|
||||
// otherwise if the character is invalid and we're expected to halt, halt
|
||||
throw new \Exception;
|
||||
}
|
||||
}
|
||||
// if the consumed character is valid, return the current position
|
||||
if (is_int($p)) {
|
||||
return $pos;
|
||||
} elseif ($errMode==self::M_SKIP) {
|
||||
// if we're expected to ignore invalid sequences:
|
||||
if ($l && $t > $l) {
|
||||
// if the last examined byte is the start of a sequence and we have more than the right amount of continuation characters, return the current position
|
||||
return $pos;
|
||||
} elseif (!$pos) {
|
||||
// if we're supposed to skip invalid sequences:
|
||||
if ($pos < 1) {
|
||||
// if we're already at the start of the string, give up
|
||||
return $pos;
|
||||
} else {
|
||||
|
@ -142,13 +132,10 @@ abstract class UTF8 {
|
|||
goto start;
|
||||
}
|
||||
} elseif ($errMode==self::M_REPLACE) {
|
||||
// if we're expected to treat invalid sequences as replacement characters, return
|
||||
// the offset of the most recently examined byte if it is the start of a multi-byte
|
||||
// sequence, or the starting offset otherwise: in the latter case the original byte
|
||||
// is itself a replacement character position
|
||||
return ($l > 1) ? $pos: $s;
|
||||
// if we're supposed to replace invalid sequences, return the current offset: we've synchronized
|
||||
return $pos;
|
||||
} else {
|
||||
// if the character is invalid and we're expected to halt, halt
|
||||
// otherwise if the character is invalid and we're expected to halt, halt
|
||||
throw new \Exception;
|
||||
}
|
||||
}
|
||||
|
@ -184,17 +171,135 @@ abstract class UTF8 {
|
|||
}
|
||||
}
|
||||
|
||||
/** Decodes the first UTF-8 character from a byte sequence into a numeric code point, starting at byte offset $pos
|
||||
*
|
||||
* Upon success, returns the numeric code point of the character, an integer between 0 and 1114111
|
||||
*
|
||||
* Upon error, returns false; if $char is the empty string or $pos is beyond the end of the string, null is returned
|
||||
*
|
||||
* $next is a variable in which to store the next byte offset at which a character starts
|
||||
*/
|
||||
public static function ord(string $string, int $pos = 0, &$next = null, int $errMode = null) {
|
||||
// this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder
|
||||
// though it differs from a slavish implementation because it operates on only a single
|
||||
// character rather than a whole stream
|
||||
$eof = strlen($string);
|
||||
start:
|
||||
$point = null;
|
||||
$seen = 0;
|
||||
$needed = 0;
|
||||
$next = $pos + 1;
|
||||
$lower = "\x80";
|
||||
$upper = "\xBF";
|
||||
while ($pos < $eof && !($needed && $seen >= $needed)) {
|
||||
$b = $string[$pos++];
|
||||
$next = $pos;
|
||||
$seen++;
|
||||
if(!$needed) {
|
||||
$needed = self::l($b);
|
||||
switch($needed) {
|
||||
case 1:
|
||||
$point = ord($b);
|
||||
break;
|
||||
case 2:
|
||||
$point = ord($b) & 0x1F;
|
||||
break;
|
||||
case 3:
|
||||
if ($b=="\xE0") {
|
||||
$lower = "\xA0";
|
||||
} elseif ($b=="\xED") {
|
||||
$upper = "\x9F";
|
||||
}
|
||||
$point = ord($b) & 0xF;
|
||||
break;
|
||||
case 4:
|
||||
if ($b=="\xF0") {
|
||||
$lower = "\x90";
|
||||
} elseif ($b=="\xF4") {
|
||||
$upper = "\x8F";
|
||||
}
|
||||
$point = ord($b) & 0x7;
|
||||
break;
|
||||
case 0:
|
||||
switch ($errMode ?? self::$errMode) {
|
||||
case self::M_SKIP:
|
||||
goto start;
|
||||
case self::M_REPLACE:
|
||||
return false;
|
||||
default:
|
||||
throw new \Exception;
|
||||
}
|
||||
break;
|
||||
}
|
||||
} elseif ($b < $lower || $b > $upper) {
|
||||
switch ($errMode ?? self::$errMode) {
|
||||
case self::M_SKIP:
|
||||
goto start;
|
||||
case self::M_REPLACE:
|
||||
return false;
|
||||
default:
|
||||
throw new \Exception;
|
||||
}
|
||||
} else {
|
||||
$lower = "\x80";
|
||||
$upper = "\xBF";
|
||||
$point = ($point << 6) | (ord($b) & 0x3F);
|
||||
}
|
||||
}
|
||||
if ($seen < $needed) {
|
||||
switch ($errMode ?? self::$errMode) {
|
||||
case self::M_SKIP:
|
||||
goto start;
|
||||
case self::M_REPLACE:
|
||||
return false;
|
||||
default:
|
||||
throw new \Exception;
|
||||
}
|
||||
} else {
|
||||
return $point;
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns the UTF-8 encoding of $codePoint
|
||||
*
|
||||
* If $codePoint is less than 0 or greater than 1114111, an empty string is returned
|
||||
*/
|
||||
public static function chr(int $codePoint): string {
|
||||
// this function implements https://encoding.spec.whatwg.org/#utf-8-encoder
|
||||
if ($codePoint < 0 || $codePoint > 0x10FFFF) {
|
||||
return "";
|
||||
} elseif ($codePoint < 128) {
|
||||
return chr($codePoint);
|
||||
} elseif ($codePoint < 0x800) {
|
||||
$count = 1;
|
||||
$offset = 0xC0;
|
||||
} elseif ($codePoint < 0x10000) {
|
||||
$count = 2;
|
||||
$offset = 0xE0;
|
||||
} else {
|
||||
$count = 3;
|
||||
$offset = 0xF0;
|
||||
}
|
||||
$bytes = chr(($codePoint >> (6 * $count)) + $offset);
|
||||
while ($count > 0) {
|
||||
$temp = $codePoint >> (6 * ($count - 1));
|
||||
$bytes .= chr(0x80 | ($temp & 0x3F));
|
||||
$count--;
|
||||
}
|
||||
return $bytes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the expected byte length of a UTF-8 character starting with byte $b
|
||||
*
|
||||
* If the byte is not the start of a UTF-8 sequence, 0 is returned
|
||||
*/
|
||||
protected static function l(string $b): int {
|
||||
if ($b >= "\xC0" && $b <= "\xDF") { // two-byte character
|
||||
if ($b >= "\xC2" && $b <= "\xDF") { // two-byte character
|
||||
return 2;
|
||||
} elseif ($b >= "\xE0" && $b <= "\xEF") { // three-byte character
|
||||
return 3;
|
||||
} elseif ($b >= "\xF0" && $b <= "\xF7") { // four-byte character
|
||||
} elseif ($b >= "\xF0" && $b <= "\xF4") { // four-byte character
|
||||
return 4;
|
||||
} elseif ($b < "\x80") { // ASCII byte: one-byte character
|
||||
return 1;
|
||||
|
|
Reference in a new issue