From fd8c333a68819590102a66ef3ab2209e4916b11e Mon Sep 17 00:00:00 2001 From: "J. King" Date: Tue, 10 Apr 2018 17:58:09 -0400 Subject: [PATCH] Split off UTF-8 processing into its own class, greately expanded Also simplified some parts of the algorithm implementation Part of this simplification involves the use of goto statements --- lib/URI.php | 100 ++++--------------------- lib/UTF8.php | 207 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 220 insertions(+), 87 deletions(-) create mode 100644 lib/UTF8.php diff --git a/lib/URI.php b/lib/URI.php index ea44399..4b7afbd 100644 --- a/lib/URI.php +++ b/lib/URI.php @@ -426,10 +426,7 @@ class URI { # Let the @ flag, [] flag, and passwordTokenSeenFlag be unset. $flagAtSign = $flagSquareBracket = $flagPasswordTokenSeen = false; # Let pointer be a pointer to first code point in input. - // DEVIATION: we operate on byte strings: $pos is the byte offset of the character referred to by $pointer; - // $posPrev is the byte offset of the start of the previous character i.e. ($pointer - 1) - $pointer = 0; - $posPrev = $pos; + // we operate on byte strings: $pos is the byte offset of the character referred to by $pointer $pos = 0; # Keep running the following state machine by switching on state. # If after a run pointer points to the EOF code point, go to the next step. @@ -437,10 +434,12 @@ class URI { // Note: the state machine is designed to run once even with an empty string do { # Within a parser algorithm that uses a pointer variable, c references the code point the pointer variable points to. - // DEVIATION: we operate on byte strings: $pos is the byte offset of the character referred to by $pointer; + // we operate on byte strings: $pos is the byte offset of the character referred to by $pointer; // $posNext is the start of "remaining" i.e. the offset of the next UTF-8 character - // $posPrev is the byte offset of the start of the previous character i.e. ($pointer - 1) - list($c, $posNext) = $this->getChar($input, $pos); + $c = UTF8::get($input, $pos, $posNext); + // when the algorithm specifies to decrease the pointer by one, the result is to reprocess the current character; we + // accomplish this by going back to this label, which skips the increment at the end of each iteration + processChar: // switch on state switch ($state) { # scheme start state @@ -452,8 +451,7 @@ class URI { } elseif (!$stateOverride) { # Otherwise, if state override is not given, set state to no scheme state, and decrease pointer by one. $state = self::ST_NO_SCHEME; - $pos = $posPrev; - $pointer--; + goto processChar; } else { # Otherwise, validation error, return failure. # NOTE: This indication of failure is used exclusively by Location object’s protocol attribute. @@ -527,7 +525,7 @@ class URI { $state = self::ST_NO_SCHEME; $pos = 0; $pointer = 0; - continue 2; + goto processChar; } else { # Otherwise, validation error, return failure. # NOTE: This indication of failure is used exclusively by Location object’s protocol attribute. Furthermore, the non-failure termination earlier in this state is an intentional difference for defining that attribute. @@ -560,13 +558,11 @@ class URI { } elseif ($base->scheme != "file") { # Otherwise, if base’s scheme is not "file", set state to relative state and decrease pointer by one. $state = self::ST_RELATIVE; - $pos = $posPrev; - $pointer--; + goto processChar; } else { # Otherwise, set state to file state and decrease pointer by one. $state = self::ST_FILE; - $pos = $posPrev; - $pointer--; + goto processChar; } break; # special relative or authority state @@ -577,8 +573,7 @@ class URI { } else { # Otherwise, validation error, set state to relative state and decrease pointer by one. $state = self::ST_RELATIVE; - $pos = $posPrev; - $pointer--; + goto processChar; } break; # path or authority state @@ -589,8 +584,7 @@ class URI { } else { # Otherwise, set state to path state, and decrease pointer by one. $state = self::ST_PATH; - $pos = $posPrev; - $pointer--; + goto processChar; } break; // invalid or unimplemented state @@ -600,10 +594,8 @@ class URI { } # If after a run pointer points to the EOF code point, go to the next step. # Otherwise, increase pointer by one and continue with the state machine. - // DEVIATION: we operate on byte strings: $pos is the byte offset of the character referred to by $pointer; + // we operate on byte strings: $pos is the byte offset of the character referred to by $pointer; // $posNext is the start of "remaining" i.e. the offset of the next UTF-8 character - // $posPrev is the byte offset of the start of the previous character i.e. ($pointer - 1) - $posPrev = $pos; $pos = $posNext; $pointer++; } while ($pos <= $eof); @@ -632,70 +624,4 @@ class URI { throw new \Exception; } } - - /** Returns the UTF-8 character at byte offset $pos (which could possibly be a replacement charcter) along with the byte offset of the next character */ - protected function getChar(string $input, int $pos, bool $throwOnError = false, $replacementChar = "\u{FFFD}"): array { - // get the byte at the specified position - $b = ($pos < strlen($input)) ? $input[$pos] : ""; - if ($b < "\x80" || $b=="") { - // if the byte is an ASCII character or end of input, simply return it - return [$b, $pos + 1]; - } else { - // otherwise determine the byte-length of the UTF-8 character - $l = $this->getCharLength($b); - if (!$l && $throwOnError) { - // if the byte is invalid and we're supposed to halt, halt - throw new \Exception; - } elseif (!$l) { - // if the byte is invalid and we're supposed to continue, skip any further invalid bytes and return a replacement character instead - do { - $l = $this->getCharLength($input[++$pos]); - } while (!$l); - return [$replacementChar, $pos]; - } else { - // otherwise collect valid mid-sequence bytes into a buffer until the whole character is retrieved or an invalid byte is encountered - $buffer = $b; - do { - $b = (++$pos < strlen($input)) ? $input[$pos] : ""; - if ($b >= "\x80" && $b <= "\xBF") { - // if the byte is valid, add it to the buffer - $buffer .= $b; - } elseif ($throwOnError) { - // if the byte is invalid and we're supposed to halt, halt - throw new \Exception; - } else { - // if the byte is invalid and we're supposed to continue, go back one byte and skip any bytes which are not sequence-start bytes, then return a replacement character - $pos--; - do { - $l = $this->getCharLength($input[++$pos]); - } while (!$l); - return [$replacementChar, $pos]; - } - } while (strlen($buffer) < $l); - // return the filled buffer and the position of the next byte - return [$buffer, $pos + 1]; - } - } - } - - /** - * Returns the total expected length of the UTF-8 character starting with byte $b - * - * If the byte is not the start of a UTF-8 sequence, 0 is returned - */ - protected function getCharLength(string $b): int { - if ($b >= "\xC0" && $b <= "\xDF") { // two-byte character - return 2; - } elseif ($b >= "\xE0" && $b <= "\xEF") { // three-byte character - return 3; - } elseif ($b >= "\xF0" && $b <= "\xF7") { // four-byte character - return 4; - } elseif ($b < "\x80") { // ASCII byte: one-byte character - return 1; - } elseif ($b == "") { // end of input: pretend it's a valid single-byte character - return 1; - } else { // invalid byte - return 0; - } - } } diff --git a/lib/UTF8.php b/lib/UTF8.php new file mode 100644 index 0000000..33bfc88 --- /dev/null +++ b/lib/UTF8.php @@ -0,0 +1,207 @@ += "\x80" && $b <= "\xBF") { + // if the byte is valid, add it to the buffer + $buffer .= $b; + } elseif ($errMode==self::M_SKIP) { + // if the byte is invalid and we're supposed to skip, start over from the current position + goto start; + } elseif ($errMode==self::M_REPLACE) { + // if the byte is invalid and we're supposed to replace, return a replacement character + $next = $pos; + return self::$replacementChar; + } else { + // if the byte is invalid and we're supposed to halt, halt + throw new \Exception; + } + } while (strlen($buffer) < $l); + // return the filled buffer and the position of the next byte + $next = $pos + 1; + return $buffer; + } + } + } + + /** Starting from byte offset $pos, advance $num characters through $string and return the byte offset of the found character + * + * If $num is negative, the operation will be performed in reverse + * + * If $pos is omitted, the start of the string will be used for a forward seek, and the end for a reverse seek + */ + public static function seek(string $string, int $num, int $pos = null, int $errMode = null): int { + $errMode = $errMode ?? self::$errMode; + if ($num > 0) { + $pos = $pos ?? 0; + do { + $c = self::get($string, $pos, $pos, $errMode); // the current position is getting overwritten with the next position, by reference + } while (--$num && $c != ""); // stop after we have skipped the desired number of characters, or reached EOF + return $pos; + } elseif ($num < 0) { + $pos = $pos ?? strlen($string); + if (!$pos) { + // if we're already at the start of the string, we can't go further back + return $pos; + } + $num = abs($num); + do { + $pos = self::sync($string, $pos -1, $errMode); + $num--; + } while ($num && $pos); + return $pos; + } else { + // seeking zero characters is equivalent to a sync + return self::sync($string, $pos, $errMode); + } + } + + /** Synchronize to the byte offset of the start of the nearest character at or before byte offset $pos */ + public static function sync(string $string, int $pos, int $errMode = null): int { + $errMode = $errMode ?? self::$errMode; + start: + if (!$pos || $pos >= strlen($string)) { + // if we're at the start of the string or past its end, then this is the character start + return $pos; + } + // save the start position for later, and increment before the coming decrement loop + $s = $pos++; + // examine the current byte and skip up to three continuation bytes, going backward and counting the number of examined bytes (between 1 and 4) + $t = 0; + do { + $pos--; + $t++; + $b = ($pos < strlen($string)) ? $string[$pos] : ""; + } while ( + $b >= "\x80" && $b <= "\xBF" && // continuation bytes + ($t < 4 || $errMode==self::M_SKIP) && // stop after four bytes, unless we're skipping invalid sequences + $pos // stop once the start of the string has been reached + ); + // get the expected length of the character starting at the last examined byte + $l = self::l($b); + if ($l==$t) { + // if the expected length matches the number of examined bytes, the character is valid + return $pos; + } elseif ($errMode==self::M_SKIP) { + // if we're expected to ignore invalid sequences: + if ($l && $t > $l) { + // if the last examined byte is the start of a sequence and we have more than the right amount of continuation characters, return the current position + return $pos; + } elseif (!$pos) { + // if we're already at the start of the string, give up + return $pos; + } else { + // otherwise skip over the last examined byte and start over + $pos--; + goto start; + } + } elseif ($errMode==self::M_REPLACE) { + // if we're expected to treat invalid sequences as replacement characters, return + // the offset of the most recently examined byte if it is the start of a multi-byte + // sequence, or the starting offset otherwise: in the latter case the original byte + // is itself a replacement character position + return ($l > 1) ? $pos: $s; + } else { + // if the character is invalid and we're expected to halt, halt + throw new \Exception; + } + } + + public static function len(string $string, int $start = 0, int $end = null, int $errMode = null): int { + $errMode = $errMode ?? self::$errMode; + $end = $end ?? strlen($string); + if (substr($string, $start, ($end - $start)) =="") { + return 0; + } + $count = 0; + $pos = $start; + do { + $c = self::get($string, $pos, $pos, $errMode); + } while ($c != "" && ++$count && $pos < $end); + return $count; + } + + public static function substr(string $str, int $start = 0, int $length = null, &$next = null, int $errMode = null): string { + $errMode = $errMode ?? self::$errMode; + if ($length > 0) { + $pos = $start; + $buffer = ""; + do { + $c = self::get($string, $pos, $pos, $errMode); // the current position is getting overwritten with the next position, by reference + $buffer .= $c; + } while (--$length && $c != ""); // stop after we have skipped the desired number of characters, or reached EOF + $next = $pos; + return $buffer; + } else { + $next = self::sync($string, $start, $errMode); + return ""; + } + } + + /** + * Returns the expected byte length of a UTF-8 character starting with byte $b + * + * If the byte is not the start of a UTF-8 sequence, 0 is returned + */ + protected static function l(string $b): int { + if ($b >= "\xC0" && $b <= "\xDF") { // two-byte character + return 2; + } elseif ($b >= "\xE0" && $b <= "\xEF") { // three-byte character + return 3; + } elseif ($b >= "\xF0" && $b <= "\xF7") { // four-byte character + return 4; + } elseif ($b < "\x80") { // ASCII byte: one-byte character + return 1; + } elseif ($b == "") { // end of input: pretend it's a valid single-byte character + return 1; + } else { // invalid byte + return 0; + } + } +}