Browse Source

Split off UTF-8 processing into its own class, greately expanded

Also simplified some parts of the algorithm implementation

Part of this simplification involves the use of goto statements
master
J. King 6 years ago
parent
commit
fd8c333a68
  1. 100
      lib/URI.php
  2. 207
      lib/UTF8.php

100
lib/URI.php

@ -426,10 +426,7 @@ class URI {
# Let the @ flag, [] flag, and passwordTokenSeenFlag be unset.
$flagAtSign = $flagSquareBracket = $flagPasswordTokenSeen = false;
# Let pointer be a pointer to first code point in input.
// DEVIATION: we operate on byte strings: $pos is the byte offset of the character referred to by $pointer;
// $posPrev is the byte offset of the start of the previous character i.e. ($pointer - 1)
$pointer = 0;
$posPrev = $pos;
// we operate on byte strings: $pos is the byte offset of the character referred to by $pointer
$pos = 0;
# Keep running the following state machine by switching on state.
# If after a run pointer points to the EOF code point, go to the next step.
@ -437,10 +434,12 @@ class URI {
// Note: the state machine is designed to run once even with an empty string
do {
# Within a parser algorithm that uses a pointer variable, c references the code point the pointer variable points to.
// DEVIATION: we operate on byte strings: $pos is the byte offset of the character referred to by $pointer;
// we operate on byte strings: $pos is the byte offset of the character referred to by $pointer;
// $posNext is the start of "remaining" i.e. the offset of the next UTF-8 character
// $posPrev is the byte offset of the start of the previous character i.e. ($pointer - 1)
list($c, $posNext) = $this->getChar($input, $pos);
$c = UTF8::get($input, $pos, $posNext);
// when the algorithm specifies to decrease the pointer by one, the result is to reprocess the current character; we
// accomplish this by going back to this label, which skips the increment at the end of each iteration
processChar:
// switch on state
switch ($state) {
# scheme start state
@ -452,8 +451,7 @@ class URI {
} elseif (!$stateOverride) {
# Otherwise, if state override is not given, set state to no scheme state, and decrease pointer by one.
$state = self::ST_NO_SCHEME;
$pos = $posPrev;
$pointer--;
goto processChar;
} else {
# Otherwise, validation error, return failure.
# NOTE: This indication of failure is used exclusively by Location object’s protocol attribute.
@ -527,7 +525,7 @@ class URI {
$state = self::ST_NO_SCHEME;
$pos = 0;
$pointer = 0;
continue 2;
goto processChar;
} else {
# Otherwise, validation error, return failure.
# NOTE: This indication of failure is used exclusively by Location object’s protocol attribute. Furthermore, the non-failure termination earlier in this state is an intentional difference for defining that attribute.
@ -560,13 +558,11 @@ class URI {
} elseif ($base->scheme != "file") {
# Otherwise, if base’s scheme is not "file", set state to relative state and decrease pointer by one.
$state = self::ST_RELATIVE;
$pos = $posPrev;
$pointer--;
goto processChar;
} else {
# Otherwise, set state to file state and decrease pointer by one.
$state = self::ST_FILE;
$pos = $posPrev;
$pointer--;
goto processChar;
}
break;
# special relative or authority state
@ -577,8 +573,7 @@ class URI {
} else {
# Otherwise, validation error, set state to relative state and decrease pointer by one.
$state = self::ST_RELATIVE;
$pos = $posPrev;
$pointer--;
goto processChar;
}
break;
# path or authority state
@ -589,8 +584,7 @@ class URI {
} else {
# Otherwise, set state to path state, and decrease pointer by one.
$state = self::ST_PATH;
$pos = $posPrev;
$pointer--;
goto processChar;
}
break;
// invalid or unimplemented state
@ -600,10 +594,8 @@ class URI {
}
# If after a run pointer points to the EOF code point, go to the next step.
# Otherwise, increase pointer by one and continue with the state machine.
// DEVIATION: we operate on byte strings: $pos is the byte offset of the character referred to by $pointer;
// we operate on byte strings: $pos is the byte offset of the character referred to by $pointer;
// $posNext is the start of "remaining" i.e. the offset of the next UTF-8 character
// $posPrev is the byte offset of the start of the previous character i.e. ($pointer - 1)
$posPrev = $pos;
$pos = $posNext;
$pointer++;
} while ($pos <= $eof);
@ -632,70 +624,4 @@ class URI {
throw new \Exception;
}
}
/** Returns the UTF-8 character at byte offset $pos (which could possibly be a replacement charcter) along with the byte offset of the next character */
protected function getChar(string $input, int $pos, bool $throwOnError = false, $replacementChar = "\u{FFFD}"): array {
// get the byte at the specified position
$b = ($pos < strlen($input)) ? $input[$pos] : "";
if ($b < "\x80" || $b=="") {
// if the byte is an ASCII character or end of input, simply return it
return [$b, $pos + 1];
} else {
// otherwise determine the byte-length of the UTF-8 character
$l = $this->getCharLength($b);
if (!$l && $throwOnError) {
// if the byte is invalid and we're supposed to halt, halt
throw new \Exception;
} elseif (!$l) {
// if the byte is invalid and we're supposed to continue, skip any further invalid bytes and return a replacement character instead
do {
$l = $this->getCharLength($input[++$pos]);
} while (!$l);
return [$replacementChar, $pos];
} else {
// otherwise collect valid mid-sequence bytes into a buffer until the whole character is retrieved or an invalid byte is encountered
$buffer = $b;
do {
$b = (++$pos < strlen($input)) ? $input[$pos] : "";
if ($b >= "\x80" && $b <= "\xBF") {
// if the byte is valid, add it to the buffer
$buffer .= $b;
} elseif ($throwOnError) {
// if the byte is invalid and we're supposed to halt, halt
throw new \Exception;
} else {
// if the byte is invalid and we're supposed to continue, go back one byte and skip any bytes which are not sequence-start bytes, then return a replacement character
$pos--;
do {
$l = $this->getCharLength($input[++$pos]);
} while (!$l);
return [$replacementChar, $pos];
}
} while (strlen($buffer) < $l);
// return the filled buffer and the position of the next byte
return [$buffer, $pos + 1];
}
}
}
/**
* Returns the total expected length of the UTF-8 character starting with byte $b
*
* If the byte is not the start of a UTF-8 sequence, 0 is returned
*/
protected function getCharLength(string $b): int {
if ($b >= "\xC0" && $b <= "\xDF") { // two-byte character
return 2;
} elseif ($b >= "\xE0" && $b <= "\xEF") { // three-byte character
return 3;
} elseif ($b >= "\xF0" && $b <= "\xF7") { // four-byte character
return 4;
} elseif ($b < "\x80") { // ASCII byte: one-byte character
return 1;
} elseif ($b == "") { // end of input: pretend it's a valid single-byte character
return 1;
} else { // invalid byte
return 0;
}
}
}

207
lib/UTF8.php

@ -0,0 +1,207 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace JKingWeb\URI;
abstract class UTF8 {
public static $replacementChar = "\u{FFFD}";
public static $errMode = self::M_REPLACE;
const M_REPLACE = 0;
const M_SKIP = 1;
const M_HALT = 2;
/** Retrieve a character from $string starting at byte offset $pos
*
* $next is a variable in which to store the next byte offset at which a character starts
*
* The returned character may be a replacement character, or the empty string if $pos is beyond the end of $string
*/
public static function get(string $string, int $pos, &$next = null, int $errMode = null): string {
start:
// get the byte at the specified position
$b = ($pos < strlen($string)) ? $string[$pos] : "";
if ($b < "\x80" || $b=="") {
// if the byte is an ASCII character or end of input, simply return it
$next = $pos + 1;
return $b;
} else {
$errMode = $errMode ?? self::$errMode;
// otherwise determine the byte-length of the UTF-8 character
$l = self::l($b);
if (!$l && $errMode==self::M_SKIP) {
// if the byte is invalid and we're supposed to skip, advance the position and start over
$pos++;
goto start;
} elseif (!$l && $errMode == self::M_REPLACE) {
// if the byte is invalid and we're supposed to replace, return a replacement character
$next = $pos + 1;
return self::$replacementChar;
} elseif (!$l) {
// if the byte is invalid and we're supposed to halt, halt
throw new \Exception;
} else {
// otherwise collect valid mid-sequence bytes into a buffer until the whole character is retrieved or an invalid byte is encountered
$buffer = $b;
do {
$b = (++$pos < strlen($string)) ? $string[$pos] : "";
if ($b >= "\x80" && $b <= "\xBF") {
// if the byte is valid, add it to the buffer
$buffer .= $b;
} elseif ($errMode==self::M_SKIP) {
// if the byte is invalid and we're supposed to skip, start over from the current position
goto start;
} elseif ($errMode==self::M_REPLACE) {
// if the byte is invalid and we're supposed to replace, return a replacement character
$next = $pos;
return self::$replacementChar;
} else {
// if the byte is invalid and we're supposed to halt, halt
throw new \Exception;
}
} while (strlen($buffer) < $l);
// return the filled buffer and the position of the next byte
$next = $pos + 1;
return $buffer;
}
}
}
/** Starting from byte offset $pos, advance $num characters through $string and return the byte offset of the found character
*
* If $num is negative, the operation will be performed in reverse
*
* If $pos is omitted, the start of the string will be used for a forward seek, and the end for a reverse seek
*/
public static function seek(string $string, int $num, int $pos = null, int $errMode = null): int {
$errMode = $errMode ?? self::$errMode;
if ($num > 0) {
$pos = $pos ?? 0;
do {
$c = self::get($string, $pos, $pos, $errMode); // the current position is getting overwritten with the next position, by reference
} while (--$num && $c != ""); // stop after we have skipped the desired number of characters, or reached EOF
return $pos;
} elseif ($num < 0) {
$pos = $pos ?? strlen($string);
if (!$pos) {
// if we're already at the start of the string, we can't go further back
return $pos;
}
$num = abs($num);
do {
$pos = self::sync($string, $pos -1, $errMode);
$num--;
} while ($num && $pos);
return $pos;
} else {
// seeking zero characters is equivalent to a sync
return self::sync($string, $pos, $errMode);
}
}
/** Synchronize to the byte offset of the start of the nearest character at or before byte offset $pos */
public static function sync(string $string, int $pos, int $errMode = null): int {
$errMode = $errMode ?? self::$errMode;
start:
if (!$pos || $pos >= strlen($string)) {
// if we're at the start of the string or past its end, then this is the character start
return $pos;
}
// save the start position for later, and increment before the coming decrement loop
$s = $pos++;
// examine the current byte and skip up to three continuation bytes, going backward and counting the number of examined bytes (between 1 and 4)
$t = 0;
do {
$pos--;
$t++;
$b = ($pos < strlen($string)) ? $string[$pos] : "";
} while (
$b >= "\x80" && $b <= "\xBF" && // continuation bytes
($t < 4 || $errMode==self::M_SKIP) && // stop after four bytes, unless we're skipping invalid sequences
$pos // stop once the start of the string has been reached
);
// get the expected length of the character starting at the last examined byte
$l = self::l($b);
if ($l==$t) {
// if the expected length matches the number of examined bytes, the character is valid
return $pos;
} elseif ($errMode==self::M_SKIP) {
// if we're expected to ignore invalid sequences:
if ($l && $t > $l) {
// if the last examined byte is the start of a sequence and we have more than the right amount of continuation characters, return the current position
return $pos;
} elseif (!$pos) {
// if we're already at the start of the string, give up
return $pos;
} else {
// otherwise skip over the last examined byte and start over
$pos--;
goto start;
}
} elseif ($errMode==self::M_REPLACE) {
// if we're expected to treat invalid sequences as replacement characters, return
// the offset of the most recently examined byte if it is the start of a multi-byte
// sequence, or the starting offset otherwise: in the latter case the original byte
// is itself a replacement character position
return ($l > 1) ? $pos: $s;
} else {
// if the character is invalid and we're expected to halt, halt
throw new \Exception;
}
}
public static function len(string $string, int $start = 0, int $end = null, int $errMode = null): int {
$errMode = $errMode ?? self::$errMode;
$end = $end ?? strlen($string);
if (substr($string, $start, ($end - $start)) =="") {
return 0;
}
$count = 0;
$pos = $start;
do {
$c = self::get($string, $pos, $pos, $errMode);
} while ($c != "" && ++$count && $pos < $end);
return $count;
}
public static function substr(string $str, int $start = 0, int $length = null, &$next = null, int $errMode = null): string {
$errMode = $errMode ?? self::$errMode;
if ($length > 0) {
$pos = $start;
$buffer = "";
do {
$c = self::get($string, $pos, $pos, $errMode); // the current position is getting overwritten with the next position, by reference
$buffer .= $c;
} while (--$length && $c != ""); // stop after we have skipped the desired number of characters, or reached EOF
$next = $pos;
return $buffer;
} else {
$next = self::sync($string, $start, $errMode);
return "";
}
}
/**
* Returns the expected byte length of a UTF-8 character starting with byte $b
*
* If the byte is not the start of a UTF-8 sequence, 0 is returned
*/
protected static function l(string $b): int {
if ($b >= "\xC0" && $b <= "\xDF") { // two-byte character
return 2;
} elseif ($b >= "\xE0" && $b <= "\xEF") { // three-byte character
return 3;
} elseif ($b >= "\xF0" && $b <= "\xF7") { // four-byte character
return 4;
} elseif ($b < "\x80") { // ASCII byte: one-byte character
return 1;
} elseif ($b == "") { // end of input: pretend it's a valid single-byte character
return 1;
} else { // invalid byte
return 0;
}
}
}