# If c is U+002F (/), then set state to authority state.
$state = self::ST_AUTHORITY;
} else {
# Otherwise, set state to path state, and decrease pointer by one.
$state = self::ST_PATH;
$pos = $posPrev;
$pointer--;
}
break;
// invalid or unimplemented state
default:
default:
// FIXME: this should be an error, but until the whole state machine is implemented, we stop processing instead
// FIXME: this should be an error, but until the whole state machine is implemented, we stop processing instead
return $url;
return $url;
@ -620,11 +635,67 @@ class URI {
/** Returns the UTF-8 character at byte offset $pos (which could possibly be a replacement charcter) along with the byte offset of the next character */
/** Returns the UTF-8 character at byte offset $pos (which could possibly be a replacement charcter) along with the byte offset of the next character */
protected function getChar(string $input, int $pos, bool $throwOnError = false, $replacementChar = "\u{FFFD}"): array {
protected function getChar(string $input, int $pos, bool $throwOnError = false, $replacementChar = "\u{FFFD}"): array {
// FIXME: stub
// get the byte at the specified position
// FIXME: write a function to read a whole UTF-8 byte sequence rather than single bytes
$b = ($pos <strlen($input))?$input[$pos]:"";
// FIXME: return an EOF object and ($pos + 1) if we're at the end of the byte stream
if ($b < "\x80" || $b=="") {
return [$input[$pos], $pos + 1];
// if the byte is an ASCII character or end of input, simply return it
return [$b, $pos + 1];
} else {
// otherwise determine the byte-length of the UTF-8 character
$l = $this->getCharLength($b);
if (!$l && $throwOnError) {
// if the byte is invalid and we're supposed to halt, halt
throw new \Exception;
} elseif (!$l) {
// if the byte is invalid and we're supposed to continue, skip any further invalid bytes and return a replacement character instead
do {
$l = $this->getCharLength($input[++$pos]);
} while (!$l);
return [$replacementChar, $pos];
} else {
// otherwise collect valid mid-sequence bytes into a buffer until the whole character is retrieved or an invalid byte is encountered
$buffer = $b;
do {
$b = (++$pos <strlen($input))?$input[$pos]:"";
if ($b >= "\x80" && $b <= "\xBF") {
// if the byte is valid, add it to the buffer
$buffer .= $b;
} elseif ($throwOnError) {
// if the byte is invalid and we're supposed to halt, halt
throw new \Exception;
} else {
// if the byte is invalid and we're supposed to continue, go back one byte and skip any bytes which are not sequence-start bytes, then return a replacement character
$pos--;
do {
$l = $this->getCharLength($input[++$pos]);
} while (!$l);
return [$replacementChar, $pos];
}
} while (strlen($buffer) < $l);
// return the filled buffer and the position of the next byte
return [$buffer, $pos + 1];
}
}
}
/**
* Returns the total expected length of the UTF-8 character starting with byte $b
*
* If the byte is not the start of a UTF-8 sequence, 0 is returned
*/
protected function getCharLength(string $b): int {
if ($b >= "\xC0" && $b <= "\xDF") { // two-byte character