From 7d13a6c3b71336d1cfa3278565f7f637a6d31ca6 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Wed, 18 Apr 2018 11:45:09 -0400 Subject: [PATCH] Four more states --- lib/URI.php | 158 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 158 insertions(+) diff --git a/lib/URI.php b/lib/URI.php index a1172b0..f3da81a 100644 --- a/lib/URI.php +++ b/lib/URI.php @@ -359,6 +359,19 @@ class URI { const CHR_ASCII_ALPHA = 3; const CHR_ASCII_ALPHANUM = 4; + // percent-encoding character set identifiers + const PE_CONTROL = 1; + const PE_FRAGMENT = 2; + const PE_PATH = 3; + const PE_USERINFO = 4; + + const PE_SET = [ + self::PE_CONTROL => [], + self::PE_FRAGMENT => [" ", '"', '`', "<", ">"], + self::PE_PATH => [" ", '"', '`', "<", ">", "#", "?", "{", "}"], + self::PE_USERINFO => [" ", '"', '`', "<", ">", "#", "?", "{", "}", "/", ":", ";", "=", "@", "[", "]", "\\"], + ]; + // error condition identifiers const ERR_LEADING_OR_TRAILING_WS = 1; const ERR_EMBEDDED_NEWLINE_OR_TAB = 2; @@ -367,6 +380,9 @@ class URI { const ERR_RELATIVE_URL = 5; const ERR_SCHEME_EXPECTING_SLASH = 6; const ERR_BACKSLASH_FORBIDDEN = 7; + const ERR_UNEXPECTED_SLASH = 8; + const ERR_UNEXPECTED_AT = 9; + const ERR_AUTHORITY_WITHOUT_HOST = 10; // parser state identifiers const ST_SCHEME_START = 1; @@ -383,6 +399,9 @@ class URI { const ST_AUTHORITY = 12; const ST_PATH = 13; const ST_RELATIVE_SLASH = 14; + const ST_HOST = 15; + const ST_HOSTNAME = 16; + const ST_FILE_HOST = 17; public static $confUseAllSchemePorts = false; @@ -683,6 +702,130 @@ class URI { } } break; + # relative slash state + case self::ST_RELATIVE_SLASH: + if ($this->isSpecial($url) && ($c=="/" || $c=="\\")) { + # If url is special and c is U+002F (/) or U+005C (\), then: + # If c is U+005C (\), validation error. + if ($c=="\\") { + $url->err[] = [$pointer, $pos, self::ERR_BACKSLASH_FORBIDDEN]; + } + # Set state to special authority ignore slashes state. + $state = self::ST_SPECIAL_AUTHORITY_IGNORE_SLASHES; + } elseif ($c="/") { + # Otherwise, if c is U+002F (/), then set state to authority state. + $state = self::ST_AUTHORITY; + } else { + # Otherwise, + $this->map($url, $base, [ + # set url’s username to base’s username, + "username", + # url’s password to base’s password, + "password", + # url’s host to base’s host, + "host", + # url’s port to base’s port, + "port", + ]); + # state to path state, and then, decrease pointer by one. + $state = self::ST_PATH; + goto processChar; + } + break; + # special authority slashes state + case self::ST_SPECIAL_AUTHORITY_SLASHES: + if ($c=="/" && substr($input, $posNext, 1)=="/") { + # If c is U+002F (/) and remaining starts with U+002F (/), then set state to special authority ignore slashes state and increase pointer by one. + $state = self::ST_SPECIAL_AUTHORITY_IGNORE_SLASHES; + // this has the effect of increasing the pointer by one + $pos = $posNext; + $c = UTF8::get($input, $pos, $posNext); + } else { + # Otherwise, validation error, set state to special authority ignore slashes state, and decrease pointer by one. + $url->err[] = [$pointer, $pos, self::ERR_SCHEME_EXPECTING_SLASH]; + $state = self::ST_SPECIAL_AUTHORITY_IGNORE_SLASHES; + goto processChar; + } + break; + # special authority ignore slashes state + case self::ST_SPECIAL_AUTHORITY_IGNORE_SLASHES: + if ($c != "/" && $c != "\\") { + # If c is neither U+002F (/) nor U+005C (\), then set state to authority state and decrease pointer by one. + $state = self::ST_AUTHORITY; + goto processChar; + } else { + # Otherwise, validation error. + $url->err[] = [$pointer, $pos, self::ERR_UNEXPECTED_SLASH]; + } + break; + # authority state + case self::ST_AUTHORITY: + if($c=="@") { + # If c is U+0040 (@), then: + # Validation error. + $url->err[] = [$pointer, $pos, self::ERR_UNEXPECTED_AT]; + # If the @ flag is set, prepend "%40" to buffer. + if ($flagAtSign) { + $buffer = "%40".$buffer; + } + # Set the @ flag. + $flagAtSign = true; + # For each codePoint in buffer: + $bPos = 0; + $bEof = strlen($buffer); + while ($bPos < $bEof) { + $codePoint = UTF8::get($buffer, $bPos, $bPosNext); + # If codePoint is U+003A (:) and passwordTokenSeenFlag is unset, then set passwordTokenSeenFlag and continue. + if ($codePoint==":" && !$flagPasswordTokenSeen) { + $flagPasswordTokenSeen = true; + // "continue" in the specification means going to the next character + $bPos = $bPosNext; + continue; + } + # Let encodedCodePoints be the result of running UTF-8 percent encode codePoint using the userinfo percent-encode set. + $encodedCodePoints = $this->percentEncode($codePoint, self::PE_USERINFO); + if ($flagPasswordTokenSeen) { + # If passwordTokenSeenFlag is set, then append encodedCodePoints to url’s password. + $url->password .= $encodedCodePoints; + } else { + # Otherwise, append encodedCodePoints to url’s username. + $url->username .= $encodedCodePoints; + } + } + # Set buffer to the empty string. + $buffer = ""; + } elseif ( + # Otherwise, if one of the following is true + in_array($c, ["/", "?", "#", ""]) || # c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#) + ($this->isSpecial($url) && $c=="\\") # url is special and c is U+005C (\) + ) { + # then: + # If @ flag is set and buffer is the empty string, validation error, return failure. + if ($flagAtSign && $buffer = "") { + $url->err[] = [$pointer, $pos, self::ERR_AUTHORITY_WITHOUT_HOST]; + $url->failure = true; + return $url; + } + # Decrease pointer by the number of code points in buffer plus one, + // DEVIATION: as with decreasing the pointer by one to reprocess characters, we'll ignore the "plus one" here for self-consitency + // we first count the number of characters in the buffer + $c = UTF8::len($buffer); + // then decrease the advisorty character pointer by that amount + $pointer -= $c; + // then seek back the same number of characters + $pos = UTF8::seek($input, -$c, $pos); + // and finally consume that character to get the position of the next character to continue the loop correctly + $c = UTF8::get($input, $pos, $posNext); + # set buffer to the empty string, and set state to host state. + $buffer = ""; + $state = self::ST_HOST; + // and reprocess the first character in the erstwhile buffer + goto processChar; + } else { + # Otherwise, append c to buffer. + $buffer .= $c; + } + break; // invalid or unimplemented state default: // FIXME: this should be an error, but until the whole state machine is implemented, we stop processing instead @@ -732,4 +875,19 @@ class URI { $test = ($est instanceof URI) ? $test->scheme : $test; return array_key_exists($test, self::SCHEME_SPECIAL); } + + protected function percentEncode(string $bytes, int $set): string { + if (!isset(self::PE_SET[$set])) { + throw new \Exception; + } + $buffer = ""; + foreach ($bytes as $b) { + if ($b < "\x20" || $b > "\x7E" || in_array($b, self::PE_SET[$set])) { + $buffer .= "%".strtoupper(bin2hex($b)); + } else { + $buffer .= $b; + } + } + return $buffer; + } }