From 6f98db1931d05e519786f80bfdc073047a5b8c25 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Thu, 14 May 2020 23:08:59 -0400 Subject: [PATCH] Various fixes to path collapsing and host parsing Host parsing breaks PSR-7 tests; these are probably "wrong" --- lib/Url.php | 152 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 89 insertions(+), 63 deletions(-) diff --git a/lib/Url.php b/lib/Url.php index dec16d8..dc05f15 100644 --- a/lib/Url.php +++ b/lib/Url.php @@ -47,6 +47,7 @@ PCRE; protected const HOST_PATTERN = '/^(\[[a-f0-9:\.]*\]|[^:]*)(?::([^\/]*))?$/si'; protected const USER_PATTERN = '/^([^:]*)(?::(.*))?$/s'; protected const SCHEME_PATTERN = '/^(?:[a-z][a-z0-9\.\-\+]*|)$/i'; + protected const IPV4_PATTERN = '/^[\.xX0-9a-fA-F\x{ff10}-\x{ff19}\x{ff21}-\x{ff26}\x{ff41}-\x{ff46}\x{ff38}\x{ff58}\x{ff0e}]*$/u'; // matches ASCII and fullwidth equivalents protected const IPV6_PATTERN = '/^\[[^\]]+\]$/i'; protected const PORT_PATTERN = '/^\d*$/'; protected const FORBIDDEN_HOST_PATTERN = '/[\x{00}\t\n\r #%\/:\?@\[\]\\\]/'; @@ -54,6 +55,7 @@ PCRE; protected const WINDOWS_AUTHORITY_PATTERN = '/^[\/\\\\]{1,2}[a-zA-Z][:|]$/'; protected const WINDOWS_PATH_PATTERN = '/(?:^|\/)([a-zA-Z])[:|]($|[\/#\?].*)/'; protected const WHITESPACE_CHARS = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\x20"; + protected const FULLWIDTH_CHARS = ["\u{FF10}" => "0", "\u{FF11}" => "1", "\u{FF12}" => "2", "\u{FF13}" => "3", "\u{FF14}" => "4", "\u{FF15}" => "5", "\u{FF16}" => "6", "\u{FF17}" => "7", "\u{FF18}" => "8", "\u{FF19}" => "9", "\u{FF21}" => "A", "\u{FF22}" => "B", "\u{FF23}" => "C", "\u{FF24}" => "D", "\u{FF25}" => "E", "\u{FF26}" => "F", "\u{FF41}" => "a", "\u{FF42}" => "b", "\u{FF43}" => "c", "\u{FF44}" => "d", "\u{FF45}" => "e", "\u{FF46}" => "f", "\u{FF38}" => "X", "\u{FF58}" => "x", "\u{FF0E}" => "."]; protected const PERCENT_ENCODE_SETS = [ 'C0' => "", 'fragment' => " \"<>`", @@ -139,47 +141,64 @@ PCRE; // URI is a URN; continue processing } } elseif ($this->scheme === "file" && preg_match(self::WINDOWS_AUTHORITY_PATTERN, $authority)) { + // URI is something like file://C:/path $path = $authority.$path; $authority = "//"; } if ($authority) { - $authority = substr($authority, 2); - if (($cleft = strrpos($authority, "@")) !== false) { - if (preg_match(self::USER_PATTERN, substr($authority, 0, $cleft), $match)) { + $auth = substr($authority, 2); + if (($cleft = strrpos($auth, "@")) !== false) { + if (preg_match(self::USER_PATTERN, substr($auth, 0, $cleft), $match)) { $this->setUser($match[1]); $this->setPass($match[2] ?? ""); } - if (preg_match(self::HOST_PATTERN, substr($authority, $cleft + 1), $match)) { + if (preg_match(self::HOST_PATTERN, substr($auth, $cleft + 1), $match)) { $this->setHost($match[1]); $this->setPort($match[2] ?? ""); } - } elseif (preg_match(self::HOST_PATTERN, $authority, $match)) { + } elseif (preg_match(self::HOST_PATTERN, $auth, $match)) { $this->setHost($match[1]); $this->setPort($match[2] ?? ""); } } - if (!$scheme && $base) { - // the effective URL scheme must be known to correctly process the path - $base = $base ?? new static($baseUrl); - $this->setScheme($base->scheme); + // resolve with the base, if necessary + if ($base) { + // if the base is a URN without a path-like path, this is invalid + if (!$scheme && !$authority && (strlen($path) || $query) && $base->isUrn() && ($base->path[0] ?? "") !== "/") { + throw new \InvalidArgumentException("Base URI cannot be a URN"); + } + if (!$authority && $this->scheme === $base->scheme) { + if ($base->host !== null) { + $this->host = $base->host; + $this->port = $base->port; + $this->user = $base->user; + $this->pass = $base->pass; + } + $this->setPath($path, $base->path); + if (!strlen($path)) { + if (!$query) { + $this->query = $base->query; + } + } + } else { + $this->setPath($path); + } + } else { + $this->setPath($path); } - $this->setPath($path); if ($query) { $this->setQuery(substr($query, 1)); } if ($fragment) { $this->setFragment(substr($fragment, 1)); } - if ((!$scheme || ($this->host === null && $this->specialScheme)) && strlen($baseUrl ?? "")) { - $this->resolve($base ?? new static($baseUrl)); - } } else { throw new \InvalidArgumentException("String is not a valid URI"); } } public function isUrn(): bool { - return $this->host === null && !$this->specialScheme; + return $this->scheme && $this->host === null && !$this->specialScheme; } public function getAuthority() { @@ -359,10 +378,11 @@ PCRE; } } - protected function setPath(string $value): void { + protected function setPath(string $value, string $base = ""): void { if ($this->specialScheme) { - $value = $this->collapsePath(str_replace("\\", "/", $value)); + $value = str_replace("\\", "/", $value); } + $value = $this->collapsePath($value, $base); $this->path = $this->percentEncode($value, $this->isUrn() ? "C0" : "path"); } @@ -382,19 +402,35 @@ PCRE; } } - protected function collapsePath(string $path): string { - if (preg_match("<^/?$>", $path)) { - return $path; + protected function collapsePath(string $path, string $base = ""): string { + $winDrive = ""; + if ($this->scheme === "file") { + if (preg_match(self::WINDOWS_PATH_PATTERN, $path, $match)) { + // If a Windows drive letter is present, the host is implicitly localhost + $this->setHost(""); + $path = "/".$match[1].":".$match[2]; + $winDrive = $match[1].":"; + } elseif (preg_match(self::WINDOWS_PATH_PATTERN, $base, $match)) { + $this->setHost(""); + $winDrive = $match[1].":"; + } } - if ($this->scheme === "file" && preg_match(self::WINDOWS_PATH_PATTERN, $path, $match)) { - // If a Windows drive letter is present, the host is implicitly localhost - $this->setHost(""); - $path = "/".$match[1].":".$match[2]; + if ($path === "/") { + return $path; + } elseif ($path === "") { + return $base; } $abs = $path[0] === "/"; $dir = $path[-1] === "/"; $term = $dir || preg_match("i", $path); $path = explode("/", substr($path, (int) $abs, strlen($path) - ($abs + $dir))); + if (!$abs && strlen($base)) { + // also consider the base path, if appropriate + $abs = $base[0] === "/"; + $base = explode("/", substr($base, (int) $abs)); + array_pop($base); + $path = array_merge($base, $path); + } $out = []; foreach ($path as $s) { if ($s === "" && !$out && $this->scheme === "file") { @@ -410,6 +446,12 @@ PCRE; $out[] = $s; } } + if ($winDrive && ($out[0] ?? "") !== $winDrive) { + if (!$out) { + $term = true; + } + array_unshift($out, $winDrive); + } if (!$out) { return $abs ? "/" : ""; } @@ -432,39 +474,6 @@ PCRE; return $out; } - protected function resolve(self $base): void { - if ($base->isUrn()) { - throw new \InvalidArgumentException("URL base must not be a Uniform Resource Name"); - } - [$scheme, $host, $user, $pass, $port, $path, $query, $fragment] = [$base->scheme, $base->host, $base->user, $base->pass, $base->port, $base->path, $base->query, $base->fragment]; - $this->scheme = $this->scheme ?? $scheme; - if (is_null($this->host)) { - $this->host = $host; - $this->user = $user; - $this->pass = $pass; - $this->port = $port; - if (!strlen($this->path ?? "")) { - $this->path = $path; - if (is_null($this->query)) { - $this->query = $query; - if (is_null($this->fragment)) { - $this->fragment = $fragment; - } - } - } elseif (strlen($path)) { - if ($this->path[0] !== "/") { - if ($path[-1] === "/") { - $this->path = $path.$this->path; - } else { - $len = strrpos($path, "/"); - $len = ($len === false) ? 0 : $len + 1; - $this->path = substr($path, 0, $len).$this->path; - } - } - } - } - } - protected function parseHost(?string $host): ?string { if (strlen($host ?? "")) { if ($host[0] === "[") { @@ -486,15 +495,31 @@ PCRE; return $this->percentEncode($host, "C0"); } $host = rawurldecode($host); - $idn = $this->parseIPv4($host) ?? idn_to_ascii($host, \IDNA_NONTRANSITIONAL_TO_ASCII | \IDNA_CHECK_BIDI | \IDNA_CHECK_CONTEXTJ, \INTL_IDNA_VARIANT_UTS46); - if ( - $idn === false - || preg_match(self::FORBIDDEN_HOST_PATTERN, $idn) - || idn_to_utf8($idn, \IDNA_NONTRANSITIONAL_TO_UNICODE | \IDNA_USE_STD3_RULES, \INTL_IDNA_VARIANT_UTS46) === false - ) { + $domain = null; + if (preg_match(self::IPV4_PATTERN, $host)) { + $domain = $this->parseIPv4($host); + } + if ($domain === null && function_exists("idn_to_ascii") && function_exists("idn_to_utf8")) { + $domain = []; + foreach (explode(".", $host) as $label) { + if (!strlen($label)) { + $domain[] = $label; + } else { + $label = idn_to_ascii($label, \IDNA_NONTRANSITIONAL_TO_ASCII | \IDNA_CHECK_BIDI | \IDNA_CHECK_CONTEXTJ, \INTL_IDNA_VARIANT_UTS46); + if ($label === false || idn_to_utf8($label, \IDNA_NONTRANSITIONAL_TO_UNICODE | \IDNA_USE_STD3_RULES, \INTL_IDNA_VARIANT_UTS46) === false) { + $domain = false; + break; + } + $domain[] = $label; + } + } + $domain = is_array($domain) ? implode(".", $domain) : $domain; + } + $domain = $domain ?? strtolower($host); + if ($domain === false || preg_match(self::FORBIDDEN_HOST_PATTERN, $domain)) { throw new \InvalidArgumentException("Invalid host in URL"); } - return strtolower($idn); + return $domain; } elseif ($this->specialScheme && $this->scheme !== "file") { throw new \InvalidArgumentException("Invalid host in URL"); } @@ -504,6 +529,7 @@ PCRE; protected function parseIPv4(string $input) { // first parse the address; this is a literal implementation of https://url.spec.whatwg.org/#concept-ipv4-parser assert(strlen($input)); + $input = str_replace(array_keys(self::FULLWIDTH_CHARS), self::FULLWIDTH_CHARS, $input); $input = explode(".", $input); if ($input[sizeof($input) - 1] === "" && sizeof($input) > 1) { array_pop($input);