diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..7032c85 --- /dev/null +++ b/AUTHORS @@ -0,0 +1 @@ +J. King https://jkingweb.ca/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..81c5948 --- /dev/null +++ b/LICENSE @@ -0,0 +1,22 @@ +Copyright (c) 2018 J. King et al. + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. diff --git a/composer.json b/composer.json index 4ca8ece..9add91f 100644 --- a/composer.json +++ b/composer.json @@ -1,8 +1,8 @@ { - "name": "jkingweb/uniform", + "name": "mensbeam/utf8", "type": "library", - "description": "A collection of URL tools compatible with the WHATWG URL standard", - "keywords": ["url","uri","whatwg"], + "description": "A set of tools for working with UTF-8 strings without mbstring or intl", + "keywords": ["utf-8", "utf8"], "license": "MIT", "authors": [ { @@ -17,7 +17,7 @@ }, "autoload": { "psr-4": { - "JKingWeb\\URI\\": "lib/" + "MensBeam\\UTF8\\": "lib/" } } } diff --git a/composer.lock b/composer.lock index e9d1567..203ef38 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file", "This file is @generated automatically" ], - "content-hash": "9c891512410ea881b9cf5ae2a3873fcb", + "content-hash": "8394b8ab5a816511b1fad1a40758b186", "packages": [], "packages-dev": [], "aliases": [], diff --git a/lib/URI.php b/lib/URI.php deleted file mode 100644 index f3da81a..0000000 --- a/lib/URI.php +++ /dev/null @@ -1,893 +0,0 @@ - 21, - 'file' => null, - 'gopher' => 70, - 'http' => 80, - 'https' => 443, - 'ws' => 80, - 'wss' => 443, - ]; - /** Default port numbers for all schemes registered with IANA - * - * If a scheme is in the IANA registry but not listed here, then it likely did not exist when this list was last compiled - */ - const SCHEME_DEFAULT_PORTS = [ - 'aaa' => 3868, - 'aaas' => 5658, - 'about' => null, - 'acap' => 674, - 'acct' => null, - 'acr' => null, - 'adiumxtra' => null, - 'afp' => 548, - 'afs' => null, - 'aim' => null, - 'appdata' => null, - 'apt' => null, - 'attachment' => null, - 'aw' => null, - 'barion' => null, - 'beshare' => null, // multiple ports - 'bitcoin' => null, - 'blob' => null, - 'bolo' => null, - 'browserext' => null, - 'callto' => null, - 'cap' => 1026, - 'chrome' => null, - 'chrome-extension' => null, - 'cid' => null, - 'coap' => 5683, - 'coap+tcp' => 5683, - 'coap+ws' => null, // it's unclear which port applies here: WebSocket would imply 80, but the specification is ambiguous - 'coaps' => 5684, - 'coaps+tcp' => 5684, - 'coaps+ws' => null, // it's unclear which port applies here: WebSocket would imply 443, but the specification is ambiguous - 'com-eventbrite-attendee' => null, - 'content' => null, - 'conti' => null, - 'crid' => null, - 'cvs' => null, - 'data' => null, - 'dav' => null, - 'diaspora' => null, - 'dict' => 2628, - 'dis' => null, - 'dlna-playcontainer' => null, - 'dlna-playsingle' => null, - 'dns' => 53, - 'dntp' => null, - 'dtn' => null, - 'dvb' => null, - 'ed2k' => null, - 'example' => null, // not an actual scheme - 'facetime' => null, - 'fax' => null, - 'feed' => null, - 'feedready' => null, - 'file' => null, - 'filesystem' => null, - 'finger' => 79, - 'fish' => 22, // an application of SSH - 'ftp' => 21, - 'geo' => null, - 'gg' => null, - 'git' => 9418, // per https://git-scm.com/book/en/v2/Git-on-the-Server-The-Protocols#_the_git_protocol - 'gizmoproject' => null, - 'go' => 1096, - 'gopher' => 70, - 'graph' => null, - 'graphdata' => null, // not in the IANA list, but included as part of the registration for 'graph' - 'gtalk' => null, - 'h323' => null, // several ports are defined in the IANA port registry---unclear which (if any) is implied by the scheme - 'ham' => null, - 'hcp' => null, - 'http' => 80, - 'https' => 443, - 'hxxp' => null, // it would be inappropriate to modify these URLs - 'hxxps' => null, // it would be inappropriate to modify these URLs - 'hydrazone' => null, - 'iax' => null, - 'icap' => 1344, - 'icon' => null, - 'im' => null, - 'imap' => 143, - 'info' => null, - 'iotdisco' => null, - 'ipn' => null, - 'ipp' => 631, - 'ipps' => 631, - 'irc' => 6667, - 'irc6' => 6667, - 'ircs' => 994, - 'iris' => null, - 'iris.beep' => null, - 'iris.lwz' => null, - 'iris.xpc' => null, - 'iris.xpcs' => null, - 'isostore' => null, - 'itms' => null, - 'jabber' => null, - 'jar' => null, - 'jms' => null, - 'keyparc' => null, - 'lastfm' => null, - 'ldap' => 389, - 'ldaps' => 636, - 'lvlt' => null, - 'magnet' => null, - 'mailserver' => null, - 'mailto' => null, - 'maps' => null, - 'market' => null, - 'message' => null, - 'microsoft.windows.camera' => null, - 'microsoft.windows.camera.multipicker' => null, - 'microsoft.windows.camera.picker' => null, - 'mid' => null, - 'mms' => null, - 'modem' => null, - 'mongodb' => null, - 'moz' => null, - 'ms-access' => null, - 'ms-browser-extension' => null, - 'ms-drive-to' => null, - 'ms-enrollment' => null, - 'ms-excel' => null, - 'ms-gamebarservices' => null, - 'ms-gamingoverlay' => null, - 'ms-getoffice' => null, - 'ms-help' => null, - 'ms-infopath' => null, - 'ms-inputapp' => null, - 'ms-lockscreencomponent-config' => null, - 'ms-media-stream-id' => null, - 'ms-mixedrealitycapture' => null, - 'ms-officeapp' => null, - 'ms-people' => null, - 'ms-project' => null, - 'ms-powerpoint' => null, - 'ms-publisher' => null, - 'ms-restoretabcompanion' => null, - 'ms-search-repair' => null, - 'ms-secondary-screen-controller' => null, - 'ms-secondary-screen-setup' => null, - // what the hell, Microsoft? Seriously? - 'ms-settings' => null, - 'ms-settings-airplanemode' => null, - 'ms-settings-bluetooth' => null, - 'ms-settings-camera' => null, - 'ms-settings-cellular' => null, - 'ms-settings-cloudstorage' => null, - 'ms-settings-connectabledevices' => null, - 'ms-settings-displays-topology' => null, - 'ms-settings-emailandaccounts' => null, - 'ms-settings-language' => null, - 'ms-settings-location' => null, - 'ms-settings-lock' => null, - 'ms-settings-nfctransactions' => null, - 'ms-settings-notifications' => null, - 'ms-settings-power' => null, - 'ms-settings-privacy' => null, - 'ms-settings-proximity' => null, - 'ms-settings-screenrotation' => null, - 'ms-settings-wifi' => null, - 'ms-settings-workplace' => null, - // You could have just defined one ms-app scheme, you know, never mind one ms-settings... - 'ms-spd' => null, - 'ms-sttoverlay' => null, - 'ms-transit-to' => null, - 'ms-useractivityset' => null, - 'ms-virtualtouchpad' => null, - 'ms-visio' => null, - 'ms-walk-to' => null, - 'ms-whiteboard' => null, - 'ms-whiteboard-cmd' => null, - 'ms-word' => null, - 'msnim' => null, - 'msrp' => null, // explicitly no default port - 'msrps' => null, // explicitly no default port - 'mtqp' => 1038, - 'mumble' => 64738, - 'mupdate' => null, - 'mvn' => null, - 'news' => 119, - 'nfs' => 2049, - 'ni' => null, - 'nih' => null, - 'nntp' => 119, - 'notes' => null, - 'ocf' => null, - 'oid' => null, - 'onenote' => null, - 'onenote-cmd' => null, - 'opaquelocktoken' => null, - 'pack' => null, - 'palm' => null, - 'paparazzi' => null, - 'pkcs11' => null, - 'platform' => null, - 'pop' => 110, - 'pres' => null, - 'prospero' => 1525, - 'proxy' => null, - 'pwid' => null, - 'psyc' => 4404, - 'qb' => null, - 'query' => null, - 'redis' => 6379, - 'rediss' => 6379, - 'reload' => 6084, - 'res' => null, - 'resource' => null, - 'rmi' => 1099, - 'rsync' => 873, - 'rtmfp' => 1935, - 'rtmp' => 1935, - 'rtsp' => 554, - 'rtsps' => 322, - 'rtspu' => 554, - 'secondlife' => null, - 'service' => null, - 'session' => null, - 'sftp' => 22, // application of SSH - 'sgn' => null, - 'shttp' => 80, - 'sieve' => 4190, - 'sip' => 5060, - 'sips' => 5061, - 'skype' => null, - 'smb' => 445, - 'sms' => null, - 'smtp' => 25, - 'snews' => 563, - 'snmp' => 161, - 'soap.beep' => null, // explicit port bypasses SRV lookups - 'soap.beeps' => null, // explicit port bypasses SRV lookups - 'soldat' => null, // port required - 'spiffe' => null, // ports are not used - 'spotify' => null, - 'ssh' => 22, - 'steam' => null, - 'stun' => null, - 'stuns' => null, - 'submit' => 587, // SMTP submission - 'svn' => null, - 'tag' => null, - 'teamspeak' => 8767, - 'tel' => null, - 'teliaeid' => null, - 'telnet' => 23, - 'tftp' => 69, - 'things' => null, - 'thismessage' => null, // not an actual scheme - 'tip' => 3372, - 'tn3270' => 23, - 'tool' => null, - 'turn' => 3478, - 'turns' => 5349, - 'tv' => null, - 'udp' => null, // multiple independent uses - 'unreal' => 7777, // assumed based on Unreal Tournament - 'urn' => null, - 'ut2004' => 7777, - 'v-event' => null, - 'vemmi' => 575, - 'ventrilo' => 3784, - 'videotex' => 516, - 'vnc' => 5900, - 'view-source' => null, - 'wais' => 210, - 'webcal' => null, // unclear if port 80 or 443 should be assumed - 'wpid' => null, // alias of pwid - 'ws' => 80, - 'wss' => 443, - 'wtai' => null, - 'wyciwyg' => null, - 'xcon' => null, // not resolvable - 'xcon-userid' => null, - 'xfire' => null, - 'xmlrpc.beep' => null, // explicit port bypasses SRV lookups - 'xmlrpc.beeps' => null, // explicit port bypasses SRV lookups - 'xmpp' => null, - 'xri' => null, // unclear; historical - 'ymsgr' => null, - 'z39.50' => 210, - 'z39.50r' => 210, - 'z39.50s' => 210, - - ]; - /** - * List of schemes which use locator syntax when they are actually names - * - * If a scheme has no documentation or examples at all, it is assumed to be among these schemes - */ - const SCHEME_NONSTANDARD = [ - "diaspora", - "dvb", - "ed2k", - "facetime", - "gizmoproject", - "hcp", - "hydrazone", - "keyparc", - "lastfm", - "market", - "mongodb", // more than one host can be specified, which is non-standard - "moz", // no documentation - "moz-icon", // not in the IANA registry; equivalent to 'icon' - "ms-enrollment", - "ms-gamebarservices", // no documentation - "ms-gamingoverlay", // no documentation - "ms-getoffice", // no documentation - "ms-help", - "ms-inputapp", // no documentation - "ms-lockscreencomponent-config", // no documentation - "ms-mixedrealitycapture", - "ms-officeapp", - "ms-restoretabcompanion", - "ms-sttoverlay", - "ms-useractivityset", - "ms-whiteboard", - "ms-whiteboard-cmd", - "ms-windows-store", // not in the IANA registry; documentation shows it uses incorrect syntax - "notes", - "onenote-cmd", - "pack", - "psyc", // authority section is non-standard - "qb", // no documentation - "res", - "resource", - "teliaeid", - "wtai", - "wyciwyg", - ]; - - // character class identifiers - const CHR_C0 = 1; - const CHR_C0_OR_SPACE = 2; - const CHR_ASCII_ALPHA = 3; - const CHR_ASCII_ALPHANUM = 4; - - // percent-encoding character set identifiers - const PE_CONTROL = 1; - const PE_FRAGMENT = 2; - const PE_PATH = 3; - const PE_USERINFO = 4; - - const PE_SET = [ - self::PE_CONTROL => [], - self::PE_FRAGMENT => [" ", '"', '`', "<", ">"], - self::PE_PATH => [" ", '"', '`', "<", ">", "#", "?", "{", "}"], - self::PE_USERINFO => [" ", '"', '`', "<", ">", "#", "?", "{", "}", "/", ":", ";", "=", "@", "[", "]", "\\"], - ]; - - // error condition identifiers - const ERR_LEADING_OR_TRAILING_WS = 1; - const ERR_EMBEDDED_NEWLINE_OR_TAB = 2; - const ERR_INVALID_SCHEME_CHAR = 3; - const ERR_FILE_SCHEME_EXPECTING_DOUBLE_SLASH = 4; - const ERR_RELATIVE_URL = 5; - const ERR_SCHEME_EXPECTING_SLASH = 6; - const ERR_BACKSLASH_FORBIDDEN = 7; - const ERR_UNEXPECTED_SLASH = 8; - const ERR_UNEXPECTED_AT = 9; - const ERR_AUTHORITY_WITHOUT_HOST = 10; - - // parser state identifiers - const ST_SCHEME_START = 1; - const ST_SCHEME = 2; - const ST_NO_SCHEME = 3; - const ST_FILE = 4; - const ST_SPECIAL_RELATIVE_OR_AUTHORITY = 5; - const ST_SPECIAL_AUTHORITY_SLASHES = 6; - const ST_PATH_OR_AUTHORITY = 7; - const ST_CANNOT_BE_A_BASE_URL_PATH = 8; - const ST_FRAGMENT = 9; - const ST_RELATIVE = 10; - const ST_SPECIAL_AUTHORITY_IGNORE_SLASHES = 11; - const ST_AUTHORITY = 12; - const ST_PATH = 13; - const ST_RELATIVE_SLASH = 14; - const ST_HOST = 15; - const ST_HOSTNAME = 16; - const ST_FILE_HOST = 17; - - public static $confUseAllSchemePorts = false; - - public $scheme = null; - public $path = []; - - public $cannotBeBaseUrl = false; - public $err = []; - - protected function basicUrlParser(string $input, self $base = null, string $encodingOverride = "", self $url = null, int $stateOverride = 0) { - $pointer = -1; - $pos = -1; - // start by getting the byte length of the input - // this will later function as a signal for end of input - // initially it also functions to show whether characters - // have been removed by stripping operations - $eof = strlen($input); - // begin algorithm - # If url is not given: - if (!$url) { - # Set url to a new URL. - $url = new self; - # Remove any leading and trailing C0 or space from input. - $input = trim($input, " \u{0}\u{1}\u{2}\u{3}\u{4}\u{5}\u{6}\u{7}\u{8}\u{9}\u{A}\u{B}\u{C}\u{D}\u{E}\u{F}\u{10}\u{11}\u{12}\u{13}\u{14}\u{15}\u{16}\u{17}\u{18}\u{19}\u{1A}\u{1B}\u{1C}\u{1D}\u{1E}\u{1F}"); - # If input contains any leading or trailing C0 control or space, validation error. - if (strlen($input) != $oef) { - $url->err[] = [$pointer, $pos, self::ERR_LEADING_OR_TRAILING_WS]; - $eof = strlen($input); - } - } - # Remove all ASCII tab or newline from input. - $input = str_replace(["\r", "\n", "\t",], "", $input); - # If input contains any ASCII tab or newline, validation error. - if (strlen($input) != $oef) { - $url->err[] = [$pointer, $pos, self::ERR_EMBEDDED_NEWLINE_OR_TAB]; - $eof = strlen($input); - } - # Let state be state override if given, or scheme start state otherwise. - $state = $stateOverride ?? self::ST_SCHEME_START; - # Let encoding be UTF-8. If encoding override is given, set encoding to the result of getting an output encoding from encoding override. - $encoding = ($encodingOverride=="") ? "utf-8" : $this->getOutputEncoding($encodingOverride); - # Let buffer be the empty string. - $buffer = ""; - # Let the @ flag, [] flag, and passwordTokenSeenFlag be unset. - $flagAtSign = $flagSquareBracket = $flagPasswordTokenSeen = false; - # Let pointer be a pointer to first code point in input. - // we operate on byte strings: $pos is the byte offset of the character referred to by $pointer - $pos = 0; - # Keep running the following state machine by switching on state. - # If after a run pointer points to the EOF code point, go to the next step. - # Otherwise, increase pointer by one and continue with the state machine. - // Note: the state machine is designed to run once even with an empty string - do { - # Within a parser algorithm that uses a pointer variable, c references the code point the pointer variable points to. - // we operate on byte strings: $pos is the byte offset of the character referred to by $pointer; - // $posNext is the start of "remaining" i.e. the offset of the next UTF-8 character - $c = UTF8::get($input, $pos, $posNext); - // when the algorithm specifies to decrease the pointer by one, the result is to reprocess the current character; we - // accomplish this by going back to this label, which skips the increment at the end of each iteration - processChar: - // switch on state - switch ($state) { - # scheme start state - case self::ST_SCHEME_START: - if ($this->isChr($c, self::CHR_ASCII_ALPHA)) { - # If c is an ASCII alpha, append c, lowercased, to buffer, and set state to scheme state. - $buffer .= strtolower($c); - $state = self::ST_SCHEME; - } elseif (!$stateOverride) { - # Otherwise, if state override is not given, set state to no scheme state, and decrease pointer by one. - $state = self::ST_NO_SCHEME; - goto processChar; - } else { - # Otherwise, validation error, return failure. - # NOTE: This indication of failure is used exclusively by Location object’s protocol attribute. - $url->err[] = [$pointer, $pos, self::ERR_INVALID_SCHEME_CHAR]; - $url->failure = true; - return $url; - } - break; - # scheme state - case self::ST_SCHEME: - if ($this->isChr($c, self::CHR_ASCII_ALPHANUM) || strpos("+-.", $c) !== false) { - # If c is an ASCII alphanumeric, U+002B (+), U+002D (-), or U+002E (.), append c, lowercased, to buffer. - $buffer .= strtolower($c); - } elseif ($c==":") { - # Otherwise, if c is U+003A (:), then: - # If state override is given, then: - if ($stateOverride && - # If url’s scheme is a special scheme and buffer is not a special scheme, then return. - ($this->isSpecial($url) && !$this->isSpecial($buffer)) || - # If url’s scheme is not a special scheme and buffer is a special scheme, then return. - (!$this->isSpecial($url) && $this->isSpecial($buffer)) || - # If url includes credentials or has a non-null port, and buffer is "file", then return. - ($buffer=="file" || !is_null($url->port) || strlen((string) $url->username) || strlen((string) $url->password)) || - # If url’s scheme is "file" and its host is an empty host or null, then return. - ($url->scheme=="file" && !strlen((string) $url->host)) - ) { - return $url; - } - # Set url’s scheme to buffer. - $url->scheme = $buffer; - # If state override is given, then: - if ($stateOverride) { - # If url’s port is url’s scheme’s default port, then set url’s port to null. - // OPTIONAL DEVIATION: we optionally allow any registered scheme's port to be defaulted - $portList = (self::$confUseAllSchemePorts ? self::SCHEME_DEFAULT_PORTS : self::SCHEME_SPECIAL); - $url->port = (isset($portList[$url->scheme]) && $url->port==$portList[$url->scheme]) ? null : $url->port; - # Return. - return $url; - } - # Set buffer to the empty string. - $buffer = ""; - if ($url->scheme=="file") { - # If url’s scheme is "file", then: - if (substr($input, $posNext, 2) !== "//") { - # If remaining does not start with "//", validation error. - $url->err[] = [$pointer + 1, $posNext, self::ERR_FILE_SCHEME_EXPECTING_DOUBLE_SLASH]; - } - # Set state to file state. - $state = self::ST_FILE; - } elseif ($base && $base->scheme===$url->scheme && $this->isSpecial($url)) { - # Otherwise, if url is special, base is non-null, and base’s scheme is equal to url’s scheme, set state to special relative or authority state. - # NOTE: This means that base’s cannot-be-a-base-URL flag is unset. - $state = self::ST_SPECIAL_RELATIVE_OR_AUTHORITY; - } elseif ($this->isSpecial($url)) { - # Otherwise, if url is special, set state to special authority slashes state. - $state = self::ST_SPECIAL_AUTHORITY_SLASHES; - } elseif ($input[$posNext]=="/") { - # Otherwise, if remaining starts with an U+002F (/), set state to path or authority state and increase pointer by one. - $state = self::ST_PATH_OR_AUTHORITY; - $pos = $posNext; - $pointer++; - } else { - # Otherwise, set url’s cannot-be-a-base-URL flag, append an empty string to url’s path, and set state to cannot-be-a-base-URL path state. - $url->cannotBeBaseUrl = true; - $url->path[] = ""; - $state = self::ST_CANNOT_BE_A_BASE_URL_PATH; - } - } elseif (!$stateOverride) { - # Otherwise, if state override is not given, set buffer to the empty string, state to no scheme state, and start over (from the first code point in input). - $buffer = ""; - $state = self::ST_NO_SCHEME; - $pos = 0; - $pointer = 0; - goto processChar; - } else { - # Otherwise, validation error, return failure. - # NOTE: This indication of failure is used exclusively by Location object’s protocol attribute. Furthermore, the non-failure termination earlier in this state is an intentional difference for defining that attribute. - $url->err[] = [$pointer, $pos, self::ERR_INVALID_SCHEME_CHAR]; - $url->failure = true; - return $url; - } - break; - # no scheme state - case self::ST_NO_SCHEME: - if (!$base || ($base->cannotBeBaseUrl && $c != "#")) { - # If base is null, or base’s cannot-be-a-base-URL flag is set and c is not U+0023 (#), validation error, return failure. - $url->err[] = [$pointer, $pos, self::ERR_RELATIVE_URL]; - $url->failure = true; - return $url; - } elseif ($base->cannotBeBaseUrl && $c=="#") { - # Otherwise, if base’s cannot-be-a-base-URL flag is set and c is U+0023 (#) - $this->map($url, $base, [ - # set url’s scheme to base’s scheme, - "scheme", - # url’s path to a copy of base’s path, - "path", - # url’s query to base’s query, - "query", - ]); - # url’s fragment to the empty string, - $url->fragment = ""; - # set url’s cannot-be-a-base-URL flag, - $url->cannotBeBaseUrl = true; - # and set state to fragment state. - $state = self::ST_FRAGMENT; - } elseif ($base->scheme != "file") { - # Otherwise, if base’s scheme is not "file", set state to relative state and decrease pointer by one. - $state = self::ST_RELATIVE; - goto processChar; - } else { - # Otherwise, set state to file state and decrease pointer by one. - $state = self::ST_FILE; - goto processChar; - } - break; - # special relative or authority state - case self::ST_SPECIAL_RELATIVE_OR_AUTHORITY: - if ($c=="/" && $input[$posNext]=="/") { - # If c is U+002F (/) and remaining starts with U+002F (/), then set state to special authority ignore slashes state and increase pointer by one. - $state = self::ST_SPECIAL_AUTHORITY_IGNORE_SLASHES; - } else { - # Otherwise, validation error, set state to relative state and decrease pointer by one. - $url->err[] = [$pointer, $pos, self::ERR_SCHEME_EXPECTING_SLASH]; - $state = self::ST_RELATIVE; - goto processChar; - } - break; - # path or authority state - case self::ST_PATH_OR_AUTHORITY: - if ($c=="/") { - # If c is U+002F (/), then set state to authority state. - $state = self::ST_AUTHORITY; - } else { - # Otherwise, set state to path state, and decrease pointer by one. - $state = self::ST_PATH; - goto processChar; - } - break; - # relative state - case self::ST_RELATIVE: - # Set url’s scheme to base’s scheme, and then, switching on c: - $url->scheme = $base->scheme; - switch ($c) { - case "": # The EOF code point - $this->map($url, $base, [ - # Set url’s username to base’s username, - "username", - # url’s password to base’s password, - "password", - # url’s host to base’s host, - "host", - # url’s port to base’s port, - "port", - # url’s path to a copy of base’s path, - "path", - # and url’s query to base’s query. - "query", - ]); - break; - case "/": - # Set state to relative slash state. - $state = self::ST_RELATIVE_SLASH; - break; - case "?": - $this->map($url, $base, [ - # Set url’s username to base’s username, - "username", - # url’s password to base’s password, - "password", - # url’s host to base’s host, - "host", - # url’s port to base’s port, - "port", - # url’s path to a copy of base’s path, - "path", - ]); - # url’s query to the empty string, - $url->query = ""; - # and state to query state. - $state = self::ST_QUERY; - break; - case "#": - $this->map($url, $base, [ - # Set url’s username to base’s username, - "username", - # url’s password to base’s password, - "password", - # url’s host to base’s host, - "host", - # url’s port to base’s port, - "port", - # url’s path to a copy of base’s path, - "path", - # url’s query to base’s query, - "query", - ]); - # url’s fragment to the empty string, - $url->fragment = ""; - # and state to fragment state. - $state = self::ST_FRAGMENT; - break; - default: - if ($this->isSpecial($url) && $c = "\\") { - # If url is special and c is U+005C (\), validation error, set state to relative slash state. - $url->err[] = [$pointer, $pos, self::ERR_BACKSLASH_FORBIDDEN]; - $state = self::ST_RELATIVE_SLASH; - } else { - # Otherwise, run these steps: - $this->map($url, $base, [ - # Set url’s username to base’s username, - "username", - # url’s password to base’s password, - "password", - # url’s host to base’s host, - "host", - # url’s port to base’s port, - "port", - # url’s path to a copy of base’s path, - "path", - ]); - # and then remove url’s path’s last item, if any. - array_pop($url->path); - # Set state to path state, and decrease pointer by one. - $state = self::ST_PATH; - goto processChar; - } - } - break; - # relative slash state - case self::ST_RELATIVE_SLASH: - if ($this->isSpecial($url) && ($c=="/" || $c=="\\")) { - # If url is special and c is U+002F (/) or U+005C (\), then: - # If c is U+005C (\), validation error. - if ($c=="\\") { - $url->err[] = [$pointer, $pos, self::ERR_BACKSLASH_FORBIDDEN]; - } - # Set state to special authority ignore slashes state. - $state = self::ST_SPECIAL_AUTHORITY_IGNORE_SLASHES; - } elseif ($c="/") { - # Otherwise, if c is U+002F (/), then set state to authority state. - $state = self::ST_AUTHORITY; - } else { - # Otherwise, - $this->map($url, $base, [ - # set url’s username to base’s username, - "username", - # url’s password to base’s password, - "password", - # url’s host to base’s host, - "host", - # url’s port to base’s port, - "port", - ]); - # state to path state, and then, decrease pointer by one. - $state = self::ST_PATH; - goto processChar; - } - break; - # special authority slashes state - case self::ST_SPECIAL_AUTHORITY_SLASHES: - if ($c=="/" && substr($input, $posNext, 1)=="/") { - # If c is U+002F (/) and remaining starts with U+002F (/), then set state to special authority ignore slashes state and increase pointer by one. - $state = self::ST_SPECIAL_AUTHORITY_IGNORE_SLASHES; - // this has the effect of increasing the pointer by one - $pos = $posNext; - $c = UTF8::get($input, $pos, $posNext); - } else { - # Otherwise, validation error, set state to special authority ignore slashes state, and decrease pointer by one. - $url->err[] = [$pointer, $pos, self::ERR_SCHEME_EXPECTING_SLASH]; - $state = self::ST_SPECIAL_AUTHORITY_IGNORE_SLASHES; - goto processChar; - } - break; - # special authority ignore slashes state - case self::ST_SPECIAL_AUTHORITY_IGNORE_SLASHES: - if ($c != "/" && $c != "\\") { - # If c is neither U+002F (/) nor U+005C (\), then set state to authority state and decrease pointer by one. - $state = self::ST_AUTHORITY; - goto processChar; - } else { - # Otherwise, validation error. - $url->err[] = [$pointer, $pos, self::ERR_UNEXPECTED_SLASH]; - } - break; - # authority state - case self::ST_AUTHORITY: - if($c=="@") { - # If c is U+0040 (@), then: - # Validation error. - $url->err[] = [$pointer, $pos, self::ERR_UNEXPECTED_AT]; - # If the @ flag is set, prepend "%40" to buffer. - if ($flagAtSign) { - $buffer = "%40".$buffer; - } - # Set the @ flag. - $flagAtSign = true; - # For each codePoint in buffer: - $bPos = 0; - $bEof = strlen($buffer); - while ($bPos < $bEof) { - $codePoint = UTF8::get($buffer, $bPos, $bPosNext); - # If codePoint is U+003A (:) and passwordTokenSeenFlag is unset, then set passwordTokenSeenFlag and continue. - if ($codePoint==":" && !$flagPasswordTokenSeen) { - $flagPasswordTokenSeen = true; - // "continue" in the specification means going to the next character - $bPos = $bPosNext; - continue; - } - # Let encodedCodePoints be the result of running UTF-8 percent encode codePoint using the userinfo percent-encode set. - $encodedCodePoints = $this->percentEncode($codePoint, self::PE_USERINFO); - if ($flagPasswordTokenSeen) { - # If passwordTokenSeenFlag is set, then append encodedCodePoints to url’s password. - $url->password .= $encodedCodePoints; - } else { - # Otherwise, append encodedCodePoints to url’s username. - $url->username .= $encodedCodePoints; - } - } - # Set buffer to the empty string. - $buffer = ""; - } elseif ( - # Otherwise, if one of the following is true - in_array($c, ["/", "?", "#", ""]) || # c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#) - ($this->isSpecial($url) && $c=="\\") # url is special and c is U+005C (\) - ) { - # then: - # If @ flag is set and buffer is the empty string, validation error, return failure. - if ($flagAtSign && $buffer = "") { - $url->err[] = [$pointer, $pos, self::ERR_AUTHORITY_WITHOUT_HOST]; - $url->failure = true; - return $url; - } - # Decrease pointer by the number of code points in buffer plus one, - // DEVIATION: as with decreasing the pointer by one to reprocess characters, we'll ignore the "plus one" here for self-consitency - // we first count the number of characters in the buffer - $c = UTF8::len($buffer); - // then decrease the advisorty character pointer by that amount - $pointer -= $c; - // then seek back the same number of characters - $pos = UTF8::seek($input, -$c, $pos); - // and finally consume that character to get the position of the next character to continue the loop correctly - $c = UTF8::get($input, $pos, $posNext); - # set buffer to the empty string, and set state to host state. - $buffer = ""; - $state = self::ST_HOST; - // and reprocess the first character in the erstwhile buffer - goto processChar; - } else { - # Otherwise, append c to buffer. - $buffer .= $c; - } - break; - // invalid or unimplemented state - default: - // FIXME: this should be an error, but until the whole state machine is implemented, we stop processing instead - return $url; - } - # If after a run pointer points to the EOF code point, go to the next step. - # Otherwise, increase pointer by one and continue with the state machine. - // we operate on byte strings: $pos is the byte offset of the character referred to by $pointer; - // $posNext is the start of "remaining" i.e. the offset of the next UTF-8 character - $pos = $posNext; - $pointer++; - } while ($pos <= $eof); - } - - protected function getOutputEncoding(string $encoding): string { - // FIXME: stub - return $encoding; - } - - protected function isChr(string $c, int $chrClass) { - switch ($chrClass) { - case self::CHR_C0: - return ($c <= "\u{1F}"); - case self::CHR_C0_OR_SPACE: - return ($c == " " || $c <= "\u{1F}"); - case self::CHR_ASCII_ALPHA: - return (($c >= "A" && $c <= "Z") || ($c >= "z" && $c <= "z")); - case self::CHR_ASCII_ALPHANUM: - return ( - ($c == (string) (int) $c) || // digits - ($c >= "A" && $c <= "Z") || // uppercase alphabetic - ($c >= "z" && $c <= "z") // lowercase alphabetic - ); - default: - throw new \Exception; - } - } - - protected function map(URI $to, URI $from, array $properties): bool { - foreach ($properties as $prop) { - $to->$prop = $from->prop; - } - return true; - } - - protected function isSpecial($test): bool { - $test = ($est instanceof URI) ? $test->scheme : $test; - return array_key_exists($test, self::SCHEME_SPECIAL); - } - - protected function percentEncode(string $bytes, int $set): string { - if (!isset(self::PE_SET[$set])) { - throw new \Exception; - } - $buffer = ""; - foreach ($bytes as $b) { - if ($b < "\x20" || $b > "\x7E" || in_array($b, self::PE_SET[$set])) { - $buffer .= "%".strtoupper(bin2hex($b)); - } else { - $buffer .= $b; - } - } - return $buffer; - } -} diff --git a/lib/URL.php b/lib/URL.php deleted file mode 100644 index 74ce6a8..0000000 --- a/lib/URL.php +++ /dev/null @@ -1,21 +0,0 @@ -basicUrlParser($base); - if (is_null($parsedBase)) { - throw new \TypeError; - } - } - $parsedUrl = $this->basicUrlParser($url, $parsedBase); - var_export($parsedUrl); - } -} diff --git a/lib/UTF8.php b/lib/UTF8.php index ddc402d..4349d19 100644 --- a/lib/UTF8.php +++ b/lib/UTF8.php @@ -4,7 +4,7 @@ * See LICENSE and AUTHORS files for details */ declare(strict_types=1); -namespace JKingWeb\URI; +namespace MensBeam\UTF8; abstract class UTF8 { public static $replacementChar = "\u{FFFD}";