Split off UTF-8 tools from URL parser

This commit is contained in:
J. King 2018-04-23 11:04:40 -04:00
parent 30162e8525
commit aa0d6ce20e
7 changed files with 29 additions and 920 deletions

1
AUTHORS Normal file
View file

@ -0,0 +1 @@
J. King https://jkingweb.ca/

22
LICENSE Normal file
View file

@ -0,0 +1,22 @@
Copyright (c) 2018 J. King et al.
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.

View file

@ -1,8 +1,8 @@
{
"name": "jkingweb/uniform",
"name": "mensbeam/utf8",
"type": "library",
"description": "A collection of URL tools compatible with the WHATWG URL standard",
"keywords": ["url","uri","whatwg"],
"description": "A set of tools for working with UTF-8 strings without mbstring or intl",
"keywords": ["utf-8", "utf8"],
"license": "MIT",
"authors": [
{
@ -17,7 +17,7 @@
},
"autoload": {
"psr-4": {
"JKingWeb\\URI\\": "lib/"
"MensBeam\\UTF8\\": "lib/"
}
}
}

2
composer.lock generated
View file

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
"This file is @generated automatically"
],
"content-hash": "9c891512410ea881b9cf5ae2a3873fcb",
"content-hash": "8394b8ab5a816511b1fad1a40758b186",
"packages": [],
"packages-dev": [],
"aliases": [],

View file

@ -1,893 +0,0 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace JKingWeb\URI;
class URI {
/** List of "special" schemes and their default port numbers */
const SCHEME_SPECIAL = [
'ftp' => 21,
'file' => null,
'gopher' => 70,
'http' => 80,
'https' => 443,
'ws' => 80,
'wss' => 443,
];
/** Default port numbers for all schemes registered with IANA
*
* If a scheme is in the IANA registry but not listed here, then it likely did not exist when this list was last compiled
*/
const SCHEME_DEFAULT_PORTS = [
'aaa' => 3868,
'aaas' => 5658,
'about' => null,
'acap' => 674,
'acct' => null,
'acr' => null,
'adiumxtra' => null,
'afp' => 548,
'afs' => null,
'aim' => null,
'appdata' => null,
'apt' => null,
'attachment' => null,
'aw' => null,
'barion' => null,
'beshare' => null, // multiple ports
'bitcoin' => null,
'blob' => null,
'bolo' => null,
'browserext' => null,
'callto' => null,
'cap' => 1026,
'chrome' => null,
'chrome-extension' => null,
'cid' => null,
'coap' => 5683,
'coap+tcp' => 5683,
'coap+ws' => null, // it's unclear which port applies here: WebSocket would imply 80, but the specification is ambiguous
'coaps' => 5684,
'coaps+tcp' => 5684,
'coaps+ws' => null, // it's unclear which port applies here: WebSocket would imply 443, but the specification is ambiguous
'com-eventbrite-attendee' => null,
'content' => null,
'conti' => null,
'crid' => null,
'cvs' => null,
'data' => null,
'dav' => null,
'diaspora' => null,
'dict' => 2628,
'dis' => null,
'dlna-playcontainer' => null,
'dlna-playsingle' => null,
'dns' => 53,
'dntp' => null,
'dtn' => null,
'dvb' => null,
'ed2k' => null,
'example' => null, // not an actual scheme
'facetime' => null,
'fax' => null,
'feed' => null,
'feedready' => null,
'file' => null,
'filesystem' => null,
'finger' => 79,
'fish' => 22, // an application of SSH
'ftp' => 21,
'geo' => null,
'gg' => null,
'git' => 9418, // per https://git-scm.com/book/en/v2/Git-on-the-Server-The-Protocols#_the_git_protocol
'gizmoproject' => null,
'go' => 1096,
'gopher' => 70,
'graph' => null,
'graphdata' => null, // not in the IANA list, but included as part of the registration for 'graph'
'gtalk' => null,
'h323' => null, // several ports are defined in the IANA port registry---unclear which (if any) is implied by the scheme
'ham' => null,
'hcp' => null,
'http' => 80,
'https' => 443,
'hxxp' => null, // it would be inappropriate to modify these URLs
'hxxps' => null, // it would be inappropriate to modify these URLs
'hydrazone' => null,
'iax' => null,
'icap' => 1344,
'icon' => null,
'im' => null,
'imap' => 143,
'info' => null,
'iotdisco' => null,
'ipn' => null,
'ipp' => 631,
'ipps' => 631,
'irc' => 6667,
'irc6' => 6667,
'ircs' => 994,
'iris' => null,
'iris.beep' => null,
'iris.lwz' => null,
'iris.xpc' => null,
'iris.xpcs' => null,
'isostore' => null,
'itms' => null,
'jabber' => null,
'jar' => null,
'jms' => null,
'keyparc' => null,
'lastfm' => null,
'ldap' => 389,
'ldaps' => 636,
'lvlt' => null,
'magnet' => null,
'mailserver' => null,
'mailto' => null,
'maps' => null,
'market' => null,
'message' => null,
'microsoft.windows.camera' => null,
'microsoft.windows.camera.multipicker' => null,
'microsoft.windows.camera.picker' => null,
'mid' => null,
'mms' => null,
'modem' => null,
'mongodb' => null,
'moz' => null,
'ms-access' => null,
'ms-browser-extension' => null,
'ms-drive-to' => null,
'ms-enrollment' => null,
'ms-excel' => null,
'ms-gamebarservices' => null,
'ms-gamingoverlay' => null,
'ms-getoffice' => null,
'ms-help' => null,
'ms-infopath' => null,
'ms-inputapp' => null,
'ms-lockscreencomponent-config' => null,
'ms-media-stream-id' => null,
'ms-mixedrealitycapture' => null,
'ms-officeapp' => null,
'ms-people' => null,
'ms-project' => null,
'ms-powerpoint' => null,
'ms-publisher' => null,
'ms-restoretabcompanion' => null,
'ms-search-repair' => null,
'ms-secondary-screen-controller' => null,
'ms-secondary-screen-setup' => null,
// what the hell, Microsoft? Seriously?
'ms-settings' => null,
'ms-settings-airplanemode' => null,
'ms-settings-bluetooth' => null,
'ms-settings-camera' => null,
'ms-settings-cellular' => null,
'ms-settings-cloudstorage' => null,
'ms-settings-connectabledevices' => null,
'ms-settings-displays-topology' => null,
'ms-settings-emailandaccounts' => null,
'ms-settings-language' => null,
'ms-settings-location' => null,
'ms-settings-lock' => null,
'ms-settings-nfctransactions' => null,
'ms-settings-notifications' => null,
'ms-settings-power' => null,
'ms-settings-privacy' => null,
'ms-settings-proximity' => null,
'ms-settings-screenrotation' => null,
'ms-settings-wifi' => null,
'ms-settings-workplace' => null,
// You could have just defined one ms-app scheme, you know, never mind one ms-settings...
'ms-spd' => null,
'ms-sttoverlay' => null,
'ms-transit-to' => null,
'ms-useractivityset' => null,
'ms-virtualtouchpad' => null,
'ms-visio' => null,
'ms-walk-to' => null,
'ms-whiteboard' => null,
'ms-whiteboard-cmd' => null,
'ms-word' => null,
'msnim' => null,
'msrp' => null, // explicitly no default port
'msrps' => null, // explicitly no default port
'mtqp' => 1038,
'mumble' => 64738,
'mupdate' => null,
'mvn' => null,
'news' => 119,
'nfs' => 2049,
'ni' => null,
'nih' => null,
'nntp' => 119,
'notes' => null,
'ocf' => null,
'oid' => null,
'onenote' => null,
'onenote-cmd' => null,
'opaquelocktoken' => null,
'pack' => null,
'palm' => null,
'paparazzi' => null,
'pkcs11' => null,
'platform' => null,
'pop' => 110,
'pres' => null,
'prospero' => 1525,
'proxy' => null,
'pwid' => null,
'psyc' => 4404,
'qb' => null,
'query' => null,
'redis' => 6379,
'rediss' => 6379,
'reload' => 6084,
'res' => null,
'resource' => null,
'rmi' => 1099,
'rsync' => 873,
'rtmfp' => 1935,
'rtmp' => 1935,
'rtsp' => 554,
'rtsps' => 322,
'rtspu' => 554,
'secondlife' => null,
'service' => null,
'session' => null,
'sftp' => 22, // application of SSH
'sgn' => null,
'shttp' => 80,
'sieve' => 4190,
'sip' => 5060,
'sips' => 5061,
'skype' => null,
'smb' => 445,
'sms' => null,
'smtp' => 25,
'snews' => 563,
'snmp' => 161,
'soap.beep' => null, // explicit port bypasses SRV lookups
'soap.beeps' => null, // explicit port bypasses SRV lookups
'soldat' => null, // port required
'spiffe' => null, // ports are not used
'spotify' => null,
'ssh' => 22,
'steam' => null,
'stun' => null,
'stuns' => null,
'submit' => 587, // SMTP submission
'svn' => null,
'tag' => null,
'teamspeak' => 8767,
'tel' => null,
'teliaeid' => null,
'telnet' => 23,
'tftp' => 69,
'things' => null,
'thismessage' => null, // not an actual scheme
'tip' => 3372,
'tn3270' => 23,
'tool' => null,
'turn' => 3478,
'turns' => 5349,
'tv' => null,
'udp' => null, // multiple independent uses
'unreal' => 7777, // assumed based on Unreal Tournament
'urn' => null,
'ut2004' => 7777,
'v-event' => null,
'vemmi' => 575,
'ventrilo' => 3784,
'videotex' => 516,
'vnc' => 5900,
'view-source' => null,
'wais' => 210,
'webcal' => null, // unclear if port 80 or 443 should be assumed
'wpid' => null, // alias of pwid
'ws' => 80,
'wss' => 443,
'wtai' => null,
'wyciwyg' => null,
'xcon' => null, // not resolvable
'xcon-userid' => null,
'xfire' => null,
'xmlrpc.beep' => null, // explicit port bypasses SRV lookups
'xmlrpc.beeps' => null, // explicit port bypasses SRV lookups
'xmpp' => null,
'xri' => null, // unclear; historical
'ymsgr' => null,
'z39.50' => 210,
'z39.50r' => 210,
'z39.50s' => 210,
];
/**
* List of schemes which use locator syntax when they are actually names
*
* If a scheme has no documentation or examples at all, it is assumed to be among these schemes
*/
const SCHEME_NONSTANDARD = [
"diaspora",
"dvb",
"ed2k",
"facetime",
"gizmoproject",
"hcp",
"hydrazone",
"keyparc",
"lastfm",
"market",
"mongodb", // more than one host can be specified, which is non-standard
"moz", // no documentation
"moz-icon", // not in the IANA registry; equivalent to 'icon'
"ms-enrollment",
"ms-gamebarservices", // no documentation
"ms-gamingoverlay", // no documentation
"ms-getoffice", // no documentation
"ms-help",
"ms-inputapp", // no documentation
"ms-lockscreencomponent-config", // no documentation
"ms-mixedrealitycapture",
"ms-officeapp",
"ms-restoretabcompanion",
"ms-sttoverlay",
"ms-useractivityset",
"ms-whiteboard",
"ms-whiteboard-cmd",
"ms-windows-store", // not in the IANA registry; documentation shows it uses incorrect syntax
"notes",
"onenote-cmd",
"pack",
"psyc", // authority section is non-standard
"qb", // no documentation
"res",
"resource",
"teliaeid",
"wtai",
"wyciwyg",
];
// character class identifiers
const CHR_C0 = 1;
const CHR_C0_OR_SPACE = 2;
const CHR_ASCII_ALPHA = 3;
const CHR_ASCII_ALPHANUM = 4;
// percent-encoding character set identifiers
const PE_CONTROL = 1;
const PE_FRAGMENT = 2;
const PE_PATH = 3;
const PE_USERINFO = 4;
const PE_SET = [
self::PE_CONTROL => [],
self::PE_FRAGMENT => [" ", '"', '`', "<", ">"],
self::PE_PATH => [" ", '"', '`', "<", ">", "#", "?", "{", "}"],
self::PE_USERINFO => [" ", '"', '`', "<", ">", "#", "?", "{", "}", "/", ":", ";", "=", "@", "[", "]", "\\"],
];
// error condition identifiers
const ERR_LEADING_OR_TRAILING_WS = 1;
const ERR_EMBEDDED_NEWLINE_OR_TAB = 2;
const ERR_INVALID_SCHEME_CHAR = 3;
const ERR_FILE_SCHEME_EXPECTING_DOUBLE_SLASH = 4;
const ERR_RELATIVE_URL = 5;
const ERR_SCHEME_EXPECTING_SLASH = 6;
const ERR_BACKSLASH_FORBIDDEN = 7;
const ERR_UNEXPECTED_SLASH = 8;
const ERR_UNEXPECTED_AT = 9;
const ERR_AUTHORITY_WITHOUT_HOST = 10;
// parser state identifiers
const ST_SCHEME_START = 1;
const ST_SCHEME = 2;
const ST_NO_SCHEME = 3;
const ST_FILE = 4;
const ST_SPECIAL_RELATIVE_OR_AUTHORITY = 5;
const ST_SPECIAL_AUTHORITY_SLASHES = 6;
const ST_PATH_OR_AUTHORITY = 7;
const ST_CANNOT_BE_A_BASE_URL_PATH = 8;
const ST_FRAGMENT = 9;
const ST_RELATIVE = 10;
const ST_SPECIAL_AUTHORITY_IGNORE_SLASHES = 11;
const ST_AUTHORITY = 12;
const ST_PATH = 13;
const ST_RELATIVE_SLASH = 14;
const ST_HOST = 15;
const ST_HOSTNAME = 16;
const ST_FILE_HOST = 17;
public static $confUseAllSchemePorts = false;
public $scheme = null;
public $path = [];
public $cannotBeBaseUrl = false;
public $err = [];
protected function basicUrlParser(string $input, self $base = null, string $encodingOverride = "", self $url = null, int $stateOverride = 0) {
$pointer = -1;
$pos = -1;
// start by getting the byte length of the input
// this will later function as a signal for end of input
// initially it also functions to show whether characters
// have been removed by stripping operations
$eof = strlen($input);
// begin algorithm
# If url is not given:
if (!$url) {
# Set url to a new URL.
$url = new self;
# Remove any leading and trailing C0 or space from input.
$input = trim($input, " \u{0}\u{1}\u{2}\u{3}\u{4}\u{5}\u{6}\u{7}\u{8}\u{9}\u{A}\u{B}\u{C}\u{D}\u{E}\u{F}\u{10}\u{11}\u{12}\u{13}\u{14}\u{15}\u{16}\u{17}\u{18}\u{19}\u{1A}\u{1B}\u{1C}\u{1D}\u{1E}\u{1F}");
# If input contains any leading or trailing C0 control or space, validation error.
if (strlen($input) != $oef) {
$url->err[] = [$pointer, $pos, self::ERR_LEADING_OR_TRAILING_WS];
$eof = strlen($input);
}
}
# Remove all ASCII tab or newline from input.
$input = str_replace(["\r", "\n", "\t",], "", $input);
# If input contains any ASCII tab or newline, validation error.
if (strlen($input) != $oef) {
$url->err[] = [$pointer, $pos, self::ERR_EMBEDDED_NEWLINE_OR_TAB];
$eof = strlen($input);
}
# Let state be state override if given, or scheme start state otherwise.
$state = $stateOverride ?? self::ST_SCHEME_START;
# Let encoding be UTF-8. If encoding override is given, set encoding to the result of getting an output encoding from encoding override.
$encoding = ($encodingOverride=="") ? "utf-8" : $this->getOutputEncoding($encodingOverride);
# Let buffer be the empty string.
$buffer = "";
# Let the @ flag, [] flag, and passwordTokenSeenFlag be unset.
$flagAtSign = $flagSquareBracket = $flagPasswordTokenSeen = false;
# Let pointer be a pointer to first code point in input.
// we operate on byte strings: $pos is the byte offset of the character referred to by $pointer
$pos = 0;
# Keep running the following state machine by switching on state.
# If after a run pointer points to the EOF code point, go to the next step.
# Otherwise, increase pointer by one and continue with the state machine.
// Note: the state machine is designed to run once even with an empty string
do {
# Within a parser algorithm that uses a pointer variable, c references the code point the pointer variable points to.
// we operate on byte strings: $pos is the byte offset of the character referred to by $pointer;
// $posNext is the start of "remaining" i.e. the offset of the next UTF-8 character
$c = UTF8::get($input, $pos, $posNext);
// when the algorithm specifies to decrease the pointer by one, the result is to reprocess the current character; we
// accomplish this by going back to this label, which skips the increment at the end of each iteration
processChar:
// switch on state
switch ($state) {
# scheme start state
case self::ST_SCHEME_START:
if ($this->isChr($c, self::CHR_ASCII_ALPHA)) {
# If c is an ASCII alpha, append c, lowercased, to buffer, and set state to scheme state.
$buffer .= strtolower($c);
$state = self::ST_SCHEME;
} elseif (!$stateOverride) {
# Otherwise, if state override is not given, set state to no scheme state, and decrease pointer by one.
$state = self::ST_NO_SCHEME;
goto processChar;
} else {
# Otherwise, validation error, return failure.
# NOTE: This indication of failure is used exclusively by Location objects protocol attribute.
$url->err[] = [$pointer, $pos, self::ERR_INVALID_SCHEME_CHAR];
$url->failure = true;
return $url;
}
break;
# scheme state
case self::ST_SCHEME:
if ($this->isChr($c, self::CHR_ASCII_ALPHANUM) || strpos("+-.", $c) !== false) {
# If c is an ASCII alphanumeric, U+002B (+), U+002D (-), or U+002E (.), append c, lowercased, to buffer.
$buffer .= strtolower($c);
} elseif ($c==":") {
# Otherwise, if c is U+003A (:), then:
# If state override is given, then:
if ($stateOverride &&
# If urls scheme is a special scheme and buffer is not a special scheme, then return.
($this->isSpecial($url) && !$this->isSpecial($buffer)) ||
# If urls scheme is not a special scheme and buffer is a special scheme, then return.
(!$this->isSpecial($url) && $this->isSpecial($buffer)) ||
# If url includes credentials or has a non-null port, and buffer is "file", then return.
($buffer=="file" || !is_null($url->port) || strlen((string) $url->username) || strlen((string) $url->password)) ||
# If urls scheme is "file" and its host is an empty host or null, then return.
($url->scheme=="file" && !strlen((string) $url->host))
) {
return $url;
}
# Set urls scheme to buffer.
$url->scheme = $buffer;
# If state override is given, then:
if ($stateOverride) {
# If urls port is urls schemes default port, then set urls port to null.
// OPTIONAL DEVIATION: we optionally allow any registered scheme's port to be defaulted
$portList = (self::$confUseAllSchemePorts ? self::SCHEME_DEFAULT_PORTS : self::SCHEME_SPECIAL);
$url->port = (isset($portList[$url->scheme]) && $url->port==$portList[$url->scheme]) ? null : $url->port;
# Return.
return $url;
}
# Set buffer to the empty string.
$buffer = "";
if ($url->scheme=="file") {
# If urls scheme is "file", then:
if (substr($input, $posNext, 2) !== "//") {
# If remaining does not start with "//", validation error.
$url->err[] = [$pointer + 1, $posNext, self::ERR_FILE_SCHEME_EXPECTING_DOUBLE_SLASH];
}
# Set state to file state.
$state = self::ST_FILE;
} elseif ($base && $base->scheme===$url->scheme && $this->isSpecial($url)) {
# Otherwise, if url is special, base is non-null, and bases scheme is equal to urls scheme, set state to special relative or authority state.
# NOTE: This means that bases cannot-be-a-base-URL flag is unset.
$state = self::ST_SPECIAL_RELATIVE_OR_AUTHORITY;
} elseif ($this->isSpecial($url)) {
# Otherwise, if url is special, set state to special authority slashes state.
$state = self::ST_SPECIAL_AUTHORITY_SLASHES;
} elseif ($input[$posNext]=="/") {
# Otherwise, if remaining starts with an U+002F (/), set state to path or authority state and increase pointer by one.
$state = self::ST_PATH_OR_AUTHORITY;
$pos = $posNext;
$pointer++;
} else {
# Otherwise, set urls cannot-be-a-base-URL flag, append an empty string to urls path, and set state to cannot-be-a-base-URL path state.
$url->cannotBeBaseUrl = true;
$url->path[] = "";
$state = self::ST_CANNOT_BE_A_BASE_URL_PATH;
}
} elseif (!$stateOverride) {
# Otherwise, if state override is not given, set buffer to the empty string, state to no scheme state, and start over (from the first code point in input).
$buffer = "";
$state = self::ST_NO_SCHEME;
$pos = 0;
$pointer = 0;
goto processChar;
} else {
# Otherwise, validation error, return failure.
# NOTE: This indication of failure is used exclusively by Location objects protocol attribute. Furthermore, the non-failure termination earlier in this state is an intentional difference for defining that attribute.
$url->err[] = [$pointer, $pos, self::ERR_INVALID_SCHEME_CHAR];
$url->failure = true;
return $url;
}
break;
# no scheme state
case self::ST_NO_SCHEME:
if (!$base || ($base->cannotBeBaseUrl && $c != "#")) {
# If base is null, or bases cannot-be-a-base-URL flag is set and c is not U+0023 (#), validation error, return failure.
$url->err[] = [$pointer, $pos, self::ERR_RELATIVE_URL];
$url->failure = true;
return $url;
} elseif ($base->cannotBeBaseUrl && $c=="#") {
# Otherwise, if bases cannot-be-a-base-URL flag is set and c is U+0023 (#)
$this->map($url, $base, [
# set urls scheme to bases scheme,
"scheme",
# urls path to a copy of bases path,
"path",
# urls query to bases query,
"query",
]);
# urls fragment to the empty string,
$url->fragment = "";
# set urls cannot-be-a-base-URL flag,
$url->cannotBeBaseUrl = true;
# and set state to fragment state.
$state = self::ST_FRAGMENT;
} elseif ($base->scheme != "file") {
# Otherwise, if bases scheme is not "file", set state to relative state and decrease pointer by one.
$state = self::ST_RELATIVE;
goto processChar;
} else {
# Otherwise, set state to file state and decrease pointer by one.
$state = self::ST_FILE;
goto processChar;
}
break;
# special relative or authority state
case self::ST_SPECIAL_RELATIVE_OR_AUTHORITY:
if ($c=="/" && $input[$posNext]=="/") {
# If c is U+002F (/) and remaining starts with U+002F (/), then set state to special authority ignore slashes state and increase pointer by one.
$state = self::ST_SPECIAL_AUTHORITY_IGNORE_SLASHES;
} else {
# Otherwise, validation error, set state to relative state and decrease pointer by one.
$url->err[] = [$pointer, $pos, self::ERR_SCHEME_EXPECTING_SLASH];
$state = self::ST_RELATIVE;
goto processChar;
}
break;
# path or authority state
case self::ST_PATH_OR_AUTHORITY:
if ($c=="/") {
# If c is U+002F (/), then set state to authority state.
$state = self::ST_AUTHORITY;
} else {
# Otherwise, set state to path state, and decrease pointer by one.
$state = self::ST_PATH;
goto processChar;
}
break;
# relative state
case self::ST_RELATIVE:
# Set urls scheme to bases scheme, and then, switching on c:
$url->scheme = $base->scheme;
switch ($c) {
case "": # The EOF code point
$this->map($url, $base, [
# Set urls username to bases username,
"username",
# urls password to bases password,
"password",
# urls host to bases host,
"host",
# urls port to bases port,
"port",
# urls path to a copy of bases path,
"path",
# and urls query to bases query.
"query",
]);
break;
case "/":
# Set state to relative slash state.
$state = self::ST_RELATIVE_SLASH;
break;
case "?":
$this->map($url, $base, [
# Set urls username to bases username,
"username",
# urls password to bases password,
"password",
# urls host to bases host,
"host",
# urls port to bases port,
"port",
# urls path to a copy of bases path,
"path",
]);
# urls query to the empty string,
$url->query = "";
# and state to query state.
$state = self::ST_QUERY;
break;
case "#":
$this->map($url, $base, [
# Set urls username to bases username,
"username",
# urls password to bases password,
"password",
# urls host to bases host,
"host",
# urls port to bases port,
"port",
# urls path to a copy of bases path,
"path",
# urls query to bases query,
"query",
]);
# urls fragment to the empty string,
$url->fragment = "";
# and state to fragment state.
$state = self::ST_FRAGMENT;
break;
default:
if ($this->isSpecial($url) && $c = "\\") {
# If url is special and c is U+005C (\), validation error, set state to relative slash state.
$url->err[] = [$pointer, $pos, self::ERR_BACKSLASH_FORBIDDEN];
$state = self::ST_RELATIVE_SLASH;
} else {
# Otherwise, run these steps:
$this->map($url, $base, [
# Set urls username to bases username,
"username",
# urls password to bases password,
"password",
# urls host to bases host,
"host",
# urls port to bases port,
"port",
# urls path to a copy of bases path,
"path",
]);
# and then remove urls paths last item, if any.
array_pop($url->path);
# Set state to path state, and decrease pointer by one.
$state = self::ST_PATH;
goto processChar;
}
}
break;
# relative slash state
case self::ST_RELATIVE_SLASH:
if ($this->isSpecial($url) && ($c=="/" || $c=="\\")) {
# If url is special and c is U+002F (/) or U+005C (\), then:
# If c is U+005C (\), validation error.
if ($c=="\\") {
$url->err[] = [$pointer, $pos, self::ERR_BACKSLASH_FORBIDDEN];
}
# Set state to special authority ignore slashes state.
$state = self::ST_SPECIAL_AUTHORITY_IGNORE_SLASHES;
} elseif ($c="/") {
# Otherwise, if c is U+002F (/), then set state to authority state.
$state = self::ST_AUTHORITY;
} else {
# Otherwise,
$this->map($url, $base, [
# set urls username to bases username,
"username",
# urls password to bases password,
"password",
# urls host to bases host,
"host",
# urls port to bases port,
"port",
]);
# state to path state, and then, decrease pointer by one.
$state = self::ST_PATH;
goto processChar;
}
break;
# special authority slashes state
case self::ST_SPECIAL_AUTHORITY_SLASHES:
if ($c=="/" && substr($input, $posNext, 1)=="/") {
# If c is U+002F (/) and remaining starts with U+002F (/), then set state to special authority ignore slashes state and increase pointer by one.
$state = self::ST_SPECIAL_AUTHORITY_IGNORE_SLASHES;
// this has the effect of increasing the pointer by one
$pos = $posNext;
$c = UTF8::get($input, $pos, $posNext);
} else {
# Otherwise, validation error, set state to special authority ignore slashes state, and decrease pointer by one.
$url->err[] = [$pointer, $pos, self::ERR_SCHEME_EXPECTING_SLASH];
$state = self::ST_SPECIAL_AUTHORITY_IGNORE_SLASHES;
goto processChar;
}
break;
# special authority ignore slashes state
case self::ST_SPECIAL_AUTHORITY_IGNORE_SLASHES:
if ($c != "/" && $c != "\\") {
# If c is neither U+002F (/) nor U+005C (\), then set state to authority state and decrease pointer by one.
$state = self::ST_AUTHORITY;
goto processChar;
} else {
# Otherwise, validation error.
$url->err[] = [$pointer, $pos, self::ERR_UNEXPECTED_SLASH];
}
break;
# authority state
case self::ST_AUTHORITY:
if($c=="@") {
# If c is U+0040 (@), then:
# Validation error.
$url->err[] = [$pointer, $pos, self::ERR_UNEXPECTED_AT];
# If the @ flag is set, prepend "%40" to buffer.
if ($flagAtSign) {
$buffer = "%40".$buffer;
}
# Set the @ flag.
$flagAtSign = true;
# For each codePoint in buffer:
$bPos = 0;
$bEof = strlen($buffer);
while ($bPos < $bEof) {
$codePoint = UTF8::get($buffer, $bPos, $bPosNext);
# If codePoint is U+003A (:) and passwordTokenSeenFlag is unset, then set passwordTokenSeenFlag and continue.
if ($codePoint==":" && !$flagPasswordTokenSeen) {
$flagPasswordTokenSeen = true;
// "continue" in the specification means going to the next character
$bPos = $bPosNext;
continue;
}
# Let encodedCodePoints be the result of running UTF-8 percent encode codePoint using the userinfo percent-encode set.
$encodedCodePoints = $this->percentEncode($codePoint, self::PE_USERINFO);
if ($flagPasswordTokenSeen) {
# If passwordTokenSeenFlag is set, then append encodedCodePoints to urls password.
$url->password .= $encodedCodePoints;
} else {
# Otherwise, append encodedCodePoints to urls username.
$url->username .= $encodedCodePoints;
}
}
# Set buffer to the empty string.
$buffer = "";
} elseif (
# Otherwise, if one of the following is true
in_array($c, ["/", "?", "#", ""]) || # c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
($this->isSpecial($url) && $c=="\\") # url is special and c is U+005C (\)
) {
# then:
# If @ flag is set and buffer is the empty string, validation error, return failure.
if ($flagAtSign && $buffer = "") {
$url->err[] = [$pointer, $pos, self::ERR_AUTHORITY_WITHOUT_HOST];
$url->failure = true;
return $url;
}
# Decrease pointer by the number of code points in buffer plus one,
// DEVIATION: as with decreasing the pointer by one to reprocess characters, we'll ignore the "plus one" here for self-consitency
// we first count the number of characters in the buffer
$c = UTF8::len($buffer);
// then decrease the advisorty character pointer by that amount
$pointer -= $c;
// then seek back the same number of characters
$pos = UTF8::seek($input, -$c, $pos);
// and finally consume that character to get the position of the next character to continue the loop correctly
$c = UTF8::get($input, $pos, $posNext);
# set buffer to the empty string, and set state to host state.
$buffer = "";
$state = self::ST_HOST;
// and reprocess the first character in the erstwhile buffer
goto processChar;
} else {
# Otherwise, append c to buffer.
$buffer .= $c;
}
break;
// invalid or unimplemented state
default:
// FIXME: this should be an error, but until the whole state machine is implemented, we stop processing instead
return $url;
}
# If after a run pointer points to the EOF code point, go to the next step.
# Otherwise, increase pointer by one and continue with the state machine.
// we operate on byte strings: $pos is the byte offset of the character referred to by $pointer;
// $posNext is the start of "remaining" i.e. the offset of the next UTF-8 character
$pos = $posNext;
$pointer++;
} while ($pos <= $eof);
}
protected function getOutputEncoding(string $encoding): string {
// FIXME: stub
return $encoding;
}
protected function isChr(string $c, int $chrClass) {
switch ($chrClass) {
case self::CHR_C0:
return ($c <= "\u{1F}");
case self::CHR_C0_OR_SPACE:
return ($c == " " || $c <= "\u{1F}");
case self::CHR_ASCII_ALPHA:
return (($c >= "A" && $c <= "Z") || ($c >= "z" && $c <= "z"));
case self::CHR_ASCII_ALPHANUM:
return (
($c == (string) (int) $c) || // digits
($c >= "A" && $c <= "Z") || // uppercase alphabetic
($c >= "z" && $c <= "z") // lowercase alphabetic
);
default:
throw new \Exception;
}
}
protected function map(URI $to, URI $from, array $properties): bool {
foreach ($properties as $prop) {
$to->$prop = $from->prop;
}
return true;
}
protected function isSpecial($test): bool {
$test = ($est instanceof URI) ? $test->scheme : $test;
return array_key_exists($test, self::SCHEME_SPECIAL);
}
protected function percentEncode(string $bytes, int $set): string {
if (!isset(self::PE_SET[$set])) {
throw new \Exception;
}
$buffer = "";
foreach ($bytes as $b) {
if ($b < "\x20" || $b > "\x7E" || in_array($b, self::PE_SET[$set])) {
$buffer .= "%".strtoupper(bin2hex($b));
} else {
$buffer .= $b;
}
}
return $buffer;
}
}

View file

@ -1,21 +0,0 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace JKingWeb\URI;
class URL extends URI {
public function __construct(string $url, string $base = null) {
$parsedBase = null;
if (!is_null($base)) {
$parsedBase = $this->basicUrlParser($base);
if (is_null($parsedBase)) {
throw new \TypeError;
}
}
$parsedUrl = $this->basicUrlParser($url, $parsedBase);
var_export($parsedUrl);
}
}

View file

@ -4,7 +4,7 @@
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace JKingWeb\URI;
namespace MensBeam\UTF8;
abstract class UTF8 {
public static $replacementChar = "\u{FFFD}";