Split off UTF-8 tools from URL parser
This commit is contained in:
parent
30162e8525
commit
aa0d6ce20e
7 changed files with 29 additions and 920 deletions
1
AUTHORS
Normal file
1
AUTHORS
Normal file
|
@ -0,0 +1 @@
|
|||
J. King https://jkingweb.ca/
|
22
LICENSE
Normal file
22
LICENSE
Normal file
|
@ -0,0 +1,22 @@
|
|||
Copyright (c) 2018 J. King et al.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person
|
||||
obtaining a copy of this software and associated documentation
|
||||
files (the "Software"), to deal in the Software without
|
||||
restriction, including without limitation the rights to use,
|
||||
copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the
|
||||
Software is furnished to do so, subject to the following
|
||||
conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
OTHER DEALINGS IN THE SOFTWARE.
|
|
@ -1,8 +1,8 @@
|
|||
{
|
||||
"name": "jkingweb/uniform",
|
||||
"name": "mensbeam/utf8",
|
||||
"type": "library",
|
||||
"description": "A collection of URL tools compatible with the WHATWG URL standard",
|
||||
"keywords": ["url","uri","whatwg"],
|
||||
"description": "A set of tools for working with UTF-8 strings without mbstring or intl",
|
||||
"keywords": ["utf-8", "utf8"],
|
||||
"license": "MIT",
|
||||
"authors": [
|
||||
{
|
||||
|
@ -17,7 +17,7 @@
|
|||
},
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"JKingWeb\\URI\\": "lib/"
|
||||
"MensBeam\\UTF8\\": "lib/"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
2
composer.lock
generated
2
composer.lock
generated
|
@ -4,7 +4,7 @@
|
|||
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
|
||||
"This file is @generated automatically"
|
||||
],
|
||||
"content-hash": "9c891512410ea881b9cf5ae2a3873fcb",
|
||||
"content-hash": "8394b8ab5a816511b1fad1a40758b186",
|
||||
"packages": [],
|
||||
"packages-dev": [],
|
||||
"aliases": [],
|
||||
|
|
893
lib/URI.php
893
lib/URI.php
|
@ -1,893 +0,0 @@
|
|||
<?php
|
||||
/** @license MIT
|
||||
* Copyright 2018 J. King et al.
|
||||
* See LICENSE and AUTHORS files for details */
|
||||
|
||||
declare(strict_types=1);
|
||||
namespace JKingWeb\URI;
|
||||
|
||||
class URI {
|
||||
/** List of "special" schemes and their default port numbers */
|
||||
const SCHEME_SPECIAL = [
|
||||
'ftp' => 21,
|
||||
'file' => null,
|
||||
'gopher' => 70,
|
||||
'http' => 80,
|
||||
'https' => 443,
|
||||
'ws' => 80,
|
||||
'wss' => 443,
|
||||
];
|
||||
/** Default port numbers for all schemes registered with IANA
|
||||
*
|
||||
* If a scheme is in the IANA registry but not listed here, then it likely did not exist when this list was last compiled
|
||||
*/
|
||||
const SCHEME_DEFAULT_PORTS = [
|
||||
'aaa' => 3868,
|
||||
'aaas' => 5658,
|
||||
'about' => null,
|
||||
'acap' => 674,
|
||||
'acct' => null,
|
||||
'acr' => null,
|
||||
'adiumxtra' => null,
|
||||
'afp' => 548,
|
||||
'afs' => null,
|
||||
'aim' => null,
|
||||
'appdata' => null,
|
||||
'apt' => null,
|
||||
'attachment' => null,
|
||||
'aw' => null,
|
||||
'barion' => null,
|
||||
'beshare' => null, // multiple ports
|
||||
'bitcoin' => null,
|
||||
'blob' => null,
|
||||
'bolo' => null,
|
||||
'browserext' => null,
|
||||
'callto' => null,
|
||||
'cap' => 1026,
|
||||
'chrome' => null,
|
||||
'chrome-extension' => null,
|
||||
'cid' => null,
|
||||
'coap' => 5683,
|
||||
'coap+tcp' => 5683,
|
||||
'coap+ws' => null, // it's unclear which port applies here: WebSocket would imply 80, but the specification is ambiguous
|
||||
'coaps' => 5684,
|
||||
'coaps+tcp' => 5684,
|
||||
'coaps+ws' => null, // it's unclear which port applies here: WebSocket would imply 443, but the specification is ambiguous
|
||||
'com-eventbrite-attendee' => null,
|
||||
'content' => null,
|
||||
'conti' => null,
|
||||
'crid' => null,
|
||||
'cvs' => null,
|
||||
'data' => null,
|
||||
'dav' => null,
|
||||
'diaspora' => null,
|
||||
'dict' => 2628,
|
||||
'dis' => null,
|
||||
'dlna-playcontainer' => null,
|
||||
'dlna-playsingle' => null,
|
||||
'dns' => 53,
|
||||
'dntp' => null,
|
||||
'dtn' => null,
|
||||
'dvb' => null,
|
||||
'ed2k' => null,
|
||||
'example' => null, // not an actual scheme
|
||||
'facetime' => null,
|
||||
'fax' => null,
|
||||
'feed' => null,
|
||||
'feedready' => null,
|
||||
'file' => null,
|
||||
'filesystem' => null,
|
||||
'finger' => 79,
|
||||
'fish' => 22, // an application of SSH
|
||||
'ftp' => 21,
|
||||
'geo' => null,
|
||||
'gg' => null,
|
||||
'git' => 9418, // per https://git-scm.com/book/en/v2/Git-on-the-Server-The-Protocols#_the_git_protocol
|
||||
'gizmoproject' => null,
|
||||
'go' => 1096,
|
||||
'gopher' => 70,
|
||||
'graph' => null,
|
||||
'graphdata' => null, // not in the IANA list, but included as part of the registration for 'graph'
|
||||
'gtalk' => null,
|
||||
'h323' => null, // several ports are defined in the IANA port registry---unclear which (if any) is implied by the scheme
|
||||
'ham' => null,
|
||||
'hcp' => null,
|
||||
'http' => 80,
|
||||
'https' => 443,
|
||||
'hxxp' => null, // it would be inappropriate to modify these URLs
|
||||
'hxxps' => null, // it would be inappropriate to modify these URLs
|
||||
'hydrazone' => null,
|
||||
'iax' => null,
|
||||
'icap' => 1344,
|
||||
'icon' => null,
|
||||
'im' => null,
|
||||
'imap' => 143,
|
||||
'info' => null,
|
||||
'iotdisco' => null,
|
||||
'ipn' => null,
|
||||
'ipp' => 631,
|
||||
'ipps' => 631,
|
||||
'irc' => 6667,
|
||||
'irc6' => 6667,
|
||||
'ircs' => 994,
|
||||
'iris' => null,
|
||||
'iris.beep' => null,
|
||||
'iris.lwz' => null,
|
||||
'iris.xpc' => null,
|
||||
'iris.xpcs' => null,
|
||||
'isostore' => null,
|
||||
'itms' => null,
|
||||
'jabber' => null,
|
||||
'jar' => null,
|
||||
'jms' => null,
|
||||
'keyparc' => null,
|
||||
'lastfm' => null,
|
||||
'ldap' => 389,
|
||||
'ldaps' => 636,
|
||||
'lvlt' => null,
|
||||
'magnet' => null,
|
||||
'mailserver' => null,
|
||||
'mailto' => null,
|
||||
'maps' => null,
|
||||
'market' => null,
|
||||
'message' => null,
|
||||
'microsoft.windows.camera' => null,
|
||||
'microsoft.windows.camera.multipicker' => null,
|
||||
'microsoft.windows.camera.picker' => null,
|
||||
'mid' => null,
|
||||
'mms' => null,
|
||||
'modem' => null,
|
||||
'mongodb' => null,
|
||||
'moz' => null,
|
||||
'ms-access' => null,
|
||||
'ms-browser-extension' => null,
|
||||
'ms-drive-to' => null,
|
||||
'ms-enrollment' => null,
|
||||
'ms-excel' => null,
|
||||
'ms-gamebarservices' => null,
|
||||
'ms-gamingoverlay' => null,
|
||||
'ms-getoffice' => null,
|
||||
'ms-help' => null,
|
||||
'ms-infopath' => null,
|
||||
'ms-inputapp' => null,
|
||||
'ms-lockscreencomponent-config' => null,
|
||||
'ms-media-stream-id' => null,
|
||||
'ms-mixedrealitycapture' => null,
|
||||
'ms-officeapp' => null,
|
||||
'ms-people' => null,
|
||||
'ms-project' => null,
|
||||
'ms-powerpoint' => null,
|
||||
'ms-publisher' => null,
|
||||
'ms-restoretabcompanion' => null,
|
||||
'ms-search-repair' => null,
|
||||
'ms-secondary-screen-controller' => null,
|
||||
'ms-secondary-screen-setup' => null,
|
||||
// what the hell, Microsoft? Seriously?
|
||||
'ms-settings' => null,
|
||||
'ms-settings-airplanemode' => null,
|
||||
'ms-settings-bluetooth' => null,
|
||||
'ms-settings-camera' => null,
|
||||
'ms-settings-cellular' => null,
|
||||
'ms-settings-cloudstorage' => null,
|
||||
'ms-settings-connectabledevices' => null,
|
||||
'ms-settings-displays-topology' => null,
|
||||
'ms-settings-emailandaccounts' => null,
|
||||
'ms-settings-language' => null,
|
||||
'ms-settings-location' => null,
|
||||
'ms-settings-lock' => null,
|
||||
'ms-settings-nfctransactions' => null,
|
||||
'ms-settings-notifications' => null,
|
||||
'ms-settings-power' => null,
|
||||
'ms-settings-privacy' => null,
|
||||
'ms-settings-proximity' => null,
|
||||
'ms-settings-screenrotation' => null,
|
||||
'ms-settings-wifi' => null,
|
||||
'ms-settings-workplace' => null,
|
||||
// You could have just defined one ms-app scheme, you know, never mind one ms-settings...
|
||||
'ms-spd' => null,
|
||||
'ms-sttoverlay' => null,
|
||||
'ms-transit-to' => null,
|
||||
'ms-useractivityset' => null,
|
||||
'ms-virtualtouchpad' => null,
|
||||
'ms-visio' => null,
|
||||
'ms-walk-to' => null,
|
||||
'ms-whiteboard' => null,
|
||||
'ms-whiteboard-cmd' => null,
|
||||
'ms-word' => null,
|
||||
'msnim' => null,
|
||||
'msrp' => null, // explicitly no default port
|
||||
'msrps' => null, // explicitly no default port
|
||||
'mtqp' => 1038,
|
||||
'mumble' => 64738,
|
||||
'mupdate' => null,
|
||||
'mvn' => null,
|
||||
'news' => 119,
|
||||
'nfs' => 2049,
|
||||
'ni' => null,
|
||||
'nih' => null,
|
||||
'nntp' => 119,
|
||||
'notes' => null,
|
||||
'ocf' => null,
|
||||
'oid' => null,
|
||||
'onenote' => null,
|
||||
'onenote-cmd' => null,
|
||||
'opaquelocktoken' => null,
|
||||
'pack' => null,
|
||||
'palm' => null,
|
||||
'paparazzi' => null,
|
||||
'pkcs11' => null,
|
||||
'platform' => null,
|
||||
'pop' => 110,
|
||||
'pres' => null,
|
||||
'prospero' => 1525,
|
||||
'proxy' => null,
|
||||
'pwid' => null,
|
||||
'psyc' => 4404,
|
||||
'qb' => null,
|
||||
'query' => null,
|
||||
'redis' => 6379,
|
||||
'rediss' => 6379,
|
||||
'reload' => 6084,
|
||||
'res' => null,
|
||||
'resource' => null,
|
||||
'rmi' => 1099,
|
||||
'rsync' => 873,
|
||||
'rtmfp' => 1935,
|
||||
'rtmp' => 1935,
|
||||
'rtsp' => 554,
|
||||
'rtsps' => 322,
|
||||
'rtspu' => 554,
|
||||
'secondlife' => null,
|
||||
'service' => null,
|
||||
'session' => null,
|
||||
'sftp' => 22, // application of SSH
|
||||
'sgn' => null,
|
||||
'shttp' => 80,
|
||||
'sieve' => 4190,
|
||||
'sip' => 5060,
|
||||
'sips' => 5061,
|
||||
'skype' => null,
|
||||
'smb' => 445,
|
||||
'sms' => null,
|
||||
'smtp' => 25,
|
||||
'snews' => 563,
|
||||
'snmp' => 161,
|
||||
'soap.beep' => null, // explicit port bypasses SRV lookups
|
||||
'soap.beeps' => null, // explicit port bypasses SRV lookups
|
||||
'soldat' => null, // port required
|
||||
'spiffe' => null, // ports are not used
|
||||
'spotify' => null,
|
||||
'ssh' => 22,
|
||||
'steam' => null,
|
||||
'stun' => null,
|
||||
'stuns' => null,
|
||||
'submit' => 587, // SMTP submission
|
||||
'svn' => null,
|
||||
'tag' => null,
|
||||
'teamspeak' => 8767,
|
||||
'tel' => null,
|
||||
'teliaeid' => null,
|
||||
'telnet' => 23,
|
||||
'tftp' => 69,
|
||||
'things' => null,
|
||||
'thismessage' => null, // not an actual scheme
|
||||
'tip' => 3372,
|
||||
'tn3270' => 23,
|
||||
'tool' => null,
|
||||
'turn' => 3478,
|
||||
'turns' => 5349,
|
||||
'tv' => null,
|
||||
'udp' => null, // multiple independent uses
|
||||
'unreal' => 7777, // assumed based on Unreal Tournament
|
||||
'urn' => null,
|
||||
'ut2004' => 7777,
|
||||
'v-event' => null,
|
||||
'vemmi' => 575,
|
||||
'ventrilo' => 3784,
|
||||
'videotex' => 516,
|
||||
'vnc' => 5900,
|
||||
'view-source' => null,
|
||||
'wais' => 210,
|
||||
'webcal' => null, // unclear if port 80 or 443 should be assumed
|
||||
'wpid' => null, // alias of pwid
|
||||
'ws' => 80,
|
||||
'wss' => 443,
|
||||
'wtai' => null,
|
||||
'wyciwyg' => null,
|
||||
'xcon' => null, // not resolvable
|
||||
'xcon-userid' => null,
|
||||
'xfire' => null,
|
||||
'xmlrpc.beep' => null, // explicit port bypasses SRV lookups
|
||||
'xmlrpc.beeps' => null, // explicit port bypasses SRV lookups
|
||||
'xmpp' => null,
|
||||
'xri' => null, // unclear; historical
|
||||
'ymsgr' => null,
|
||||
'z39.50' => 210,
|
||||
'z39.50r' => 210,
|
||||
'z39.50s' => 210,
|
||||
|
||||
];
|
||||
/**
|
||||
* List of schemes which use locator syntax when they are actually names
|
||||
*
|
||||
* If a scheme has no documentation or examples at all, it is assumed to be among these schemes
|
||||
*/
|
||||
const SCHEME_NONSTANDARD = [
|
||||
"diaspora",
|
||||
"dvb",
|
||||
"ed2k",
|
||||
"facetime",
|
||||
"gizmoproject",
|
||||
"hcp",
|
||||
"hydrazone",
|
||||
"keyparc",
|
||||
"lastfm",
|
||||
"market",
|
||||
"mongodb", // more than one host can be specified, which is non-standard
|
||||
"moz", // no documentation
|
||||
"moz-icon", // not in the IANA registry; equivalent to 'icon'
|
||||
"ms-enrollment",
|
||||
"ms-gamebarservices", // no documentation
|
||||
"ms-gamingoverlay", // no documentation
|
||||
"ms-getoffice", // no documentation
|
||||
"ms-help",
|
||||
"ms-inputapp", // no documentation
|
||||
"ms-lockscreencomponent-config", // no documentation
|
||||
"ms-mixedrealitycapture",
|
||||
"ms-officeapp",
|
||||
"ms-restoretabcompanion",
|
||||
"ms-sttoverlay",
|
||||
"ms-useractivityset",
|
||||
"ms-whiteboard",
|
||||
"ms-whiteboard-cmd",
|
||||
"ms-windows-store", // not in the IANA registry; documentation shows it uses incorrect syntax
|
||||
"notes",
|
||||
"onenote-cmd",
|
||||
"pack",
|
||||
"psyc", // authority section is non-standard
|
||||
"qb", // no documentation
|
||||
"res",
|
||||
"resource",
|
||||
"teliaeid",
|
||||
"wtai",
|
||||
"wyciwyg",
|
||||
];
|
||||
|
||||
// character class identifiers
|
||||
const CHR_C0 = 1;
|
||||
const CHR_C0_OR_SPACE = 2;
|
||||
const CHR_ASCII_ALPHA = 3;
|
||||
const CHR_ASCII_ALPHANUM = 4;
|
||||
|
||||
// percent-encoding character set identifiers
|
||||
const PE_CONTROL = 1;
|
||||
const PE_FRAGMENT = 2;
|
||||
const PE_PATH = 3;
|
||||
const PE_USERINFO = 4;
|
||||
|
||||
const PE_SET = [
|
||||
self::PE_CONTROL => [],
|
||||
self::PE_FRAGMENT => [" ", '"', '`', "<", ">"],
|
||||
self::PE_PATH => [" ", '"', '`', "<", ">", "#", "?", "{", "}"],
|
||||
self::PE_USERINFO => [" ", '"', '`', "<", ">", "#", "?", "{", "}", "/", ":", ";", "=", "@", "[", "]", "\\"],
|
||||
];
|
||||
|
||||
// error condition identifiers
|
||||
const ERR_LEADING_OR_TRAILING_WS = 1;
|
||||
const ERR_EMBEDDED_NEWLINE_OR_TAB = 2;
|
||||
const ERR_INVALID_SCHEME_CHAR = 3;
|
||||
const ERR_FILE_SCHEME_EXPECTING_DOUBLE_SLASH = 4;
|
||||
const ERR_RELATIVE_URL = 5;
|
||||
const ERR_SCHEME_EXPECTING_SLASH = 6;
|
||||
const ERR_BACKSLASH_FORBIDDEN = 7;
|
||||
const ERR_UNEXPECTED_SLASH = 8;
|
||||
const ERR_UNEXPECTED_AT = 9;
|
||||
const ERR_AUTHORITY_WITHOUT_HOST = 10;
|
||||
|
||||
// parser state identifiers
|
||||
const ST_SCHEME_START = 1;
|
||||
const ST_SCHEME = 2;
|
||||
const ST_NO_SCHEME = 3;
|
||||
const ST_FILE = 4;
|
||||
const ST_SPECIAL_RELATIVE_OR_AUTHORITY = 5;
|
||||
const ST_SPECIAL_AUTHORITY_SLASHES = 6;
|
||||
const ST_PATH_OR_AUTHORITY = 7;
|
||||
const ST_CANNOT_BE_A_BASE_URL_PATH = 8;
|
||||
const ST_FRAGMENT = 9;
|
||||
const ST_RELATIVE = 10;
|
||||
const ST_SPECIAL_AUTHORITY_IGNORE_SLASHES = 11;
|
||||
const ST_AUTHORITY = 12;
|
||||
const ST_PATH = 13;
|
||||
const ST_RELATIVE_SLASH = 14;
|
||||
const ST_HOST = 15;
|
||||
const ST_HOSTNAME = 16;
|
||||
const ST_FILE_HOST = 17;
|
||||
|
||||
public static $confUseAllSchemePorts = false;
|
||||
|
||||
public $scheme = null;
|
||||
public $path = [];
|
||||
|
||||
public $cannotBeBaseUrl = false;
|
||||
public $err = [];
|
||||
|
||||
protected function basicUrlParser(string $input, self $base = null, string $encodingOverride = "", self $url = null, int $stateOverride = 0) {
|
||||
$pointer = -1;
|
||||
$pos = -1;
|
||||
// start by getting the byte length of the input
|
||||
// this will later function as a signal for end of input
|
||||
// initially it also functions to show whether characters
|
||||
// have been removed by stripping operations
|
||||
$eof = strlen($input);
|
||||
// begin algorithm
|
||||
# If url is not given:
|
||||
if (!$url) {
|
||||
# Set url to a new URL.
|
||||
$url = new self;
|
||||
# Remove any leading and trailing C0 or space from input.
|
||||
$input = trim($input, " \u{0}\u{1}\u{2}\u{3}\u{4}\u{5}\u{6}\u{7}\u{8}\u{9}\u{A}\u{B}\u{C}\u{D}\u{E}\u{F}\u{10}\u{11}\u{12}\u{13}\u{14}\u{15}\u{16}\u{17}\u{18}\u{19}\u{1A}\u{1B}\u{1C}\u{1D}\u{1E}\u{1F}");
|
||||
# If input contains any leading or trailing C0 control or space, validation error.
|
||||
if (strlen($input) != $oef) {
|
||||
$url->err[] = [$pointer, $pos, self::ERR_LEADING_OR_TRAILING_WS];
|
||||
$eof = strlen($input);
|
||||
}
|
||||
}
|
||||
# Remove all ASCII tab or newline from input.
|
||||
$input = str_replace(["\r", "\n", "\t",], "", $input);
|
||||
# If input contains any ASCII tab or newline, validation error.
|
||||
if (strlen($input) != $oef) {
|
||||
$url->err[] = [$pointer, $pos, self::ERR_EMBEDDED_NEWLINE_OR_TAB];
|
||||
$eof = strlen($input);
|
||||
}
|
||||
# Let state be state override if given, or scheme start state otherwise.
|
||||
$state = $stateOverride ?? self::ST_SCHEME_START;
|
||||
# Let encoding be UTF-8. If encoding override is given, set encoding to the result of getting an output encoding from encoding override.
|
||||
$encoding = ($encodingOverride=="") ? "utf-8" : $this->getOutputEncoding($encodingOverride);
|
||||
# Let buffer be the empty string.
|
||||
$buffer = "";
|
||||
# Let the @ flag, [] flag, and passwordTokenSeenFlag be unset.
|
||||
$flagAtSign = $flagSquareBracket = $flagPasswordTokenSeen = false;
|
||||
# Let pointer be a pointer to first code point in input.
|
||||
// we operate on byte strings: $pos is the byte offset of the character referred to by $pointer
|
||||
$pos = 0;
|
||||
# Keep running the following state machine by switching on state.
|
||||
# If after a run pointer points to the EOF code point, go to the next step.
|
||||
# Otherwise, increase pointer by one and continue with the state machine.
|
||||
// Note: the state machine is designed to run once even with an empty string
|
||||
do {
|
||||
# Within a parser algorithm that uses a pointer variable, c references the code point the pointer variable points to.
|
||||
// we operate on byte strings: $pos is the byte offset of the character referred to by $pointer;
|
||||
// $posNext is the start of "remaining" i.e. the offset of the next UTF-8 character
|
||||
$c = UTF8::get($input, $pos, $posNext);
|
||||
// when the algorithm specifies to decrease the pointer by one, the result is to reprocess the current character; we
|
||||
// accomplish this by going back to this label, which skips the increment at the end of each iteration
|
||||
processChar:
|
||||
// switch on state
|
||||
switch ($state) {
|
||||
# scheme start state
|
||||
case self::ST_SCHEME_START:
|
||||
if ($this->isChr($c, self::CHR_ASCII_ALPHA)) {
|
||||
# If c is an ASCII alpha, append c, lowercased, to buffer, and set state to scheme state.
|
||||
$buffer .= strtolower($c);
|
||||
$state = self::ST_SCHEME;
|
||||
} elseif (!$stateOverride) {
|
||||
# Otherwise, if state override is not given, set state to no scheme state, and decrease pointer by one.
|
||||
$state = self::ST_NO_SCHEME;
|
||||
goto processChar;
|
||||
} else {
|
||||
# Otherwise, validation error, return failure.
|
||||
# NOTE: This indication of failure is used exclusively by Location object’s protocol attribute.
|
||||
$url->err[] = [$pointer, $pos, self::ERR_INVALID_SCHEME_CHAR];
|
||||
$url->failure = true;
|
||||
return $url;
|
||||
}
|
||||
break;
|
||||
# scheme state
|
||||
case self::ST_SCHEME:
|
||||
if ($this->isChr($c, self::CHR_ASCII_ALPHANUM) || strpos("+-.", $c) !== false) {
|
||||
# If c is an ASCII alphanumeric, U+002B (+), U+002D (-), or U+002E (.), append c, lowercased, to buffer.
|
||||
$buffer .= strtolower($c);
|
||||
} elseif ($c==":") {
|
||||
# Otherwise, if c is U+003A (:), then:
|
||||
# If state override is given, then:
|
||||
if ($stateOverride &&
|
||||
# If url’s scheme is a special scheme and buffer is not a special scheme, then return.
|
||||
($this->isSpecial($url) && !$this->isSpecial($buffer)) ||
|
||||
# If url’s scheme is not a special scheme and buffer is a special scheme, then return.
|
||||
(!$this->isSpecial($url) && $this->isSpecial($buffer)) ||
|
||||
# If url includes credentials or has a non-null port, and buffer is "file", then return.
|
||||
($buffer=="file" || !is_null($url->port) || strlen((string) $url->username) || strlen((string) $url->password)) ||
|
||||
# If url’s scheme is "file" and its host is an empty host or null, then return.
|
||||
($url->scheme=="file" && !strlen((string) $url->host))
|
||||
) {
|
||||
return $url;
|
||||
}
|
||||
# Set url’s scheme to buffer.
|
||||
$url->scheme = $buffer;
|
||||
# If state override is given, then:
|
||||
if ($stateOverride) {
|
||||
# If url’s port is url’s scheme’s default port, then set url’s port to null.
|
||||
// OPTIONAL DEVIATION: we optionally allow any registered scheme's port to be defaulted
|
||||
$portList = (self::$confUseAllSchemePorts ? self::SCHEME_DEFAULT_PORTS : self::SCHEME_SPECIAL);
|
||||
$url->port = (isset($portList[$url->scheme]) && $url->port==$portList[$url->scheme]) ? null : $url->port;
|
||||
# Return.
|
||||
return $url;
|
||||
}
|
||||
# Set buffer to the empty string.
|
||||
$buffer = "";
|
||||
if ($url->scheme=="file") {
|
||||
# If url’s scheme is "file", then:
|
||||
if (substr($input, $posNext, 2) !== "//") {
|
||||
# If remaining does not start with "//", validation error.
|
||||
$url->err[] = [$pointer + 1, $posNext, self::ERR_FILE_SCHEME_EXPECTING_DOUBLE_SLASH];
|
||||
}
|
||||
# Set state to file state.
|
||||
$state = self::ST_FILE;
|
||||
} elseif ($base && $base->scheme===$url->scheme && $this->isSpecial($url)) {
|
||||
# Otherwise, if url is special, base is non-null, and base’s scheme is equal to url’s scheme, set state to special relative or authority state.
|
||||
# NOTE: This means that base’s cannot-be-a-base-URL flag is unset.
|
||||
$state = self::ST_SPECIAL_RELATIVE_OR_AUTHORITY;
|
||||
} elseif ($this->isSpecial($url)) {
|
||||
# Otherwise, if url is special, set state to special authority slashes state.
|
||||
$state = self::ST_SPECIAL_AUTHORITY_SLASHES;
|
||||
} elseif ($input[$posNext]=="/") {
|
||||
# Otherwise, if remaining starts with an U+002F (/), set state to path or authority state and increase pointer by one.
|
||||
$state = self::ST_PATH_OR_AUTHORITY;
|
||||
$pos = $posNext;
|
||||
$pointer++;
|
||||
} else {
|
||||
# Otherwise, set url’s cannot-be-a-base-URL flag, append an empty string to url’s path, and set state to cannot-be-a-base-URL path state.
|
||||
$url->cannotBeBaseUrl = true;
|
||||
$url->path[] = "";
|
||||
$state = self::ST_CANNOT_BE_A_BASE_URL_PATH;
|
||||
}
|
||||
} elseif (!$stateOverride) {
|
||||
# Otherwise, if state override is not given, set buffer to the empty string, state to no scheme state, and start over (from the first code point in input).
|
||||
$buffer = "";
|
||||
$state = self::ST_NO_SCHEME;
|
||||
$pos = 0;
|
||||
$pointer = 0;
|
||||
goto processChar;
|
||||
} else {
|
||||
# Otherwise, validation error, return failure.
|
||||
# NOTE: This indication of failure is used exclusively by Location object’s protocol attribute. Furthermore, the non-failure termination earlier in this state is an intentional difference for defining that attribute.
|
||||
$url->err[] = [$pointer, $pos, self::ERR_INVALID_SCHEME_CHAR];
|
||||
$url->failure = true;
|
||||
return $url;
|
||||
}
|
||||
break;
|
||||
# no scheme state
|
||||
case self::ST_NO_SCHEME:
|
||||
if (!$base || ($base->cannotBeBaseUrl && $c != "#")) {
|
||||
# If base is null, or base’s cannot-be-a-base-URL flag is set and c is not U+0023 (#), validation error, return failure.
|
||||
$url->err[] = [$pointer, $pos, self::ERR_RELATIVE_URL];
|
||||
$url->failure = true;
|
||||
return $url;
|
||||
} elseif ($base->cannotBeBaseUrl && $c=="#") {
|
||||
# Otherwise, if base’s cannot-be-a-base-URL flag is set and c is U+0023 (#)
|
||||
$this->map($url, $base, [
|
||||
# set url’s scheme to base’s scheme,
|
||||
"scheme",
|
||||
# url’s path to a copy of base’s path,
|
||||
"path",
|
||||
# url’s query to base’s query,
|
||||
"query",
|
||||
]);
|
||||
# url’s fragment to the empty string,
|
||||
$url->fragment = "";
|
||||
# set url’s cannot-be-a-base-URL flag,
|
||||
$url->cannotBeBaseUrl = true;
|
||||
# and set state to fragment state.
|
||||
$state = self::ST_FRAGMENT;
|
||||
} elseif ($base->scheme != "file") {
|
||||
# Otherwise, if base’s scheme is not "file", set state to relative state and decrease pointer by one.
|
||||
$state = self::ST_RELATIVE;
|
||||
goto processChar;
|
||||
} else {
|
||||
# Otherwise, set state to file state and decrease pointer by one.
|
||||
$state = self::ST_FILE;
|
||||
goto processChar;
|
||||
}
|
||||
break;
|
||||
# special relative or authority state
|
||||
case self::ST_SPECIAL_RELATIVE_OR_AUTHORITY:
|
||||
if ($c=="/" && $input[$posNext]=="/") {
|
||||
# If c is U+002F (/) and remaining starts with U+002F (/), then set state to special authority ignore slashes state and increase pointer by one.
|
||||
$state = self::ST_SPECIAL_AUTHORITY_IGNORE_SLASHES;
|
||||
} else {
|
||||
# Otherwise, validation error, set state to relative state and decrease pointer by one.
|
||||
$url->err[] = [$pointer, $pos, self::ERR_SCHEME_EXPECTING_SLASH];
|
||||
$state = self::ST_RELATIVE;
|
||||
goto processChar;
|
||||
}
|
||||
break;
|
||||
# path or authority state
|
||||
case self::ST_PATH_OR_AUTHORITY:
|
||||
if ($c=="/") {
|
||||
# If c is U+002F (/), then set state to authority state.
|
||||
$state = self::ST_AUTHORITY;
|
||||
} else {
|
||||
# Otherwise, set state to path state, and decrease pointer by one.
|
||||
$state = self::ST_PATH;
|
||||
goto processChar;
|
||||
}
|
||||
break;
|
||||
# relative state
|
||||
case self::ST_RELATIVE:
|
||||
# Set url’s scheme to base’s scheme, and then, switching on c:
|
||||
$url->scheme = $base->scheme;
|
||||
switch ($c) {
|
||||
case "": # The EOF code point
|
||||
$this->map($url, $base, [
|
||||
# Set url’s username to base’s username,
|
||||
"username",
|
||||
# url’s password to base’s password,
|
||||
"password",
|
||||
# url’s host to base’s host,
|
||||
"host",
|
||||
# url’s port to base’s port,
|
||||
"port",
|
||||
# url’s path to a copy of base’s path,
|
||||
"path",
|
||||
# and url’s query to base’s query.
|
||||
"query",
|
||||
]);
|
||||
break;
|
||||
case "/":
|
||||
# Set state to relative slash state.
|
||||
$state = self::ST_RELATIVE_SLASH;
|
||||
break;
|
||||
case "?":
|
||||
$this->map($url, $base, [
|
||||
# Set url’s username to base’s username,
|
||||
"username",
|
||||
# url’s password to base’s password,
|
||||
"password",
|
||||
# url’s host to base’s host,
|
||||
"host",
|
||||
# url’s port to base’s port,
|
||||
"port",
|
||||
# url’s path to a copy of base’s path,
|
||||
"path",
|
||||
]);
|
||||
# url’s query to the empty string,
|
||||
$url->query = "";
|
||||
# and state to query state.
|
||||
$state = self::ST_QUERY;
|
||||
break;
|
||||
case "#":
|
||||
$this->map($url, $base, [
|
||||
# Set url’s username to base’s username,
|
||||
"username",
|
||||
# url’s password to base’s password,
|
||||
"password",
|
||||
# url’s host to base’s host,
|
||||
"host",
|
||||
# url’s port to base’s port,
|
||||
"port",
|
||||
# url’s path to a copy of base’s path,
|
||||
"path",
|
||||
# url’s query to base’s query,
|
||||
"query",
|
||||
]);
|
||||
# url’s fragment to the empty string,
|
||||
$url->fragment = "";
|
||||
# and state to fragment state.
|
||||
$state = self::ST_FRAGMENT;
|
||||
break;
|
||||
default:
|
||||
if ($this->isSpecial($url) && $c = "\\") {
|
||||
# If url is special and c is U+005C (\), validation error, set state to relative slash state.
|
||||
$url->err[] = [$pointer, $pos, self::ERR_BACKSLASH_FORBIDDEN];
|
||||
$state = self::ST_RELATIVE_SLASH;
|
||||
} else {
|
||||
# Otherwise, run these steps:
|
||||
$this->map($url, $base, [
|
||||
# Set url’s username to base’s username,
|
||||
"username",
|
||||
# url’s password to base’s password,
|
||||
"password",
|
||||
# url’s host to base’s host,
|
||||
"host",
|
||||
# url’s port to base’s port,
|
||||
"port",
|
||||
# url’s path to a copy of base’s path,
|
||||
"path",
|
||||
]);
|
||||
# and then remove url’s path’s last item, if any.
|
||||
array_pop($url->path);
|
||||
# Set state to path state, and decrease pointer by one.
|
||||
$state = self::ST_PATH;
|
||||
goto processChar;
|
||||
}
|
||||
}
|
||||
break;
|
||||
# relative slash state
|
||||
case self::ST_RELATIVE_SLASH:
|
||||
if ($this->isSpecial($url) && ($c=="/" || $c=="\\")) {
|
||||
# If url is special and c is U+002F (/) or U+005C (\), then:
|
||||
# If c is U+005C (\), validation error.
|
||||
if ($c=="\\") {
|
||||
$url->err[] = [$pointer, $pos, self::ERR_BACKSLASH_FORBIDDEN];
|
||||
}
|
||||
# Set state to special authority ignore slashes state.
|
||||
$state = self::ST_SPECIAL_AUTHORITY_IGNORE_SLASHES;
|
||||
} elseif ($c="/") {
|
||||
# Otherwise, if c is U+002F (/), then set state to authority state.
|
||||
$state = self::ST_AUTHORITY;
|
||||
} else {
|
||||
# Otherwise,
|
||||
$this->map($url, $base, [
|
||||
# set url’s username to base’s username,
|
||||
"username",
|
||||
# url’s password to base’s password,
|
||||
"password",
|
||||
# url’s host to base’s host,
|
||||
"host",
|
||||
# url’s port to base’s port,
|
||||
"port",
|
||||
]);
|
||||
# state to path state, and then, decrease pointer by one.
|
||||
$state = self::ST_PATH;
|
||||
goto processChar;
|
||||
}
|
||||
break;
|
||||
# special authority slashes state
|
||||
case self::ST_SPECIAL_AUTHORITY_SLASHES:
|
||||
if ($c=="/" && substr($input, $posNext, 1)=="/") {
|
||||
# If c is U+002F (/) and remaining starts with U+002F (/), then set state to special authority ignore slashes state and increase pointer by one.
|
||||
$state = self::ST_SPECIAL_AUTHORITY_IGNORE_SLASHES;
|
||||
// this has the effect of increasing the pointer by one
|
||||
$pos = $posNext;
|
||||
$c = UTF8::get($input, $pos, $posNext);
|
||||
} else {
|
||||
# Otherwise, validation error, set state to special authority ignore slashes state, and decrease pointer by one.
|
||||
$url->err[] = [$pointer, $pos, self::ERR_SCHEME_EXPECTING_SLASH];
|
||||
$state = self::ST_SPECIAL_AUTHORITY_IGNORE_SLASHES;
|
||||
goto processChar;
|
||||
}
|
||||
break;
|
||||
# special authority ignore slashes state
|
||||
case self::ST_SPECIAL_AUTHORITY_IGNORE_SLASHES:
|
||||
if ($c != "/" && $c != "\\") {
|
||||
# If c is neither U+002F (/) nor U+005C (\), then set state to authority state and decrease pointer by one.
|
||||
$state = self::ST_AUTHORITY;
|
||||
goto processChar;
|
||||
} else {
|
||||
# Otherwise, validation error.
|
||||
$url->err[] = [$pointer, $pos, self::ERR_UNEXPECTED_SLASH];
|
||||
}
|
||||
break;
|
||||
# authority state
|
||||
case self::ST_AUTHORITY:
|
||||
if($c=="@") {
|
||||
# If c is U+0040 (@), then:
|
||||
# Validation error.
|
||||
$url->err[] = [$pointer, $pos, self::ERR_UNEXPECTED_AT];
|
||||
# If the @ flag is set, prepend "%40" to buffer.
|
||||
if ($flagAtSign) {
|
||||
$buffer = "%40".$buffer;
|
||||
}
|
||||
# Set the @ flag.
|
||||
$flagAtSign = true;
|
||||
# For each codePoint in buffer:
|
||||
$bPos = 0;
|
||||
$bEof = strlen($buffer);
|
||||
while ($bPos < $bEof) {
|
||||
$codePoint = UTF8::get($buffer, $bPos, $bPosNext);
|
||||
# If codePoint is U+003A (:) and passwordTokenSeenFlag is unset, then set passwordTokenSeenFlag and continue.
|
||||
if ($codePoint==":" && !$flagPasswordTokenSeen) {
|
||||
$flagPasswordTokenSeen = true;
|
||||
// "continue" in the specification means going to the next character
|
||||
$bPos = $bPosNext;
|
||||
continue;
|
||||
}
|
||||
# Let encodedCodePoints be the result of running UTF-8 percent encode codePoint using the userinfo percent-encode set.
|
||||
$encodedCodePoints = $this->percentEncode($codePoint, self::PE_USERINFO);
|
||||
if ($flagPasswordTokenSeen) {
|
||||
# If passwordTokenSeenFlag is set, then append encodedCodePoints to url’s password.
|
||||
$url->password .= $encodedCodePoints;
|
||||
} else {
|
||||
# Otherwise, append encodedCodePoints to url’s username.
|
||||
$url->username .= $encodedCodePoints;
|
||||
}
|
||||
}
|
||||
# Set buffer to the empty string.
|
||||
$buffer = "";
|
||||
} elseif (
|
||||
# Otherwise, if one of the following is true
|
||||
in_array($c, ["/", "?", "#", ""]) || # c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
|
||||
($this->isSpecial($url) && $c=="\\") # url is special and c is U+005C (\)
|
||||
) {
|
||||
# then:
|
||||
# If @ flag is set and buffer is the empty string, validation error, return failure.
|
||||
if ($flagAtSign && $buffer = "") {
|
||||
$url->err[] = [$pointer, $pos, self::ERR_AUTHORITY_WITHOUT_HOST];
|
||||
$url->failure = true;
|
||||
return $url;
|
||||
}
|
||||
# Decrease pointer by the number of code points in buffer plus one,
|
||||
// DEVIATION: as with decreasing the pointer by one to reprocess characters, we'll ignore the "plus one" here for self-consitency
|
||||
// we first count the number of characters in the buffer
|
||||
$c = UTF8::len($buffer);
|
||||
// then decrease the advisorty character pointer by that amount
|
||||
$pointer -= $c;
|
||||
// then seek back the same number of characters
|
||||
$pos = UTF8::seek($input, -$c, $pos);
|
||||
// and finally consume that character to get the position of the next character to continue the loop correctly
|
||||
$c = UTF8::get($input, $pos, $posNext);
|
||||
# set buffer to the empty string, and set state to host state.
|
||||
$buffer = "";
|
||||
$state = self::ST_HOST;
|
||||
// and reprocess the first character in the erstwhile buffer
|
||||
goto processChar;
|
||||
} else {
|
||||
# Otherwise, append c to buffer.
|
||||
$buffer .= $c;
|
||||
}
|
||||
break;
|
||||
// invalid or unimplemented state
|
||||
default:
|
||||
// FIXME: this should be an error, but until the whole state machine is implemented, we stop processing instead
|
||||
return $url;
|
||||
}
|
||||
# If after a run pointer points to the EOF code point, go to the next step.
|
||||
# Otherwise, increase pointer by one and continue with the state machine.
|
||||
// we operate on byte strings: $pos is the byte offset of the character referred to by $pointer;
|
||||
// $posNext is the start of "remaining" i.e. the offset of the next UTF-8 character
|
||||
$pos = $posNext;
|
||||
$pointer++;
|
||||
} while ($pos <= $eof);
|
||||
}
|
||||
|
||||
protected function getOutputEncoding(string $encoding): string {
|
||||
// FIXME: stub
|
||||
return $encoding;
|
||||
}
|
||||
|
||||
protected function isChr(string $c, int $chrClass) {
|
||||
switch ($chrClass) {
|
||||
case self::CHR_C0:
|
||||
return ($c <= "\u{1F}");
|
||||
case self::CHR_C0_OR_SPACE:
|
||||
return ($c == " " || $c <= "\u{1F}");
|
||||
case self::CHR_ASCII_ALPHA:
|
||||
return (($c >= "A" && $c <= "Z") || ($c >= "z" && $c <= "z"));
|
||||
case self::CHR_ASCII_ALPHANUM:
|
||||
return (
|
||||
($c == (string) (int) $c) || // digits
|
||||
($c >= "A" && $c <= "Z") || // uppercase alphabetic
|
||||
($c >= "z" && $c <= "z") // lowercase alphabetic
|
||||
);
|
||||
default:
|
||||
throw new \Exception;
|
||||
}
|
||||
}
|
||||
|
||||
protected function map(URI $to, URI $from, array $properties): bool {
|
||||
foreach ($properties as $prop) {
|
||||
$to->$prop = $from->prop;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
protected function isSpecial($test): bool {
|
||||
$test = ($est instanceof URI) ? $test->scheme : $test;
|
||||
return array_key_exists($test, self::SCHEME_SPECIAL);
|
||||
}
|
||||
|
||||
protected function percentEncode(string $bytes, int $set): string {
|
||||
if (!isset(self::PE_SET[$set])) {
|
||||
throw new \Exception;
|
||||
}
|
||||
$buffer = "";
|
||||
foreach ($bytes as $b) {
|
||||
if ($b < "\x20" || $b > "\x7E" || in_array($b, self::PE_SET[$set])) {
|
||||
$buffer .= "%".strtoupper(bin2hex($b));
|
||||
} else {
|
||||
$buffer .= $b;
|
||||
}
|
||||
}
|
||||
return $buffer;
|
||||
}
|
||||
}
|
21
lib/URL.php
21
lib/URL.php
|
@ -1,21 +0,0 @@
|
|||
<?php
|
||||
/** @license MIT
|
||||
* Copyright 2018 J. King et al.
|
||||
* See LICENSE and AUTHORS files for details */
|
||||
|
||||
declare(strict_types=1);
|
||||
namespace JKingWeb\URI;
|
||||
|
||||
class URL extends URI {
|
||||
public function __construct(string $url, string $base = null) {
|
||||
$parsedBase = null;
|
||||
if (!is_null($base)) {
|
||||
$parsedBase = $this->basicUrlParser($base);
|
||||
if (is_null($parsedBase)) {
|
||||
throw new \TypeError;
|
||||
}
|
||||
}
|
||||
$parsedUrl = $this->basicUrlParser($url, $parsedBase);
|
||||
var_export($parsedUrl);
|
||||
}
|
||||
}
|
|
@ -4,7 +4,7 @@
|
|||
* See LICENSE and AUTHORS files for details */
|
||||
|
||||
declare(strict_types=1);
|
||||
namespace JKingWeb\URI;
|
||||
namespace MensBeam\UTF8;
|
||||
|
||||
abstract class UTF8 {
|
||||
public static $replacementChar = "\u{FFFD}";
|
||||
|
|
Loading…
Reference in a new issue