An implementation of the WHATWG Mime Sniffing specification
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

268 lines
12 KiB

<?php
/** @license MIT
* Copyright 2020 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Mime;
/** A structured representation of a MIME type, consitent with the WHATWG MIME Sniffing specification
*
* The class is not instantiated directly, but rather via many of its static methods.
* If parsing e.g. "TeXt/HTML; X=a; Y=B", the result will expose the following read-only
* properties:
*
* - `type`: `"text"`
* - `subtype`: `"html"`
* - `essence`: `"text/html"`
* - `params`: `['x' => "a", 'y' => "B"]`
*
* Instances may be cast to strings to yield a normalized representation
*
* @see https://mimesniff.spec.whatwg.org/
*
* @property-read string $type The major type of the MIME type i.e. the part before the slash
* @property-read string $subtype The subtype of the MIME type i.e. the part after the slash
* @property-read string $essence The full MIME type without paramters e.g. `"text/html"`
* @property-read array $params The associative array of parameters included with the type. Keys are lowercase; values are presented in their original case, unescaped
* @property-read bool $isArchive Whether the MIME type is an archive type
* @property-read bool $isAudioVideo Whether the MIME type is an audio or video type
* @property-read bool $isFont Whether the MIME type is a font type
* @property-read bool $isHtml Whether the MIME type is HTML
* @property-read bool $isImage Whether the MIME type is an image type
* @property-read bool $isJavascript Whether the MIME type is a JavaScript type
* @property-read bool $isJson Whether the MIME type is a JSON type
* @property-read bool $isScriptable Whether the MIME type is a type which can be scripted (namely via JavaScript)
* @property-read bool $isXml Whether the MIME type is an XML type
* @property-read bool $isZipBased Whether the MIME type is a ZIP-based type
*/
class MimeType {
protected const TYPE_PATTERN = <<<'PATTERN'
/^
[\t\r\n ]* # optional leading whitespace
([^\/]+) # type
\/ # type-subtype delimiter
([^;]+) # subtype (possibly with trailing whitespace)
(;.*)? # optional parameters, to be parsed separately
[\t\r\n ]* # optional trailing whitespace
$/sx
PATTERN;
protected const PARAM_PATTERN = <<<'PATTERN'
/
[;\t\r\n ]* # parameter delimiter and leading whitespace, all optional
([^=;]*) # parameter name; may be empty
(?:= # parameter name-value delimiter
(
"(?:\\"|[^"])*(?:"|$)[^;]* # quoted parameter value and optional garbage
|[^;]* # unquoted parameter value (possibly with trailing whitespace)
)
)?
;? # optional trailing parameter delimiter
[\t\r\n ]* # optional trailing whitespace
/sx
PATTERN;
protected const TOKEN_PATTERN = '/^[A-Za-z0-9!#$%&\'*+\-\.\^_`|~]+$/s';
protected const BARE_VALUE_PATTERN = '/^[\t\x{20}-\x{7E}\x{80}-\x{FF}]+$/su';
protected const QUOTED_VALUE_PATTERN = '/^"((?:\\\"|[\t !\x{23}-\x{7E}\x{80}-\x{FF}])*)(?:"|$)/su';
protected const ESCAPE_PATTERN = '/\\\(.)/s';
protected const CHAR_MAP = [0x80 => "\u{80}","\u{81}","\u{82}","\u{83}","\u{84}","\u{85}","\u{86}","\u{87}","\u{88}","\u{89}","\u{8a}","\u{8b}","\u{8c}","\u{8d}","\u{8e}","\u{8f}","\u{90}","\u{91}","\u{92}","\u{93}","\u{94}","\u{95}","\u{96}","\u{97}","\u{98}","\u{99}","\u{9a}","\u{9b}","\u{9c}","\u{9d}","\u{9e}","\u{9f}","\u{a0}","\u{a1}","\u{a2}","\u{a3}","\u{a4}","\u{a5}","\u{a6}","\u{a7}","\u{a8}","\u{a9}","\u{aa}","\u{ab}","\u{ac}","\u{ad}","\u{ae}","\u{af}","\u{b0}","\u{b1}","\u{b2}","\u{b3}","\u{b4}","\u{b5}","\u{b6}","\u{b7}","\u{b8}","\u{b9}","\u{ba}","\u{bb}","\u{bc}","\u{bd}","\u{be}","\u{bf}","\u{c0}","\u{c1}","\u{c2}","\u{c3}","\u{c4}","\u{c5}","\u{c6}","\u{c7}","\u{c8}","\u{c9}","\u{ca}","\u{cb}","\u{cc}","\u{cd}","\u{ce}","\u{cf}","\u{d0}","\u{d1}","\u{d2}","\u{d3}","\u{d4}","\u{d5}","\u{d6}","\u{d7}","\u{d8}","\u{d9}","\u{da}","\u{db}","\u{dc}","\u{dd}","\u{de}","\u{df}","\u{e0}","\u{e1}","\u{e2}","\u{e3}","\u{e4}","\u{e5}","\u{e6}","\u{e7}","\u{e8}","\u{e9}","\u{ea}","\u{eb}","\u{ec}","\u{ed}","\u{ee}","\u{ef}","\u{f0}","\u{f1}","\u{f2}","\u{f3}","\u{f4}","\u{f5}","\u{f6}","\u{f7}","\u{f8}","\u{f9}","\u{fa}","\u{fb}","\u{fc}","\u{fd}","\u{fe}","\u{ff}"];
protected $type = "";
protected $subtype = "";
protected $params = [];
private $essence;
private $isArchive;
private $isAudioVideo;
private $isFont;
private $isHtml;
private $isImage;
private $isJavascript;
private $isJson;
private $isScriptable;
private $isXml;
private $isZipBased;
protected function __construct(string $type = "", string $subtype = "", array $params = []) {
$this->type = $type;
$this->subtype = $subtype;
$this->params = $params;
}
public function __get(string $name) {
switch ($name) {
case "essence":
return $this->essence();
case "isArchive":
return in_array($this->essence(), ["application/zip", "application/x-gzip", "application/x-rar-compressed"]);
case "isAudioVideo":
return $this->type === "audio" || $this->type === "video" || $this->essence() === "application/ogg";
case "isFont":
return $this->type === "font" || preg_match("<^application/(?:font-(?:cff|off|sfnt|ttf|woff)|vnd\.ms-(?:fontobject|opentype))$>", $this->essence());
case "isHtml":
return $this->essence() === "text/html";
case "isImage":
return $this->type === "image";
case "isJavascript":
return (bool) preg_match("<^(?:(?:text|application)/(?:(?:x-)?(?:ecma|java)script)|text/(?:livescript|jscript|javascript1\.[0-5]))$>", $this->essence());
case "isJson":
return substr($this->subtype, -5) === "+json" || preg_match("<^(?:text|application)/json$>", $this->essence());
case "isScriptable":
return $this->essence() === "application/pdf" || $this->__get("isHtml") || $this->__get("isXml");
case "isXml":
return substr($this->subtype, -4) === "+xml" || preg_match("<^(?:text|application)/xml$>", $this->essence());
case "isZipBased":
return substr($this->subtype, -4) === "+zip" || $this->essence() === "application/zip";
default:
return $this->$name ?? null;
}
}
public function __toString(): string {
$out = $this->essence();
if (is_array($this->params) && sizeof($this->params)) {
foreach ($this->params as $name => $value) {
$out .= ";$name=".(preg_match(self::TOKEN_PATTERN, $value) ? $value : '"'.str_replace(["\\", '"'], ["\\\\", "\\\""], $value).'"');
}
}
return $out;
}
protected function essence(): string {
return $this->type."/".$this->subtype;
}
/** Parses a UTF-8 string and returns a MimeType instance, or null on failure
*
* If parsing an HTTP header, the MimeType::parseBytes method should be used instead
*
* @see \MensBeam\Mime\MimeType::parseBytes
*/
public static function parse(string $mimeType): ?self {
if (preg_match(self::TYPE_PATTERN, $mimeType, $match)) {
[$mimeType, $type, $subtype, $params] = array_pad($match, 4, "");
if (strlen($type = static::parseHttpToken($type)) && strlen($subtype = static::parseHttpToken(rtrim($subtype, "\t\r\n ")))) {
return new static(strtolower($type), strtolower($subtype), static::parseParams($params));
}
}
return null;
}
/** Parses a binary string and returns a MimeType instance, or null on failure
*
* This should be used on MIME type strings from HTTP headers, which use a special character set
*/
public static function parseBytes(string $mimeType): ?self {
return static::parse(static::decode($mimeType));
}
/** Returns the UTF-8 isomorphically decoded form of the binary string $bytes
*
* @see https://infra.spec.whatwg.org/#isomorphic-decode
* @param string $bytes The binary string to decode to UTF-8
*/
public static function decode(string $bytes): string {
$out = "";
for ($a = 0; $a < strlen($bytes); $a++) {
$c = $bytes[$a];
$p = ord($c);
$out .= $p < 0x80 ? $c : self::CHAR_MAP[$p];
}
return $out;
}
/** Returns the isomorphically encoded form of the UTF-8 input string $chars
*
* If the input contains characters beyond the Latin-1 Supplement block, null is returned
*
* This method should be used when a MIME type of unknown provenance is to be inserted into an HTTP header
*
* @see https://infra.spec.whatwg.org/#isomorphic-encode
* @param string $chars The UTF-8 encoded string to convert to binary
*/
public static function encode(string $chars): ?string {
$map = array_combine(array_values(self::CHAR_MAP), range(chr(0x80), chr(0xFF)));
$out = "";
$set = array_reverse(preg_split("<>u", $chars));
array_pop($set);
while (sizeof($set) > 1) {
$c = array_pop($set);
if (strlen($c) === 1) {
$out .= $c;
} elseif (isset($map[$c])) {
$out .= $map[$c];
} else {
return null;
}
}
return $out;
}
/** Parses a parameter string into an associative array of keys and values
*
* If a parameter appears more than once, the first valid instance is used
*/
protected static function parseParams(string $params): array {
$out = [];
if (preg_match_all(self::PARAM_PATTERN, $params, $matches, \PREG_SET_ORDER)) {
foreach ($matches as $match) {
[$param, $name, $value] = array_pad($match, 3, "");
$name = strtolower(static::parseHttpToken($name));
if (!strlen($name) || isset($out[$name])) {
continue;
} elseif (strlen($value) && $value[0] === '"') {
$value = static::parseHttpQuotedValue($value);
if (is_null($value)) {
continue;
}
} else {
$value = static::parseHttpBareValue($value);
if (!strlen($value)) {
continue;
}
}
$out[$name] = $value;
}
}
return $out;
}
/** Validates a st ring as an HTTP token production
*
* Returns an empty string if the string is not a valid token
*
* @see https://tools.ietf.org/html/rfc7230#section-3.2.6
*/
protected static function parseHttpToken(string $token): string {
if (preg_match(self::TOKEN_PATTERN, $token, $match)) {
return $token;
}
return "";
}
/** Trims and validates a bare HTTP value string; per HTTP this should be a token, but WHATWG allows the full qdtext production
*
* Returns an empty string if the string is not a valid token
*
* @see https://tools.ietf.org/html/rfc7230#section-3.2.6
*/
protected static function parseHttpBareValue(string $value): string {
$value = rtrim($value, "\t\r\n ");
if (preg_match(self::BARE_VALUE_PATTERN, $value, $match)) {
return $value;
}
return "";
}
/** Trims and validates a quoted HTTP value string per the qdtext production
*
* Returns null if the string is not a valid token; an emptty string is a valid value
*
* @see https://tools.ietf.org/html/rfc7230#section-3.2.6
*/
protected static function parseHttpQuotedValue(string $value): ?string {
if (preg_match(self::QUOTED_VALUE_PATTERN, $value, $match)) {
return preg_replace(self::ESCAPE_PATTERN, '$1', $match[1]);
}
return null;
}
}