Browse Source

Add a WHATWG MIME type parser, with tests

master
J. King 4 years ago
parent
commit
5ea5f96e9b
  1. 126
      lib/Parser/MimeType.php
  2. 20
      tests/cases/Util/Mime/README
  3. 3526
      tests/cases/Util/Mime/generated-mime-types.json
  4. 383
      tests/cases/Util/Mime/mime-types.json
  5. 42
      tests/cases/Util/MimeTypeTest.php

126
lib/Parser/MimeType.php

@ -0,0 +1,126 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Lax\Parser;
use function PHPSTORM_META\type;
/** @property-read string $essence */
class MimeType {
protected const TYPE_PATTERN = <<<'PATTERN'
<^
[\t\r\n ]* # optional leading whitespace
([^/]+) # type
/ # type/subtype delimiter
([^;]+) # subtype (possibly with trailing whitespace)
(;.*)? # optional parameters, to be parsed separately
[\t\r\n ]* # optional trailing whitespace
$>sx
PATTERN;
protected const PARAM_PATTERN = <<<'PATTERN'
<
[;\t\r\n ]* # parameter delimiter and leading whitespace, all optional
([^=;]*) # parameter name; may be empty
(?:= # parameter name/value delimiter
(
"(?:\\"|[^"])*(?:"|$)[^;]* # quoted parameter value and optional garbage
|[^;]* # unquoted parameter value (possibly with trailing whitespace)
)
)?
;? # optional trailing parameter delimiter
[\t\r\n ]* # optional trailing whitespace
>sx
PATTERN;
protected const TOKEN_PATTERN = '<^[A-Za-z0-9!#$%&\'*+\-\.\^_`|~]+$>s';
protected const BARE_VALUE_PATTERN = '<^[\t\x{20}-\x{7E}\x{80}-\x{FF}]+$>su';
protected const QUOTED_VALUE_PATTERN = '<^"((?:\\\"|[\t !\x{23}-\x{7E}\x{80}-\x{FF}])*)(?:"|$)>su';
protected const ESCAPE_PATTERN = '<\\\(.)>s';
public $type = "";
public $subtype = "";
public $params = [];
private $essence;
public function __construct(string $type = "", string $subtype = "", array $params = []) {
$this->type = $type;
$this->subtype = $subtype;
$this->params = $params;
}
public function __get(string $name) {
if ($name === "essence") {
return $this->type."/".$this->subtype;
}
return $this->$name ?? null;
}
public function __toString(): string {
$out = $this->__get("essence");
if (is_array($this->params) && sizeof($this->params)) {
foreach ($this->params as $name => $value) {
$out .= ";$name=".(preg_match(self::TOKEN_PATTERN, $value) ? $value : '"'.str_replace(["\\", '"'], ["\\\\", "\\\""], $value).'"');
}
}
return $out;
}
public static function parse(string $mimeType): ?self {
if (preg_match(self::TYPE_PATTERN, $mimeType, $match)) {
[$mimeType, $type, $subtype, $params] = array_pad($match, 4, "");
if (strlen($type = static::parseHttpToken($type)) && strlen($subtype = static::parseHttpToken(rtrim($subtype, "\t\r\n ")))) {
return new static(strtolower($type), strtolower($subtype), static::parseParams($params));
}
}
return null;
}
protected static function parseParams(string $params): array {
$out = [];
if (preg_match_all(self::PARAM_PATTERN, $params, $matches, \PREG_SET_ORDER)) {
foreach ($matches as $match) {
[$param, $name, $value] = array_pad($match, 3, "");
$name = strtolower(static::parseHttpToken($name));
if (!strlen($name) || isset($out[$name])) {
continue;
} elseif (strlen($value) && $value[0] === '"') {
$value = static::parseHttpQuotedValue($value);
if (is_null($value)) {
continue;
}
} else {
$value = static::parseHttpBareValue($value);
if (!strlen($value)) {
continue;
}
}
$out[$name] = $value;
}
}
return $out;
}
protected static function parseHttpToken(string $token): string {
if (preg_match(self::TOKEN_PATTERN, $token, $match)) {
return $token;
}
return "";
}
protected static function parseHttpBareValue(string $value): string {
$value = rtrim($value, "\t\r\n ");
if (preg_match(self::BARE_VALUE_PATTERN, $value, $match)) {
return $value;
}
return "";
}
protected static function parseHttpQuotedValue(string $value): ?string {
if (preg_match(self::QUOTED_VALUE_PATTERN, $value, $match)) {
return preg_replace(self::ESCAPE_PATTERN, '$1', $match[1]);
}
return null;
}
}

20
tests/cases/Util/Mime/README

@ -0,0 +1,20 @@
The Mime class used in Lax is an original work, but its test suite is
derived from an existing test corpus from the following source:
The Web Platform Test suite
<https://github.com/web-platform-tests/wpt/tree/62317fb983ca5687e4133d89f5523839fdab7f69/mimesniff/mime-types>
The license text is reproduced below, but please note that the license text
has changed since the last time the mimesniff portion of the test suite was
modified; the text included here is current as of 2019-04-25.
Copyright 2019 web-platform-tests contributors
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

3526
tests/cases/Util/Mime/generated-mime-types.json

File diff suppressed because it is too large

383
tests/cases/Util/Mime/mime-types.json

@ -0,0 +1,383 @@
[
"Basics",
{
"input": "text/html;charset=gbk",
"output": "text/html;charset=gbk",
"navigable": true,
"encoding": "GBK"
},
{
"input": "TEXT/HTML;CHARSET=GBK",
"output": "text/html;charset=GBK",
"navigable": true,
"encoding": "GBK"
},
"Legacy comment syntax",
{
"input": "text/html;charset=gbk(",
"output": "text/html;charset=\"gbk(\"",
"navigable": true,
"encoding": null
},
{
"input": "text/html;x=(;charset=gbk",
"output": "text/html;x=\"(\";charset=gbk",
"navigable": true,
"encoding": "GBK"
},
"Duplicate parameter",
{
"input": "text/html;charset=gbk;charset=windows-1255",
"output": "text/html;charset=gbk",
"navigable": true,
"encoding": "GBK"
},
{
"input": "text/html;charset=();charset=GBK",
"output": "text/html;charset=\"()\"",
"navigable": true,
"encoding": null
},
"Spaces",
{
"input": "text/html;charset =gbk",
"output": "text/html",
"navigable": true,
"encoding": null
},
{
"input": "text/html ;charset=gbk",
"output": "text/html;charset=gbk",
"navigable": true,
"encoding": "GBK"
},
{
"input": "text/html; charset=gbk",
"output": "text/html;charset=gbk",
"navigable": true,
"encoding": "GBK"
},
{
"input": "text/html;charset= gbk",
"output": "text/html;charset=\" gbk\"",
"navigable": true,
"encoding": "GBK"
},
{
"input": "text/html;charset= \"gbk\"",
"output": "text/html;charset=\" \\\"gbk\\\"\"",
"navigable": true,
"encoding": null
},
"0x0B and 0x0C",
{
"input": "text/html;charset=\u000Bgbk",
"output": "text/html",
"navigable": true,
"encoding": null
},
{
"input": "text/html;charset=\u000Cgbk",
"output": "text/html",
"navigable": true,
"encoding": null
},
{
"input": "text/html;\u000Bcharset=gbk",
"output": "text/html",
"navigable": true,
"encoding": null
},
{
"input": "text/html;\u000Ccharset=gbk",
"output": "text/html",
"navigable": true,
"encoding": null
},
"Single quotes are a token, not a delimiter",
{
"input": "text/html;charset='gbk'",
"output": "text/html;charset='gbk'",
"navigable": true,
"encoding": null
},
{
"input": "text/html;charset='gbk",
"output": "text/html;charset='gbk",
"navigable": true,
"encoding": null
},
{
"input": "text/html;charset=gbk'",
"output": "text/html;charset=gbk'",
"navigable": true,
"encoding": null
},
{
"input": "text/html;charset=';charset=GBK",
"output": "text/html;charset='",
"navigable": true,
"encoding": null
},
"Invalid parameters",
{
"input": "text/html;test;charset=gbk",
"output": "text/html;charset=gbk",
"navigable": true,
"encoding": "GBK"
},
{
"input": "text/html;test=;charset=gbk",
"output": "text/html;charset=gbk",
"navigable": true,
"encoding": "GBK"
},
{
"input": "text/html;';charset=gbk",
"output": "text/html;charset=gbk",
"navigable": true,
"encoding": "GBK"
},
{
"input": "text/html;\";charset=gbk",
"output": "text/html;charset=gbk",
"navigable": true,
"encoding": "GBK"
},
{
"input": "text/html ; ; charset=gbk",
"output": "text/html;charset=gbk",
"navigable": true,
"encoding": "GBK"
},
{
"input": "text/html;;;;charset=gbk",
"output": "text/html;charset=gbk",
"navigable": true,
"encoding": "GBK"
},
{
"input": "text/html;charset= \"\u007F;charset=GBK",
"output": "text/html;charset=GBK",
"navigable": true,
"encoding": "GBK"
},
{
"input": "text/html;charset=\"\u007F;charset=foo\";charset=GBK",
"output": "text/html;charset=GBK",
"navigable": true,
"encoding": "GBK"
},
"Double quotes",
{
"input": "text/html;charset=\"gbk\"",
"output": "text/html;charset=gbk",
"navigable": true,
"encoding": "GBK"
},
{
"input": "text/html;charset=\"gbk",
"output": "text/html;charset=gbk",
"navigable": true,
"encoding": "GBK"
},
{
"input": "text/html;charset=gbk\"",
"output": "text/html;charset=\"gbk\\\"\"",
"navigable": true,
"encoding": null
},
{
"input": "text/html;charset=\" gbk\"",
"output": "text/html;charset=\" gbk\"",
"navigable": true,
"encoding": "GBK"
},
{
"input": "text/html;charset=\"gbk \"",
"output": "text/html;charset=\"gbk \"",
"navigable": true,
"encoding": "GBK"
},
{
"input": "text/html;charset=\"\\ gbk\"",
"output": "text/html;charset=\" gbk\"",
"navigable": true,
"encoding": "GBK"
},
{
"input": "text/html;charset=\"\\g\\b\\k\"",
"output": "text/html;charset=gbk",
"navigable": true,
"encoding": "GBK"
},
{
"input": "text/html;charset=\"gbk\"x",
"output": "text/html;charset=gbk",
"navigable": true,
"encoding": "GBK"
},
{
"input": "text/html;charset=\"\";charset=GBK",
"output": "text/html;charset=\"\"",
"navigable": true,
"encoding": null
},
{
"input": "text/html;charset=\";charset=GBK",
"output": "text/html;charset=\";charset=GBK\"",
"navigable": true,
"encoding": null
},
"Unexpected code points",
{
"input": "text/html;charset={gbk}",
"output": "text/html;charset=\"{gbk}\"",
"navigable": true,
"encoding": null
},
"Parameter name longer than 127",
{
"input": "text/html;0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789=x;charset=gbk",
"output": "text/html;0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789=x;charset=gbk",
"navigable": true,
"encoding": "GBK"
},
"type/subtype longer than 127",
{
"input": "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789/0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789",
"output": "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789/0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789"
},
"Valid",
{
"input": "!#$%&'*+-.^_`|~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz/!#$%&'*+-.^_`|~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz;!#$%&'*+-.^_`|~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz=!#$%&'*+-.^_`|~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
"output": "!#$%&'*+-.^_`|~0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz/!#$%&'*+-.^_`|~0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz;!#$%&'*+-.^_`|~0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz=!#$%&'*+-.^_`|~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
},
{
"input": "x/x;x=\"\t !\\\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0\u00A1\u00A2\u00A3\u00A4\u00A5\u00A6\u00A7\u00A8\u00A9\u00AA\u00AB\u00AC\u00AD\u00AE\u00AF\u00B0\u00B1\u00B2\u00B3\u00B4\u00B5\u00B6\u00B7\u00B8\u00B9\u00BA\u00BB\u00BC\u00BD\u00BE\u00BF\u00C0\u00C1\u00C2\u00C3\u00C4\u00C5\u00C6\u00C7\u00C8\u00C9\u00CA\u00CB\u00CC\u00CD\u00CE\u00CF\u00D0\u00D1\u00D2\u00D3\u00D4\u00D5\u00D6\u00D7\u00D8\u00D9\u00DA\u00DB\u00DC\u00DD\u00DE\u00DF\u00E0\u00E1\u00E2\u00E3\u00E4\u00E5\u00E6\u00E7\u00E8\u00E9\u00EA\u00EB\u00EC\u00ED\u00EE\u00EF\u00F0\u00F1\u00F2\u00F3\u00F4\u00F5\u00F6\u00F7\u00F8\u00F9\u00FA\u00FB\u00FC\u00FD\u00FE\u00FF\"",
"output": "x/x;x=\"\t !\\\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0\u00A1\u00A2\u00A3\u00A4\u00A5\u00A6\u00A7\u00A8\u00A9\u00AA\u00AB\u00AC\u00AD\u00AE\u00AF\u00B0\u00B1\u00B2\u00B3\u00B4\u00B5\u00B6\u00B7\u00B8\u00B9\u00BA\u00BB\u00BC\u00BD\u00BE\u00BF\u00C0\u00C1\u00C2\u00C3\u00C4\u00C5\u00C6\u00C7\u00C8\u00C9\u00CA\u00CB\u00CC\u00CD\u00CE\u00CF\u00D0\u00D1\u00D2\u00D3\u00D4\u00D5\u00D6\u00D7\u00D8\u00D9\u00DA\u00DB\u00DC\u00DD\u00DE\u00DF\u00E0\u00E1\u00E2\u00E3\u00E4\u00E5\u00E6\u00E7\u00E8\u00E9\u00EA\u00EB\u00EC\u00ED\u00EE\u00EF\u00F0\u00F1\u00F2\u00F3\u00F4\u00F5\u00F6\u00F7\u00F8\u00F9\u00FA\u00FB\u00FC\u00FD\u00FE\u00FF\""
},
"End-of-file handling",
{
"input": "x/x;test",
"output": "x/x"
},
{
"input": "x/x;test=\"\\",
"output": "x/x;test=\"\\\\\""
},
"Whitespace (not handled by generated-mime-types.json or above)",
{
"input": "x/x;x= ",
"output": "x/x"
},
{
"input": "x/x;x=\t",
"output": "x/x"
},
{
"input": "x/x\n\r\t ;x=x",
"output": "x/x;x=x"
},
{
"input": "\n\r\t x/x;x=x\n\r\t ",
"output": "x/x;x=x"
},
{
"input": "x/x;\n\r\t x=x\n\r\t ;x=y",
"output": "x/x;x=x"
},
"Latin1",
{
"input": "text/html;test=\u00FF;charset=gbk",
"output": "text/html;test=\"\u00FF\";charset=gbk",
"navigable": true,
"encoding": "GBK"
},
">Latin1",
{
"input": "x/x;test=\uFFFD;x=x",
"output": "x/x;x=x"
},
"Failure",
{
"input": "\u000Bx/x",
"output": null
},
{
"input": "\u000Cx/x",
"output": null
},
{
"input": "x/x\u000B",
"output": null
},
{
"input": "x/x\u000C",
"output": null
},
{
"input": "",
"output": null
},
{
"input": "\t",
"output": null
},
{
"input": "/",
"output": null
},
{
"input": "bogus",
"output": null
},
{
"input": "bogus/",
"output": null
},
{
"input": "bogus/ ",
"output": null
},
{
"input": "bogus/bogus/;",
"output": null
},
{
"input": "</>",
"output": null
},
{
"input": "(/)",
"output": null
},
{
"input": "ÿ/ÿ",
"output": null
},
{
"input": "text/html(;doesnot=matter",
"output": null
},
{
"input": "{/}",
"output": null
},
{
"input": "\u0100/\u0100",
"output": null
},
{
"input": "text /html",
"output": null
},
{
"input": "text/ html",
"output": null
},
{
"input": "\"text/html\"",
"output": null
}
]

42
tests/cases/Util/MimeTypeTest.php

@ -0,0 +1,42 @@
<?php
/** @license MIT
* Copyright 2018 J. King
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Lax\TestCase\Util;
use MensBeam\Lax\Parser\MimeType as Mime;
/** @covers \MensBeam\Lax\Parser\Mime */
class MimeTypeTest extends \PHPUnit\Framework\TestCase {
/** @dataProvider provideStandardTests */
public function testStandardTestSuite(string $input, ?string $exp): void {
if (is_null($exp)) {
$this->assertNull(Mime::parse($input));
} else {
$this->assertSame($exp, (string) Mime::parse($input));
}
}
public function provideStandardTests(): iterable {
foreach (new \GlobIterator(__DIR__."/Mime/*.json", \FilesystemIterator::CURRENT_AS_PATHNAME | \FilesystemIterator::KEY_AS_FILENAME) as $file => $path) {
$indexOffset = 0;
$description = "";
foreach (json_decode(file_get_contents($path)) as $index => $test) {
if (is_string($test)) {
// the array member is a description of the next member
// the index offset should be decremented, the description stored, and this entry skipped
$indexOffset--;
$description = $test;
continue;
} else {
$index += $indexOffset;
$description = $description ? ": $description" : "";
yield "$file #$index$description" => [$test->input, $test->output];
$description = null;
}
}
}
}
}
Loading…
Cancel
Save