J. King
5 years ago
7 changed files with 177 additions and 32 deletions
@ -0,0 +1,86 @@ |
|||||
|
<?php |
||||
|
declare(strict_types=1); |
||||
|
namespace dW\HTML5; |
||||
|
|
||||
|
use MensBeam\Intl\Encoding; |
||||
|
|
||||
|
abstract class Charset { |
||||
|
/** Matches an encoding label (e.g. "utf-8") to a compatible decoder class. |
||||
|
* |
||||
|
* @param string $value The encoding label to match |
||||
|
*/ |
||||
|
public static function fromCharset(string $value): ?string { |
||||
|
$encoding = Encoding::matchLabel($value); |
||||
|
if ($encoding) { |
||||
|
return $encoding['class']; |
||||
|
} |
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
/** Extracts an encoding from an HTTP Content-Type header-field |
||||
|
* and returns the class name of a compatible decoder. |
||||
|
* |
||||
|
* @param string $contentType The value of a Content-Type header-field |
||||
|
*/ |
||||
|
public static function fromTransport(string $contentType): ?string { |
||||
|
// Try to sniff out a charset from a Content-Type header-field. |
||||
|
// This does cut some corners, but should be sufficient for practical use |
||||
|
$s = preg_replace("/\s+/", " ", strtolower($contentType)); |
||||
|
$pos = 0; |
||||
|
$end = strlen($s); |
||||
|
// skip the type |
||||
|
while ($pos < $end && @$s[$pos++] !== "/"); |
||||
|
// skip the subtype |
||||
|
while ($pos < $end && @$s[$pos++] !== ";"); |
||||
|
// check parameters in sequence |
||||
|
while ($pos < $end) { |
||||
|
// skip any leading whitespace |
||||
|
if (@$s[$pos] === " ") { |
||||
|
$pos++; |
||||
|
} |
||||
|
// collect characters for the parameter name |
||||
|
$param = ""; |
||||
|
while ($pos < $end && @$s[$pos] !== "=") { |
||||
|
$param .= @$s[$pos++]; |
||||
|
} |
||||
|
// skip the equals sign |
||||
|
$pos++; |
||||
|
if ($s[$pos] === '"') { |
||||
|
// Value is a quoted-string |
||||
|
$pos++; |
||||
|
$value = ""; |
||||
|
while (!in_array($c = @$s[$pos++], ['"', ""])) { |
||||
|
if ($c === "\\") { |
||||
|
$value .= @$s[$pos++]; |
||||
|
} else { |
||||
|
$value .= $c; |
||||
|
} |
||||
|
} |
||||
|
// only interpret the value if a closing quotation mark was seen |
||||
|
if ($c !== '"') { |
||||
|
$value = ""; |
||||
|
} |
||||
|
} else { |
||||
|
// Value is a bare token |
||||
|
$value = ""; |
||||
|
while (!in_array($c = @$s[$pos++], [';', " ", ""])) { |
||||
|
$value .= $c; |
||||
|
} |
||||
|
} |
||||
|
// if the parameter was the character set, interpret its value and return |
||||
|
if ($param === "charset") { |
||||
|
$encoding = Encoding::matchLabel($value); |
||||
|
if ($encoding) { |
||||
|
return $encoding['class']; |
||||
|
} else { |
||||
|
return null; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
public static function fromPrescan(string $data): ?string { |
||||
|
return null; |
||||
|
} |
||||
|
} |
@ -0,0 +1,48 @@ |
|||||
|
<?php |
||||
|
declare(strict_types=1); |
||||
|
namespace dW\HTML5\TestCase; |
||||
|
|
||||
|
use dW\HTML5\Charset; |
||||
|
use MensBeam\Intl\Encoding\UTF8; |
||||
|
use MensBeam\Intl\Encoding\Windows1252; |
||||
|
|
||||
|
/** |
||||
|
* @covers \dW\HTML5\Charset |
||||
|
*/ |
||||
|
class TestCharset extends \PHPUnit\Framework\TestCase { |
||||
|
/** @dataProvider provideCharsets */ |
||||
|
public function testDetermineEncodingFromEncodingLabel(string $in, ?string $exp) { |
||||
|
$this->assertSame($exp, Charset::fromCharset($in)); |
||||
|
} |
||||
|
|
||||
|
public function provideCharsets() { |
||||
|
return [ |
||||
|
["UTF-8", UTF8::class], |
||||
|
[" utf8 ", UTF8::class], |
||||
|
["ISO-8859-1", Windows1252::class], |
||||
|
["text/html; charset=utf8", null], |
||||
|
]; |
||||
|
} |
||||
|
|
||||
|
/** @dataProvider provideContentTypes */ |
||||
|
public function testDetermineEncodingFromContentType(string $in, ?string $exp) { |
||||
|
$this->assertSame($exp, Charset::fromTransport($in)); |
||||
|
} |
||||
|
|
||||
|
public function provideContentTypes() { |
||||
|
return [ |
||||
|
["UTF-8", null], |
||||
|
["charset=utf8", null], |
||||
|
["text/html", null], |
||||
|
["text/html charset=utf8", null], |
||||
|
["text/html; charset=utf8", UTF8::class], |
||||
|
["text/html;charset=utf8", UTF8::class], |
||||
|
["text/html; charset=\"utf8\"", UTF8::class], |
||||
|
["image/svg+xml; param=value; charset=utf8", UTF8::class], |
||||
|
["image/svg+xml; charset=utf8; charset=big5", UTF8::class], |
||||
|
["image/svg+xml; charset=utf8;charset=big5", UTF8::class], |
||||
|
["text/html; charset=not-valid; charset=big5", null], |
||||
|
["text/html; charset=not-valid", null], |
||||
|
]; |
||||
|
} |
||||
|
} |
Loading…
Reference in new issue