J. King
5 years ago
7 changed files with 177 additions and 32 deletions
@ -0,0 +1,86 @@ |
|||
<?php |
|||
declare(strict_types=1); |
|||
namespace dW\HTML5; |
|||
|
|||
use MensBeam\Intl\Encoding; |
|||
|
|||
abstract class Charset { |
|||
/** Matches an encoding label (e.g. "utf-8") to a compatible decoder class. |
|||
* |
|||
* @param string $value The encoding label to match |
|||
*/ |
|||
public static function fromCharset(string $value): ?string { |
|||
$encoding = Encoding::matchLabel($value); |
|||
if ($encoding) { |
|||
return $encoding['class']; |
|||
} |
|||
return null; |
|||
} |
|||
|
|||
/** Extracts an encoding from an HTTP Content-Type header-field |
|||
* and returns the class name of a compatible decoder. |
|||
* |
|||
* @param string $contentType The value of a Content-Type header-field |
|||
*/ |
|||
public static function fromTransport(string $contentType): ?string { |
|||
// Try to sniff out a charset from a Content-Type header-field. |
|||
// This does cut some corners, but should be sufficient for practical use |
|||
$s = preg_replace("/\s+/", " ", strtolower($contentType)); |
|||
$pos = 0; |
|||
$end = strlen($s); |
|||
// skip the type |
|||
while ($pos < $end && @$s[$pos++] !== "/"); |
|||
// skip the subtype |
|||
while ($pos < $end && @$s[$pos++] !== ";"); |
|||
// check parameters in sequence |
|||
while ($pos < $end) { |
|||
// skip any leading whitespace |
|||
if (@$s[$pos] === " ") { |
|||
$pos++; |
|||
} |
|||
// collect characters for the parameter name |
|||
$param = ""; |
|||
while ($pos < $end && @$s[$pos] !== "=") { |
|||
$param .= @$s[$pos++]; |
|||
} |
|||
// skip the equals sign |
|||
$pos++; |
|||
if ($s[$pos] === '"') { |
|||
// Value is a quoted-string |
|||
$pos++; |
|||
$value = ""; |
|||
while (!in_array($c = @$s[$pos++], ['"', ""])) { |
|||
if ($c === "\\") { |
|||
$value .= @$s[$pos++]; |
|||
} else { |
|||
$value .= $c; |
|||
} |
|||
} |
|||
// only interpret the value if a closing quotation mark was seen |
|||
if ($c !== '"') { |
|||
$value = ""; |
|||
} |
|||
} else { |
|||
// Value is a bare token |
|||
$value = ""; |
|||
while (!in_array($c = @$s[$pos++], [';', " ", ""])) { |
|||
$value .= $c; |
|||
} |
|||
} |
|||
// if the parameter was the character set, interpret its value and return |
|||
if ($param === "charset") { |
|||
$encoding = Encoding::matchLabel($value); |
|||
if ($encoding) { |
|||
return $encoding['class']; |
|||
} else { |
|||
return null; |
|||
} |
|||
} |
|||
} |
|||
return null; |
|||
} |
|||
|
|||
public static function fromPrescan(string $data): ?string { |
|||
return null; |
|||
} |
|||
} |
@ -0,0 +1,48 @@ |
|||
<?php |
|||
declare(strict_types=1); |
|||
namespace dW\HTML5\TestCase; |
|||
|
|||
use dW\HTML5\Charset; |
|||
use MensBeam\Intl\Encoding\UTF8; |
|||
use MensBeam\Intl\Encoding\Windows1252; |
|||
|
|||
/** |
|||
* @covers \dW\HTML5\Charset |
|||
*/ |
|||
class TestCharset extends \PHPUnit\Framework\TestCase { |
|||
/** @dataProvider provideCharsets */ |
|||
public function testDetermineEncodingFromEncodingLabel(string $in, ?string $exp) { |
|||
$this->assertSame($exp, Charset::fromCharset($in)); |
|||
} |
|||
|
|||
public function provideCharsets() { |
|||
return [ |
|||
["UTF-8", UTF8::class], |
|||
[" utf8 ", UTF8::class], |
|||
["ISO-8859-1", Windows1252::class], |
|||
["text/html; charset=utf8", null], |
|||
]; |
|||
} |
|||
|
|||
/** @dataProvider provideContentTypes */ |
|||
public function testDetermineEncodingFromContentType(string $in, ?string $exp) { |
|||
$this->assertSame($exp, Charset::fromTransport($in)); |
|||
} |
|||
|
|||
public function provideContentTypes() { |
|||
return [ |
|||
["UTF-8", null], |
|||
["charset=utf8", null], |
|||
["text/html", null], |
|||
["text/html charset=utf8", null], |
|||
["text/html; charset=utf8", UTF8::class], |
|||
["text/html;charset=utf8", UTF8::class], |
|||
["text/html; charset=\"utf8\"", UTF8::class], |
|||
["image/svg+xml; param=value; charset=utf8", UTF8::class], |
|||
["image/svg+xml; charset=utf8; charset=big5", UTF8::class], |
|||
["image/svg+xml; charset=utf8;charset=big5", UTF8::class], |
|||
["text/html; charset=not-valid; charset=big5", null], |
|||
["text/html; charset=not-valid", null], |
|||
]; |
|||
} |
|||
} |
Loading…
Reference in new issue