diff --git a/lib/Parser/Charset.php b/lib/Parser/Charset.php index 4edda17..62275de 100644 --- a/lib/Parser/Charset.php +++ b/lib/Parser/Charset.php @@ -130,24 +130,40 @@ abstract class Charset { * @param int $endAfter The number of bytes of the string to stop after */ public static function fromPrescan(string $data, int $endAfter = 1024): ?string { - # When an algorithm requires a user agent to prescan a byte stream to - # determine its encoding, given some defined end condition, then it - # must run the following steps. - # These steps either abort unsuccessfully or return a character - # encoding. If at any point during these steps (including during - # instances of the get an attribute algorithm invoked by this one) - # the user agent either runs out of bytes (meaning the position - # pointer created in the first step below goes beyond the end of the - # byte stream obtained so far) or reaches its end condition, then - # abort the prescan a byte stream to determine its encoding - # algorithm unsuccessfully. + # When an algorithm requires a user agent to prescan a byte stream + # to determine its encoding, given some defined end condition, + # then it must run the following steps. + # If at any point during these steps (including during instances + # of the get an attribute algorithm invoked by this one) the + # user agent either runs out of bytes (meaning the position + # pointer created in the first step below goes beyond the end + # of the byte stream obtained so far) or reaches its end condition, + # then abort the prescan a byte stream to determine its encoding + # algorithm and return the result get an XML encoding applied to + # the same bytes that the prescan a byte stream to determine its + # encoding algorithm was applied to. Otherwise, these steps will + # return a character encoding. $s = substr($data, 0, $endAfter); $endAfter = strlen($s); + # Let fallback encoding be null. + // NOTE: This is never used # Let position be a pointer to a byte in the input byte stream, # initially pointing at the first byte. $pos = 0; - + + # Prescan for UTF-16 XML declarations: If position points to: + # A sequence of bytes starting with: 0x3C, 0x0, 0x3F, 0x0, 0x78, 0x0 (case-sensitive UTF-16 little-endian '). If there is no such byte, + # then return failure. + $xmlDeclarationEnd = strpos($s, ">"); + if (!$xmlDeclarationEnd) { + return null; + } + # Set encodingPosition to the position of the first occurrence of the + # subsequence of bytes 0x65, 0x6E, 0x63, 0x6F, 0x64, 0x69, 0x6E, + # 0x67 (`encoding`) at or after the current encodingPosition. If + # there is no such sequence, then return failure. + // NOTE: This is buggy; see https://github.com/whatwg/html/issues/6939 + $pos = strpos($s, "encoding"); + if ($pos === false || $pos > $xmlDeclarationEnd) { + return null; + } + # Advance encodingPosition past the 0x67 (g) byte. + $pos = $pos + strlen("encoding"); + # While the byte at encodingPosition is less than or equal to 0x20 + # (i.e., it is either an ASCII space or control character), + # advance encodingPosition to the next byte. + while (ord($s[$pos]) <= 0x20) { + $pos++; + } + # If the byte at encodingPosition is not 0x3D (=), then return failure. + if ($s[$pos] !== "=") { + return null; + } + # While the byte at encodingPosition is less than or equal to 0x20 + # (i.e., it is either an ASCII space or control character), + # advance encodingPosition to the next byte. + while (ord($s[$pos]) <= 0x20) { + $pos++; + } + # Let quoteMark be the byte at encodingPosition. + $quoteMark = $s[$pos]; + # If quoteMark is not either 0x22 (") or 0x27 ('), then return failure. + if ($quoteMark !== "'" && $quoteMark !== '"') { + return null; + } + # Advance encodingPosition to the next byte. + $pos++; + # Let encodingEndPosition be the position of the next occurence of + # quoteMark at or after encodingPosition. If quoteMark does not + # occur again, then return failure. + $encodingEndPosition = strpos($s, $quoteMark, $pos); + if ($encodingEndPosition === false) { + return null; + } + # Let potentialEncoding be the sequence of the bytes between + # encodingPosition (inclusive) and encodingEndPosition (exlusive). + $potentialEncoding = substr($s, $pos, $encodingEndPosition - $pos); + # If potentialEncoding contains one or more bytes whose byte value + # is 0x20 or below, then return failure. + if (preg_match('/[\x{00}-\x{20}]/', $potentialEncoding)) { + return null; + } + # Let encoding be the result of getting an encoding given + # potentialEncoding isomorphic decoded. + // NOTE: Isomorphic decoding is not necessary since all encoding labels are ASCII + $encoding = static::fromCharset($potentialEncoding); + # If the encoding is UTF-16BE/LE, then change it to UTF-8. + if ($encoding === "UTF-16LE" || $encoding === "UTF-16BE") { + $encoding = "UTF-8"; + } + # Return encoding. + return $encoding; } /** Scans an attribute during the encoding detection pre-scan */ diff --git a/tests/cases/TestTreeConstructor.php b/tests/cases/TestTreeConstructor.php index d84fd25..f7c7c8f 100644 --- a/tests/cases/TestTreeConstructor.php +++ b/tests/cases/TestTreeConstructor.php @@ -40,7 +40,7 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase { } /** @dataProvider provideStandardTreeTests */ - public function xtestStandardTreeTestsWithHtmlNamespace(string $data, array $exp, array $errors, $fragment): void { + public function testStandardTreeTestsWithHtmlNamespace(string $data, array $exp, array $errors, $fragment): void { $config = new Config; $config->htmlNamespace = true; $this->runTreeTest($data, $exp, $errors, $fragment, $config);