Browse Source

Update encoding pre-scan

Official tests are available in WPT,but will need a new test harness
serialize
J. King 3 years ago
parent
commit
29127c51ee
  1. 125
      lib/Parser/Charset.php
  2. 2
      tests/cases/TestTreeConstructor.php

125
lib/Parser/Charset.php

@ -130,24 +130,40 @@ abstract class Charset {
* @param int $endAfter The number of bytes of the string to stop after
*/
public static function fromPrescan(string $data, int $endAfter = 1024): ?string {
# When an algorithm requires a user agent to prescan a byte stream to
# determine its encoding, given some defined end condition, then it
# must run the following steps.
# These steps either abort unsuccessfully or return a character
# encoding. If at any point during these steps (including during
# instances of the get an attribute algorithm invoked by this one)
# the user agent either runs out of bytes (meaning the position
# pointer created in the first step below goes beyond the end of the
# byte stream obtained so far) or reaches its end condition, then
# abort the prescan a byte stream to determine its encoding
# algorithm unsuccessfully.
# When an algorithm requires a user agent to prescan a byte stream
# to determine its encoding, given some defined end condition,
# then it must run the following steps.
# If at any point during these steps (including during instances
# of the get an attribute algorithm invoked by this one) the
# user agent either runs out of bytes (meaning the position
# pointer created in the first step below goes beyond the end
# of the byte stream obtained so far) or reaches its end condition,
# then abort the prescan a byte stream to determine its encoding
# algorithm and return the result get an XML encoding applied to
# the same bytes that the prescan a byte stream to determine its
# encoding algorithm was applied to. Otherwise, these steps will
# return a character encoding.
$s = substr($data, 0, $endAfter);
$endAfter = strlen($s);
# Let fallback encoding be null.
// NOTE: This is never used
# Let position be a pointer to a byte in the input byte stream,
# initially pointing at the first byte.
$pos = 0;
# Prescan for UTF-16 XML declarations: If position points to:
# A sequence of bytes starting with: 0x3C, 0x0, 0x3F, 0x0, 0x78, 0x0 (case-sensitive UTF-16 little-endian '<?x')
if (substr($s, 0, 6) === "\x3C\x00\x3F\x00\x78\x00") {
# Return UTF-16LE.
return "UTF-16LE";
}
# A sequence of bytes starting with: 0x0, 0x3C, 0x0, 0x3F, 0x0, 0x78 (case-sensitive UTF-16 big-endian '<?x')
if (substr($s, 0, 6) === "\x00\x3C\x00\x3F\x00\x78") {
# Return UTF-16BE.
return "UTF-16BE";
}
# Loop: If position points to:
while ($pos < $endAfter) {
// OPTIMIZATION: Start my skipping anything not a less-than sign
@ -273,7 +289,90 @@ abstract class Charset {
$pos++;
}
}
return null;
return static::fromXMLDeclaration($data, $endAfter);
}
protected static function fromXMLDeclaration(string $data, int $endAfter): ?string {
# When the prescan a byte stream to determine its encoding algorithm
# is aborted without returning an encoding, get an XML encoding
# means doing this.
$s = substr($data, 0, $endAfter);
$endAfter = strlen($s);
# Let encodingPosition be a pointer to the start of the stream.
$pos = 0;
# If encodingPosition does not point to the start of a byte sequence
# 0x3C, 0x3F, 0x78, 0x6D, 0x6C (`<?xml`), then return failure.
if (substr($s, $pos, 5) !== "<?xml") {
return null;
}
# Let xmlDeclarationEnd be a pointer to the next byte in the input
# byte stream which is 0x3E (>). If there is no such byte,
# then return failure.
$xmlDeclarationEnd = strpos($s, ">");
if (!$xmlDeclarationEnd) {
return null;
}
# Set encodingPosition to the position of the first occurrence of the
# subsequence of bytes 0x65, 0x6E, 0x63, 0x6F, 0x64, 0x69, 0x6E,
# 0x67 (`encoding`) at or after the current encodingPosition. If
# there is no such sequence, then return failure.
// NOTE: This is buggy; see https://github.com/whatwg/html/issues/6939
$pos = strpos($s, "encoding");
if ($pos === false || $pos > $xmlDeclarationEnd) {
return null;
}
# Advance encodingPosition past the 0x67 (g) byte.
$pos = $pos + strlen("encoding");
# While the byte at encodingPosition is less than or equal to 0x20
# (i.e., it is either an ASCII space or control character),
# advance encodingPosition to the next byte.
while (ord($s[$pos]) <= 0x20) {
$pos++;
}
# If the byte at encodingPosition is not 0x3D (=), then return failure.
if ($s[$pos] !== "=") {
return null;
}
# While the byte at encodingPosition is less than or equal to 0x20
# (i.e., it is either an ASCII space or control character),
# advance encodingPosition to the next byte.
while (ord($s[$pos]) <= 0x20) {
$pos++;
}
# Let quoteMark be the byte at encodingPosition.
$quoteMark = $s[$pos];
# If quoteMark is not either 0x22 (") or 0x27 ('), then return failure.
if ($quoteMark !== "'" && $quoteMark !== '"') {
return null;
}
# Advance encodingPosition to the next byte.
$pos++;
# Let encodingEndPosition be the position of the next occurence of
# quoteMark at or after encodingPosition. If quoteMark does not
# occur again, then return failure.
$encodingEndPosition = strpos($s, $quoteMark, $pos);
if ($encodingEndPosition === false) {
return null;
}
# Let potentialEncoding be the sequence of the bytes between
# encodingPosition (inclusive) and encodingEndPosition (exlusive).
$potentialEncoding = substr($s, $pos, $encodingEndPosition - $pos);
# If potentialEncoding contains one or more bytes whose byte value
# is 0x20 or below, then return failure.
if (preg_match('/[\x{00}-\x{20}]/', $potentialEncoding)) {
return null;
}
# Let encoding be the result of getting an encoding given
# potentialEncoding isomorphic decoded.
// NOTE: Isomorphic decoding is not necessary since all encoding labels are ASCII
$encoding = static::fromCharset($potentialEncoding);
# If the encoding is UTF-16BE/LE, then change it to UTF-8.
if ($encoding === "UTF-16LE" || $encoding === "UTF-16BE") {
$encoding = "UTF-8";
}
# Return encoding.
return $encoding;
}
/** Scans an attribute during the encoding detection pre-scan */

2
tests/cases/TestTreeConstructor.php

@ -40,7 +40,7 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
}
/** @dataProvider provideStandardTreeTests */
public function xtestStandardTreeTestsWithHtmlNamespace(string $data, array $exp, array $errors, $fragment): void {
public function testStandardTreeTestsWithHtmlNamespace(string $data, array $exp, array $errors, $fragment): void {
$config = new Config;
$config->htmlNamespace = true;
$this->runTreeTest($data, $exp, $errors, $fragment, $config);

Loading…
Cancel
Save