diff --git a/lib/Charset.php b/lib/Charset.php index 0ca26ad..7047ee6 100644 --- a/lib/Charset.php +++ b/lib/Charset.php @@ -12,7 +12,7 @@ abstract class Charset { public static function fromCharset(string $value): ?string { $encoding = Encoding::matchLabel($value); if ($encoding) { - return $encoding['class']; + return $encoding['name']; } return null; } @@ -71,7 +71,7 @@ abstract class Charset { if ($param === "charset") { $encoding = Encoding::matchLabel($value); if ($encoding) { - return $encoding['class']; + return $encoding['name']; } else { return null; } @@ -80,7 +80,372 @@ abstract class Charset { return null; } - public static function fromPrescan(string $data): ?string { - return null; + public static function fromPrescan(string $data, int $endAfter = 1024): ?string { + # When an algorithm requires a user agent to prescan a byte stream to + # determine its encoding, given some defined end condition, then it + # must run the following steps. + # These steps either abort unsuccessfully or return a character + # encoding. If at any point during these steps (including during + # instances of the get an attribute algorithm invoked by this one) + # the user agent either runs out of bytes (meaning the position + # pointer created in the first step below goes beyond the end of the + # byte stream obtained so far) or reaches its end condition, then + # abort the prescan a byte stream to determine its encoding + # algorithm unsuccessfully. + $s = substr($data, 0, $endAfter); + + # Let position be a pointer to a byte in the input byte stream, + # initially pointing at the first byte. + $pos = 0; + + # Loop: If position points to: + while ($pos < $endAfter) { + // OPTIMIZATION: Start my skipping anything not a less-than sign + if (@$s[$pos] === "<") { + $pos++; + + # A sequence of bytes starting with: 0x3C 0x21 0x2D 0x2D (`' sequence) and + # comes after the 0x3C byte that was found.e (The two + # 0x2D bytes can be the same as those in the '", $pos) ?: $endAfter) + 3; + } + # A sequence of bytes starting with: 0x3C, 0x4D or 0x6D, + # 0x45 or 0x65, 0x54 or 0x74, 0x41 or 0x61, and one of + # 0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F (case-insensitive + # ASCII 'i", substr($s, $pos, 5))) { + # Advance the position pointer so that it points at + # the next 0x09, 0x0A, 0x0C, 0x0D, 0x20, or 0x2F + # byte (the one in sequence of characters matched above). + $pos += 5; + # Let attribute list be an empty list of strings. + # Let got pragma be false. + # Let need pragma be null. + # Let charset be the null value (which, for the purposes + # of this algorithm, is distinct from an unrecognized + # encoding or the empty string). + $attrList = []; + $gotPragma = false; + $needPragma = null; + $charset = null; + + # Attributes: Get an attribute and its value. + # If no attribute was sniffed, then jump to the processing step below. + while ($attr = self::getAttribute($s, $pos)) { + # If the attribute's name is already in attribute list, + # then return to the step labeled attributes. + if (isset($attrList[$attr['name']])) { + continue; + } + # Add the attribute's name to attribute list. + $attrList[$attr['name']] = true; + # Run the appropriate step from the following list, if one applies: + + # If the attribute's name is "http-equiv" + if ($attr['name'] === "http-equiv") { + # If the attribute's value is "content-type", then set got pragma to true. + if ($attr['value'] === "content-type") { + $gotPragma = true; + } + } + # If the attribute's name is "content" + elseif ($attr['name'] === "content") { + # Apply the algorithm for extracting a character encoding from a meta + # element, giving the attribute's value as the string to parse. + # If a character encoding is returned, and if charset is still set to + # null, let charset be the encoding returned, and set need pragma to true. + + // OPTIMIZATION: Check if charset is null before performing the algorithm + if (is_null($charset) && $candidate = self::fromMeta($attr['value'])) { + $charset = $candidate; + $needPragma = true; + } + } + # If the attribute's name is "charset" + elseif ($attr['name'] === "charset") { + # Let charset be the result of getting an encoding from the attribute's + # value, and set need pragma to false. + $candidate = self::fromCharset($attr['value']); + $charset = $candidate ?? false; // false signifies 'failure' + $needPragma = false; + } + } + + # Processing: If need pragma is null, then jump to the step below labeled next byte. + # If need pragma is true but got pragma is false, then jump to the step below labeled next byte. + if (is_null($needPragma) || ($needPragma && !$gotPragma)) { + continue; + } + # If charset is failure, then jump to the step below labeled next byte. + if ($charset === false) { + $pos++; + continue; + } + # If charset is a UTF-16 encoding, then set charset to UTF-8. + elseif ($charset === "UTF-16") { + $charset = "UTF-8"; + } + # If charset is x-user-defined, then set charset to windows-1252. + elseif ($charset === "x-user-defined") { + $charset = "windows-1252"; + } + # Abort the prescan a byte stream to determine its encoding algorithm, + # returning the encoding given by charset. + return $charset; + } + # A sequence of bytes starting with a 0x3C byte (<), optionally a 0x2F byte (/), + # and finally a byte in the range 0x41-0x5A or 0x61-0x7A (A-Z or a-z) + elseif (($s[$pos] === "/" && ctype_alpha($s[$pos + 1])) || (ctype_alpha($s[$pos]))) { + # Advance the position pointer so that it points at the next + # 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>) byte. + while (!in_array(@$s[$pos++], ["\x09", "\x0A", "\x0C", "\x0D", " ", ">", ""])); + # Repeatedly get an attribute until no further attributes can be found, + # then jump to the step below labeled next byte. + while(self::getAttribute($s, $pos)); + } + # A sequence of bytes starting with: 0x3C 0x21 (`) that comes after the 0x3C byte that was found. + $pos = (strpos($s, ">", $pos) ?: $endAfter) + 1; + } + } + # Any other byte + else { + # Do nothing with that byte. + $pos++; + } + } + } + + protected static function getAttribute(string $s, &$pos): array { + # When the prescan a byte stream to determine its encoding + # algorithm says to get an attribute, it means doing this: + + # If the byte at position is one of + # 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), + # or 0x2F (/) then advance position to the next byte and + # redo this step. + while (in_array(@$s[$pos], ["\x09", "\x0A", "\x0C", "\x0D", " ", "/"])) { + $pos++; + } + $char = @$s[$pos]; + + # If the byte at position is 0x3E (>), + # then abort the get an attribute algorithm. There isn't one. + if ($char === ">") { + return []; + } + # Otherwise, the byte at position is the start of the attribute name. + # Let attribute name and attribute value be the empty string. + $name = ""; + $value = ""; + + # Process the byte at position as follows: + while ($char !== "") { + # If it is 0x3D (=), and the attribute name is longer than the empty string + if ($char === "=" && $name !== "") { + # Advance position to the next byte and jump to the step below labeled value. + $pos++; + goto value; + } + # If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP) + elseif (in_array($char, ["\x09", "\x0A", "\x0C", "\x0D", " "])) { + goto spaces; + } + # If it is 0x2F (/) or 0x3E (>) + elseif ($char === "/" || $char === ">") { + # Abort the get an attribute algorithm. + # The attribute's name is the value of attribute name, its value is the empty string. + return ['name' => $name, 'value' => $value]; + } + # If it is in the range 0x41 (A) to 0x5A (Z) + # Anything else + else { + # Append the code point with the same value as the byte at position to attribute name. + # (It doesn't actually matter how bytes outside the ASCII range are handled here, + # since only ASCII bytes can contribute to the detection of a character encoding.) + + // OPTIMIZATION: Also handle uppercase characters + $name .= strtolower($char); + } + + # Advance position to the next byte and return to the previous step. + $char = @$s[++$pos]; + } + + if ($char === "") { + // Out of bytes + return []; + } + + spaces: + # If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), + # or 0x20 (SP) then advance position to the next byte, then, repeat this step. + while (in_array(@$s[$pos], ["\x09", "\x0A", "\x0C", "\x0D", " ", "/"])) { + $pos++; + } + $char = @$s[$pos]; + if ($char === "") { + // Out of bytes + return []; + } + # If the byte at position is not 0x3D (=), abort the get an attribute algorithm. + # The attribute's name is the value of attribute name, its value is the empty string. + if ($char !== "=") { + return ['name' => $name, 'value' => $value]; + } + # Advance position past the 0x3D (=) byte. + $char = @$s[++$pos]; + + value: + # If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), + # or 0x20 (SP) then advance position to the next byte, then, repeat this step. + while (in_array(@$s[$pos], ["\x09", "\x0A", "\x0C", "\x0D", " ", "/"])) { + $pos++; + } + $char = @$s[$pos]; + if ($char === "") { + // Out of bytes + return []; + } + # Process the byte at position as follows: + # If it is 0x22 (") or 0x27 (') + if ($char === "'" || $char === '"') { + # Let b be the value of the byte at position. + $b = $char; + # Quote loop: Advance position to the next byte. + while (($char = @$s[++$pos]) !== "") { + # If the value of the byte at position is the value of b, + # then advance position to the next byte and abort + # the "get an attribute" algorithm. + # The attribute's name is the value of attribute name, + # and its value is the value of attribute value. + if ($char === $b) { + $pos++; + return ['name' => $name, 'value' => $value]; + } + # Otherwise, append a code point to attribute value whose + # value is the same as the value of the byte at position. + + // OPTIMIZATION: Also handle uppercase characters + $value .= strtolower($char); + } + // Out of bytes + return []; + } + # If it is 0x3E (>) + elseif ($char === ">") { + # Abort the get an attribute algorithm. + # The attribute's name is the value of attribute name, + # its value is the empty string. + return ['name' => $name, 'value' => $value]; + } + # Anything else + else { + # Append a code point with the same value as the byte at position to attribute value. + # Advance position to the next byte. + + // OPTIMIZATION: Also handle uppercase characters + $value .= strtolower($char); + + while (($char = @$s[++$pos]) !== "") { + # Process the byte at position as follows: + # If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>) + if (in_array($char, ["\x09", "\x0A", "\x0C", "\x0D", " ", ">"])) { + # Abort the get an attribute algorithm. + # The attribute's name is the value of attribute name + # and its value is the value of attribute value. + return ['name' => $name, 'value' => $value]; + } + # If it is in the range 0x41 (A) to 0x5A (Z) + # Anything else + else { + # Append a code point with the same value as + # the byte at position to attribute value. + $value .= strtolower($char); + } + } + // Out of bytes + return []; + } + } + + protected static function fromMeta(string $s): ?string { + # The algorithm for extracting a character encoding from a meta element, + # given a string s, is as follows. + # It either returns a character encoding or nothing. + + # Let position be a pointer into s, initially pointing at the start of the string. + $pos = 0; + $end = strlen($s); + + # Loop: + while ($pos < $end) { + # Find the first seven characters in s after position + # that are an ASCII case-insensitive match for the word "charset". + # If no such match is found, return nothing. + $found = stripos($s, "charset", $pos); + if ($found === false) { + return null; + } + $pos = $found + 7; + # Skip any ASCII whitespace that immediately follow the word "charset" + # (there might not be any). + while (in_array(@$s[$pos], ["\x09", "\x0A", "\x0C", "\x0D", " "])) { + $pos++; + } + # If the next character is not a U+003D EQUALS SIGN (=), + # then move position to point just before that next + # character, and jump back to the step labeled loop. + if (@$s[$pos] !== "=") { + continue; + } + # Skip any ASCII whitespace that immediately follow the equals sign + # (there might not be any). + while (in_array(@$s[++$pos], ["\x09", "\x0A", "\x0C", "\x0D", " "])); + + # Process the next character as follows: + $char = @$s[$pos]; + + # If it is a U+0022 QUOTATION MARK character (")... + # If it is a U+0027 APOSTROPHE character (')... + if ($char === '"' || $char === "'") { + # ... and there is a later U+0022 QUOTATION MARK character (") in s + # ... and there is a later U+0027 APOSTROPHE character (') in s + if (($end = strpos($s, $char, $pos + 1)) !== false) { + $pos++; + return self::fromCharset(substr($s, $pos, $end - $pos)); + } + # If it is an unmatched U+0022 QUOTATION MARK character (") + # If it is an unmatched U+0027 APOSTROPHE character (') + else { + # Return nothing + return null; + } + } + # There is no next character + elseif ($char === "") { + # Return nothing + return null; + } + # Anything else + else { + # Return the result of getting an encoding from the substring + # that consists of this character up to but not including + # the first ASCII whitespace or U+003B SEMICOLON (;) + # character, or the end of s, whichever comes first. + $size = -1; + while (!in_array(@$s[$pos + (++$size)], ["\x09", "\x0A", "\x0C", "\x0D", " ", ";", ""])); + return self::fromCharset(substr($s, $pos, $size)); + } + } } } diff --git a/lib/Data.php b/lib/Data.php index a837535..8c98b76 100644 --- a/lib/Data.php +++ b/lib/Data.php @@ -3,6 +3,7 @@ declare(strict_types=1); namespace dW\HTML5; use MensBeam\Intl\Encoding; +use MensBeam\Intl\Encoding\Encoding as EncodingEncoding; class Data { use ParseErrorEmitter; @@ -50,10 +51,10 @@ class Data { } elseif ($encoding = Charset::fromPrescan($data)) { // Encoding is tentative } else { - // Encoding is tentative; fall back to windows 1252 - $encoding = \MensBeam\Intl\Encoding\Windows1252::class; + // Encoding is tentative; fall back to the configured default encoding + $encoding = Parser::$fallbackEncoding; } - $this->data = new $encoding($data, false, true); + $this->data = Encoding::createDecoder($encoding, $data, false, true); } diff --git a/lib/Parser.php b/lib/Parser.php index 89fd0ca..cca0f47 100644 --- a/lib/Parser.php +++ b/lib/Parser.php @@ -34,6 +34,8 @@ class Parser { /* Static properties */ + public static $fallbackEncoding = "UTF-8"; + // Property used as an instance for the non-static properties protected static $instance; diff --git a/tests/cases/TestCharset.php b/tests/cases/TestCharset.php index 35562c2..b7c2eb2 100644 --- a/tests/cases/TestCharset.php +++ b/tests/cases/TestCharset.php @@ -3,8 +3,6 @@ declare(strict_types=1); namespace dW\HTML5\TestCase; use dW\HTML5\Charset; -use MensBeam\Intl\Encoding\UTF8; -use MensBeam\Intl\Encoding\Windows1252; /** * @covers \dW\HTML5\Charset @@ -17,9 +15,9 @@ class TestCharset extends \PHPUnit\Framework\TestCase { public function provideCharsets() { return [ - ["UTF-8", UTF8::class], - [" utf8 ", UTF8::class], - ["ISO-8859-1", Windows1252::class], + ["UTF-8", "UTF-8"], + [" utf8 ", "UTF-8"], + ["ISO-8859-1", "windows-1252"], ["text/html; charset=utf8", null], ]; } @@ -35,12 +33,12 @@ class TestCharset extends \PHPUnit\Framework\TestCase { ["charset=utf8", null], ["text/html", null], ["text/html charset=utf8", null], - ["text/html; charset=utf8", UTF8::class], - ["text/html;charset=utf8", UTF8::class], - ["text/html; charset=\"utf8\"", UTF8::class], - ["image/svg+xml; param=value; charset=utf8", UTF8::class], - ["image/svg+xml; charset=utf8; charset=big5", UTF8::class], - ["image/svg+xml; charset=utf8;charset=big5", UTF8::class], + ["text/html; charset=utf8", "UTF-8"], + ["text/html;charset=utf8", "UTF-8"], + ["text/html; charset=\"utf8\"", "UTF-8"], + ["image/svg+xml; param=value; charset=utf8", "UTF-8"], + ["image/svg+xml; charset=utf8; charset=big5", "UTF-8"], + ["image/svg+xml; charset=utf8;charset=big5", "UTF-8"], ["text/html; charset=not-valid; charset=big5", null], ["text/html; charset=not-valid", null], ["text/html; charsaaet=\"a \\\"fancy\\\" encoding\"", null],