You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
532 lines
25 KiB
532 lines
25 KiB
<?php
|
|
/** @license MIT
|
|
* Copyright 2017 , Dustin Wilson, J. King et al.
|
|
* See LICENSE and AUTHORS files for details */
|
|
|
|
declare(strict_types=1);
|
|
namespace MensBeam\HTML\Parser;
|
|
|
|
use MensBeam\Intl\Encoding;
|
|
use MensBeam\Mime\MimeType;
|
|
|
|
abstract class Charset {
|
|
/** Finds a Unicode byte order mark in a byte stream
|
|
* and returns the detected encoding, if any */
|
|
public static function fromBOM(string $data): ?string {
|
|
if (substr($data, 0, 3) === "\u{FEFF}") {
|
|
return "UTF-8";
|
|
} elseif (substr($data, 0, 2) === "\xFE\xFF") {
|
|
return "UTF-16BE";
|
|
} elseif (substr($data, 0, 2) === "\xFF\xFE") {
|
|
return "UTF-16LE";
|
|
} else {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/** Matches an encoding label (e.g. "utf-8") to its canonical name.
|
|
*
|
|
* @param string $value The encoding label to match
|
|
*/
|
|
public static function fromCharset(string $value): ?string {
|
|
$encoding = Encoding::matchLabel($value);
|
|
if ($encoding) {
|
|
return $encoding['name'];
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/** Extracts an encoding from an HTTP Content-Type header-field
|
|
* and returns the associated canonical encoding name.
|
|
*
|
|
* @param string $contentType The value of a Content-Type header-field
|
|
*/
|
|
public static function fromTransport(string $contentType): ?string {
|
|
$type = MimeType::parseBytes($contentType);
|
|
if ($type && isset($type->params['charset'])) {
|
|
$encoding = Encoding::matchLabel($type->params['charset']);
|
|
if ($encoding) {
|
|
return $encoding['name'];
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/** Interprets the value of an http-equiv Content-Type meta element. Despite the name this uses a different algorithm than that used for parsing HTTP Content-Types */
|
|
public static function fromMeta(string $s): ?string {
|
|
# The algorithm for extracting a character encoding from a meta element,
|
|
# given a string s, is as follows.
|
|
# It either returns a character encoding or nothing.
|
|
|
|
# Let position be a pointer into s, initially pointing at the start of the string.
|
|
$pos = 0;
|
|
$end = strlen($s);
|
|
|
|
# Loop:
|
|
while ($pos < $end) {
|
|
# Find the first seven characters in s after position
|
|
# that are an ASCII case-insensitive match for the word "charset".
|
|
# If no such match is found, return nothing.
|
|
$found = stripos($s, "charset", $pos);
|
|
if ($found === false) {
|
|
return null;
|
|
}
|
|
$pos = $found + 7;
|
|
# Skip any ASCII whitespace that immediately follow the word "charset"
|
|
# (there might not be any).
|
|
while (in_array(@$s[$pos], ["\x09", "\x0A", "\x0C", "\x0D", " "])) {
|
|
$pos++;
|
|
}
|
|
# If the next character is not a U+003D EQUALS SIGN (=),
|
|
# then move position to point just before that next
|
|
# character, and jump back to the step labeled loop.
|
|
if (@$s[$pos] !== "=") {
|
|
continue;
|
|
}
|
|
# Skip any ASCII whitespace that immediately follow the equals sign
|
|
# (there might not be any).
|
|
while (in_array(@$s[++$pos], ["\x09", "\x0A", "\x0C", "\x0D", " "]));
|
|
|
|
# Process the next character as follows:
|
|
$char = @$s[$pos];
|
|
|
|
# If it is a U+0022 QUOTATION MARK character (")...
|
|
# If it is a U+0027 APOSTROPHE character (')...
|
|
if ($char === '"' || $char === "'") {
|
|
# ... and there is a later U+0022 QUOTATION MARK character (") in s
|
|
# ... and there is a later U+0027 APOSTROPHE character (') in s
|
|
if (($end = strpos($s, $char, $pos + 1)) !== false) {
|
|
$pos++;
|
|
return self::fromCharset(substr($s, $pos, $end - $pos));
|
|
}
|
|
# If it is an unmatched U+0022 QUOTATION MARK character (")
|
|
# If it is an unmatched U+0027 APOSTROPHE character (')
|
|
else {
|
|
# Return nothing
|
|
return null;
|
|
}
|
|
}
|
|
# There is no next character
|
|
elseif ($char === "") {
|
|
# Return nothing
|
|
return null;
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Return the result of getting an encoding from the substring
|
|
# that consists of this character up to but not including
|
|
# the first ASCII whitespace or U+003B SEMICOLON (;)
|
|
# character, or the end of s, whichever comes first.
|
|
$size = -1;
|
|
while (!in_array(@$s[$pos + (++$size)], ["\x09", "\x0A", "\x0C", "\x0D", " ", ";", ""]));
|
|
return self::fromCharset(substr($s, $pos, $size));
|
|
}
|
|
}
|
|
} // @codeCoverageIgnore
|
|
|
|
/** Inspects the head of an HTML string to guess its encoding
|
|
*
|
|
* @param string $data The HTML string to scan
|
|
* @param int $endAfter The number of bytes of the string to stop after
|
|
*/
|
|
public static function fromPrescan(string $data, int $endAfter = 1024): ?string {
|
|
# When an algorithm requires a user agent to prescan a byte stream
|
|
# to determine its encoding, given some defined end condition,
|
|
# then it must run the following steps.
|
|
# If at any point during these steps (including during instances
|
|
# of the get an attribute algorithm invoked by this one) the
|
|
# user agent either runs out of bytes (meaning the position
|
|
# pointer created in the first step below goes beyond the end
|
|
# of the byte stream obtained so far) or reaches its end condition,
|
|
# then abort the prescan a byte stream to determine its encoding
|
|
# algorithm and return the result get an XML encoding applied to
|
|
# the same bytes that the prescan a byte stream to determine its
|
|
# encoding algorithm was applied to. Otherwise, these steps will
|
|
# return a character encoding.
|
|
$s = substr($data, 0, $endAfter);
|
|
$endAfter = strlen($s);
|
|
|
|
# Let fallback encoding be null.
|
|
// NOTE: This is never used
|
|
# Let position be a pointer to a byte in the input byte stream,
|
|
# initially pointing at the first byte.
|
|
$pos = 0;
|
|
|
|
# Prescan for UTF-16 XML declarations: If position points to:
|
|
# A sequence of bytes starting with: 0x3C, 0x0, 0x3F, 0x0, 0x78, 0x0 (case-sensitive UTF-16 little-endian '<?x')
|
|
if (substr($s, 0, 6) === "\x3C\x00\x3F\x00\x78\x00") {
|
|
# Return UTF-16LE.
|
|
return "UTF-16LE";
|
|
}
|
|
# A sequence of bytes starting with: 0x0, 0x3C, 0x0, 0x3F, 0x0, 0x78 (case-sensitive UTF-16 big-endian '<?x')
|
|
if (substr($s, 0, 6) === "\x00\x3C\x00\x3F\x00\x78") {
|
|
# Return UTF-16BE.
|
|
return "UTF-16BE";
|
|
}
|
|
|
|
# Loop: If position points to:
|
|
while ($pos < $endAfter) {
|
|
// OPTIMIZATION: Start my skipping anything not a less-than sign
|
|
if (@$s[$pos] === "<") {
|
|
$pos++;
|
|
|
|
# A sequence of bytes starting with: 0x3C 0x21 0x2D 0x2D (`<!--`)
|
|
if (@$s[$pos] === "!" && @$s[$pos + 1] === "-" && @$s[$pos + 2] === "-") {
|
|
# Advance the position pointer so that it points at the
|
|
# first 0x3E byte which is preceded by two 0x2D bytes
|
|
# (i.e. at the end of an ASCII '-->' sequence) and
|
|
# comes after the 0x3C byte that was found.e (The two
|
|
# 0x2D bytes can be the same as those in the '<!--'
|
|
# sequence.)
|
|
$pos = (strpos($s, "-->", $pos) ?: $endAfter) + 3;
|
|
}
|
|
# A sequence of bytes starting with: 0x3C, 0x4D or 0x6D,
|
|
# 0x45 or 0x65, 0x54 or 0x74, 0x41 or 0x61, and one of
|
|
# 0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F (case-insensitive
|
|
# ASCII '<meta' followed by a space or slash)
|
|
elseif (preg_match("<^meta[\x09\x0A\x0C\x0D /]$>i", substr($s, $pos, 5))) {
|
|
# Advance the position pointer so that it points at
|
|
# the next 0x09, 0x0A, 0x0C, 0x0D, 0x20, or 0x2F
|
|
# byte (the one in sequence of characters matched above).
|
|
$pos += 5;
|
|
# Let attribute list be an empty list of strings.
|
|
# Let got pragma be false.
|
|
# Let need pragma be null.
|
|
# Let charset be the null value (which, for the purposes
|
|
# of this algorithm, is distinct from an unrecognized
|
|
# encoding or the empty string).
|
|
$attrList = [];
|
|
$gotPragma = false;
|
|
$needPragma = null;
|
|
$charset = null;
|
|
|
|
# Attributes: Get an attribute and its value.
|
|
# If no attribute was sniffed, then jump to the processing step below.
|
|
while ($attr = self::getAttribute($s, $pos)) {
|
|
# If the attribute's name is already in attribute list,
|
|
# then return to the step labeled attributes.
|
|
if (isset($attrList[$attr['name']])) {
|
|
continue;
|
|
}
|
|
# Add the attribute's name to attribute list.
|
|
$attrList[$attr['name']] = true;
|
|
# Run the appropriate step from the following list, if one applies:
|
|
|
|
# If the attribute's name is "http-equiv"
|
|
if ($attr['name'] === "http-equiv") {
|
|
# If the attribute's value is "content-type", then set got pragma to true.
|
|
if ($attr['value'] === "content-type") {
|
|
$gotPragma = true;
|
|
}
|
|
}
|
|
# If the attribute's name is "content"
|
|
elseif ($attr['name'] === "content") {
|
|
# Apply the algorithm for extracting a character encoding from a meta
|
|
# element, giving the attribute's value as the string to parse.
|
|
# If a character encoding is returned, and if charset is still set to
|
|
# null, let charset be the encoding returned, and set need pragma to true.
|
|
|
|
// OPTIMIZATION: Check if charset is null before performing the algorithm
|
|
if ($charset === null && $candidate = self::fromMeta($attr['value'])) {
|
|
$charset = $candidate;
|
|
$needPragma = true;
|
|
}
|
|
}
|
|
# If the attribute's name is "charset"
|
|
elseif ($attr['name'] === "charset") {
|
|
# Let charset be the result of getting an encoding from the attribute's
|
|
# value, and set need pragma to false.
|
|
$candidate = self::fromCharset($attr['value']);
|
|
$charset = $candidate ?? false; // false signifies 'failure'
|
|
$needPragma = false;
|
|
}
|
|
}
|
|
|
|
# Processing: If need pragma is null, then jump to the step below labeled next byte.
|
|
# If need pragma is true but got pragma is false, then jump to the step below labeled next byte.
|
|
if ($needPragma === null || ($needPragma && !$gotPragma)) {
|
|
continue;
|
|
}
|
|
# If charset is failure, then jump to the step below labeled next byte.
|
|
if ($charset === false) {
|
|
$pos++;
|
|
continue;
|
|
}
|
|
# If charset is a UTF-16 encoding, then set charset to UTF-8.
|
|
elseif ($charset === "UTF-16" || $charset === "UTF-16LE" || $charset === "UTF-16BE") {
|
|
$charset = "UTF-8";
|
|
}
|
|
# If charset is x-user-defined, then set charset to windows-1252.
|
|
elseif ($charset === "x-user-defined") {
|
|
$charset = "windows-1252";
|
|
}
|
|
# Abort the prescan a byte stream to determine its encoding algorithm,
|
|
# returning the encoding given by charset.
|
|
return $charset;
|
|
}
|
|
# A sequence of bytes starting with a 0x3C byte (<), optionally a 0x2F byte (/),
|
|
# and finally a byte in the range 0x41-0x5A or 0x61-0x7A (A-Z or a-z)
|
|
elseif ((@$s[$pos] === "/" && ctype_alpha(@$s[$pos + 1])) || (ctype_alpha(@$s[$pos]))) {
|
|
# Advance the position pointer so that it points at the next
|
|
# 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>) byte.
|
|
while (!in_array(@$s[++$pos], ["\x09", "\x0A", "\x0C", "\x0D", " ", ">", ""]));
|
|
# Repeatedly get an attribute until no further attributes can be found,
|
|
# then jump to the step below labeled next byte.
|
|
while(self::getAttribute($s, $pos));
|
|
}
|
|
# A sequence of bytes starting with: 0x3C 0x21 (`<!`)
|
|
# A sequence of bytes starting with: 0x3C 0x2F (`</`)
|
|
# A sequence of bytes starting with: 0x3C 0x3F (`<?`)
|
|
elseif (in_array(@$s[$pos], ["!", "/", "?"])) {
|
|
# Advance the position pointer so that it points at the first
|
|
# 0x3E byte (>) that comes after the 0x3C byte that was found.
|
|
$pos = (strpos($s, ">", $pos) ?: $endAfter) + 1;
|
|
}
|
|
}
|
|
# Any other byte
|
|
else {
|
|
# Do nothing with that byte.
|
|
$pos++;
|
|
}
|
|
}
|
|
return static::fromXMLDeclaration($data, $endAfter);
|
|
}
|
|
|
|
protected static function fromXMLDeclaration(string $data, int $endAfter): ?string {
|
|
# When the prescan a byte stream to determine its encoding algorithm
|
|
# is aborted without returning an encoding, get an XML encoding
|
|
# means doing this.
|
|
$s = substr($data, 0, $endAfter);
|
|
$endAfter = strlen($s);
|
|
|
|
# Let encodingPosition be a pointer to the start of the stream.
|
|
$pos = 0;
|
|
# If encodingPosition does not point to the start of a byte sequence
|
|
# 0x3C, 0x3F, 0x78, 0x6D, 0x6C (`<?xml`), then return failure.
|
|
if (substr($s, $pos, 5) !== "<?xml") {
|
|
return null;
|
|
}
|
|
# Let xmlDeclarationEnd be a pointer to the next byte in the input
|
|
# byte stream which is 0x3E (>). If there is no such byte,
|
|
# then return failure.
|
|
$xmlDeclarationEnd = strpos($s, ">");
|
|
if (!$xmlDeclarationEnd) {
|
|
return null;
|
|
}
|
|
# Set encodingPosition to the position of the first occurrence of the
|
|
# subsequence of bytes 0x65, 0x6E, 0x63, 0x6F, 0x64, 0x69, 0x6E,
|
|
# 0x67 (`encoding`) at or after the current encodingPosition. If
|
|
# there is no such sequence, then return failure.
|
|
// NOTE: This is buggy; see https://github.com/whatwg/html/issues/6939
|
|
$pos = strpos($s, "encoding");
|
|
if ($pos === false || $pos > $xmlDeclarationEnd) {
|
|
return null;
|
|
}
|
|
# Advance encodingPosition past the 0x67 (g) byte.
|
|
$pos = $pos + strlen("encoding");
|
|
# While the byte at encodingPosition is less than or equal to 0x20
|
|
# (i.e., it is either an ASCII space or control character),
|
|
# advance encodingPosition to the next byte.
|
|
while (ord($s[$pos]) <= 0x20) {
|
|
$pos++;
|
|
}
|
|
# If the byte at encodingPosition is not 0x3D (=), then return failure.
|
|
// NOTE: This is also buggy: see https://github.com/whatwg/html/issues/7193
|
|
if ($s[$pos++] !== "=") {
|
|
return null;
|
|
}
|
|
# While the byte at encodingPosition is less than or equal to 0x20
|
|
# (i.e., it is either an ASCII space or control character),
|
|
# advance encodingPosition to the next byte.
|
|
while (ord($s[$pos]) <= 0x20) {
|
|
$pos++;
|
|
}
|
|
# Let quoteMark be the byte at encodingPosition.
|
|
$quoteMark = $s[$pos];
|
|
# If quoteMark is not either 0x22 (") or 0x27 ('), then return failure.
|
|
if ($quoteMark !== "'" && $quoteMark !== '"') {
|
|
return null;
|
|
}
|
|
# Advance encodingPosition to the next byte.
|
|
$pos++;
|
|
# Let encodingEndPosition be the position of the next occurence of
|
|
# quoteMark at or after encodingPosition. If quoteMark does not
|
|
# occur again, then return failure.
|
|
$encodingEndPosition = strpos($s, $quoteMark, $pos);
|
|
if ($encodingEndPosition === false) {
|
|
return null;
|
|
}
|
|
# Let potentialEncoding be the sequence of the bytes between
|
|
# encodingPosition (inclusive) and encodingEndPosition (exlusive).
|
|
$potentialEncoding = substr($s, $pos, $encodingEndPosition - $pos);
|
|
# If potentialEncoding contains one or more bytes whose byte value
|
|
# is 0x20 or below, then return failure.
|
|
if (preg_match('/[\x{00}-\x{20}]/', $potentialEncoding)) {
|
|
return null;
|
|
}
|
|
# Let encoding be the result of getting an encoding given
|
|
# potentialEncoding isomorphic decoded.
|
|
// NOTE: Isomorphic decoding is not necessary since all encoding labels are ASCII
|
|
$encoding = static::fromCharset($potentialEncoding);
|
|
# If the encoding is UTF-16BE/LE, then change it to UTF-8.
|
|
if ($encoding === "UTF-16LE" || $encoding === "UTF-16BE") {
|
|
$encoding = "UTF-8";
|
|
}
|
|
# Return encoding.
|
|
return $encoding;
|
|
}
|
|
|
|
/** Scans an attribute during the encoding detection pre-scan */
|
|
protected static function getAttribute(string $s, &$pos): array {
|
|
# When the prescan a byte stream to determine its encoding
|
|
# algorithm says to get an attribute, it means doing this:
|
|
|
|
# If the byte at position is one of
|
|
# 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP),
|
|
# or 0x2F (/) then advance position to the next byte and
|
|
# redo this step.
|
|
while (in_array(@$s[$pos], ["\x09", "\x0A", "\x0C", "\x0D", " ", "/"])) {
|
|
$pos++;
|
|
}
|
|
$char = @$s[$pos];
|
|
|
|
# If the byte at position is 0x3E (>),
|
|
# then abort the get an attribute algorithm. There isn't one.
|
|
if ($char === ">") {
|
|
return [];
|
|
}
|
|
# Otherwise, the byte at position is the start of the attribute name.
|
|
# Let attribute name and attribute value be the empty string.
|
|
$name = "";
|
|
$value = "";
|
|
|
|
# Process the byte at position as follows:
|
|
while ($char !== "") {
|
|
# If it is 0x3D (=), and the attribute name is longer than the empty string
|
|
if ($char === "=" && $name !== "") {
|
|
# Advance position to the next byte and jump to the step below labeled value.
|
|
$pos++;
|
|
goto value;
|
|
}
|
|
# If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP)
|
|
elseif (in_array($char, ["\x09", "\x0A", "\x0C", "\x0D", " "])) {
|
|
goto spaces;
|
|
}
|
|
# If it is 0x2F (/) or 0x3E (>)
|
|
elseif ($char === "/" || $char === ">") {
|
|
# Abort the get an attribute algorithm.
|
|
# The attribute's name is the value of attribute name, its value is the empty string.
|
|
return ['name' => $name, 'value' => $value];
|
|
}
|
|
# If it is in the range 0x41 (A) to 0x5A (Z)
|
|
# Anything else
|
|
else {
|
|
# Append the code point with the same value as the byte at position to attribute name.
|
|
# (It doesn't actually matter how bytes outside the ASCII range are handled here,
|
|
# since only ASCII bytes can contribute to the detection of a character encoding.)
|
|
|
|
// OPTIMIZATION: Also handle uppercase characters
|
|
$name .= strtolower($char);
|
|
}
|
|
|
|
# Advance position to the next byte and return to the previous step.
|
|
$char = @$s[++$pos];
|
|
}
|
|
|
|
if ($char === "") {
|
|
// Out of bytes
|
|
return [];
|
|
}
|
|
|
|
spaces:
|
|
# If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR),
|
|
# or 0x20 (SP) then advance position to the next byte, then, repeat this step.
|
|
while (in_array(@$s[$pos], ["\x09", "\x0A", "\x0C", "\x0D", " "])) {
|
|
$pos++;
|
|
}
|
|
$char = @$s[$pos];
|
|
if ($char === "") {
|
|
// Out of bytes
|
|
return [];
|
|
}
|
|
# If the byte at position is not 0x3D (=), abort the get an attribute algorithm.
|
|
# The attribute's name is the value of attribute name, its value is the empty string.
|
|
if ($char !== "=") {
|
|
return ['name' => $name, 'value' => $value];
|
|
}
|
|
# Advance position past the 0x3D (=) byte.
|
|
$char = @$s[++$pos];
|
|
|
|
value:
|
|
# If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR),
|
|
# or 0x20 (SP) then advance position to the next byte, then, repeat this step.
|
|
while (in_array(@$s[$pos], ["\x09", "\x0A", "\x0C", "\x0D", " "])) {
|
|
$pos++;
|
|
}
|
|
$char = @$s[$pos];
|
|
if ($char === "") {
|
|
// Out of bytes
|
|
return [];
|
|
}
|
|
# Process the byte at position as follows:
|
|
# If it is 0x22 (") or 0x27 (')
|
|
if ($char === "'" || $char === '"') {
|
|
# Let b be the value of the byte at position.
|
|
$b = $char;
|
|
# Quote loop: Advance position to the next byte.
|
|
while (($char = @$s[++$pos]) !== "") {
|
|
# If the value of the byte at position is the value of b,
|
|
# then advance position to the next byte and abort
|
|
# the "get an attribute" algorithm.
|
|
# The attribute's name is the value of attribute name,
|
|
# and its value is the value of attribute value.
|
|
if ($char === $b) {
|
|
$pos++;
|
|
return ['name' => $name, 'value' => $value];
|
|
}
|
|
# Otherwise, append a code point to attribute value whose
|
|
# value is the same as the value of the byte at position.
|
|
|
|
// OPTIMIZATION: Also handle uppercase characters
|
|
$value .= strtolower($char);
|
|
}
|
|
// Out of bytes
|
|
return [];
|
|
}
|
|
# If it is 0x3E (>)
|
|
elseif ($char === ">") {
|
|
# Abort the get an attribute algorithm.
|
|
# The attribute's name is the value of attribute name,
|
|
# its value is the empty string.
|
|
return ['name' => $name, 'value' => $value];
|
|
}
|
|
# Anything else
|
|
else {
|
|
# Append a code point with the same value as the byte at position to attribute value.
|
|
# Advance position to the next byte.
|
|
|
|
// OPTIMIZATION: Also handle uppercase characters
|
|
$value .= strtolower($char);
|
|
|
|
while (($char = @$s[++$pos]) !== "") {
|
|
# Process the byte at position as follows:
|
|
# If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>)
|
|
if (in_array($char, ["\x09", "\x0A", "\x0C", "\x0D", " ", ">"])) {
|
|
# Abort the get an attribute algorithm.
|
|
# The attribute's name is the value of attribute name
|
|
# and its value is the value of attribute value.
|
|
return ['name' => $name, 'value' => $value];
|
|
}
|
|
# If it is in the range 0x41 (A) to 0x5A (Z)
|
|
# Anything else
|
|
else {
|
|
# Append a code point with the same value as
|
|
# the byte at position to attribute value.
|
|
$value .= strtolower($char);
|
|
}
|
|
}
|
|
// Out of bytes
|
|
return [];
|
|
}
|
|
}
|
|
}
|
|
|