HTML-Parser/lib/Parser/Charset.php


								<?php

								/** @license MIT

								 * Copyright 2017 , Dustin Wilson, J. King et al.

								 * See LICENSE and AUTHORS files for details */


								declare(strict_types=1);

								namespace MensBeam\HTML\Parser;


								use MensBeam\Intl\Encoding;

								use MensBeam\Mime\MimeType;


								abstract class Charset {

								    /** Finds a Unicode byte order mark in a byte stream

								     * and returns the detected encoding, if any */

								    public static function fromBOM(string $data): ?string {

								        if (substr($data, 0, 3) === "\u{FEFF}") {

								            return "UTF-8";

								        } elseif (substr($data, 0, 2) === "\xFE\xFF") {

								            return "UTF-16BE";

								        } elseif (substr($data, 0, 2) === "\xFF\xFE") {

								            return "UTF-16LE";

								        } else {

								            return null;

								        }

								    }


								    /** Matches an encoding label (e.g. "utf-8") to its canonical name.

								     *

								     * @param string $value The encoding label to match

								     */

								    public static function fromCharset(string $value): ?string {

								        $encoding = Encoding::matchLabel($value);

								        if ($encoding) {

								            return $encoding['name'];

								        }

								        return null;

								    }


								    /** Extracts an encoding from an HTTP Content-Type header-field

								     * and returns the associated canonical encoding name.

								     *

								     * @param string $contentType The value of a Content-Type header-field

								     */

								    public static function fromTransport(string $contentType): ?string {

								        $type = MimeType::parseBytes($contentType);

								        if ($type && isset($type->params['charset'])) {

								            $encoding = Encoding::matchLabel($type->params['charset']);

								            if ($encoding) {

								                return $encoding['name'];

								            }

								        }

								        return null;

								    }


								    /** Interprets the value of an http-equiv Content-Type meta element. Despite the name this uses a different algorithm than that used for parsing HTTP Content-Types */

								    public static function fromMeta(string $s): ?string {

								        # The algorithm for extracting a character encoding from a meta element,

								        #   given a string s, is as follows.

								        # It either returns a character encoding or nothing.


								        # Let position be a pointer into s, initially pointing at the start of the string.

								        $pos = 0;

								        $end = strlen($s);


								        # Loop:

								        while ($pos < $end) {

								            # Find the first seven characters in s after position

								            #   that are an ASCII case-insensitive match for the word "charset".

								            # If no such match is found, return nothing.

								            $found = stripos($s, "charset", $pos);

								            if ($found === false) {

								                return null;

								            }

								            $pos = $found + 7;

								            # Skip any ASCII whitespace that immediately follow the word "charset"

								            #   (there might not be any).

								            while (in_array(@$s[$pos], ["\x09", "\x0A", "\x0C", "\x0D", " "])) {

								                $pos++;

								            }

								            # If the next character is not a U+003D EQUALS SIGN (=),

								            #   then move position to point just before that next

								            #   character, and jump back to the step labeled loop.

								            if (@$s[$pos] !== "=") {

								                continue;

								            }

								            # Skip any ASCII whitespace that immediately follow the equals sign

								            #   (there might not be any).

								            while (in_array(@$s[++$pos], ["\x09", "\x0A", "\x0C", "\x0D", " "]));


								            # Process the next character as follows:

								            $char = @$s[$pos];


								            # If it is a U+0022 QUOTATION MARK character (")...

								            # If it is a U+0027 APOSTROPHE character (')...

								            if ($char === '"' || $char === "'") {

								                # ... and there is a later U+0022 QUOTATION MARK character (") in s

								                # ... and there is a later U+0027 APOSTROPHE character (') in s

								                if (($end = strpos($s, $char, $pos + 1)) !== false) {

								                    $pos++;

								                    return self::fromCharset(substr($s, $pos, $end - $pos));

								                }

								                # If it is an unmatched U+0022 QUOTATION MARK character (")

								                # If it is an unmatched U+0027 APOSTROPHE character (')

								                else {

								                    # Return nothing

								                    return null;

								                }

								            }

								            # There is no next character

								            elseif ($char === "") {

								                # Return nothing

								                return null;

								            }

								            # Anything else

								            else {

								                # Return the result of getting an encoding from the substring

								                #   that consists of this character up to but not including

								                #   the first ASCII whitespace or U+003B SEMICOLON (;)

								                #   character, or the end of s, whichever comes first.

								                $size = -1;

								                while (!in_array(@$s[$pos + (++$size)], ["\x09", "\x0A", "\x0C", "\x0D", " ", ";", ""]));

								                return self::fromCharset(substr($s, $pos, $size));

								            }

								        }

								    } // @codeCoverageIgnore


								    /** Inspects the head of an HTML string to guess its encoding

								     *

								     * @param string $data The HTML string to scan

								     * @param int $endAfter The number of bytes of the string to stop after

								     */

								    public static function fromPrescan(string $data, int $endAfter = 1024): ?string {

								        # When an algorithm requires a user agent to prescan a byte stream

								        #   to determine its encoding, given some defined end condition,

								        #   then it must run the following steps.

								        # If at any point during these steps (including during instances

								        #   of the get an attribute algorithm invoked by this one) the

								        #   user agent either runs out of bytes (meaning the position

								        #   pointer created in the first step below goes beyond the end

								        #   of the byte stream obtained so far) or reaches its end condition,

								        #   then abort the prescan a byte stream to determine its encoding

								        #   algorithm and return the result get an XML encoding applied to

								        #   the same bytes that the prescan a byte stream to determine its

								        #   encoding algorithm was applied to. Otherwise, these steps will

								        #   return a character encoding.

								        $s = substr($data, 0, $endAfter);

								        $endAfter = strlen($s);


								        # Let fallback encoding be null.

								        // NOTE: This is never used

								        # Let position be a pointer to a byte in the input byte stream,

								        #   initially pointing at the first byte.

								        $pos = 0;


								        # Prescan for UTF-16 XML declarations: If position points to:

								        # A sequence of bytes starting with: 0x3C, 0x0, 0x3F, 0x0, 0x78, 0x0 (case-sensitive UTF-16 little-endian '<?x')

								        if (substr($s, 0, 6) === "\x3C\x00\x3F\x00\x78\x00") {

								            # Return UTF-16LE.

								            return "UTF-16LE";

								        }

								        # A sequence of bytes starting with: 0x0, 0x3C, 0x0, 0x3F, 0x0, 0x78 (case-sensitive UTF-16 big-endian '<?x')

								        if (substr($s, 0, 6) === "\x00\x3C\x00\x3F\x00\x78") {

								            # Return UTF-16BE.

								            return "UTF-16BE";

								        }


								        # Loop: If position points to:

								        while ($pos < $endAfter) {

								            // OPTIMIZATION: Start my skipping anything not a less-than sign

								            if (@$s[$pos] === "<") {

								                $pos++;


								                # A sequence of bytes starting with: 0x3C 0x21 0x2D 0x2D (`<!--`)

								                if (@$s[$pos] === "!" && @$s[$pos + 1] === "-" && @$s[$pos + 2] === "-") {

								                    # Advance the position pointer so that it points at the

								                    #   first 0x3E byte which is preceded by two 0x2D bytes

								                    #   (i.e. at the end of an ASCII '-->' sequence) and

								                    #   comes after the 0x3C byte that was found.e (The two

								                    #   0x2D bytes can be the same as those in the '<!--'

								                    #   sequence.)

								                    $pos = (strpos($s, "-->", $pos) ?: $endAfter) + 3;

								                }

								                # A sequence of bytes starting with: 0x3C, 0x4D or 0x6D,

								                #   0x45 or 0x65, 0x54 or 0x74, 0x41 or 0x61, and one of

								                #   0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F (case-insensitive

								                #   ASCII '<meta' followed by a space or slash)

								                elseif (preg_match("<^meta[\x09\x0A\x0C\x0D /]$>i", substr($s, $pos, 5))) {

								                    # Advance the position pointer so that it points at

								                    #   the next 0x09, 0x0A, 0x0C, 0x0D, 0x20, or 0x2F

								                    #   byte (the one in sequence of characters matched above).

								                    $pos += 5;

								                    # Let attribute list be an empty list of strings.

								                    # Let got pragma be false.

								                    # Let need pragma be null.

								                    # Let charset be the null value (which, for the purposes

								                    #   of this algorithm, is distinct from an unrecognized

								                    #   encoding or the empty string).

								                    $attrList = [];

								                    $gotPragma = false;

								                    $needPragma = null;

								                    $charset = null;


								                    # Attributes: Get an attribute and its value.

								                    # If no attribute was sniffed, then jump to the processing step below.

								                    while ($attr = self::getAttribute($s, $pos)) {

								                        # If the attribute's name is already in attribute list,

								                        #   then return to the step labeled attributes.

								                        if (isset($attrList[$attr['name']])) {

								                            continue;

								                        }

								                        # Add the attribute's name to attribute list.

								                        $attrList[$attr['name']] = true;

								                        # Run the appropriate step from the following list, if one applies:


								                        # If the attribute's name is "http-equiv"

								                        if ($attr['name'] === "http-equiv") {

								                            # If the attribute's value is "content-type", then set got pragma to true.

								                            if ($attr['value'] === "content-type") {

								                                $gotPragma = true;

								                            }

								                        }

								                        # If the attribute's name is "content"

								                        elseif ($attr['name'] === "content") {

								                            # Apply the algorithm for extracting a character encoding from a meta

								                            #   element, giving the attribute's value as the string to parse.

								                            # If a character encoding is returned, and if charset is still set to

								                            #   null, let charset be the encoding returned, and set need pragma to true.


								                            // OPTIMIZATION: Check if charset is null before performing the algorithm

								                            if ($charset === null && $candidate = self::fromMeta($attr['value'])) {

								                                $charset = $candidate;

								                                $needPragma = true;

								                            }

								                        }

								                        # If the attribute's name is "charset"

								                        elseif ($attr['name'] === "charset") {

								                            # Let charset be the result of getting an encoding from the attribute's

								                            #   value, and set need pragma to false.

								                            $candidate = self::fromCharset($attr['value']);

								                            $charset = $candidate ?? false; // false signifies 'failure'

								                            $needPragma = false;

								                        }

								                    }


								                    # Processing: If need pragma is null, then jump to the step below labeled next byte.

								                    # If need pragma is true but got pragma is false, then jump to the step below labeled next byte.

								                    if ($needPragma === null || ($needPragma && !$gotPragma)) {

								                        continue;

								                    }

								                    # If charset is failure, then jump to the step below labeled next byte.

								                    if ($charset === false) {

								                        $pos++;

								                        continue;

								                    }

								                    # If charset is a UTF-16 encoding, then set charset to UTF-8.

								                    elseif ($charset === "UTF-16" || $charset === "UTF-16LE" || $charset === "UTF-16BE") {

								                        $charset = "UTF-8";

								                    }

								                    # If charset is x-user-defined, then set charset to windows-1252.

								                    elseif ($charset === "x-user-defined") {

								                        $charset = "windows-1252";

								                    }

								                    # Abort the prescan a byte stream to determine its encoding algorithm,

								                    #   returning the encoding given by charset.

								                    return $charset;

								                }

								                # A sequence of bytes starting with a 0x3C byte (<), optionally a 0x2F byte (/),

								                #   and finally a byte in the range 0x41-0x5A or 0x61-0x7A (A-Z or a-z)

								                elseif ((@$s[$pos] === "/" && ctype_alpha(@$s[$pos + 1])) || (ctype_alpha(@$s[$pos]))) {

								                    # Advance the position pointer so that it points at the next

								                    #   0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>) byte.

								                    while (!in_array(@$s[++$pos], ["\x09", "\x0A", "\x0C", "\x0D", " ", ">", ""]));

								                    # Repeatedly get an attribute until no further attributes can be found,

								                    #   then jump to the step below labeled next byte.

								                    while(self::getAttribute($s, $pos));

								                }

								                # A sequence of bytes starting with: 0x3C 0x21 (`<!`)

								                # A sequence of bytes starting with: 0x3C 0x2F (`</`)

								                # A sequence of bytes starting with: 0x3C 0x3F (`<?`)

								                elseif (in_array(@$s[$pos], ["!", "/", "?"])) {

								                    # Advance the position pointer so that it points at the first

								                    #   0x3E byte (>) that comes after the 0x3C byte that was found.

								                    $pos = (strpos($s, ">", $pos) ?: $endAfter) + 1;

								                }

								            }

								            # Any other byte

								            else {

								                # Do nothing with that byte.

								                $pos++;

								            }

								        }

								        return static::fromXMLDeclaration($data, $endAfter);

								    }


								    protected static function fromXMLDeclaration(string $data, int $endAfter): ?string {

								        # When the prescan a byte stream to determine its encoding algorithm

								        #   is aborted without returning an encoding, get an XML encoding

								        #   means doing this.

								        $s = substr($data, 0, $endAfter);

								        $endAfter = strlen($s);


								        # Let encodingPosition be a pointer to the start of the stream.

								        $pos = 0;

								        # If encodingPosition does not point to the start of a byte sequence

								        #   0x3C, 0x3F, 0x78, 0x6D, 0x6C (`<?xml`), then return failure.

								        if (substr($s, $pos, 5) !== "<?xml") {

								            return null;

								        }

								        # Let xmlDeclarationEnd be a pointer to the next byte in the input

								        #   byte stream which is 0x3E (>). If there is no such byte,

								        #   then return failure.

								        $xmlDeclarationEnd = strpos($s, ">");

								        if (!$xmlDeclarationEnd) {

								            return null;

								        }

								        # Set encodingPosition to the position of the first occurrence of the

								        #   subsequence of bytes 0x65, 0x6E, 0x63, 0x6F, 0x64, 0x69, 0x6E,

								        #   0x67 (`encoding`) at or after the current encodingPosition. If

								        #   there is no such sequence, then return failure.

								        // NOTE: This is buggy; see https://github.com/whatwg/html/issues/6939

								        $pos = strpos($s, "encoding");

								        if ($pos === false || $pos > $xmlDeclarationEnd) {

								            return null;

								        }

								        # Advance encodingPosition past the 0x67 (g) byte.

								        $pos = $pos + strlen("encoding");

								        # While the byte at encodingPosition is less than or equal to 0x20

								        #   (i.e., it is either an ASCII space or control character),

								        #   advance encodingPosition to the next byte.

								        while (ord($s[$pos]) <= 0x20) {

								            $pos++;

								        }

								        # If the byte at encodingPosition is not 0x3D (=), then return failure.

								        // NOTE: This is also buggy: see https://github.com/whatwg/html/issues/7193

								        if ($s[$pos++] !== "=") {

								            return null;

								        }

								        # While the byte at encodingPosition is less than or equal to 0x20

								        #   (i.e., it is either an ASCII space or control character),

								        #   advance encodingPosition to the next byte.

								        while (ord($s[$pos]) <= 0x20) {

								            $pos++;

								        }

								        # Let quoteMark be the byte at encodingPosition.

								        $quoteMark = $s[$pos];

								        # If quoteMark is not either 0x22 (") or 0x27 ('), then return failure.

								        if ($quoteMark !== "'" && $quoteMark !== '"') {

								            return null;

								        }

								        # Advance encodingPosition to the next byte.

								        $pos++;

								        # Let encodingEndPosition be the position of the next occurence of

								        #   quoteMark at or after encodingPosition. If quoteMark does not

								        #   occur again, then return failure.

								        $encodingEndPosition = strpos($s, $quoteMark, $pos);

								        if ($encodingEndPosition === false) {

								            return null;

								        }

								        # Let potentialEncoding be the sequence of the bytes between

								        #   encodingPosition (inclusive) and encodingEndPosition (exlusive).

								        $potentialEncoding = substr($s, $pos, $encodingEndPosition - $pos);

								        # If potentialEncoding contains one or more bytes whose byte value

								        #   is 0x20 or below, then return failure.

								        if (preg_match('/[\x{00}-\x{20}]/', $potentialEncoding)) {

								            return null;

								        }

								        # Let encoding be the result of getting an encoding given

								        #   potentialEncoding isomorphic decoded.

								        // NOTE: Isomorphic decoding is not necessary since all encoding labels are ASCII

								        $encoding = static::fromCharset($potentialEncoding);

								        # If the encoding is UTF-16BE/LE, then change it to UTF-8.

								        if ($encoding === "UTF-16LE" || $encoding === "UTF-16BE") {

								            $encoding = "UTF-8";

								        }

								        # Return encoding.

								        return $encoding;

								    }


								    /** Scans an attribute during the encoding detection pre-scan */

								    protected static function getAttribute(string $s, &$pos): array {

								        # When the prescan a byte stream to determine its encoding

								        #   algorithm says to get an attribute, it means doing this:


								        # If the byte at position is one of

								        #   0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP),

								        #   or 0x2F (/) then advance position to the next byte and

								        #   redo this step.

								        while (in_array(@$s[$pos], ["\x09", "\x0A", "\x0C", "\x0D", " ", "/"])) {

								            $pos++;

								        }

								        $char = @$s[$pos];


								        # If the byte at position is 0x3E (>),

								        #   then abort the get an attribute algorithm. There isn't one.

								        if ($char === ">") {

								            return [];

								        }

								        # Otherwise, the byte at position is the start of the attribute name.

								        #  Let attribute name and attribute value be the empty string.

								        $name = "";

								        $value = "";


								        # Process the byte at position as follows:

								        while ($char !== "") {

								            # If it is 0x3D (=), and the attribute name is longer than the empty string

								            if ($char === "=" && $name !== "") {

								                # Advance position to the next byte and jump to the step below labeled value.

								                $pos++;

								                goto value;

								            }

								            # If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP)

								            elseif (in_array($char, ["\x09", "\x0A", "\x0C", "\x0D", " "])) {

								                goto spaces;

								            }

								            # If it is 0x2F (/) or 0x3E (>)

								            elseif ($char === "/" || $char === ">") {

								                # Abort the get an attribute algorithm.

								                # The attribute's name is the value of attribute name, its value is the empty string.

								                return ['name' => $name, 'value' => $value];

								            }

								            # If it is in the range 0x41 (A) to 0x5A (Z)

								            # Anything else

								            else {

								                # Append the code point with the same value as the byte at position to attribute name.

								                # (It doesn't actually matter how bytes outside the ASCII range are handled here,

								                #    since only ASCII bytes can contribute to the detection of a character encoding.)


								                // OPTIMIZATION: Also handle uppercase characters

								                $name .= strtolower($char);

								            }


								            # Advance position to the next byte and return to the previous step.

								            $char = @$s[++$pos];

								        }


								        if ($char === "") {

								            // Out of bytes

								            return [];

								        }


								        spaces:

								        #  If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR),

								        #   or 0x20 (SP) then advance position to the next byte, then, repeat this step.

								        while (in_array(@$s[$pos], ["\x09", "\x0A", "\x0C", "\x0D", " "])) {

								            $pos++;

								        }

								        $char = @$s[$pos];

								        if ($char === "") {

								            // Out of bytes

								            return [];

								        }

								        # If the byte at position is not 0x3D (=), abort the get an attribute algorithm.

								        # The attribute's name is the value of attribute name, its value is the empty string.

								        if ($char !== "=") {

								            return ['name' => $name, 'value' => $value];

								        }

								        # Advance position past the 0x3D (=) byte.

								        $char = @$s[++$pos];


								        value:

								        # If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR),

								        #   or 0x20 (SP) then advance position to the next byte, then, repeat this step.

								        while (in_array(@$s[$pos], ["\x09", "\x0A", "\x0C", "\x0D", " "])) {

								            $pos++;

								        }

								        $char = @$s[$pos];

								        if ($char === "") {

								            // Out of bytes

								            return [];

								        }

								        # Process the byte at position as follows:

								        # If it is 0x22 (") or 0x27 (')

								        if ($char === "'" || $char === '"') {

								            # Let b be the value of the byte at position.

								            $b = $char;

								            # Quote loop: Advance position to the next byte.

								            while (($char = @$s[++$pos]) !== "") {

								                # If the value of the byte at position is the value of b,

								                #   then advance position to the next byte and abort

								                #   the "get an attribute" algorithm.

								                # The attribute's name is the value of attribute name,

								                #   and its value is the value of attribute value.

								                if ($char === $b) {

								                    $pos++;

								                    return ['name' => $name, 'value' => $value];

								                }

								                # Otherwise, append a code point to attribute value whose

								                #   value is the same as the value of the byte at position.


								                // OPTIMIZATION: Also handle uppercase characters

								                $value .= strtolower($char);

								            }

								            // Out of bytes

								            return [];

								        }

								        # If it is 0x3E (>)

								        elseif ($char === ">") {

								            # Abort the get an attribute algorithm.

								            # The attribute's name is the value of attribute name,

								            #   its value is the empty string.

								            return ['name' => $name, 'value' => $value];

								        }

								        # Anything else

								        else {

								            # Append a code point with the same value as the byte at position to attribute value.

								            # Advance position to the next byte.


								            // OPTIMIZATION: Also handle uppercase characters

								            $value .= strtolower($char);


								            while (($char = @$s[++$pos]) !== "") {

								                # Process the byte at position as follows:

								                # If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>)

								                if (in_array($char, ["\x09", "\x0A", "\x0C", "\x0D", " ", ">"])) {

								                    # Abort the get an attribute algorithm.

								                    # The attribute's name is the value of attribute name

								                    #   and its value is the value of attribute value.

								                    return ['name' => $name, 'value' => $value];

								                }

								                # If it is in the range 0x41 (A) to 0x5A (Z)

								                # Anything else

								                else {

								                    # Append a code point with the same value as

								                    #   the byte at position to attribute value.

								                    $value .= strtolower($char);

								                }

								            }

								            // Out of bytes

								            return [];

								        }

								    }

								}