From 9dcc37995083ef00781b039c607f9941baebc902 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Thu, 30 Mar 2023 10:50:15 -0400 Subject: [PATCH] Allow encoding detection in DOMParser For HTML the HTML parser's machinery is used; for XML we implement our own logic for known encodings --- lib/DOMParser.php | 101 ++++++++++++++++++++++++++-------------------- 1 file changed, 57 insertions(+), 44 deletions(-) diff --git a/lib/DOMParser.php b/lib/DOMParser.php index 0984dff..9af7d02 100644 --- a/lib/DOMParser.php +++ b/lib/DOMParser.php @@ -9,65 +9,78 @@ namespace MensBeam\HTML; use MensBeam\Mime\MimeType; use MensBeam\Intl\Encoding; -/** The DOMParser interface allows authors to create new DOMDocument objects by parsing strings, as either HTML or XML. */ +/** The DOMParser interface allows authors to create new DOMDocument objects by parsing strings, as either HTML or XML */ class DOMParser { - protected const TYPES = [ - "text/html", - "text/xml", - "application/xml", - "application/xhtml+xml", - "image/svg+xml" - ]; + /** @var A UTF-8 byte order mark */ + protected const BOM_UTF8 = "\xEF\xBB\xBF"; + /** @var A UTF-16 (big-endian) byte order mark */ + protected const BOM_UTF16BE = "\xFE\xFF"; + /** @var A UTF-16 (little-endian) byte order mark */ + protected const BOM_UTF16LE = "\xFF\xFE"; - /** Parses `$string` using either the HTML or XML parser, according to `$type`, and returns the resulting `DOMDocument`. + /** Parses `$string` using either the HTML or XML parser, according to `$type`, and returns the resulting `DOMDocument` * - * `$type` can be `"text/html"` (which will invoke the HTML parser), or any of `"text/xml"`, `"application/xml"`, - * `"application/xhtml+xml"`, or `"image/svg+xml"` (which will invoke the XML parser). + * `$type` can be `"text/html"` (which will invoke the HTML parser), or + * any XML type (which will invoke the XML parser). A `charset` parameter + * may be included to specify the document encoding; otherwise encoding + * will be detected from document hints. This differs from the standard + * interface which only accepts certain XML types, and requires Unicode + * characters rather than bytes as input, obviating the need for encoding + * detection * - * For the XML parser, if `$string` cannot be parsed, then the returned `DOMDocument` will contain elements describing the resulting error. + * For the XML parser, if `$string` cannot be parsed, then the returned + * `DOMDocument` will contain elements describing the resulting error * - * Note that script elements are not evaluated during parsing, and the resulting document's encoding will always be UTF-8. - * - * Values other than the above for `$type` will cause an `InvalidArgumentException` exception to be thrown. - * - * Since PHP strings are bytes, `$type` may include a `charset` parameter. If no parameter is is supplied UTF-8 is assumed. + * If no encoding is specified and none can be detected from the document, + * the default encoding is Windows-1252 for HTML and UTF-8 for XML */ public function parseFromString(string $string, string $type): \DOMDocument { // start by parsing the type $t = MimeType::parseBytes($type); - if (!in_array($t->essence, self::TYPES)) { - throw new \InvalidArgumentException("\$type must be one of ".implode(", ", self::TYPES)); - } - $charset = $t->params['charset'] ?? "UTF-8"; - $encoding = Encoding::matchLabel($charset); - if (!$encoding) { - throw new \InvalidArgumentException("Specified charset is not supported"); + if (!$t->isHtml && !$t->isXml) { + throw new \InvalidArgumentException("\$type must be \"text/html\" or an XML type")); } - $charset = $encoding['name']; // parse the string as either HTML or XML - if ($t->essence === "text/html") { - // for HTML we invoke our parser - $config = new Parser\Config; - $config->encodingFallback = "UTF-8"; - $config->encodingPrescanBytes = 0; - return Parser::parse($string, $charset, $config)->document; + if ($t->isHtml) { + // for HTML we invoke our parser which has its own handling for everything + return Parser::parse($string, $type)->document; } else { - // for XML we have to jump through a few hoops to make sure the DOMDocument doesn't make a hash of things, or try to detect encoding + // for XML we have to jump through a few hoops to deal with encoding; + // if we have a known encoding we want to make sure the XML parser + // doesn't try to do its own detection. The best way to do this is + // to add a Unicode byte order mark if the string doesn't have one $doc = new \DOMDocument(); try { - if ($charset !== "UTF-8") { - // transcode the string to UTF-8 where necessary - $decoder = Encoding::createDecoder($charset, $string, true, false); - $string = ""; - while (strlen($c = $decoder->nextChar())) { - $string .= $c; - $string .= $decoder->asciiSpanNot(""); + // first check for a byte order mark; if one exists we can go straight to parsing + if (!Encoding::sniffBOM($string)) { + // check the type for a charset parameter if there is no BOM + $charset = $t->params['charset'] ?? ""; + if ($charset) { + $encoding = Encoding::matchLabel($charset); + if (!$encoding) { + throw new \InvalidArgumentException("Specified charset is not supported"); + } + $charset = $encoding['name']; + } + if ($charset) { + // if the string is known to be UTF-8 or UTF-16 according to the type but has no BOM, add one + if ($charset === "UTF-8") { + $string = self::BOM_UTF8.$string; + } elseif ($charset === "UTF-16BE") { + $string = self::BOM_UTF16BE.$string; + } elseif ($charset === "UTF-16LE") { + $string = self::BOM_UTF16LE.$string; + } else { + // transcode the string to UTF-8 with a BOM where the string's encoding cannot include a BOM + $decoder = Encoding::createDecoder($charset, $string, true, false); + $string = self::BOM_UTF8; + while (strlen($c = $decoder->nextChar())) { + $string .= $c; + $string .= $decoder->asciiSpanNot(""); + } + unset($decoder); + } } - unset($decoder); - } - // add a byte-order mark if the string doesn't have one; this serves as an authoritative encoding specifier - if (substr($string, 0, 3) !== "\xEF\xBB\xBF") { - $string = "\xEF\xBB\xBF".$string; } // parse the document if (!$doc->loadXML($string, \LIBXML_NONET | \LIBXML_BIGLINES | \LIBXML_COMPACT |\LIBXML_NOWARNING | \LIBXML_NOERROR)) {