Browse Source

Allow encoding detection in DOMParser

For HTML the HTML parser's machinery is used; for XML we implement our
own logic for known encodings
domparser
J. King 1 year ago
parent
commit
9dcc379950
  1. 101
      lib/DOMParser.php

101
lib/DOMParser.php

@ -9,65 +9,78 @@ namespace MensBeam\HTML;
use MensBeam\Mime\MimeType;
use MensBeam\Intl\Encoding;
/** The DOMParser interface allows authors to create new DOMDocument objects by parsing strings, as either HTML or XML. */
/** The DOMParser interface allows authors to create new DOMDocument objects by parsing strings, as either HTML or XML */
class DOMParser {
protected const TYPES = [
"text/html",
"text/xml",
"application/xml",
"application/xhtml+xml",
"image/svg+xml"
];
/** @var A UTF-8 byte order mark */
protected const BOM_UTF8 = "\xEF\xBB\xBF";
/** @var A UTF-16 (big-endian) byte order mark */
protected const BOM_UTF16BE = "\xFE\xFF";
/** @var A UTF-16 (little-endian) byte order mark */
protected const BOM_UTF16LE = "\xFF\xFE";
/** Parses `$string` using either the HTML or XML parser, according to `$type`, and returns the resulting `DOMDocument`.
/** Parses `$string` using either the HTML or XML parser, according to `$type`, and returns the resulting `DOMDocument`
*
* `$type` can be `"text/html"` (which will invoke the HTML parser), or any of `"text/xml"`, `"application/xml"`,
* `"application/xhtml+xml"`, or `"image/svg+xml"` (which will invoke the XML parser).
* `$type` can be `"text/html"` (which will invoke the HTML parser), or
* any XML type (which will invoke the XML parser). A `charset` parameter
* may be included to specify the document encoding; otherwise encoding
* will be detected from document hints. This differs from the standard
* interface which only accepts certain XML types, and requires Unicode
* characters rather than bytes as input, obviating the need for encoding
* detection
*
* For the XML parser, if `$string` cannot be parsed, then the returned `DOMDocument` will contain elements describing the resulting error.
* For the XML parser, if `$string` cannot be parsed, then the returned
* `DOMDocument` will contain elements describing the resulting error
*
* Note that script elements are not evaluated during parsing, and the resulting document's encoding will always be UTF-8.
*
* Values other than the above for `$type` will cause an `InvalidArgumentException` exception to be thrown.
*
* Since PHP strings are bytes, `$type` may include a `charset` parameter. If no parameter is is supplied UTF-8 is assumed.
* If no encoding is specified and none can be detected from the document,
* the default encoding is Windows-1252 for HTML and UTF-8 for XML
*/
public function parseFromString(string $string, string $type): \DOMDocument {
// start by parsing the type
$t = MimeType::parseBytes($type);
if (!in_array($t->essence, self::TYPES)) {
throw new \InvalidArgumentException("\$type must be one of ".implode(", ", self::TYPES));
}
$charset = $t->params['charset'] ?? "UTF-8";
$encoding = Encoding::matchLabel($charset);
if (!$encoding) {
throw new \InvalidArgumentException("Specified charset is not supported");
if (!$t->isHtml && !$t->isXml) {
throw new \InvalidArgumentException("\$type must be \"text/html\" or an XML type"));
}
$charset = $encoding['name'];
// parse the string as either HTML or XML
if ($t->essence === "text/html") {
// for HTML we invoke our parser
$config = new Parser\Config;
$config->encodingFallback = "UTF-8";
$config->encodingPrescanBytes = 0;
return Parser::parse($string, $charset, $config)->document;
if ($t->isHtml) {
// for HTML we invoke our parser which has its own handling for everything
return Parser::parse($string, $type)->document;
} else {
// for XML we have to jump through a few hoops to make sure the DOMDocument doesn't make a hash of things, or try to detect encoding
// for XML we have to jump through a few hoops to deal with encoding;
// if we have a known encoding we want to make sure the XML parser
// doesn't try to do its own detection. The best way to do this is
// to add a Unicode byte order mark if the string doesn't have one
$doc = new \DOMDocument();
try {
if ($charset !== "UTF-8") {
// transcode the string to UTF-8 where necessary
$decoder = Encoding::createDecoder($charset, $string, true, false);
$string = "";
while (strlen($c = $decoder->nextChar())) {
$string .= $c;
$string .= $decoder->asciiSpanNot("");
// first check for a byte order mark; if one exists we can go straight to parsing
if (!Encoding::sniffBOM($string)) {
// check the type for a charset parameter if there is no BOM
$charset = $t->params['charset'] ?? "";
if ($charset) {
$encoding = Encoding::matchLabel($charset);
if (!$encoding) {
throw new \InvalidArgumentException("Specified charset is not supported");
}
$charset = $encoding['name'];
}
if ($charset) {
// if the string is known to be UTF-8 or UTF-16 according to the type but has no BOM, add one
if ($charset === "UTF-8") {
$string = self::BOM_UTF8.$string;
} elseif ($charset === "UTF-16BE") {
$string = self::BOM_UTF16BE.$string;
} elseif ($charset === "UTF-16LE") {
$string = self::BOM_UTF16LE.$string;
} else {
// transcode the string to UTF-8 with a BOM where the string's encoding cannot include a BOM
$decoder = Encoding::createDecoder($charset, $string, true, false);
$string = self::BOM_UTF8;
while (strlen($c = $decoder->nextChar())) {
$string .= $c;
$string .= $decoder->asciiSpanNot("");
}
unset($decoder);
}
}
unset($decoder);
}
// add a byte-order mark if the string doesn't have one; this serves as an authoritative encoding specifier
if (substr($string, 0, 3) !== "\xEF\xBB\xBF") {
$string = "\xEF\xBB\xBF".$string;
}
// parse the document
if (!$doc->loadXML($string, \LIBXML_NONET | \LIBXML_BIGLINES | \LIBXML_COMPACT |\LIBXML_NOWARNING | \LIBXML_NOERROR)) {

Loading…
Cancel
Save