Browse Source

Fall back on XML parser in cases of unsupported encodings

domparser
J. King 1 year ago
parent
commit
2b39319894
  1. 15
      lib/DOMParser.php

15
lib/DOMParser.php

@ -37,14 +37,11 @@ class DOMParser {
public function parseFromString(string $string, string $type): \DOMDocument { public function parseFromString(string $string, string $type): \DOMDocument {
// start by parsing the type // start by parsing the type
$t = MimeType::parseBytes($type); $t = MimeType::parseBytes($type);
if (!$t->isHtml && !$t->isXml) {
throw new \InvalidArgumentException("\$type must be \"text/html\" or an XML type");
}
// parse the string as either HTML or XML // parse the string as either HTML or XML
if ($t->isHtml) { if ($t->isHtml) {
// for HTML we invoke our parser which has its own handling for everything // for HTML we invoke our parser which has its own handling for everything
return Parser::parse($string, $type)->document; return Parser::parse($string, $type)->document;
} else { } elseif ($t->isXml) {
// for XML we have to jump through a few hoops to deal with encoding; // for XML we have to jump through a few hoops to deal with encoding;
// if we have a known encoding we want to make sure the XML parser // if we have a known encoding we want to make sure the XML parser
// doesn't try to do its own detection. The best way to do this is // doesn't try to do its own detection. The best way to do this is
@ -57,11 +54,13 @@ class DOMParser {
$charset = $t->params['charset'] ?? ""; $charset = $t->params['charset'] ?? "";
if ($charset) { if ($charset) {
$encoding = Encoding::matchLabel($charset); $encoding = Encoding::matchLabel($charset);
if (!$encoding) { if ($encoding) {
throw new \InvalidArgumentException("Specified charset is not supported"); $charset = $encoding['name'];
} }
$charset = $encoding['name'];
} }
// if a supported encoding was parsed from the type, act
// accordingly; otherwise skip to parsing and let the
// XML parser detect encoding
if ($charset) { if ($charset) {
// if the string is known to be UTF-8 or UTF-16 according to the type but has no BOM, add one // if the string is known to be UTF-8 or UTF-16 according to the type but has no BOM, add one
if ($charset === "UTF-8") { if ($charset === "UTF-8") {
@ -91,6 +90,8 @@ class DOMParser {
$doc->documentElement->appendChild($doc->createTextNode($e->getMessage())); $doc->documentElement->appendChild($doc->createTextNode($e->getMessage()));
} }
return $doc; return $doc;
} else {
throw new \InvalidArgumentException("\$type must be \"text/html\" or an XML type");
} }
} }
} }
Loading…
Cancel
Save