diff --git a/lib/DOMParser.php b/lib/DOMParser.php index baad496..0984dff 100644 --- a/lib/DOMParser.php +++ b/lib/DOMParser.php @@ -50,9 +50,34 @@ class DOMParser { $config = new Parser\Config; $config->encodingFallback = "UTF-8"; $config->encodingPrescanBytes = 0; - return Parser::parse($string, $charset, $config); + return Parser::parse($string, $charset, $config)->document; } else { // for XML we have to jump through a few hoops to make sure the DOMDocument doesn't make a hash of things, or try to detect encoding + $doc = new \DOMDocument(); + try { + if ($charset !== "UTF-8") { + // transcode the string to UTF-8 where necessary + $decoder = Encoding::createDecoder($charset, $string, true, false); + $string = ""; + while (strlen($c = $decoder->nextChar())) { + $string .= $c; + $string .= $decoder->asciiSpanNot(""); + } + unset($decoder); + } + // add a byte-order mark if the string doesn't have one; this serves as an authoritative encoding specifier + if (substr($string, 0, 3) !== "\xEF\xBB\xBF") { + $string = "\xEF\xBB\xBF".$string; + } + // parse the document + if (!$doc->loadXML($string, \LIBXML_NONET | \LIBXML_BIGLINES | \LIBXML_COMPACT |\LIBXML_NOWARNING | \LIBXML_NOERROR)) { + throw new \Exception(libxml_get_last_error()->message); + } + } catch (\Exception $e) { + $doc->appendChild($doc->createElementNS("http://www.mozilla.org/newlayout/xml/parsererror.xml", "parserror")); + $doc->documentElement->appendChild($doc->createTextNode($e->getMessage())); + } + return $doc; } } } \ No newline at end of file