From 03a42a23439a662f861dbfaed2039f13244ea5be Mon Sep 17 00:00:00 2001 From: "J. King" Date: Sat, 1 Apr 2023 09:07:11 -0400 Subject: [PATCH] Move the creation of documents to separate functions This will make document creation easier to override for e.g. enhanced DOM implementations. --- lib/DOMParser.php | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/lib/DOMParser.php b/lib/DOMParser.php index 0422c64..421f8bb 100644 --- a/lib/DOMParser.php +++ b/lib/DOMParser.php @@ -50,14 +50,13 @@ XMLDECL; // parse the string as either HTML or XML if ($t->isHtml) { // for HTML we invoke our parser which has its own handling for everything - return Parser::parse($string, $type)->document; + return $this->createDocumentHtml($string, $type); } elseif ($t->isXml) { // for XML we have to jump through a few hoops to deal with encoding; // if we have a known encoding we want to make sure the XML parser // doesn't try to do its own detection. The only way to do this is // to convert to UTF-8 where necessary and remove any XML // declaration encoding information - $doc = new \DOMDocument(); try { // first check for a byte order mark; if one exists we can go straight to parsing if (!Encoding::sniffBOM($string)) { @@ -80,6 +79,9 @@ XMLDECL; // or ISO 2022-JP never contain non-ASCII characters // before encoding information is seen) if ($charset === "UTF-16BE" || $charset === "UTF-16LE") { + // NOTE: the transcoding operation may throw an + // exception, which is why this whole operation + // is wrapped in a try block $decoder = Encoding::createDecoder($charset, $string, true, false); $string = ""; while (strlen($c = $decoder->nextChar())) { @@ -100,16 +102,25 @@ XMLDECL; } } // parse the document - if (!$doc->loadXML($string, \LIBXML_NONET | \LIBXML_BIGLINES | \LIBXML_COMPACT |\LIBXML_NOWARNING | \LIBXML_NOERROR)) { - throw new \Exception(libxml_get_last_error()->message); - } + return $this->createDocumentXml($string); } catch (\Exception $e) { - $doc->appendChild($doc->createElementNS("http://www.mozilla.org/newlayout/xml/parsererror.xml", "parserror")); - $doc->documentElement->appendChild($doc->createTextNode($e->getMessage())); + $string = "".htmlspecialchars($e->getMessage(), \ENT_NOQUOTES | \ENT_SUBSTITUTE | \ENT_XML1, "UTF-8").""; + return $this->createDocumentXml($string); } - return $doc; } else { throw new \InvalidArgumentException("\$type must be \"text/html\" or an XML type"); } } + + protected function createDocumentHtml(string $string, string $type): \DOMDocument { + return Parser::parse($string, $type)->document; + } + + protected function createDocumentXml(string $string): \DOMDocument { + $document = new \DOMDocument; + if (!$document->loadXML($string, \LIBXML_NONET | \LIBXML_BIGLINES | \LIBXML_COMPACT |\LIBXML_NOWARNING | \LIBXML_NOERROR)) { + throw new \Exception(libxml_get_last_error()->message); + } + return $document; + } } \ No newline at end of file