|
|
@ -9,65 +9,78 @@ namespace MensBeam\HTML; |
|
|
|
use MensBeam\Mime\MimeType; |
|
|
|
use MensBeam\Intl\Encoding; |
|
|
|
|
|
|
|
/** The DOMParser interface allows authors to create new DOMDocument objects by parsing strings, as either HTML or XML. */ |
|
|
|
/** The DOMParser interface allows authors to create new DOMDocument objects by parsing strings, as either HTML or XML */ |
|
|
|
class DOMParser { |
|
|
|
protected const TYPES = [ |
|
|
|
"text/html", |
|
|
|
"text/xml", |
|
|
|
"application/xml", |
|
|
|
"application/xhtml+xml", |
|
|
|
"image/svg+xml" |
|
|
|
]; |
|
|
|
/** @var A UTF-8 byte order mark */ |
|
|
|
protected const BOM_UTF8 = "\xEF\xBB\xBF"; |
|
|
|
/** @var A UTF-16 (big-endian) byte order mark */ |
|
|
|
protected const BOM_UTF16BE = "\xFE\xFF"; |
|
|
|
/** @var A UTF-16 (little-endian) byte order mark */ |
|
|
|
protected const BOM_UTF16LE = "\xFF\xFE"; |
|
|
|
|
|
|
|
/** Parses `$string` using either the HTML or XML parser, according to `$type`, and returns the resulting `DOMDocument`. |
|
|
|
/** Parses `$string` using either the HTML or XML parser, according to `$type`, and returns the resulting `DOMDocument` |
|
|
|
* |
|
|
|
* `$type` can be `"text/html"` (which will invoke the HTML parser), or any of `"text/xml"`, `"application/xml"`, |
|
|
|
* `"application/xhtml+xml"`, or `"image/svg+xml"` (which will invoke the XML parser). |
|
|
|
* `$type` can be `"text/html"` (which will invoke the HTML parser), or |
|
|
|
* any XML type (which will invoke the XML parser). A `charset` parameter |
|
|
|
* may be included to specify the document encoding; otherwise encoding |
|
|
|
* will be detected from document hints. This differs from the standard |
|
|
|
* interface which only accepts certain XML types, and requires Unicode |
|
|
|
* characters rather than bytes as input, obviating the need for encoding |
|
|
|
* detection |
|
|
|
* |
|
|
|
* For the XML parser, if `$string` cannot be parsed, then the returned `DOMDocument` will contain elements describing the resulting error. |
|
|
|
* For the XML parser, if `$string` cannot be parsed, then the returned |
|
|
|
* `DOMDocument` will contain elements describing the resulting error |
|
|
|
* |
|
|
|
* Note that script elements are not evaluated during parsing, and the resulting document's encoding will always be UTF-8. |
|
|
|
* |
|
|
|
* Values other than the above for `$type` will cause an `InvalidArgumentException` exception to be thrown. |
|
|
|
* |
|
|
|
* Since PHP strings are bytes, `$type` may include a `charset` parameter. If no parameter is is supplied UTF-8 is assumed. |
|
|
|
* If no encoding is specified and none can be detected from the document, |
|
|
|
* the default encoding is Windows-1252 for HTML and UTF-8 for XML |
|
|
|
*/ |
|
|
|
public function parseFromString(string $string, string $type): \DOMDocument { |
|
|
|
// start by parsing the type |
|
|
|
$t = MimeType::parseBytes($type); |
|
|
|
if (!in_array($t->essence, self::TYPES)) { |
|
|
|
throw new \InvalidArgumentException("\$type must be one of ".implode(", ", self::TYPES)); |
|
|
|
} |
|
|
|
$charset = $t->params['charset'] ?? "UTF-8"; |
|
|
|
$encoding = Encoding::matchLabel($charset); |
|
|
|
if (!$encoding) { |
|
|
|
throw new \InvalidArgumentException("Specified charset is not supported"); |
|
|
|
if (!$t->isHtml && !$t->isXml) { |
|
|
|
throw new \InvalidArgumentException("\$type must be \"text/html\" or an XML type")); |
|
|
|
} |
|
|
|
$charset = $encoding['name']; |
|
|
|
// parse the string as either HTML or XML |
|
|
|
if ($t->essence === "text/html") { |
|
|
|
// for HTML we invoke our parser |
|
|
|
$config = new Parser\Config; |
|
|
|
$config->encodingFallback = "UTF-8"; |
|
|
|
$config->encodingPrescanBytes = 0; |
|
|
|
return Parser::parse($string, $charset, $config)->document; |
|
|
|
if ($t->isHtml) { |
|
|
|
// for HTML we invoke our parser which has its own handling for everything |
|
|
|
return Parser::parse($string, $type)->document; |
|
|
|
} else { |
|
|
|
// for XML we have to jump through a few hoops to make sure the DOMDocument doesn't make a hash of things, or try to detect encoding |
|
|
|
// for XML we have to jump through a few hoops to deal with encoding; |
|
|
|
// if we have a known encoding we want to make sure the XML parser |
|
|
|
// doesn't try to do its own detection. The best way to do this is |
|
|
|
// to add a Unicode byte order mark if the string doesn't have one |
|
|
|
$doc = new \DOMDocument(); |
|
|
|
try { |
|
|
|
if ($charset !== "UTF-8") { |
|
|
|
// transcode the string to UTF-8 where necessary |
|
|
|
$decoder = Encoding::createDecoder($charset, $string, true, false); |
|
|
|
$string = ""; |
|
|
|
while (strlen($c = $decoder->nextChar())) { |
|
|
|
$string .= $c; |
|
|
|
$string .= $decoder->asciiSpanNot(""); |
|
|
|
// first check for a byte order mark; if one exists we can go straight to parsing |
|
|
|
if (!Encoding::sniffBOM($string)) { |
|
|
|
// check the type for a charset parameter if there is no BOM |
|
|
|
$charset = $t->params['charset'] ?? ""; |
|
|
|
if ($charset) { |
|
|
|
$encoding = Encoding::matchLabel($charset); |
|
|
|
if (!$encoding) { |
|
|
|
throw new \InvalidArgumentException("Specified charset is not supported"); |
|
|
|
} |
|
|
|
$charset = $encoding['name']; |
|
|
|
} |
|
|
|
if ($charset) { |
|
|
|
// if the string is known to be UTF-8 or UTF-16 according to the type but has no BOM, add one |
|
|
|
if ($charset === "UTF-8") { |
|
|
|
$string = self::BOM_UTF8.$string; |
|
|
|
} elseif ($charset === "UTF-16BE") { |
|
|
|
$string = self::BOM_UTF16BE.$string; |
|
|
|
} elseif ($charset === "UTF-16LE") { |
|
|
|
$string = self::BOM_UTF16LE.$string; |
|
|
|
} else { |
|
|
|
// transcode the string to UTF-8 with a BOM where the string's encoding cannot include a BOM |
|
|
|
$decoder = Encoding::createDecoder($charset, $string, true, false); |
|
|
|
$string = self::BOM_UTF8; |
|
|
|
while (strlen($c = $decoder->nextChar())) { |
|
|
|
$string .= $c; |
|
|
|
$string .= $decoder->asciiSpanNot(""); |
|
|
|
} |
|
|
|
unset($decoder); |
|
|
|
} |
|
|
|
} |
|
|
|
unset($decoder); |
|
|
|
} |
|
|
|
// add a byte-order mark if the string doesn't have one; this serves as an authoritative encoding specifier |
|
|
|
if (substr($string, 0, 3) !== "\xEF\xBB\xBF") { |
|
|
|
$string = "\xEF\xBB\xBF".$string; |
|
|
|
} |
|
|
|
// parse the document |
|
|
|
if (!$doc->loadXML($string, \LIBXML_NONET | \LIBXML_BIGLINES | \LIBXML_COMPACT |\LIBXML_NOWARNING | \LIBXML_NOERROR)) { |
|
|
|