@ -24,9 +24,35 @@ class DOMParser {
(\s+version=(?:"1\.[0-9]+"|'1\.[0-9]+'))
(?:\s+encoding=("[A-Za-z][A-Za-z0-9\._\-]*"|'[A-Za-z][A-Za-z0-9\._\-]*'))?
(\s+standalone=(?:"yes"|"no"|'yes'|'no'))?
( \s*) \?>
\s*\?>
/sx
XMLDECL;
/** @var array A list of standard encoding labels which DOMDocument either does not know or does not map to the correct encoding; this is a worst-case list taken from PHP 5.6 on Windows with some exclusions for encodings which are completely unsupported */
const ENCODING_NAUGHTY_LIST = [
"unicode-1-1-utf-8", "unicode11utf8", "unicode20utf8", "x-unicode20utf8",
"iso88592", "iso88593", "iso88594", "iso88595", "csiso88596e",
"csiso88596i", "iso-8859-6-e", "iso-8859-6-i", "iso88596", "iso88597",
"sun_eu_greek", "csiso88598e", "iso-8859-8-e", "iso88598", "visual",
"csiso88598i", "iso-8859-8-i", "logical", "iso885910", "iso885913",
"iso885914", "csisolatin9", "iso885915", "l9", "koi", "koi8", "koi8_r",
"x-mac-roman", "dos-874", "iso-8859-11", "iso8859-11", "iso885911",
"tis-620", "x-cp1250", "x-cp1251", "ansi_x3.4-1968", "ascii", "cp819",
"csisolatin1", "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1",
"iso88591", "iso_8859-1", "iso_8859-1:1987", "l1", "latin1",
"us-ascii", "x-cp1252", "x-cp1253", "iso88599", "x-cp1254",
"x-cp1255", "x-cp1256", "x-cp1257", "cp1258", "windows-1258",
"x-mac-ukrainian", "chinese", "csgb2312", "csiso58gb231280", "gb2312",
"gb_2312", "gb_2312-80", "gbk", "iso-ir-58", "big5", "cn-big5",
"csbig5", "x-x-big5", "x-euc-jp", "ms932", "windows-31j", "x-sjis",
"cseuckr", "euc-kr", "replacement",
];
/** @var array A List of canonical encoding names DOMDocument does not understand, with liases to labels it does understand */
const ENCODING_ALIAS_MAP = [
'windows-1258' => "x-cp1258",
'GBK' => "x-gbk",
'Big5' => "big5-hkscs",
'EUC-KR' => "korean",
];
/** Parses `$string` using either the HTML or XML parser, according to `$type`, and returns the resulting `DOMDocument`
*
@ -59,50 +85,50 @@ XMLDECL;
// can add BOMs to UTF-16 documents, but for other encodings we
// must parse XML declarations and validate that any encoding
// declaration is correct and change it if it is incorrect
// this process is further complicated by libxml not understanding
// all labels from the Encoding specification (which we try to
// honour since it can be assumed to be a best practice), so we
// must also rewrite some encoding declarations
try {
// first check for a byte order mark; if one exists we can go straight to parsing
if (!Encoding::sniffBOM($string)) {
// check the type for a charset parameter if there is no BOM
$charset = $t->params['charset'] ?? "";
if ($charset) {
if ($encoding = Encoding::matchLabel($charset)) {
$charset = $encoding['name'];
}
// otherwise determine the embedded encoding of the document
if (preg_match(self::XML_DECLARATION_PATTERN, $string, $match)) {
$match[2] = ($match[2] ?? "") ?: '"utf-8"'; // declaration without encoding is UTF-8
$xmlDeclaration = $match[0];
$xmlVersion = $match[1];
$xmlEncoding = substr($match[2], 1, strlen($match[2]) - 2);
$xmlStandalone = $match[3] ?? "";
$docEnc = Encoding::matchLabel($xmlEncoding);
} else {
$xmlDeclaration = "";
$xmlVersion = " version=\"1.0\"";
$xmlEncoding = "";
$xmlStandalone = "";
$docEnc = Encoding::matchLabel("utf-8");
}
// if a supported encoding was parsed from the type, act
// accordingly; otherwise skip to parsing and let the
// XML parser detect encoding
if ($charset === "UTF-16BE") {
// if the string is UTF-16BE, adding a BOM is sufficient
$string = self::BOM_UTF16BE.$string;
} elseif ($charset === "UTF-16LE") {
// if the string is UTF-16LE, adding a BOM is sufficient
$string = self::BOM_UTF16LE.$string;
} elseif ($charset) {
// for ASCII-compatible encodings look for an XML declaration
if (preg_match(self::XML_DECLARATION_PATTERN, $string, $match)) {
// if an existing encoding declaration is found,
// keep it only if it matches; if no encoding
// declaration is found but the encoding is UTF-8
// this is also acceptable
$keep = false;
if ($match[2]) {
$candidate = substr($match[2], 1, strlen($match[2]) - 2);
if ($encoding = Encoding::matchLabel($candidate)) {
if ($charset === $encoding['name']) {
$keep = true;
}
}
} elseif ($charset === "UTF-8") {
$keep = true;
}
// substitute the encoding declaration where necessary
if (!$keep) {
$string = "<? xml ". $match[1] ." encoding = \ " $charset\ "". $match[3] . $match[4] ." ?> ".substr($string, strlen($match[0]));
}
} elseif ($charset !== "UTF-8") {
// add a declaration if none is found and the encoding is not UTF-8
$string = "<? xml version = \ "1.0 \" encoding= \" $charset\ " ?> ".$string;
// next check the type for a charset parameter if there is one
$typeEnc = Encoding::matchLabel($t->params['charset'] ?? "");
// if the document encoding differs from the type encoding
// or the document encoding is not recognized by libxml,
// we need to mangle the document before parsing
if (($typeEnc & & $docEnc & & $docEnc['name'] !== $typeEnc['name']) || ($docEnc & & in_array($docEnc['label'], self::ENCODING_NAUGHTY_LIST)) || (!$docEnc & & !$typeEnc)) {
$charset = ($typeEnc ?? $docEnc)['name'] ?? "UTF-8";
// some canonical names are not recognized by libxml, so we must use other labels
$charset = self::ENCODING_ALIAS_MAP[$charset] ?? $charset;
if ($charset === "UTF-8") {
// if the string is UTF-8, adding a BOM is sufficient
$string = self::BOM_UTF8.$string;
} elseif ($charset === "UTF-16BE") {
// if the string is UTF-16BE, adding a BOM is sufficient
$string = self::BOM_UTF16BE.$string;
} elseif ($charset === "UTF-16LE") {
// if the string is UTF-16LE, adding a BOM is sufficient
$string = self::BOM_UTF16LE.$string;
} elseif ($charset) {
// otherwise substitute the encoding declaration if any
$string = "<? xml ". $xmlVersion ." encoding = \ " $charset\ "". $xmlStandalone ." ?> ".substr($string, strlen($xmlDeclaration));
}
}
}