"html", self::MATHML_NAMESPACE => "math", self::SVG_NAMESPACE => "svg", self::XLINK_NAMESPACE => "xlink", self::XML_NAMESPACE => "xml", self::XMLNS_NAMESPACE => "xmlns", ]; /** Parses a string to produce a document object * * @param string $data The string to parse. This may be in any valid encoding * @param string|null $encodingOrContentType The document encoding, or HTTP Content-Type header value, if known. If no provided encoding detection will be attempted * @param \MensBeam\HTML\Parser\Config|null $config The configuration parameters to use, if any */ public static function parse(string $data, ?string $encodingOrContentType = null, ?Config $config = null): Output { // parse the document return static::parseDocumentOrFragment($data, $encodingOrContentType, null, null, $config ?? new Config); } public static function parseFragment(\DOMElement $fragmentContext, ?int $fragmentQuirks, string $data, ?string $encodingOrContentType = null, ?Config $config = null): \DOMDocumentFragment { // parse the fragment into a temporary document $out = self::parseDocumentOrFragment($data, $encodingOrContentType, $fragmentContext, $fragmentQuirks, $config ?? new Config); $document = $out->document; // extract the nodes from the temporary document into a fragment belonging to the context element's document $fragment = $fragmentContext->ownerDocument->createDocumentFragment(); foreach ($document->documentElement->childNodes as $node) { $node = $fragment->ownerDocument->importNode($node, true); $fragment->appendChild($node); } return $fragment; } protected static function parseDocumentOrFragment(string $data, ?string $encodingOrContentType, ?\DOMElement $fragmentContext, ?int $fragmentQuirks, Config $config): Output { // check the document class if (isset($config->documentClass)) { try { $document = new $config->documentClass; } catch (\Throwable $e) { throw new Exception(Exception::FAILED_CREATING_DOCUMENT, [$config->documentClass], $e); } if (!$document instanceof \DOMDocument) { throw new Exception(Exception::INVALID_DOCUMENT_CLASS, [get_class($document)]); } } else { $document = new \DOMDocument(); } // sort out other needed configuration $htmlNamespace = ($config->htmlNamespace) ? self::HTML_NAMESPACE : null; // Initialize the various classes needed for parsing $errorHandler = $config->errorCollection ? new ParseError : null; $decoder = new Data($data, $encodingOrContentType, $errorHandler, $config); $stack = new OpenElementsStack($htmlNamespace, $fragmentContext); $tokenizer = new Tokenizer($decoder, $stack, $errorHandler); $tokenList = $tokenizer->tokenize(); $treeConstructor = new TreeConstructor($document, $decoder, $tokenizer, $tokenList, $errorHandler, $stack, new TemplateInsertionModesStack, $fragmentContext, $fragmentQuirks, $config); try { $treeConstructor->constructTree(); } catch (EncodingChangeException $e) { // We are supposed to reparse with a new encoding // Clear out the document if ($document->doctype) { $document->removeChild($document->doctype); } while ($document->hasChildNodes()) { $document->removeChild($document->firstChild); } // save the target encoding $encoding = $decoder->encoding; // Destroy our existing objects unset($errorHandler, $decoder, $stack, $tokenizer, $tokenList, $treeConstructor); // Parse a second time return static::parseDocumentOrFragment($data, $encoding, $fragmentContext, $fragmentQuirks, $config); } // prepare the output $out = new Output; $out->document = $document; $out->encoding = $decoder->encoding; $out->quirksMode = $treeConstructor->quirksMode; if ($errorHandler) { $out->errors = $errorHandler->errors; } return $out; } public static function fetchFile(string $file, ?string $encodingOrContentType = null): ?array { $f = fopen($file, "r"); if (!$f) { return null; } $data = stream_get_contents($f); $encoding = Charset::fromCharset((string) $encodingOrContentType) ?? Charset::fromTransport((string) $encodingOrContentType); if (!$encoding) { $meta = stream_get_meta_data($f); if ($meta['wrapper_type'] === "http") { // Try to find a Content-Type header-field foreach ($meta['wrapper_data'] as $h) { $h = explode(":", $h, 2); if (count($h) === 2) { if (preg_match("/^\s*Content-Type\s*$/i", $h[0])) { // Try to get an encoding from it $encoding = Charset::fromTransport($h[1]); break; } } } } } return [$data, $encoding]; } }