You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

147 lines
6.6 KiB

<?php
/** @license MIT
* Copyright 2023 J. King
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam;
use MensBeam\HTML\Parser as HTMLParser;
use MensBeam\Microformats\Parser as MfParser;
use MensBeam\Microformats\Url;
/** A generic parser for microformats
*
* It implements Microformats v2 as well as backwards-compatible processing of
* so-called "classic" or "backcompat" Microformats. Some of its functionality
* is optional. Where an $options array is a possible parameter, the following
* keys are understood:
*
* - `dateNormalization` (bool) Whether to perform date and time normalization
* throughout parsing rather than only in value-class parsing where it is
* required by the specification. True by default
* - `impliedTz` (bool) Whether to allow an implied datetime value to supply an
* implied timezone to datetimes without a timezone
* - `lang` (bool) Whether to include language information in microformat and
* rich-text structures. True by default
* - `thoroughTrim` (bool) Whether to use the more thorough whitespace-trimming
* algorithm proposed for future standardization rather than the "classic",
* simpler whitespace-trimming algorithm mandated by the parsing specification.
* True by default.
*
* Currently all input is assumed to be HTML, but processing of generic XML
* data may be supported in future.
*/
class Microformats {
/** Parses a resource at a URL for microformats
*
* If retrieving the resource fails `null` is returned.
*
* @param string $file The resource to retrieve and parse
* @param array $options Options for the parser; please see the class documentetation for details
*/
public static function fromUrl(string $url, array $options = []): ?array {
$stream = fopen($url, "r");
if ($stream) {
$locationAcceptable = true;
$location = null;
$type = null;
$data = stream_get_contents($stream);
if ($data !== false) {
$meta = stream_get_meta_data($stream);
if ($meta && $meta['wrapper_type'] === "http") {
foreach ($meta['wrapper_data'] ?? [] as $h) {
if (preg_match('/^HTTP\//i', $h)) {
$type = null;
$locationAcceptable = true;
} elseif (preg_match('/^Location\s*:\s*(.*)/is', $h, $match) && $locationAcceptable) {
$location = (string) URL::fromString($match[1], $location ?? $url);
$locationAcceptable = false;
} elseif (preg_match('/^Content-Type\s*:\s*(.*)/is', $h, $match) && $type === null) {
$type = $match[1];
}
}
}
return static::fromString($data, $type ?? "", $location ?? $url, $options);
}
}
return null;
}
/** Parses a file for microformats
*
* If reading the file fails `null` is returned.
*
* While fopen wrappers can be used to open remote resources over HTTP, no
* effort is made to support this specially by reading the `Content-Type`
* header or deducing the final URL. The `Microformats::fromUrl` method
* should be used for this purpose instead.
*
* @param string $file The file to read and parse
* @param string $contentType The HTTP Content-Type of the file if known, optionally with parameters
* @param string $url The effective URL (after redirections) of the file if known
* @param array $options Options for the parser; please see the class documentetation for details
*/
public static function fromFile(string $file, string $contentType, string $url, array $options = []): ?array {
$string = file_get_contents($file);
if ($string === false) {
return null;
}
return static::fromString($string, $contentType, $url, $options);
}
/** Parses a string for microformats
*
* @param string $input The string to parse
* @param string $contentType The HTTP Content-Type of the string if known, optionally with parameters
* @param string $url The effective URL (after redirections) of the string if known
* @param array $options Options for the parser; please see the class documentetation for details
*/
public static function fromString(string $input, string $contentType, string $url, array $options = []): array {
$parsed = HTMLParser::parse($input, $contentType);
return static::fromHtmlElement($parsed->document->documentElement, $url, $options);
}
/** Parses an HTML element for microformats
*
* @param \DOMElement $input The element to examine. Siblings and ancestors of this element will be ignored
* @param string $url The effective URL (after redirections) of the document if known
* @param array $options Options for the parser; please see the class documentetation for details
*/
public static function fromHtmlElement(\DOMElement $input, string $url, array $options = []): array {
return (new MfParser)->parseHtmlElement($input, $url, $options);
}
/** Serializes a Microformats structure to JSON.
*
* This is necessary to serialize empty hash tables (JSON objects)
* correctly. It cannot cover all possible cases of manipulation, but
* does cover cases which normally occur with data in the wild.
*
* @param array $data The Microformats structure to serialize
* @param int $flags [optional] Bitmask consisting of JSON_HEX_QUOT, JSON_HEX_TAG, JSON_HEX_AMP, JSON_HEX_APOS, JSON_NUMERIC_CHECK, JSON_PRETTY_PRINT, JSON_UNESCAPED_SLASHES, JSON_FORCE_OBJECT, JSON_UNESCAPED_UNICODE. JSON_THROW_ON_ERROR The behaviour of these constants is described on the JSON constants page
* @param int $depth [optional] Set the maximum depth. Must be greater than zero.
*/
public static function toJson(array $data, int $flags = 0, int $depth = 512): string {
$walk = function(&$arr) use(&$walk) {
foreach ($arr as $k => &$v) {
if (is_array($v)) {
if ($k === "properties" && !$v) {
$v = new \stdClass;
} else {
$walk($v);
}
}
}
};
if (!$data['rels']) {
$data['rels'] = new \stdClass;
}
if (!$data['rel-urls']) {
$data['rel-urls'] = new \stdClass;
}
$walk($data['items']);
return json_encode($data, $flags, $depth);
}
}