Browse Source

Annotate remaining functions

master
J. King 1 year ago
parent
commit
a41f757e19
  1. 140
      lib/Parser.php

140
lib/Parser.php

@ -554,6 +554,17 @@ class Parser {
return array_values($out); return array_values($out);
} }
/** Parses an element for microformat information
*
* Returns a microformat "item" structure, possibly with child and
* descendent amicroformat structures.
*
* @see https://microformats.org/wiki/microformats2-parsing#parse_an_element_for_class_microformats
*
* @param \DOMElement $root The root element of the microformat
* @param array $types The previously determined v2 microformat types of the element
* @param bool $backcompat Whether this element is a v1 microformat and backward-compatible processing should be used
*/
protected function parseMicroformat(\DOMElement $root, array $types, bool $backcompat): array { protected function parseMicroformat(\DOMElement $root, array $types, bool $backcompat): array {
# keep track of whether the root class name(s) was from backcompat # keep track of whether the root class name(s) was from backcompat
// this is a parameter to this function // this is a parameter to this function
@ -817,6 +828,20 @@ class Parser {
return $out; return $out;
} }
/** Retrieves the value of a single microformat property from an element
*
* The value returned is determined from the prefix and the structure of
* the element rather than the property name.
*
* @see https://microformats.org/wiki/microformats2-parsing#parse_an_element_for_properties
*
* @param \DOMElement $node The element to retrieve a value from
* @param string $prefix The property prefix (`p`, `dt`, `u`, or `e`)
* @param array $backcompatTypes The set of microformat types currently in scope, if performing backcompat processing (and empty array otherwise)
* @param string|null $impliedDate A previously-seen date value from which we can imply a date if only a time is present on the element
* @param bool $isChild Whether the subject element is itself a child microformat. This affect whether the "Value Class Pattern" applies
* @return string|array
*/
protected function parseProperty(\DOMElement $node, string $prefix, array $backcompatTypes, ?string $impliedDate, bool $isChild) { protected function parseProperty(\DOMElement $node, string $prefix, array $backcompatTypes, ?string $impliedDate, bool $isChild) {
switch ($prefix) { switch ($prefix) {
case "p": case "p":
@ -940,6 +965,20 @@ class Parser {
} }
} }
/** Retrieves a "Value Class Pattern" value from an element
*
* The Value Class Pattern is a means of marking only certain spans of text
* as relevant information, allowing for naturally-flowing text and
* machine-readable information to co-exist.
*
* @see https://microformats.org/wiki/value-class-pattern#Basic_Parsing
*
* @param \DOMElement $node The subject element
* @param string $prefix The property prefix (`p`, `dt`, `u`, or `e`)
* @param array $backcompatTypes The set of microformat types currently in scope, if performing backcompat processing (and empty array otherwise)
* @param string|null $impliedDate A previously-seen date value from which we can imply a date if only a time is present on the element
* @return string|array|null
*/
protected function getValueClassPattern(\DOMElement $root, string $prefix, array $backcompatTypes, ?string $impliedDate = null) { protected function getValueClassPattern(\DOMElement $root, string $prefix, array $backcompatTypes, ?string $impliedDate = null) {
$out = []; $out = [];
$skipChildren = false; $skipChildren = false;
@ -1031,6 +1070,17 @@ class Parser {
} }
} }
/** Retrieves structured data from an HTML `img` element
*
* If the element has an `alt` attribute an array containing both `alt` and
* `src` keys is returned. Otherwise a string with the value of `src` is
* returned instead.
*
* @see https://microformats.org/wiki/microformats2-parsing#parse_an_img_element_for_src_and_alt
*
* @param \DOMElement $node The `img` element to examine
* @return array|string
*/
protected function parseImg(\DOMElement $node) { protected function parseImg(\DOMElement $node) {
# To parse an img element for src and alt attributes: # To parse an img element for src and alt attributes:
if ($node->localName === "img" && $node->hasAttribute("alt")) { if ($node->localName === "img" && $node->hasAttribute("alt")) {
@ -1048,6 +1098,22 @@ class Parser {
} }
} }
/** Validates whether a string is a date, and returns its parts
*
* The return value is an array which can contain zero or more of the
* follwing keys:
*
* - `date`
* - `time`
* - `zone`
*
* Absence of all keys indicates an invalid string. The three parts may
* appear in any combination except `date` and `zone` without `time`.
*
* @see https://microformats.org/wiki/value-class-pattern#Date_and_time_parsing
*
* @param string $input The string to test for validity
*/
protected function parseDatePart(string $input): array { protected function parseDatePart(string $input): array {
// do a first-pass normalization on the input; this normalizes am/pm and normalizes and trims whitespace // do a first-pass normalization on the input; this normalizes am/pm and normalizes and trims whitespace
$input = trim(preg_replace(['/([ap])\.m\.$/', '/\s+/'], ["$1m", " "], strtr($input, "APM", "apm"))); $input = trim(preg_replace(['/([ap])\.m\.$/', '/\s+/'], ["$1m", " "], strtr($input, "APM", "apm")));
@ -1130,6 +1196,13 @@ class Parser {
return []; return [];
} }
/** Tests a string for validity against one or more date-format strings
*
* The returned value will be the first valid DateTimeImmutable value, if any.
*
* @param string $input The string to validate
* @param string $format One or more date formats to validate against
*/
protected function testDate(string $input, string ...$format): ?\DateTimeImmutable { protected function testDate(string $input, string ...$format): ?\DateTimeImmutable {
foreach ($format as $f) { foreach ($format as $f) {
$out = \DateTimeImmutable::createFromFormat("!$f", $input, new \DateTimeZone("UTC")); $out = \DateTimeImmutable::createFromFormat("!$f", $input, new \DateTimeZone("UTC"));
@ -1140,6 +1213,11 @@ class Parser {
return null; return null;
} }
/** Concatenates date parts together, optionally with an implied date
*
* @param array $parts The date parts, `date`, `time`, and `zone`
* @param string|null $implied An optional implied date to use if the date is absent from the input
*/
protected function stitchDate(array $parts, ?string $implied): ?string { protected function stitchDate(array $parts, ?string $implied): ?string {
if (sizeof($parts) === 3) { if (sizeof($parts) === 3) {
return $parts['date']." ".$parts['time'].$parts['zone']; return $parts['date']." ".$parts['time'].$parts['zone'];
@ -1161,6 +1239,11 @@ class Parser {
return null; return null;
} }
/** Resolves a URL against a base URL and normalizes the result
*
* @param string $url The URL to resolve and normalize
* @param string|null $baseUrl The base URL to resolve against. If this argument is absent the document base will be used
*/
protected function normalizeUrl(string $url, string $baseUrl = null): string { protected function normalizeUrl(string $url, string $baseUrl = null): string {
// TODO: Implement better URL parser // TODO: Implement better URL parser
try { try {
@ -1170,9 +1253,19 @@ class Parser {
} }
} }
/** Retrieves the trimmed plain-text content of an HTML element
*
* Depending on user options the traditional "simple" algorithm may be used
* in place of the more recent "thorough" algorithm.
*
* @see https://microformats.org/wiki/textcontent-parsing
*
* @param \DOMElement $node The element whose text is to be retrieved
* @param string $prefix The prefix of the microformat property the text is to be used for. This is only relevant for the "simple" algorithm
*/
protected function getCleanText(\DOMElement $node, string $prefix): string { protected function getCleanText(\DOMElement $node, string $prefix): string {
if ($this->options['simpleTrim']) { if ($this->options['simpleTrim']) {
return $this->getCleanTextBasic($node, $prefix); return $this->getCleanTextSimple($node, $prefix);
} else { } else {
// https://microformats.org/wiki/textcontent-parsing // https://microformats.org/wiki/textcontent-parsing
# Plain text of element # Plain text of element
@ -1184,13 +1277,19 @@ class Parser {
# Strip leading and trailing ASCII whitespace from output # Strip leading and trailing ASCII whitespace from output
$output = trim($output); $output = trim($output);
# Replace any sequence of one or more consecutive U+0020 SPACE code points in output with a single U+0020 SPACE code point # Replace any sequence of one or more consecutive U+0020 SPACE code points in output with a single U+0020 SPACE code point
$output = preg_replace('/ {2,}/', " ", $output); $output = preg_replace('/\s{2,}/m', " ", $output);
# Return output # Return output
return $output; return $output;
} }
} }
protected function getCleanTextThorough(\DOMElement $node, string $prefix): string { /** Part of the algorithm to retrieve the trimmed plain-text content of an HTML element
*
* @see https://microformats.org/wiki/textcontent-parsing#Element_to_string
*
* @param \DOMElement $node The element whose text is to be retrieved
*/
protected function getCleanTextThorough(\DOMElement $node): string {
# Element to string # Element to string
# To get the string value for an Element input: # To get the string value for an Element input:
# Let output be an empty list # Let output be an empty list
@ -1251,13 +1350,13 @@ class Parser {
# Let value be the result of running this algorithm on child # Let value be the result of running this algorithm on child
# Prepend a single U+000A LF code point to value # Prepend a single U+000A LF code point to value
# Append value to output # Append value to output
$output[] = "\n".$this->getCleanTextThorough($n, $prefix); $output[] = "\n".$this->getCleanTextThorough($n);
break; break;
default: default:
# Any other value # Any other value
# Let value be the result of running this algorithm on child # Let value be the result of running this algorithm on child
# Append value to output # Append value to output
$output[] = $this->getCleanTextThorough($n, $prefix); $output[] = $this->getCleanTextThorough($n);
break; break;
} }
} else { } else {
@ -1269,7 +1368,14 @@ class Parser {
return implode("", $output); return implode("", $output);
} }
protected function getCleanTextBasic(\DOMElement $node, string $prefix): string { /** Part of the algorithm to retrieve the trimmed plain-text content of an HTML element
*
* This is the traditional "simple" algorithm.
*
* @param \DOMElement $node The element whose text is to be retrieved
* @param string $prefix The prefix of the microformat property the text is to be used for
*/
protected function getCleanTextSimple(\DOMElement $node, string $prefix): string {
# the textContent of the element after: # the textContent of the element after:
$copy = $node->cloneNode(true); $copy = $node->cloneNode(true);
# dropping any nested <script> & <style> elements; # dropping any nested <script> & <style> elements;
@ -1305,6 +1411,12 @@ class Parser {
return trim($copy->textContent); return trim($copy->textContent);
} }
/** Retrieves and resolves the base URL of an HTML document's `<base>`
* element, if any
*
* @param \DOMElement $root Any element within the document to check
* @param string $base The HTTP-level base URL, if available
*/
protected function getBaseUrl(\DOMElement $root, string $base): string { protected function getBaseUrl(\DOMElement $root, string $base): string {
$set = $root->ownerDocument->getElementsByTagName("base"); $set = $root->ownerDocument->getElementsByTagName("base");
if ($set->length) { if ($set->length) {
@ -1313,6 +1425,12 @@ class Parser {
return $base; return $base;
} }
/** Finds the nearest HTML language information for an element
*
* No validation or normalization is performed on the returned information.
*
* @param \DOMElement $node The subject element
*/
protected function getLang(\DOMElement $node): ?string { protected function getLang(\DOMElement $node): ?string {
while ($node && !($node instanceof \DOMElement && $node->hasAttribute("lang"))) { while ($node && !($node instanceof \DOMElement && $node->hasAttribute("lang"))) {
$node = $node->parentNode; $node = $node->parentNode;
@ -1326,7 +1444,7 @@ class Parser {
/** Finds the next element in tree order after $node, if any /** Finds the next element in tree order after $node, if any
* *
* @param \DOMNode $node The context node * @param \DOMNode $node The context node
* @param \DOMElement $root The element to consider the contextual root of the tree * @param \DOMElement $root The element to consider the contextual root of the tree; nodes outside this element will not be examined
* @param bool $considerChildren Whether or not child nodes are valid next nodes * @param bool $considerChildren Whether or not child nodes are valid next nodes
*/ */
protected function nextElement(\DOMElement $node, \DOMElement $root, bool $considerChildren): ?\DOMElement { protected function nextElement(\DOMElement $node, \DOMElement $root, bool $considerChildren): ?\DOMElement {
@ -1354,7 +1472,13 @@ class Parser {
return $next; return $next;
} }
protected function normalizeOptions(array $options) { /** Normalizes an array of options
*
* Default values are filled in and unknown options removed
*
* @param array $options The options array to normalize
*/
protected function normalizeOptions(array $options): array {
return [ return [
'impliedTz' => (bool) ($options['impliedTz'] ?? false), 'impliedTz' => (bool) ($options['impliedTz'] ?? false),
'lang' => (bool) ($options['lang'] ?? false), 'lang' => (bool) ($options['lang'] ?? false),

Loading…
Cancel
Save