|
@ -554,6 +554,17 @@ class Parser { |
|
|
return array_values($out); |
|
|
return array_values($out); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/** Parses an element for microformat information |
|
|
|
|
|
* |
|
|
|
|
|
* Returns a microformat "item" structure, possibly with child and |
|
|
|
|
|
* descendent amicroformat structures. |
|
|
|
|
|
* |
|
|
|
|
|
* @see https://microformats.org/wiki/microformats2-parsing#parse_an_element_for_class_microformats |
|
|
|
|
|
* |
|
|
|
|
|
* @param \DOMElement $root The root element of the microformat |
|
|
|
|
|
* @param array $types The previously determined v2 microformat types of the element |
|
|
|
|
|
* @param bool $backcompat Whether this element is a v1 microformat and backward-compatible processing should be used |
|
|
|
|
|
*/ |
|
|
protected function parseMicroformat(\DOMElement $root, array $types, bool $backcompat): array { |
|
|
protected function parseMicroformat(\DOMElement $root, array $types, bool $backcompat): array { |
|
|
# keep track of whether the root class name(s) was from backcompat |
|
|
# keep track of whether the root class name(s) was from backcompat |
|
|
// this is a parameter to this function |
|
|
// this is a parameter to this function |
|
@ -817,6 +828,20 @@ class Parser { |
|
|
return $out; |
|
|
return $out; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/** Retrieves the value of a single microformat property from an element |
|
|
|
|
|
* |
|
|
|
|
|
* The value returned is determined from the prefix and the structure of |
|
|
|
|
|
* the element rather than the property name. |
|
|
|
|
|
* |
|
|
|
|
|
* @see https://microformats.org/wiki/microformats2-parsing#parse_an_element_for_properties |
|
|
|
|
|
* |
|
|
|
|
|
* @param \DOMElement $node The element to retrieve a value from |
|
|
|
|
|
* @param string $prefix The property prefix (`p`, `dt`, `u`, or `e`) |
|
|
|
|
|
* @param array $backcompatTypes The set of microformat types currently in scope, if performing backcompat processing (and empty array otherwise) |
|
|
|
|
|
* @param string|null $impliedDate A previously-seen date value from which we can imply a date if only a time is present on the element |
|
|
|
|
|
* @param bool $isChild Whether the subject element is itself a child microformat. This affect whether the "Value Class Pattern" applies |
|
|
|
|
|
* @return string|array |
|
|
|
|
|
*/ |
|
|
protected function parseProperty(\DOMElement $node, string $prefix, array $backcompatTypes, ?string $impliedDate, bool $isChild) { |
|
|
protected function parseProperty(\DOMElement $node, string $prefix, array $backcompatTypes, ?string $impliedDate, bool $isChild) { |
|
|
switch ($prefix) { |
|
|
switch ($prefix) { |
|
|
case "p": |
|
|
case "p": |
|
@ -940,6 +965,20 @@ class Parser { |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/** Retrieves a "Value Class Pattern" value from an element |
|
|
|
|
|
* |
|
|
|
|
|
* The Value Class Pattern is a means of marking only certain spans of text |
|
|
|
|
|
* as relevant information, allowing for naturally-flowing text and |
|
|
|
|
|
* machine-readable information to co-exist. |
|
|
|
|
|
* |
|
|
|
|
|
* @see https://microformats.org/wiki/value-class-pattern#Basic_Parsing |
|
|
|
|
|
* |
|
|
|
|
|
* @param \DOMElement $node The subject element |
|
|
|
|
|
* @param string $prefix The property prefix (`p`, `dt`, `u`, or `e`) |
|
|
|
|
|
* @param array $backcompatTypes The set of microformat types currently in scope, if performing backcompat processing (and empty array otherwise) |
|
|
|
|
|
* @param string|null $impliedDate A previously-seen date value from which we can imply a date if only a time is present on the element |
|
|
|
|
|
* @return string|array|null |
|
|
|
|
|
*/ |
|
|
protected function getValueClassPattern(\DOMElement $root, string $prefix, array $backcompatTypes, ?string $impliedDate = null) { |
|
|
protected function getValueClassPattern(\DOMElement $root, string $prefix, array $backcompatTypes, ?string $impliedDate = null) { |
|
|
$out = []; |
|
|
$out = []; |
|
|
$skipChildren = false; |
|
|
$skipChildren = false; |
|
@ -1031,6 +1070,17 @@ class Parser { |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/** Retrieves structured data from an HTML `img` element |
|
|
|
|
|
* |
|
|
|
|
|
* If the element has an `alt` attribute an array containing both `alt` and |
|
|
|
|
|
* `src` keys is returned. Otherwise a string with the value of `src` is |
|
|
|
|
|
* returned instead. |
|
|
|
|
|
* |
|
|
|
|
|
* @see https://microformats.org/wiki/microformats2-parsing#parse_an_img_element_for_src_and_alt |
|
|
|
|
|
* |
|
|
|
|
|
* @param \DOMElement $node The `img` element to examine |
|
|
|
|
|
* @return array|string |
|
|
|
|
|
*/ |
|
|
protected function parseImg(\DOMElement $node) { |
|
|
protected function parseImg(\DOMElement $node) { |
|
|
# To parse an img element for src and alt attributes: |
|
|
# To parse an img element for src and alt attributes: |
|
|
if ($node->localName === "img" && $node->hasAttribute("alt")) { |
|
|
if ($node->localName === "img" && $node->hasAttribute("alt")) { |
|
@ -1048,6 +1098,22 @@ class Parser { |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/** Validates whether a string is a date, and returns its parts |
|
|
|
|
|
* |
|
|
|
|
|
* The return value is an array which can contain zero or more of the |
|
|
|
|
|
* follwing keys: |
|
|
|
|
|
* |
|
|
|
|
|
* - `date` |
|
|
|
|
|
* - `time` |
|
|
|
|
|
* - `zone` |
|
|
|
|
|
* |
|
|
|
|
|
* Absence of all keys indicates an invalid string. The three parts may |
|
|
|
|
|
* appear in any combination except `date` and `zone` without `time`. |
|
|
|
|
|
* |
|
|
|
|
|
* @see https://microformats.org/wiki/value-class-pattern#Date_and_time_parsing |
|
|
|
|
|
* |
|
|
|
|
|
* @param string $input The string to test for validity |
|
|
|
|
|
*/ |
|
|
protected function parseDatePart(string $input): array { |
|
|
protected function parseDatePart(string $input): array { |
|
|
// do a first-pass normalization on the input; this normalizes am/pm and normalizes and trims whitespace |
|
|
// do a first-pass normalization on the input; this normalizes am/pm and normalizes and trims whitespace |
|
|
$input = trim(preg_replace(['/([ap])\.m\.$/', '/\s+/'], ["$1m", " "], strtr($input, "APM", "apm"))); |
|
|
$input = trim(preg_replace(['/([ap])\.m\.$/', '/\s+/'], ["$1m", " "], strtr($input, "APM", "apm"))); |
|
@ -1130,6 +1196,13 @@ class Parser { |
|
|
return []; |
|
|
return []; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/** Tests a string for validity against one or more date-format strings |
|
|
|
|
|
* |
|
|
|
|
|
* The returned value will be the first valid DateTimeImmutable value, if any. |
|
|
|
|
|
* |
|
|
|
|
|
* @param string $input The string to validate |
|
|
|
|
|
* @param string $format One or more date formats to validate against |
|
|
|
|
|
*/ |
|
|
protected function testDate(string $input, string ...$format): ?\DateTimeImmutable { |
|
|
protected function testDate(string $input, string ...$format): ?\DateTimeImmutable { |
|
|
foreach ($format as $f) { |
|
|
foreach ($format as $f) { |
|
|
$out = \DateTimeImmutable::createFromFormat("!$f", $input, new \DateTimeZone("UTC")); |
|
|
$out = \DateTimeImmutable::createFromFormat("!$f", $input, new \DateTimeZone("UTC")); |
|
@ -1140,6 +1213,11 @@ class Parser { |
|
|
return null; |
|
|
return null; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/** Concatenates date parts together, optionally with an implied date |
|
|
|
|
|
* |
|
|
|
|
|
* @param array $parts The date parts, `date`, `time`, and `zone` |
|
|
|
|
|
* @param string|null $implied An optional implied date to use if the date is absent from the input |
|
|
|
|
|
*/ |
|
|
protected function stitchDate(array $parts, ?string $implied): ?string { |
|
|
protected function stitchDate(array $parts, ?string $implied): ?string { |
|
|
if (sizeof($parts) === 3) { |
|
|
if (sizeof($parts) === 3) { |
|
|
return $parts['date']." ".$parts['time'].$parts['zone']; |
|
|
return $parts['date']." ".$parts['time'].$parts['zone']; |
|
@ -1161,6 +1239,11 @@ class Parser { |
|
|
return null; |
|
|
return null; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/** Resolves a URL against a base URL and normalizes the result |
|
|
|
|
|
* |
|
|
|
|
|
* @param string $url The URL to resolve and normalize |
|
|
|
|
|
* @param string|null $baseUrl The base URL to resolve against. If this argument is absent the document base will be used |
|
|
|
|
|
*/ |
|
|
protected function normalizeUrl(string $url, string $baseUrl = null): string { |
|
|
protected function normalizeUrl(string $url, string $baseUrl = null): string { |
|
|
// TODO: Implement better URL parser |
|
|
// TODO: Implement better URL parser |
|
|
try { |
|
|
try { |
|
@ -1170,9 +1253,19 @@ class Parser { |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/** Retrieves the trimmed plain-text content of an HTML element |
|
|
|
|
|
* |
|
|
|
|
|
* Depending on user options the traditional "simple" algorithm may be used |
|
|
|
|
|
* in place of the more recent "thorough" algorithm. |
|
|
|
|
|
* |
|
|
|
|
|
* @see https://microformats.org/wiki/textcontent-parsing |
|
|
|
|
|
* |
|
|
|
|
|
* @param \DOMElement $node The element whose text is to be retrieved |
|
|
|
|
|
* @param string $prefix The prefix of the microformat property the text is to be used for. This is only relevant for the "simple" algorithm |
|
|
|
|
|
*/ |
|
|
protected function getCleanText(\DOMElement $node, string $prefix): string { |
|
|
protected function getCleanText(\DOMElement $node, string $prefix): string { |
|
|
if ($this->options['simpleTrim']) { |
|
|
if ($this->options['simpleTrim']) { |
|
|
return $this->getCleanTextBasic($node, $prefix); |
|
|
return $this->getCleanTextSimple($node, $prefix); |
|
|
} else { |
|
|
} else { |
|
|
// https://microformats.org/wiki/textcontent-parsing |
|
|
// https://microformats.org/wiki/textcontent-parsing |
|
|
# Plain text of element |
|
|
# Plain text of element |
|
@ -1184,13 +1277,19 @@ class Parser { |
|
|
# Strip leading and trailing ASCII whitespace from output |
|
|
# Strip leading and trailing ASCII whitespace from output |
|
|
$output = trim($output); |
|
|
$output = trim($output); |
|
|
# Replace any sequence of one or more consecutive U+0020 SPACE code points in output with a single U+0020 SPACE code point |
|
|
# Replace any sequence of one or more consecutive U+0020 SPACE code points in output with a single U+0020 SPACE code point |
|
|
$output = preg_replace('/ {2,}/', " ", $output); |
|
|
$output = preg_replace('/\s{2,}/m', " ", $output); |
|
|
# Return output |
|
|
# Return output |
|
|
return $output; |
|
|
return $output; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
protected function getCleanTextThorough(\DOMElement $node, string $prefix): string { |
|
|
/** Part of the algorithm to retrieve the trimmed plain-text content of an HTML element |
|
|
|
|
|
* |
|
|
|
|
|
* @see https://microformats.org/wiki/textcontent-parsing#Element_to_string |
|
|
|
|
|
* |
|
|
|
|
|
* @param \DOMElement $node The element whose text is to be retrieved |
|
|
|
|
|
*/ |
|
|
|
|
|
protected function getCleanTextThorough(\DOMElement $node): string { |
|
|
# Element to string |
|
|
# Element to string |
|
|
# To get the string value for an Element input: |
|
|
# To get the string value for an Element input: |
|
|
# Let output be an empty list |
|
|
# Let output be an empty list |
|
@ -1251,13 +1350,13 @@ class Parser { |
|
|
# Let value be the result of running this algorithm on child |
|
|
# Let value be the result of running this algorithm on child |
|
|
# Prepend a single U+000A LF code point to value |
|
|
# Prepend a single U+000A LF code point to value |
|
|
# Append value to output |
|
|
# Append value to output |
|
|
$output[] = "\n".$this->getCleanTextThorough($n, $prefix); |
|
|
$output[] = "\n".$this->getCleanTextThorough($n); |
|
|
break; |
|
|
break; |
|
|
default: |
|
|
default: |
|
|
# Any other value |
|
|
# Any other value |
|
|
# Let value be the result of running this algorithm on child |
|
|
# Let value be the result of running this algorithm on child |
|
|
# Append value to output |
|
|
# Append value to output |
|
|
$output[] = $this->getCleanTextThorough($n, $prefix); |
|
|
$output[] = $this->getCleanTextThorough($n); |
|
|
break; |
|
|
break; |
|
|
} |
|
|
} |
|
|
} else { |
|
|
} else { |
|
@ -1269,7 +1368,14 @@ class Parser { |
|
|
return implode("", $output); |
|
|
return implode("", $output); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
protected function getCleanTextBasic(\DOMElement $node, string $prefix): string { |
|
|
/** Part of the algorithm to retrieve the trimmed plain-text content of an HTML element |
|
|
|
|
|
* |
|
|
|
|
|
* This is the traditional "simple" algorithm. |
|
|
|
|
|
* |
|
|
|
|
|
* @param \DOMElement $node The element whose text is to be retrieved |
|
|
|
|
|
* @param string $prefix The prefix of the microformat property the text is to be used for |
|
|
|
|
|
*/ |
|
|
|
|
|
protected function getCleanTextSimple(\DOMElement $node, string $prefix): string { |
|
|
# the textContent of the element after: |
|
|
# the textContent of the element after: |
|
|
$copy = $node->cloneNode(true); |
|
|
$copy = $node->cloneNode(true); |
|
|
# dropping any nested <script> & <style> elements; |
|
|
# dropping any nested <script> & <style> elements; |
|
@ -1305,6 +1411,12 @@ class Parser { |
|
|
return trim($copy->textContent); |
|
|
return trim($copy->textContent); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/** Retrieves and resolves the base URL of an HTML document's `<base>` |
|
|
|
|
|
* element, if any |
|
|
|
|
|
* |
|
|
|
|
|
* @param \DOMElement $root Any element within the document to check |
|
|
|
|
|
* @param string $base The HTTP-level base URL, if available |
|
|
|
|
|
*/ |
|
|
protected function getBaseUrl(\DOMElement $root, string $base): string { |
|
|
protected function getBaseUrl(\DOMElement $root, string $base): string { |
|
|
$set = $root->ownerDocument->getElementsByTagName("base"); |
|
|
$set = $root->ownerDocument->getElementsByTagName("base"); |
|
|
if ($set->length) { |
|
|
if ($set->length) { |
|
@ -1313,6 +1425,12 @@ class Parser { |
|
|
return $base; |
|
|
return $base; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/** Finds the nearest HTML language information for an element |
|
|
|
|
|
* |
|
|
|
|
|
* No validation or normalization is performed on the returned information. |
|
|
|
|
|
* |
|
|
|
|
|
* @param \DOMElement $node The subject element |
|
|
|
|
|
*/ |
|
|
protected function getLang(\DOMElement $node): ?string { |
|
|
protected function getLang(\DOMElement $node): ?string { |
|
|
while ($node && !($node instanceof \DOMElement && $node->hasAttribute("lang"))) { |
|
|
while ($node && !($node instanceof \DOMElement && $node->hasAttribute("lang"))) { |
|
|
$node = $node->parentNode; |
|
|
$node = $node->parentNode; |
|
@ -1326,7 +1444,7 @@ class Parser { |
|
|
/** Finds the next element in tree order after $node, if any |
|
|
/** Finds the next element in tree order after $node, if any |
|
|
* |
|
|
* |
|
|
* @param \DOMNode $node The context node |
|
|
* @param \DOMNode $node The context node |
|
|
* @param \DOMElement $root The element to consider the contextual root of the tree |
|
|
* @param \DOMElement $root The element to consider the contextual root of the tree; nodes outside this element will not be examined |
|
|
* @param bool $considerChildren Whether or not child nodes are valid next nodes |
|
|
* @param bool $considerChildren Whether or not child nodes are valid next nodes |
|
|
*/ |
|
|
*/ |
|
|
protected function nextElement(\DOMElement $node, \DOMElement $root, bool $considerChildren): ?\DOMElement { |
|
|
protected function nextElement(\DOMElement $node, \DOMElement $root, bool $considerChildren): ?\DOMElement { |
|
@ -1354,7 +1472,13 @@ class Parser { |
|
|
return $next; |
|
|
return $next; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
protected function normalizeOptions(array $options) { |
|
|
/** Normalizes an array of options |
|
|
|
|
|
* |
|
|
|
|
|
* Default values are filled in and unknown options removed |
|
|
|
|
|
* |
|
|
|
|
|
* @param array $options The options array to normalize |
|
|
|
|
|
*/ |
|
|
|
|
|
protected function normalizeOptions(array $options): array { |
|
|
return [ |
|
|
return [ |
|
|
'impliedTz' => (bool) ($options['impliedTz'] ?? false), |
|
|
'impliedTz' => (bool) ($options['impliedTz'] ?? false), |
|
|
'lang' => (bool) ($options['lang'] ?? false), |
|
|
'lang' => (bool) ($options['lang'] ?? false), |
|
|