From 7011bc0c4d5b7c195388cc1a7989f5253a4bd3f6 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Wed, 23 Sep 2020 10:01:59 -0400 Subject: [PATCH] Skeleton of h-feed parser This is probbly a dead end; the reference microformats parser should probably ultimately be used instead --- lib/Exception.php | 4 + lib/Parser/HTML/Construct.php | 44 +++++++++ lib/Parser/HTML/Feed.php | 124 ++++++++++++++++++++++++++ lib/Parser/XML/Construct.php | 2 +- lib/Parser/XML/XPath.php | 4 +- tests/cases/Parser/HTML/failures.yaml | 17 ++++ tests/cases/Parser/HTML/feed.yaml | 53 +++++++++++ tests/cases/Parser/HTMLTest.php | 28 ++++++ 8 files changed, 274 insertions(+), 2 deletions(-) create mode 100644 lib/Parser/HTML/Construct.php create mode 100644 lib/Parser/HTML/Feed.php create mode 100644 tests/cases/Parser/HTML/failures.yaml create mode 100644 tests/cases/Parser/HTML/feed.yaml create mode 100644 tests/cases/Parser/HTMLTest.php diff --git a/lib/Exception.php b/lib/Exception.php index 7a1d222..ce2d764 100644 --- a/lib/Exception.php +++ b/lib/Exception.php @@ -12,10 +12,14 @@ abstract class Exception extends \Exception { 'notSupportedType' => [0x1101, "Document type is not supported"], 'notJSONType' => [0x1111, "Document Content-Type is not either that of JSON Feed or generic JSON"], 'notXMLType' => [0x1111, "Document Content-Type is not that of an XML newsfeed"], + 'notHTMLType' => [0x1111, "Document Content-Type is not that of an HTML newsfeed"], 'notJSON' => [0x1112, "Document is not valid JSON"], 'notXML' => [0x1112, "Document is not well-formed XML"], + 'notHTML' => [0x1112, "Document is not well-formed HTML"], // this condition should be impossible + 'notXHTML' => [0x1112, "Document is well-formed XML, but is not XHTML"], 'notJSONFeed' => [0x1113, "Document is not a JSON Feed document"], 'notXMLFeed' => [0x1113, "Document is not a newsfeed"], + 'notHTMLFeed' => [0x1113, "Document does not contain a newsfeed"], // Fetching: 0x1200 'httpStatus400' => [0x1201, "Client request was not acceptable to the server (code 400)"], 'httpStatus401' => [0x1202, "Supplied credentials are insufficient to access the resource (code 401)"], diff --git a/lib/Parser/HTML/Construct.php b/lib/Parser/HTML/Construct.php new file mode 100644 index 0000000..04d6b10 --- /dev/null +++ b/lib/Parser/HTML/Construct.php @@ -0,0 +1,44 @@ +xpath->query($query."[contains(concat(' ', normalize-space(@class), ' '), ' $class ')][1]", $context ?? $this->subject); + assert($el !== false, "Invalid XPath query: \"$query\""); + return $el->length ? $el->item(0) : null; + } + + public function getLang(): ?string { + // walk up the tree looking for the nearest language tag, preferring XML over HTML when appropriate + $el = $this->subject; + $out = null; + $xhtml = (bool) $el->ownerDocument->documentElement->namespaceURI; + do { + if ($xhtml) { + $out = trim($el->getAttributeNS("http://www.w3.org/XML/1998/namespace", "lang") ?? ""); + $out = strlen($out) ? $out : null; + } + if (is_null($out)) { + $out = trim($el->getAttribute("lang") ?? ""); + $out = strlen($out) ? $out : null; + } + $el = $el->parentNode; + } while (is_null($out) && $el); + return $out; + } +} diff --git a/lib/Parser/HTML/Feed.php b/lib/Parser/HTML/Feed.php new file mode 100644 index 0000000..1ee79c8 --- /dev/null +++ b/lib/Parser/HTML/Feed.php @@ -0,0 +1,124 @@ +contentType) ?? ""; + if ($type && !in_array($type->essence, self::MIME_TYPES)) { + throw new Exception("notHTMLType"); + } + if ($type && $type->essence === "application/xhtml+xml") { + $this->document = new \DOMDocument; + if (!$this->document->loadXML($this->data, self::LIBXML_OPTIONS)) { + // ignore XML parsing errors; we will reparse as HTML in this case + $this->document = null; + } elseif ($this->document->documentElement->namespaceURI !== XPath::NS['html']) { + throw new Exception("notXHTML"); + } + } + if (!$this->document) { + $this->document = new \DOMDocument; + if (!$this->document->loadHTML($this->data, self::LIBXML_OPTIONS)) { + throw new Exception("notHTML"); // @codeCoverageIgnore + } + } + $this->document->documentURI = $this->url; + $this->xpath = new \DOMXPath($this->document); + $this->subject = $this->fetchElement("h-feed", "//*"); + if (!$this->subject) { + throw new Exception("notHTMLFeed"); + } + $feed->meta->url = $this->url; + $feed->format = "h-feed"; + $feed->version = "1"; + return $feed; + } + + /** {@inheritDoc} + * + * h-feeds do not have IDs, so this is always null. + */ + public function getId(): ?string { + return null; + } + + public function getUrl(): ?Url { + return null; + } + + public function getTitle(): ?Text { + return null; + } + + public function getLink(): ?Url { + return null; + } + + public function getSummary(): ?Text { + return null; + } + + public function getDateModified(): ?Date { + return null; + } + + public function getIcon(): ?Url { + return null; + } + + public function getImage(): ?Url { + return null; + } + + public function getCategories(): CategoryCollection { + return new CategoryCollection; + } + + public function getPeople(): PersonCollection { + return new PersonCollection; + } + + public function getEntries(FeedStruct $feed): array { + return []; + } + + public function getSchedule(): Schedule { + return new Schedule; + } +} \ No newline at end of file diff --git a/lib/Parser/XML/Construct.php b/lib/Parser/XML/Construct.php index cc7c9d6..320689c 100644 --- a/lib/Parser/XML/Construct.php +++ b/lib/Parser/XML/Construct.php @@ -293,7 +293,7 @@ abstract class Construct { } break; case "application/xhtml+xml": - if (is_null($out->xhtml) && ($xhtml = $this->fetchElement("xhtml:div", $node))) { + if (is_null($out->xhtml) && ($xhtml = $this->fetchElement("html:div", $node))) { $out->xhtml = $xhtml->ownerDocument->saveXML($xhtml); $out->xhtmlBase = strlen($xhtml->baseURI) ? $xhtml->baseURI : null; } diff --git a/lib/Parser/XML/XPath.php b/lib/Parser/XML/XPath.php index 5a9ac5f..a3314fd 100644 --- a/lib/Parser/XML/XPath.php +++ b/lib/Parser/XML/XPath.php @@ -19,12 +19,13 @@ class XPath extends \DOMXpath { 'media' => "http://search.yahoo.com/mrss/", // Embedded media extension http://www.rssboard.org/media-rss 'rss1file' => "http://purl.oclc.org/net/rss_2.0/enc#", // RSS 1.0 enclosures https://foz.home.xs4all.nl/mod_enclosure.html 'rdf' => "http://www.w3.org/1999/02/22-rdf-syntax-ns#", // Resource Description Framework https://www.w3.org/TR/2014/REC-rdf11-concepts-20140225/ - 'xhtml' => "http://www.w3.org/1999/xhtml", // XHTML https://html.spec.whatwg.org/ + 'html' => "http://www.w3.org/1999/xhtml", // XHTML https://html.spec.whatwg.org/ 'apple' => "http://www.itunes.com/dtds/podcast-1.0.dtd", // iTunes podcasts https://help.apple.com/itc/podcasts_connect/#/itcb54353390 'gplay' => "http://www.google.com/schemas/play-podcasts/1.0", // Google Play podcasts https://support.google.com/googleplay/podcasts/answer/6260341 ]; public $rss2 = false; + public $html = false; /** Returns an XPath processor with various necessary namespace prefixes defined */ public function __construct(\DOMDocument $doc) { @@ -37,6 +38,7 @@ class XPath extends \DOMXpath { /** {@inheritDoc} */ public function query($expression, $contextnode = null, $registerNS = true) { $expression = $this->rss2 ? str_replace("rss2:", "", $expression) : $expression; + $expression = $this->html ? str_replace("html:", "", $expression) : $expression; return parent::query($expression, $contextnode, $registerNS); } } diff --git a/tests/cases/Parser/HTML/failures.yaml b/tests/cases/Parser/HTML/failures.yaml new file mode 100644 index 0000000..1fc0f18 --- /dev/null +++ b/tests/cases/Parser/HTML/failures.yaml @@ -0,0 +1,17 @@ +Content-Type mismatch: + type: text/xml + input: > + + exception: notHTMLType + +Not a feed: + type: text/html + input: > + + exception: notHTMLFeed + +Not an XHTML document: + type: application/xhtml+xml + input: > + + exception: notXHTML diff --git a/tests/cases/Parser/HTML/feed.yaml b/tests/cases/Parser/HTML/feed.yaml new file mode 100644 index 0000000..127f20e --- /dev/null +++ b/tests/cases/Parser/HTML/feed.yaml @@ -0,0 +1,53 @@ +Basic example 1: + input: > + + output: + format: 'h-feed' + version: '1' + +Basic example 2: + input: > + + output: + format: 'h-feed' + version: '1' + +Basic example 3: + input: > + + output: + format: 'h-feed' + version: '1' + +Basic example 4: + input: > + + output: + format: 'h-feed' + version: '1' + + +Basic example 5: + input: > + + + + output: + format: 'h-feed' + version: '1' + +Basic XHTML example: + type: application/xhtml+xml + input: > + + output: + format: 'h-feed' + version: '1' + +Basic XHTML fallback example: + type: application/xhtml+xml + input: > + + output: + format: 'h-feed' + version: '1' \ No newline at end of file diff --git a/tests/cases/Parser/HTMLTest.php b/tests/cases/Parser/HTMLTest.php new file mode 100644 index 0000000..7c0a517 --- /dev/null +++ b/tests/cases/Parser/HTMLTest.php @@ -0,0 +1,28 @@ + + */ +class HTMLTest extends AbstractParserTestCase { + /** @dataProvider provideHTML */ + public function testParseAnHtmlFeed(string $input, string $type, ?string $url, $exp): void { + $p = new \MensBeam\Lax\Parser\HTML\Feed($input, $type, $url); + if ($exp instanceof \Exception) { + $this->expectExceptionObject($exp); + $p->parse(); + } else { + $act = $p->parse(); + $this->assertEquals($exp, $act); + } + } + + public function provideHTML(): iterable { + return $this->provideParserTests(__DIR__."/HTML/*.yaml"); + } +}