From 7011bc0c4d5b7c195388cc1a7989f5253a4bd3f6 Mon Sep 17 00:00:00 2001
From: "J. King" <jking@jkingweb.ca>
Date: Wed, 23 Sep 2020 10:01:59 -0400
Subject: [PATCH] Skeleton of h-feed parser

This is probbly a dead end; the reference microformats parser should
probably ultimately be used instead
---
 lib/Exception.php                     |   4 +
 lib/Parser/HTML/Construct.php         |  44 +++++++++
 lib/Parser/HTML/Feed.php              | 124 ++++++++++++++++++++++++++
 lib/Parser/XML/Construct.php          |   2 +-
 lib/Parser/XML/XPath.php              |   4 +-
 tests/cases/Parser/HTML/failures.yaml |  17 ++++
 tests/cases/Parser/HTML/feed.yaml     |  53 +++++++++++
 tests/cases/Parser/HTMLTest.php       |  28 ++++++
 8 files changed, 274 insertions(+), 2 deletions(-)
 create mode 100644 lib/Parser/HTML/Construct.php
 create mode 100644 lib/Parser/HTML/Feed.php
 create mode 100644 tests/cases/Parser/HTML/failures.yaml
 create mode 100644 tests/cases/Parser/HTML/feed.yaml
 create mode 100644 tests/cases/Parser/HTMLTest.php

diff --git a/lib/Exception.php b/lib/Exception.php
index 7a1d222..ce2d764 100644
--- a/lib/Exception.php
+++ b/lib/Exception.php
@@ -12,10 +12,14 @@ abstract class Exception extends \Exception {
         'notSupportedType'          => [0x1101, "Document type is not supported"],
         'notJSONType'               => [0x1111, "Document Content-Type is not either that of JSON Feed or generic JSON"],
         'notXMLType'                => [0x1111, "Document Content-Type is not that of an XML newsfeed"],
+        'notHTMLType'               => [0x1111, "Document Content-Type is not that of an HTML newsfeed"],
         'notJSON'                   => [0x1112, "Document is not valid JSON"],
         'notXML'                    => [0x1112, "Document is not well-formed XML"],
+        'notHTML'                   => [0x1112, "Document is not well-formed HTML"], // this condition should be impossible
+        'notXHTML'                  => [0x1112, "Document is well-formed XML, but is not XHTML"],
         'notJSONFeed'               => [0x1113, "Document is not a JSON Feed document"],
         'notXMLFeed'                => [0x1113, "Document is not a newsfeed"],
+        'notHTMLFeed'               => [0x1113, "Document does not contain a newsfeed"],
         // Fetching: 0x1200
         'httpStatus400'             => [0x1201, "Client request was not acceptable to the server (code 400)"],
         'httpStatus401'             => [0x1202, "Supplied credentials are insufficient to access the resource (code 401)"],
diff --git a/lib/Parser/HTML/Construct.php b/lib/Parser/HTML/Construct.php
new file mode 100644
index 0000000..04d6b10
--- /dev/null
+++ b/lib/Parser/HTML/Construct.php
@@ -0,0 +1,44 @@
+<?php
+/** @license MIT
+ * Copyright 2018 J. King et al.
+ * See LICENSE and AUTHORS files for details */
+
+declare(strict_types=1);
+namespace MensBeam\Lax\Parser\HTML;
+
+abstract class Construct {
+    use \MensBeam\Lax\Parser\Construct;
+
+    /** @var \DOMDocument */
+    protected $document;
+    /** @var \DOMXPath */
+    protected $xpath;
+    /** @var \DOMElement */
+    protected $subject;
+
+    /** Retrieves an element node based on class name and optionally XPath query */
+    protected function fetchElement(string $class, string $query = "/*", \DOMNode $context = null): ?\DOMElement {
+        $el = $this->xpath->query($query."[contains(concat(' ', normalize-space(@class), ' '), ' $class ')][1]", $context ?? $this->subject);
+        assert($el !== false, "Invalid XPath query: \"$query\"");
+        return $el->length ? $el->item(0) : null;
+    }
+
+    public function getLang(): ?string {
+        // walk up the tree looking for the nearest language tag, preferring XML over HTML when appropriate
+        $el = $this->subject;
+        $out = null;
+        $xhtml = (bool) $el->ownerDocument->documentElement->namespaceURI;
+        do {
+            if ($xhtml) {
+                $out = trim($el->getAttributeNS("http://www.w3.org/XML/1998/namespace", "lang") ?? "");
+                $out = strlen($out) ? $out : null;
+            }
+            if (is_null($out)) {
+                $out = trim($el->getAttribute("lang") ?? "");
+                $out = strlen($out) ? $out : null;
+            }
+            $el = $el->parentNode;
+        } while (is_null($out) && $el);
+        return $out;
+    }
+}
diff --git a/lib/Parser/HTML/Feed.php b/lib/Parser/HTML/Feed.php
new file mode 100644
index 0000000..1ee79c8
--- /dev/null
+++ b/lib/Parser/HTML/Feed.php
@@ -0,0 +1,124 @@
+<?php
+/** @license MIT
+ * Copyright 2018 J. King et al.
+ * See LICENSE and AUTHORS files for details */
+
+declare(strict_types=1);
+namespace MensBeam\Lax\Parser\HTML;
+
+use MensBeam\Lax\Date;
+use MensBeam\Lax\Text;
+use MensBeam\Lax\Url;
+use MensBeam\Lax\MimeType;
+use MensBeam\Lax\Schedule;
+use MensBeam\Lax\Feed as FeedStruct;
+use MensBeam\Lax\Parser\Exception;
+use MensBeam\Lax\Parser\XML\XPath;
+use MensBeam\Lax\Category\Collection as CategoryCollection;
+use MensBeam\Lax\Person\Collection as PersonCollection;
+
+class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
+    use \MensBeam\Lax\Parser\AbstractFeed;
+
+    protected const LIBXML_OPTIONS = \LIBXML_BIGLINES | \LIBXML_COMPACT | \LIBXML_HTML_NODEFDTD | \LIBXML_NOCDATA | \LIBXML_NOENT | \LIBXML_NONET | \LIBXML_NOERROR | LIBXML_NOWARNING;
+    public const MIME_TYPES = [
+        "text/html",
+        "application/xhtml+xml",
+    ];
+    
+    /** @var string */
+    protected $data;
+    /** @var string */
+    protected $contentType;
+    /** @var string */
+    protected $url;
+    /** @var \DOMElement */
+    protected $subject;
+    /** @var \DOMXpath */
+    protected $xpath;
+
+    /** Performs initialization of the instance */
+    protected function init(FeedStruct $feed): FeedStruct {
+        $type = MimeType::parse($this->contentType) ?? "";
+        if ($type && !in_array($type->essence, self::MIME_TYPES)) {
+            throw new Exception("notHTMLType");
+        }
+        if ($type && $type->essence === "application/xhtml+xml") {
+            $this->document = new \DOMDocument;
+            if (!$this->document->loadXML($this->data, self::LIBXML_OPTIONS)) {
+                // ignore XML parsing errors; we will reparse as HTML in this case
+                $this->document = null;
+            } elseif ($this->document->documentElement->namespaceURI !== XPath::NS['html']) {
+                throw new Exception("notXHTML");
+            }
+        }
+        if (!$this->document) {
+            $this->document = new \DOMDocument;
+            if (!$this->document->loadHTML($this->data, self::LIBXML_OPTIONS)) {
+                throw new Exception("notHTML"); // @codeCoverageIgnore
+            }
+        }
+        $this->document->documentURI = $this->url;
+        $this->xpath = new \DOMXPath($this->document);
+        $this->subject = $this->fetchElement("h-feed", "//*");
+        if (!$this->subject) {
+            throw new Exception("notHTMLFeed");
+        }
+        $feed->meta->url = $this->url;
+        $feed->format = "h-feed";
+        $feed->version = "1";
+        return $feed;
+    }
+
+    /** {@inheritDoc} 
+     * 
+     * h-feeds do not have IDs, so this is always null.
+    */
+    public function getId(): ?string {
+        return null;
+    }
+
+    public function getUrl(): ?Url {
+        return null;
+    }
+
+    public function getTitle(): ?Text {
+        return null;
+    }
+
+    public function getLink(): ?Url {
+        return null;
+    }
+
+    public function getSummary(): ?Text {
+        return null;
+    }
+
+    public function getDateModified(): ?Date {
+        return null;
+    }
+
+    public function getIcon(): ?Url {
+        return null;
+    }
+
+    public function getImage(): ?Url {
+        return null;
+    }
+
+    public function getCategories(): CategoryCollection {
+        return new CategoryCollection;
+    }
+
+    public function getPeople(): PersonCollection {
+        return new PersonCollection;
+    }
+
+    public function getEntries(FeedStruct $feed): array {
+        return [];
+    }
+
+    public function getSchedule(): Schedule {
+        return new Schedule;
+    }
+}
\ No newline at end of file
diff --git a/lib/Parser/XML/Construct.php b/lib/Parser/XML/Construct.php
index cc7c9d6..320689c 100644
--- a/lib/Parser/XML/Construct.php
+++ b/lib/Parser/XML/Construct.php
@@ -293,7 +293,7 @@ abstract class Construct {
                     }
                     break;
                 case "application/xhtml+xml":
-                    if (is_null($out->xhtml) && ($xhtml = $this->fetchElement("xhtml:div", $node))) {
+                    if (is_null($out->xhtml) && ($xhtml = $this->fetchElement("html:div", $node))) {
                         $out->xhtml = $xhtml->ownerDocument->saveXML($xhtml);
                         $out->xhtmlBase = strlen($xhtml->baseURI) ? $xhtml->baseURI : null;
                     }
diff --git a/lib/Parser/XML/XPath.php b/lib/Parser/XML/XPath.php
index 5a9ac5f..a3314fd 100644
--- a/lib/Parser/XML/XPath.php
+++ b/lib/Parser/XML/XPath.php
@@ -19,12 +19,13 @@ class XPath extends \DOMXpath {
         'media'    => "http://search.yahoo.com/mrss/",                     // Embedded media extension                 http://www.rssboard.org/media-rss
         'rss1file' => "http://purl.oclc.org/net/rss_2.0/enc#",             // RSS 1.0 enclosures                       https://foz.home.xs4all.nl/mod_enclosure.html
         'rdf'      => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",       // Resource Description Framework           https://www.w3.org/TR/2014/REC-rdf11-concepts-20140225/
-        'xhtml'    => "http://www.w3.org/1999/xhtml",                      // XHTML                                    https://html.spec.whatwg.org/
+        'html'     => "http://www.w3.org/1999/xhtml",                      // XHTML                                    https://html.spec.whatwg.org/
         'apple'    => "http://www.itunes.com/dtds/podcast-1.0.dtd",        // iTunes podcasts                          https://help.apple.com/itc/podcasts_connect/#/itcb54353390
         'gplay'    => "http://www.google.com/schemas/play-podcasts/1.0",   // Google Play podcasts                     https://support.google.com/googleplay/podcasts/answer/6260341
     ];
 
     public $rss2 = false;
+    public $html = false;
 
     /** Returns an XPath processor with various necessary namespace prefixes defined */
     public function __construct(\DOMDocument $doc) {
@@ -37,6 +38,7 @@ class XPath extends \DOMXpath {
     /** {@inheritDoc} */
     public function query($expression, $contextnode = null, $registerNS = true) {
         $expression = $this->rss2 ? str_replace("rss2:", "", $expression) : $expression;
+        $expression = $this->html ? str_replace("html:", "", $expression) : $expression;
         return parent::query($expression, $contextnode, $registerNS);
     }
 }
diff --git a/tests/cases/Parser/HTML/failures.yaml b/tests/cases/Parser/HTML/failures.yaml
new file mode 100644
index 0000000..1fc0f18
--- /dev/null
+++ b/tests/cases/Parser/HTML/failures.yaml
@@ -0,0 +1,17 @@
+Content-Type mismatch:
+    type: text/xml
+    input: >
+        <html class="h-feed"></html>
+    exception: notHTMLType
+
+Not a feed:
+    type: text/html
+    input: >
+        <html></html>
+    exception: notHTMLFeed
+
+Not an XHTML document:
+    type: application/xhtml+xml
+    input: >
+        <html></html>
+    exception: notXHTML
diff --git a/tests/cases/Parser/HTML/feed.yaml b/tests/cases/Parser/HTML/feed.yaml
new file mode 100644
index 0000000..127f20e
--- /dev/null
+++ b/tests/cases/Parser/HTML/feed.yaml
@@ -0,0 +1,53 @@
+Basic example 1:
+    input: >
+        <html class="h-feed"></html>
+    output:
+        format: 'h-feed'
+        version: '1'
+
+Basic example 2:
+    input: >
+        <html class="home h-feed"></html>
+    output:
+        format: 'h-feed'
+        version: '1'
+
+Basic example 3:
+    input: >
+        <html class="h-feed main"></html>
+    output:
+        format: 'h-feed'
+        version: '1'
+
+Basic example 4:
+    input: >
+        <html class="home h-feed main"></html>
+    output:
+        format: 'h-feed'
+        version: '1'
+
+
+Basic example 5:
+    input: >
+        <html class="main">
+            <body class="h-feed"></body>
+        </html>
+    output:
+        format: 'h-feed'
+        version: '1'
+
+Basic XHTML example:
+    type: application/xhtml+xml 
+    input: >
+        <html xmlns="http://www.w3.org/1999/xhtml" class="h-feed"></html>
+    output:
+        format: 'h-feed'
+        version: '1'
+
+Basic XHTML fallback example:
+    type: application/xhtml+xml 
+    input: >
+        <html class="h-feed">
+    output:
+        format: 'h-feed'
+        version: '1'
\ No newline at end of file
diff --git a/tests/cases/Parser/HTMLTest.php b/tests/cases/Parser/HTMLTest.php
new file mode 100644
index 0000000..7c0a517
--- /dev/null
+++ b/tests/cases/Parser/HTMLTest.php
@@ -0,0 +1,28 @@
+<?php
+/** @license MIT
+ * Copyright 2018 J. King
+ * See LICENSE and AUTHORS files for details */
+
+declare(strict_types=1);
+namespace MensBeam\Lax\TestCase\Parser;
+
+/**
+ * @covers MensBeam\Lax\Parser\HTML\Feed<extended>
+ */
+class HTMLTest extends AbstractParserTestCase {
+    /** @dataProvider provideHTML */
+    public function testParseAnHtmlFeed(string $input, string $type, ?string $url, $exp): void {
+        $p = new \MensBeam\Lax\Parser\HTML\Feed($input, $type, $url);
+        if ($exp instanceof \Exception) {
+            $this->expectExceptionObject($exp);
+            $p->parse();
+        } else {
+            $act = $p->parse();
+            $this->assertEquals($exp, $act);
+        }
+    }
+
+    public function provideHTML(): iterable {
+        return $this->provideParserTests(__DIR__."/HTML/*.yaml");
+    }
+}