Skeleton of h-feed parser

This is probbly a dead end; the reference microformats parser should
probably ultimately be used instead
This commit is contained in:
J. King 2020-09-23 10:01:59 -04:00
parent f4f46c57a5
commit 7011bc0c4d
8 changed files with 274 additions and 2 deletions

View file

@ -12,10 +12,14 @@ abstract class Exception extends \Exception {
'notSupportedType' => [0x1101, "Document type is not supported"],
'notJSONType' => [0x1111, "Document Content-Type is not either that of JSON Feed or generic JSON"],
'notXMLType' => [0x1111, "Document Content-Type is not that of an XML newsfeed"],
'notHTMLType' => [0x1111, "Document Content-Type is not that of an HTML newsfeed"],
'notJSON' => [0x1112, "Document is not valid JSON"],
'notXML' => [0x1112, "Document is not well-formed XML"],
'notHTML' => [0x1112, "Document is not well-formed HTML"], // this condition should be impossible
'notXHTML' => [0x1112, "Document is well-formed XML, but is not XHTML"],
'notJSONFeed' => [0x1113, "Document is not a JSON Feed document"],
'notXMLFeed' => [0x1113, "Document is not a newsfeed"],
'notHTMLFeed' => [0x1113, "Document does not contain a newsfeed"],
// Fetching: 0x1200
'httpStatus400' => [0x1201, "Client request was not acceptable to the server (code 400)"],
'httpStatus401' => [0x1202, "Supplied credentials are insufficient to access the resource (code 401)"],

View file

@ -0,0 +1,44 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Lax\Parser\HTML;
abstract class Construct {
use \MensBeam\Lax\Parser\Construct;
/** @var \DOMDocument */
protected $document;
/** @var \DOMXPath */
protected $xpath;
/** @var \DOMElement */
protected $subject;
/** Retrieves an element node based on class name and optionally XPath query */
protected function fetchElement(string $class, string $query = "/*", \DOMNode $context = null): ?\DOMElement {
$el = $this->xpath->query($query."[contains(concat(' ', normalize-space(@class), ' '), ' $class ')][1]", $context ?? $this->subject);
assert($el !== false, "Invalid XPath query: \"$query\"");
return $el->length ? $el->item(0) : null;
}
public function getLang(): ?string {
// walk up the tree looking for the nearest language tag, preferring XML over HTML when appropriate
$el = $this->subject;
$out = null;
$xhtml = (bool) $el->ownerDocument->documentElement->namespaceURI;
do {
if ($xhtml) {
$out = trim($el->getAttributeNS("http://www.w3.org/XML/1998/namespace", "lang") ?? "");
$out = strlen($out) ? $out : null;
}
if (is_null($out)) {
$out = trim($el->getAttribute("lang") ?? "");
$out = strlen($out) ? $out : null;
}
$el = $el->parentNode;
} while (is_null($out) && $el);
return $out;
}
}

124
lib/Parser/HTML/Feed.php Normal file
View file

@ -0,0 +1,124 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Lax\Parser\HTML;
use MensBeam\Lax\Date;
use MensBeam\Lax\Text;
use MensBeam\Lax\Url;
use MensBeam\Lax\MimeType;
use MensBeam\Lax\Schedule;
use MensBeam\Lax\Feed as FeedStruct;
use MensBeam\Lax\Parser\Exception;
use MensBeam\Lax\Parser\XML\XPath;
use MensBeam\Lax\Category\Collection as CategoryCollection;
use MensBeam\Lax\Person\Collection as PersonCollection;
class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
use \MensBeam\Lax\Parser\AbstractFeed;
protected const LIBXML_OPTIONS = \LIBXML_BIGLINES | \LIBXML_COMPACT | \LIBXML_HTML_NODEFDTD | \LIBXML_NOCDATA | \LIBXML_NOENT | \LIBXML_NONET | \LIBXML_NOERROR | LIBXML_NOWARNING;
public const MIME_TYPES = [
"text/html",
"application/xhtml+xml",
];
/** @var string */
protected $data;
/** @var string */
protected $contentType;
/** @var string */
protected $url;
/** @var \DOMElement */
protected $subject;
/** @var \DOMXpath */
protected $xpath;
/** Performs initialization of the instance */
protected function init(FeedStruct $feed): FeedStruct {
$type = MimeType::parse($this->contentType) ?? "";
if ($type && !in_array($type->essence, self::MIME_TYPES)) {
throw new Exception("notHTMLType");
}
if ($type && $type->essence === "application/xhtml+xml") {
$this->document = new \DOMDocument;
if (!$this->document->loadXML($this->data, self::LIBXML_OPTIONS)) {
// ignore XML parsing errors; we will reparse as HTML in this case
$this->document = null;
} elseif ($this->document->documentElement->namespaceURI !== XPath::NS['html']) {
throw new Exception("notXHTML");
}
}
if (!$this->document) {
$this->document = new \DOMDocument;
if (!$this->document->loadHTML($this->data, self::LIBXML_OPTIONS)) {
throw new Exception("notHTML"); // @codeCoverageIgnore
}
}
$this->document->documentURI = $this->url;
$this->xpath = new \DOMXPath($this->document);
$this->subject = $this->fetchElement("h-feed", "//*");
if (!$this->subject) {
throw new Exception("notHTMLFeed");
}
$feed->meta->url = $this->url;
$feed->format = "h-feed";
$feed->version = "1";
return $feed;
}
/** {@inheritDoc}
*
* h-feeds do not have IDs, so this is always null.
*/
public function getId(): ?string {
return null;
}
public function getUrl(): ?Url {
return null;
}
public function getTitle(): ?Text {
return null;
}
public function getLink(): ?Url {
return null;
}
public function getSummary(): ?Text {
return null;
}
public function getDateModified(): ?Date {
return null;
}
public function getIcon(): ?Url {
return null;
}
public function getImage(): ?Url {
return null;
}
public function getCategories(): CategoryCollection {
return new CategoryCollection;
}
public function getPeople(): PersonCollection {
return new PersonCollection;
}
public function getEntries(FeedStruct $feed): array {
return [];
}
public function getSchedule(): Schedule {
return new Schedule;
}
}

View file

@ -293,7 +293,7 @@ abstract class Construct {
}
break;
case "application/xhtml+xml":
if (is_null($out->xhtml) && ($xhtml = $this->fetchElement("xhtml:div", $node))) {
if (is_null($out->xhtml) && ($xhtml = $this->fetchElement("html:div", $node))) {
$out->xhtml = $xhtml->ownerDocument->saveXML($xhtml);
$out->xhtmlBase = strlen($xhtml->baseURI) ? $xhtml->baseURI : null;
}

View file

@ -19,12 +19,13 @@ class XPath extends \DOMXpath {
'media' => "http://search.yahoo.com/mrss/", // Embedded media extension http://www.rssboard.org/media-rss
'rss1file' => "http://purl.oclc.org/net/rss_2.0/enc#", // RSS 1.0 enclosures https://foz.home.xs4all.nl/mod_enclosure.html
'rdf' => "http://www.w3.org/1999/02/22-rdf-syntax-ns#", // Resource Description Framework https://www.w3.org/TR/2014/REC-rdf11-concepts-20140225/
'xhtml' => "http://www.w3.org/1999/xhtml", // XHTML https://html.spec.whatwg.org/
'html' => "http://www.w3.org/1999/xhtml", // XHTML https://html.spec.whatwg.org/
'apple' => "http://www.itunes.com/dtds/podcast-1.0.dtd", // iTunes podcasts https://help.apple.com/itc/podcasts_connect/#/itcb54353390
'gplay' => "http://www.google.com/schemas/play-podcasts/1.0", // Google Play podcasts https://support.google.com/googleplay/podcasts/answer/6260341
];
public $rss2 = false;
public $html = false;
/** Returns an XPath processor with various necessary namespace prefixes defined */
public function __construct(\DOMDocument $doc) {
@ -37,6 +38,7 @@ class XPath extends \DOMXpath {
/** {@inheritDoc} */
public function query($expression, $contextnode = null, $registerNS = true) {
$expression = $this->rss2 ? str_replace("rss2:", "", $expression) : $expression;
$expression = $this->html ? str_replace("html:", "", $expression) : $expression;
return parent::query($expression, $contextnode, $registerNS);
}
}

View file

@ -0,0 +1,17 @@
Content-Type mismatch:
type: text/xml
input: >
<html class="h-feed"></html>
exception: notHTMLType
Not a feed:
type: text/html
input: >
<html></html>
exception: notHTMLFeed
Not an XHTML document:
type: application/xhtml+xml
input: >
<html></html>
exception: notXHTML

View file

@ -0,0 +1,53 @@
Basic example 1:
input: >
<html class="h-feed"></html>
output:
format: 'h-feed'
version: '1'
Basic example 2:
input: >
<html class="home h-feed"></html>
output:
format: 'h-feed'
version: '1'
Basic example 3:
input: >
<html class="h-feed main"></html>
output:
format: 'h-feed'
version: '1'
Basic example 4:
input: >
<html class="home h-feed main"></html>
output:
format: 'h-feed'
version: '1'
Basic example 5:
input: >
<html class="main">
<body class="h-feed"></body>
</html>
output:
format: 'h-feed'
version: '1'
Basic XHTML example:
type: application/xhtml+xml
input: >
<html xmlns="http://www.w3.org/1999/xhtml" class="h-feed"></html>
output:
format: 'h-feed'
version: '1'
Basic XHTML fallback example:
type: application/xhtml+xml
input: >
<html class="h-feed">
output:
format: 'h-feed'
version: '1'

View file

@ -0,0 +1,28 @@
<?php
/** @license MIT
* Copyright 2018 J. King
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Lax\TestCase\Parser;
/**
* @covers MensBeam\Lax\Parser\HTML\Feed<extended>
*/
class HTMLTest extends AbstractParserTestCase {
/** @dataProvider provideHTML */
public function testParseAnHtmlFeed(string $input, string $type, ?string $url, $exp): void {
$p = new \MensBeam\Lax\Parser\HTML\Feed($input, $type, $url);
if ($exp instanceof \Exception) {
$this->expectExceptionObject($exp);
$p->parse();
} else {
$act = $p->parse();
$this->assertEquals($exp, $act);
}
}
public function provideHTML(): iterable {
return $this->provideParserTests(__DIR__."/HTML/*.yaml");
}
}