Skeleton of h-feed parser
This is probbly a dead end; the reference microformats parser should probably ultimately be used instead
This commit is contained in:
parent
f4f46c57a5
commit
7011bc0c4d
8 changed files with 274 additions and 2 deletions
|
@ -12,10 +12,14 @@ abstract class Exception extends \Exception {
|
|||
'notSupportedType' => [0x1101, "Document type is not supported"],
|
||||
'notJSONType' => [0x1111, "Document Content-Type is not either that of JSON Feed or generic JSON"],
|
||||
'notXMLType' => [0x1111, "Document Content-Type is not that of an XML newsfeed"],
|
||||
'notHTMLType' => [0x1111, "Document Content-Type is not that of an HTML newsfeed"],
|
||||
'notJSON' => [0x1112, "Document is not valid JSON"],
|
||||
'notXML' => [0x1112, "Document is not well-formed XML"],
|
||||
'notHTML' => [0x1112, "Document is not well-formed HTML"], // this condition should be impossible
|
||||
'notXHTML' => [0x1112, "Document is well-formed XML, but is not XHTML"],
|
||||
'notJSONFeed' => [0x1113, "Document is not a JSON Feed document"],
|
||||
'notXMLFeed' => [0x1113, "Document is not a newsfeed"],
|
||||
'notHTMLFeed' => [0x1113, "Document does not contain a newsfeed"],
|
||||
// Fetching: 0x1200
|
||||
'httpStatus400' => [0x1201, "Client request was not acceptable to the server (code 400)"],
|
||||
'httpStatus401' => [0x1202, "Supplied credentials are insufficient to access the resource (code 401)"],
|
||||
|
|
44
lib/Parser/HTML/Construct.php
Normal file
44
lib/Parser/HTML/Construct.php
Normal file
|
@ -0,0 +1,44 @@
|
|||
<?php
|
||||
/** @license MIT
|
||||
* Copyright 2018 J. King et al.
|
||||
* See LICENSE and AUTHORS files for details */
|
||||
|
||||
declare(strict_types=1);
|
||||
namespace MensBeam\Lax\Parser\HTML;
|
||||
|
||||
abstract class Construct {
|
||||
use \MensBeam\Lax\Parser\Construct;
|
||||
|
||||
/** @var \DOMDocument */
|
||||
protected $document;
|
||||
/** @var \DOMXPath */
|
||||
protected $xpath;
|
||||
/** @var \DOMElement */
|
||||
protected $subject;
|
||||
|
||||
/** Retrieves an element node based on class name and optionally XPath query */
|
||||
protected function fetchElement(string $class, string $query = "/*", \DOMNode $context = null): ?\DOMElement {
|
||||
$el = $this->xpath->query($query."[contains(concat(' ', normalize-space(@class), ' '), ' $class ')][1]", $context ?? $this->subject);
|
||||
assert($el !== false, "Invalid XPath query: \"$query\"");
|
||||
return $el->length ? $el->item(0) : null;
|
||||
}
|
||||
|
||||
public function getLang(): ?string {
|
||||
// walk up the tree looking for the nearest language tag, preferring XML over HTML when appropriate
|
||||
$el = $this->subject;
|
||||
$out = null;
|
||||
$xhtml = (bool) $el->ownerDocument->documentElement->namespaceURI;
|
||||
do {
|
||||
if ($xhtml) {
|
||||
$out = trim($el->getAttributeNS("http://www.w3.org/XML/1998/namespace", "lang") ?? "");
|
||||
$out = strlen($out) ? $out : null;
|
||||
}
|
||||
if (is_null($out)) {
|
||||
$out = trim($el->getAttribute("lang") ?? "");
|
||||
$out = strlen($out) ? $out : null;
|
||||
}
|
||||
$el = $el->parentNode;
|
||||
} while (is_null($out) && $el);
|
||||
return $out;
|
||||
}
|
||||
}
|
124
lib/Parser/HTML/Feed.php
Normal file
124
lib/Parser/HTML/Feed.php
Normal file
|
@ -0,0 +1,124 @@
|
|||
<?php
|
||||
/** @license MIT
|
||||
* Copyright 2018 J. King et al.
|
||||
* See LICENSE and AUTHORS files for details */
|
||||
|
||||
declare(strict_types=1);
|
||||
namespace MensBeam\Lax\Parser\HTML;
|
||||
|
||||
use MensBeam\Lax\Date;
|
||||
use MensBeam\Lax\Text;
|
||||
use MensBeam\Lax\Url;
|
||||
use MensBeam\Lax\MimeType;
|
||||
use MensBeam\Lax\Schedule;
|
||||
use MensBeam\Lax\Feed as FeedStruct;
|
||||
use MensBeam\Lax\Parser\Exception;
|
||||
use MensBeam\Lax\Parser\XML\XPath;
|
||||
use MensBeam\Lax\Category\Collection as CategoryCollection;
|
||||
use MensBeam\Lax\Person\Collection as PersonCollection;
|
||||
|
||||
class Feed extends Construct implements \MensBeam\Lax\Parser\Feed {
|
||||
use \MensBeam\Lax\Parser\AbstractFeed;
|
||||
|
||||
protected const LIBXML_OPTIONS = \LIBXML_BIGLINES | \LIBXML_COMPACT | \LIBXML_HTML_NODEFDTD | \LIBXML_NOCDATA | \LIBXML_NOENT | \LIBXML_NONET | \LIBXML_NOERROR | LIBXML_NOWARNING;
|
||||
public const MIME_TYPES = [
|
||||
"text/html",
|
||||
"application/xhtml+xml",
|
||||
];
|
||||
|
||||
/** @var string */
|
||||
protected $data;
|
||||
/** @var string */
|
||||
protected $contentType;
|
||||
/** @var string */
|
||||
protected $url;
|
||||
/** @var \DOMElement */
|
||||
protected $subject;
|
||||
/** @var \DOMXpath */
|
||||
protected $xpath;
|
||||
|
||||
/** Performs initialization of the instance */
|
||||
protected function init(FeedStruct $feed): FeedStruct {
|
||||
$type = MimeType::parse($this->contentType) ?? "";
|
||||
if ($type && !in_array($type->essence, self::MIME_TYPES)) {
|
||||
throw new Exception("notHTMLType");
|
||||
}
|
||||
if ($type && $type->essence === "application/xhtml+xml") {
|
||||
$this->document = new \DOMDocument;
|
||||
if (!$this->document->loadXML($this->data, self::LIBXML_OPTIONS)) {
|
||||
// ignore XML parsing errors; we will reparse as HTML in this case
|
||||
$this->document = null;
|
||||
} elseif ($this->document->documentElement->namespaceURI !== XPath::NS['html']) {
|
||||
throw new Exception("notXHTML");
|
||||
}
|
||||
}
|
||||
if (!$this->document) {
|
||||
$this->document = new \DOMDocument;
|
||||
if (!$this->document->loadHTML($this->data, self::LIBXML_OPTIONS)) {
|
||||
throw new Exception("notHTML"); // @codeCoverageIgnore
|
||||
}
|
||||
}
|
||||
$this->document->documentURI = $this->url;
|
||||
$this->xpath = new \DOMXPath($this->document);
|
||||
$this->subject = $this->fetchElement("h-feed", "//*");
|
||||
if (!$this->subject) {
|
||||
throw new Exception("notHTMLFeed");
|
||||
}
|
||||
$feed->meta->url = $this->url;
|
||||
$feed->format = "h-feed";
|
||||
$feed->version = "1";
|
||||
return $feed;
|
||||
}
|
||||
|
||||
/** {@inheritDoc}
|
||||
*
|
||||
* h-feeds do not have IDs, so this is always null.
|
||||
*/
|
||||
public function getId(): ?string {
|
||||
return null;
|
||||
}
|
||||
|
||||
public function getUrl(): ?Url {
|
||||
return null;
|
||||
}
|
||||
|
||||
public function getTitle(): ?Text {
|
||||
return null;
|
||||
}
|
||||
|
||||
public function getLink(): ?Url {
|
||||
return null;
|
||||
}
|
||||
|
||||
public function getSummary(): ?Text {
|
||||
return null;
|
||||
}
|
||||
|
||||
public function getDateModified(): ?Date {
|
||||
return null;
|
||||
}
|
||||
|
||||
public function getIcon(): ?Url {
|
||||
return null;
|
||||
}
|
||||
|
||||
public function getImage(): ?Url {
|
||||
return null;
|
||||
}
|
||||
|
||||
public function getCategories(): CategoryCollection {
|
||||
return new CategoryCollection;
|
||||
}
|
||||
|
||||
public function getPeople(): PersonCollection {
|
||||
return new PersonCollection;
|
||||
}
|
||||
|
||||
public function getEntries(FeedStruct $feed): array {
|
||||
return [];
|
||||
}
|
||||
|
||||
public function getSchedule(): Schedule {
|
||||
return new Schedule;
|
||||
}
|
||||
}
|
|
@ -293,7 +293,7 @@ abstract class Construct {
|
|||
}
|
||||
break;
|
||||
case "application/xhtml+xml":
|
||||
if (is_null($out->xhtml) && ($xhtml = $this->fetchElement("xhtml:div", $node))) {
|
||||
if (is_null($out->xhtml) && ($xhtml = $this->fetchElement("html:div", $node))) {
|
||||
$out->xhtml = $xhtml->ownerDocument->saveXML($xhtml);
|
||||
$out->xhtmlBase = strlen($xhtml->baseURI) ? $xhtml->baseURI : null;
|
||||
}
|
||||
|
|
|
@ -19,12 +19,13 @@ class XPath extends \DOMXpath {
|
|||
'media' => "http://search.yahoo.com/mrss/", // Embedded media extension http://www.rssboard.org/media-rss
|
||||
'rss1file' => "http://purl.oclc.org/net/rss_2.0/enc#", // RSS 1.0 enclosures https://foz.home.xs4all.nl/mod_enclosure.html
|
||||
'rdf' => "http://www.w3.org/1999/02/22-rdf-syntax-ns#", // Resource Description Framework https://www.w3.org/TR/2014/REC-rdf11-concepts-20140225/
|
||||
'xhtml' => "http://www.w3.org/1999/xhtml", // XHTML https://html.spec.whatwg.org/
|
||||
'html' => "http://www.w3.org/1999/xhtml", // XHTML https://html.spec.whatwg.org/
|
||||
'apple' => "http://www.itunes.com/dtds/podcast-1.0.dtd", // iTunes podcasts https://help.apple.com/itc/podcasts_connect/#/itcb54353390
|
||||
'gplay' => "http://www.google.com/schemas/play-podcasts/1.0", // Google Play podcasts https://support.google.com/googleplay/podcasts/answer/6260341
|
||||
];
|
||||
|
||||
public $rss2 = false;
|
||||
public $html = false;
|
||||
|
||||
/** Returns an XPath processor with various necessary namespace prefixes defined */
|
||||
public function __construct(\DOMDocument $doc) {
|
||||
|
@ -37,6 +38,7 @@ class XPath extends \DOMXpath {
|
|||
/** {@inheritDoc} */
|
||||
public function query($expression, $contextnode = null, $registerNS = true) {
|
||||
$expression = $this->rss2 ? str_replace("rss2:", "", $expression) : $expression;
|
||||
$expression = $this->html ? str_replace("html:", "", $expression) : $expression;
|
||||
return parent::query($expression, $contextnode, $registerNS);
|
||||
}
|
||||
}
|
||||
|
|
17
tests/cases/Parser/HTML/failures.yaml
Normal file
17
tests/cases/Parser/HTML/failures.yaml
Normal file
|
@ -0,0 +1,17 @@
|
|||
Content-Type mismatch:
|
||||
type: text/xml
|
||||
input: >
|
||||
<html class="h-feed"></html>
|
||||
exception: notHTMLType
|
||||
|
||||
Not a feed:
|
||||
type: text/html
|
||||
input: >
|
||||
<html></html>
|
||||
exception: notHTMLFeed
|
||||
|
||||
Not an XHTML document:
|
||||
type: application/xhtml+xml
|
||||
input: >
|
||||
<html></html>
|
||||
exception: notXHTML
|
53
tests/cases/Parser/HTML/feed.yaml
Normal file
53
tests/cases/Parser/HTML/feed.yaml
Normal file
|
@ -0,0 +1,53 @@
|
|||
Basic example 1:
|
||||
input: >
|
||||
<html class="h-feed"></html>
|
||||
output:
|
||||
format: 'h-feed'
|
||||
version: '1'
|
||||
|
||||
Basic example 2:
|
||||
input: >
|
||||
<html class="home h-feed"></html>
|
||||
output:
|
||||
format: 'h-feed'
|
||||
version: '1'
|
||||
|
||||
Basic example 3:
|
||||
input: >
|
||||
<html class="h-feed main"></html>
|
||||
output:
|
||||
format: 'h-feed'
|
||||
version: '1'
|
||||
|
||||
Basic example 4:
|
||||
input: >
|
||||
<html class="home h-feed main"></html>
|
||||
output:
|
||||
format: 'h-feed'
|
||||
version: '1'
|
||||
|
||||
|
||||
Basic example 5:
|
||||
input: >
|
||||
<html class="main">
|
||||
<body class="h-feed"></body>
|
||||
</html>
|
||||
output:
|
||||
format: 'h-feed'
|
||||
version: '1'
|
||||
|
||||
Basic XHTML example:
|
||||
type: application/xhtml+xml
|
||||
input: >
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" class="h-feed"></html>
|
||||
output:
|
||||
format: 'h-feed'
|
||||
version: '1'
|
||||
|
||||
Basic XHTML fallback example:
|
||||
type: application/xhtml+xml
|
||||
input: >
|
||||
<html class="h-feed">
|
||||
output:
|
||||
format: 'h-feed'
|
||||
version: '1'
|
28
tests/cases/Parser/HTMLTest.php
Normal file
28
tests/cases/Parser/HTMLTest.php
Normal file
|
@ -0,0 +1,28 @@
|
|||
<?php
|
||||
/** @license MIT
|
||||
* Copyright 2018 J. King
|
||||
* See LICENSE and AUTHORS files for details */
|
||||
|
||||
declare(strict_types=1);
|
||||
namespace MensBeam\Lax\TestCase\Parser;
|
||||
|
||||
/**
|
||||
* @covers MensBeam\Lax\Parser\HTML\Feed<extended>
|
||||
*/
|
||||
class HTMLTest extends AbstractParserTestCase {
|
||||
/** @dataProvider provideHTML */
|
||||
public function testParseAnHtmlFeed(string $input, string $type, ?string $url, $exp): void {
|
||||
$p = new \MensBeam\Lax\Parser\HTML\Feed($input, $type, $url);
|
||||
if ($exp instanceof \Exception) {
|
||||
$this->expectExceptionObject($exp);
|
||||
$p->parse();
|
||||
} else {
|
||||
$act = $p->parse();
|
||||
$this->assertEquals($exp, $act);
|
||||
}
|
||||
}
|
||||
|
||||
public function provideHTML(): iterable {
|
||||
return $this->provideParserTests(__DIR__."/HTML/*.yaml");
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue