Browse Source
This is probbly a dead end; the reference microformats parser should probably ultimately be used insteadmaster
J. King
4 years ago
8 changed files with 274 additions and 2 deletions
@ -0,0 +1,44 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Lax\Parser\HTML; |
||||
|
|
||||
|
abstract class Construct { |
||||
|
use \MensBeam\Lax\Parser\Construct; |
||||
|
|
||||
|
/** @var \DOMDocument */ |
||||
|
protected $document; |
||||
|
/** @var \DOMXPath */ |
||||
|
protected $xpath; |
||||
|
/** @var \DOMElement */ |
||||
|
protected $subject; |
||||
|
|
||||
|
/** Retrieves an element node based on class name and optionally XPath query */ |
||||
|
protected function fetchElement(string $class, string $query = "/*", \DOMNode $context = null): ?\DOMElement { |
||||
|
$el = $this->xpath->query($query."[contains(concat(' ', normalize-space(@class), ' '), ' $class ')][1]", $context ?? $this->subject); |
||||
|
assert($el !== false, "Invalid XPath query: \"$query\""); |
||||
|
return $el->length ? $el->item(0) : null; |
||||
|
} |
||||
|
|
||||
|
public function getLang(): ?string { |
||||
|
// walk up the tree looking for the nearest language tag, preferring XML over HTML when appropriate |
||||
|
$el = $this->subject; |
||||
|
$out = null; |
||||
|
$xhtml = (bool) $el->ownerDocument->documentElement->namespaceURI; |
||||
|
do { |
||||
|
if ($xhtml) { |
||||
|
$out = trim($el->getAttributeNS("http://www.w3.org/XML/1998/namespace", "lang") ?? ""); |
||||
|
$out = strlen($out) ? $out : null; |
||||
|
} |
||||
|
if (is_null($out)) { |
||||
|
$out = trim($el->getAttribute("lang") ?? ""); |
||||
|
$out = strlen($out) ? $out : null; |
||||
|
} |
||||
|
$el = $el->parentNode; |
||||
|
} while (is_null($out) && $el); |
||||
|
return $out; |
||||
|
} |
||||
|
} |
@ -0,0 +1,124 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Lax\Parser\HTML; |
||||
|
|
||||
|
use MensBeam\Lax\Date; |
||||
|
use MensBeam\Lax\Text; |
||||
|
use MensBeam\Lax\Url; |
||||
|
use MensBeam\Lax\MimeType; |
||||
|
use MensBeam\Lax\Schedule; |
||||
|
use MensBeam\Lax\Feed as FeedStruct; |
||||
|
use MensBeam\Lax\Parser\Exception; |
||||
|
use MensBeam\Lax\Parser\XML\XPath; |
||||
|
use MensBeam\Lax\Category\Collection as CategoryCollection; |
||||
|
use MensBeam\Lax\Person\Collection as PersonCollection; |
||||
|
|
||||
|
class Feed extends Construct implements \MensBeam\Lax\Parser\Feed { |
||||
|
use \MensBeam\Lax\Parser\AbstractFeed; |
||||
|
|
||||
|
protected const LIBXML_OPTIONS = \LIBXML_BIGLINES | \LIBXML_COMPACT | \LIBXML_HTML_NODEFDTD | \LIBXML_NOCDATA | \LIBXML_NOENT | \LIBXML_NONET | \LIBXML_NOERROR | LIBXML_NOWARNING; |
||||
|
public const MIME_TYPES = [ |
||||
|
"text/html", |
||||
|
"application/xhtml+xml", |
||||
|
]; |
||||
|
|
||||
|
/** @var string */ |
||||
|
protected $data; |
||||
|
/** @var string */ |
||||
|
protected $contentType; |
||||
|
/** @var string */ |
||||
|
protected $url; |
||||
|
/** @var \DOMElement */ |
||||
|
protected $subject; |
||||
|
/** @var \DOMXpath */ |
||||
|
protected $xpath; |
||||
|
|
||||
|
/** Performs initialization of the instance */ |
||||
|
protected function init(FeedStruct $feed): FeedStruct { |
||||
|
$type = MimeType::parse($this->contentType) ?? ""; |
||||
|
if ($type && !in_array($type->essence, self::MIME_TYPES)) { |
||||
|
throw new Exception("notHTMLType"); |
||||
|
} |
||||
|
if ($type && $type->essence === "application/xhtml+xml") { |
||||
|
$this->document = new \DOMDocument; |
||||
|
if (!$this->document->loadXML($this->data, self::LIBXML_OPTIONS)) { |
||||
|
// ignore XML parsing errors; we will reparse as HTML in this case |
||||
|
$this->document = null; |
||||
|
} elseif ($this->document->documentElement->namespaceURI !== XPath::NS['html']) { |
||||
|
throw new Exception("notXHTML"); |
||||
|
} |
||||
|
} |
||||
|
if (!$this->document) { |
||||
|
$this->document = new \DOMDocument; |
||||
|
if (!$this->document->loadHTML($this->data, self::LIBXML_OPTIONS)) { |
||||
|
throw new Exception("notHTML"); // @codeCoverageIgnore |
||||
|
} |
||||
|
} |
||||
|
$this->document->documentURI = $this->url; |
||||
|
$this->xpath = new \DOMXPath($this->document); |
||||
|
$this->subject = $this->fetchElement("h-feed", "//*"); |
||||
|
if (!$this->subject) { |
||||
|
throw new Exception("notHTMLFeed"); |
||||
|
} |
||||
|
$feed->meta->url = $this->url; |
||||
|
$feed->format = "h-feed"; |
||||
|
$feed->version = "1"; |
||||
|
return $feed; |
||||
|
} |
||||
|
|
||||
|
/** {@inheritDoc} |
||||
|
* |
||||
|
* h-feeds do not have IDs, so this is always null. |
||||
|
*/ |
||||
|
public function getId(): ?string { |
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
public function getUrl(): ?Url { |
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
public function getTitle(): ?Text { |
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
public function getLink(): ?Url { |
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
public function getSummary(): ?Text { |
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
public function getDateModified(): ?Date { |
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
public function getIcon(): ?Url { |
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
public function getImage(): ?Url { |
||||
|
return null; |
||||
|
} |
||||
|
|
||||
|
public function getCategories(): CategoryCollection { |
||||
|
return new CategoryCollection; |
||||
|
} |
||||
|
|
||||
|
public function getPeople(): PersonCollection { |
||||
|
return new PersonCollection; |
||||
|
} |
||||
|
|
||||
|
public function getEntries(FeedStruct $feed): array { |
||||
|
return []; |
||||
|
} |
||||
|
|
||||
|
public function getSchedule(): Schedule { |
||||
|
return new Schedule; |
||||
|
} |
||||
|
} |
@ -0,0 +1,17 @@ |
|||||
|
Content-Type mismatch: |
||||
|
type: text/xml |
||||
|
input: > |
||||
|
<html class="h-feed"></html> |
||||
|
exception: notHTMLType |
||||
|
|
||||
|
Not a feed: |
||||
|
type: text/html |
||||
|
input: > |
||||
|
<html></html> |
||||
|
exception: notHTMLFeed |
||||
|
|
||||
|
Not an XHTML document: |
||||
|
type: application/xhtml+xml |
||||
|
input: > |
||||
|
<html></html> |
||||
|
exception: notXHTML |
@ -0,0 +1,53 @@ |
|||||
|
Basic example 1: |
||||
|
input: > |
||||
|
<html class="h-feed"></html> |
||||
|
output: |
||||
|
format: 'h-feed' |
||||
|
version: '1' |
||||
|
|
||||
|
Basic example 2: |
||||
|
input: > |
||||
|
<html class="home h-feed"></html> |
||||
|
output: |
||||
|
format: 'h-feed' |
||||
|
version: '1' |
||||
|
|
||||
|
Basic example 3: |
||||
|
input: > |
||||
|
<html class="h-feed main"></html> |
||||
|
output: |
||||
|
format: 'h-feed' |
||||
|
version: '1' |
||||
|
|
||||
|
Basic example 4: |
||||
|
input: > |
||||
|
<html class="home h-feed main"></html> |
||||
|
output: |
||||
|
format: 'h-feed' |
||||
|
version: '1' |
||||
|
|
||||
|
|
||||
|
Basic example 5: |
||||
|
input: > |
||||
|
<html class="main"> |
||||
|
<body class="h-feed"></body> |
||||
|
</html> |
||||
|
output: |
||||
|
format: 'h-feed' |
||||
|
version: '1' |
||||
|
|
||||
|
Basic XHTML example: |
||||
|
type: application/xhtml+xml |
||||
|
input: > |
||||
|
<html xmlns="http://www.w3.org/1999/xhtml" class="h-feed"></html> |
||||
|
output: |
||||
|
format: 'h-feed' |
||||
|
version: '1' |
||||
|
|
||||
|
Basic XHTML fallback example: |
||||
|
type: application/xhtml+xml |
||||
|
input: > |
||||
|
<html class="h-feed"> |
||||
|
output: |
||||
|
format: 'h-feed' |
||||
|
version: '1' |
@ -0,0 +1,28 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Lax\TestCase\Parser; |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Lax\Parser\HTML\Feed<extended> |
||||
|
*/ |
||||
|
class HTMLTest extends AbstractParserTestCase { |
||||
|
/** @dataProvider provideHTML */ |
||||
|
public function testParseAnHtmlFeed(string $input, string $type, ?string $url, $exp): void { |
||||
|
$p = new \MensBeam\Lax\Parser\HTML\Feed($input, $type, $url); |
||||
|
if ($exp instanceof \Exception) { |
||||
|
$this->expectExceptionObject($exp); |
||||
|
$p->parse(); |
||||
|
} else { |
||||
|
$act = $p->parse(); |
||||
|
$this->assertEquals($exp, $act); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
public function provideHTML(): iterable { |
||||
|
return $this->provideParserTests(__DIR__."/HTML/*.yaml"); |
||||
|
} |
||||
|
} |
Loading…
Reference in new issue