Browse Source
This is probbly a dead end; the reference microformats parser should probably ultimately be used insteadmaster
J. King
4 years ago
8 changed files with 274 additions and 2 deletions
@ -0,0 +1,44 @@ |
|||
<?php |
|||
/** @license MIT |
|||
* Copyright 2018 J. King et al. |
|||
* See LICENSE and AUTHORS files for details */ |
|||
|
|||
declare(strict_types=1); |
|||
namespace MensBeam\Lax\Parser\HTML; |
|||
|
|||
abstract class Construct { |
|||
use \MensBeam\Lax\Parser\Construct; |
|||
|
|||
/** @var \DOMDocument */ |
|||
protected $document; |
|||
/** @var \DOMXPath */ |
|||
protected $xpath; |
|||
/** @var \DOMElement */ |
|||
protected $subject; |
|||
|
|||
/** Retrieves an element node based on class name and optionally XPath query */ |
|||
protected function fetchElement(string $class, string $query = "/*", \DOMNode $context = null): ?\DOMElement { |
|||
$el = $this->xpath->query($query."[contains(concat(' ', normalize-space(@class), ' '), ' $class ')][1]", $context ?? $this->subject); |
|||
assert($el !== false, "Invalid XPath query: \"$query\""); |
|||
return $el->length ? $el->item(0) : null; |
|||
} |
|||
|
|||
public function getLang(): ?string { |
|||
// walk up the tree looking for the nearest language tag, preferring XML over HTML when appropriate |
|||
$el = $this->subject; |
|||
$out = null; |
|||
$xhtml = (bool) $el->ownerDocument->documentElement->namespaceURI; |
|||
do { |
|||
if ($xhtml) { |
|||
$out = trim($el->getAttributeNS("http://www.w3.org/XML/1998/namespace", "lang") ?? ""); |
|||
$out = strlen($out) ? $out : null; |
|||
} |
|||
if (is_null($out)) { |
|||
$out = trim($el->getAttribute("lang") ?? ""); |
|||
$out = strlen($out) ? $out : null; |
|||
} |
|||
$el = $el->parentNode; |
|||
} while (is_null($out) && $el); |
|||
return $out; |
|||
} |
|||
} |
@ -0,0 +1,124 @@ |
|||
<?php |
|||
/** @license MIT |
|||
* Copyright 2018 J. King et al. |
|||
* See LICENSE and AUTHORS files for details */ |
|||
|
|||
declare(strict_types=1); |
|||
namespace MensBeam\Lax\Parser\HTML; |
|||
|
|||
use MensBeam\Lax\Date; |
|||
use MensBeam\Lax\Text; |
|||
use MensBeam\Lax\Url; |
|||
use MensBeam\Lax\MimeType; |
|||
use MensBeam\Lax\Schedule; |
|||
use MensBeam\Lax\Feed as FeedStruct; |
|||
use MensBeam\Lax\Parser\Exception; |
|||
use MensBeam\Lax\Parser\XML\XPath; |
|||
use MensBeam\Lax\Category\Collection as CategoryCollection; |
|||
use MensBeam\Lax\Person\Collection as PersonCollection; |
|||
|
|||
class Feed extends Construct implements \MensBeam\Lax\Parser\Feed { |
|||
use \MensBeam\Lax\Parser\AbstractFeed; |
|||
|
|||
protected const LIBXML_OPTIONS = \LIBXML_BIGLINES | \LIBXML_COMPACT | \LIBXML_HTML_NODEFDTD | \LIBXML_NOCDATA | \LIBXML_NOENT | \LIBXML_NONET | \LIBXML_NOERROR | LIBXML_NOWARNING; |
|||
public const MIME_TYPES = [ |
|||
"text/html", |
|||
"application/xhtml+xml", |
|||
]; |
|||
|
|||
/** @var string */ |
|||
protected $data; |
|||
/** @var string */ |
|||
protected $contentType; |
|||
/** @var string */ |
|||
protected $url; |
|||
/** @var \DOMElement */ |
|||
protected $subject; |
|||
/** @var \DOMXpath */ |
|||
protected $xpath; |
|||
|
|||
/** Performs initialization of the instance */ |
|||
protected function init(FeedStruct $feed): FeedStruct { |
|||
$type = MimeType::parse($this->contentType) ?? ""; |
|||
if ($type && !in_array($type->essence, self::MIME_TYPES)) { |
|||
throw new Exception("notHTMLType"); |
|||
} |
|||
if ($type && $type->essence === "application/xhtml+xml") { |
|||
$this->document = new \DOMDocument; |
|||
if (!$this->document->loadXML($this->data, self::LIBXML_OPTIONS)) { |
|||
// ignore XML parsing errors; we will reparse as HTML in this case |
|||
$this->document = null; |
|||
} elseif ($this->document->documentElement->namespaceURI !== XPath::NS['html']) { |
|||
throw new Exception("notXHTML"); |
|||
} |
|||
} |
|||
if (!$this->document) { |
|||
$this->document = new \DOMDocument; |
|||
if (!$this->document->loadHTML($this->data, self::LIBXML_OPTIONS)) { |
|||
throw new Exception("notHTML"); // @codeCoverageIgnore |
|||
} |
|||
} |
|||
$this->document->documentURI = $this->url; |
|||
$this->xpath = new \DOMXPath($this->document); |
|||
$this->subject = $this->fetchElement("h-feed", "//*"); |
|||
if (!$this->subject) { |
|||
throw new Exception("notHTMLFeed"); |
|||
} |
|||
$feed->meta->url = $this->url; |
|||
$feed->format = "h-feed"; |
|||
$feed->version = "1"; |
|||
return $feed; |
|||
} |
|||
|
|||
/** {@inheritDoc} |
|||
* |
|||
* h-feeds do not have IDs, so this is always null. |
|||
*/ |
|||
public function getId(): ?string { |
|||
return null; |
|||
} |
|||
|
|||
public function getUrl(): ?Url { |
|||
return null; |
|||
} |
|||
|
|||
public function getTitle(): ?Text { |
|||
return null; |
|||
} |
|||
|
|||
public function getLink(): ?Url { |
|||
return null; |
|||
} |
|||
|
|||
public function getSummary(): ?Text { |
|||
return null; |
|||
} |
|||
|
|||
public function getDateModified(): ?Date { |
|||
return null; |
|||
} |
|||
|
|||
public function getIcon(): ?Url { |
|||
return null; |
|||
} |
|||
|
|||
public function getImage(): ?Url { |
|||
return null; |
|||
} |
|||
|
|||
public function getCategories(): CategoryCollection { |
|||
return new CategoryCollection; |
|||
} |
|||
|
|||
public function getPeople(): PersonCollection { |
|||
return new PersonCollection; |
|||
} |
|||
|
|||
public function getEntries(FeedStruct $feed): array { |
|||
return []; |
|||
} |
|||
|
|||
public function getSchedule(): Schedule { |
|||
return new Schedule; |
|||
} |
|||
} |
@ -0,0 +1,17 @@ |
|||
Content-Type mismatch: |
|||
type: text/xml |
|||
input: > |
|||
<html class="h-feed"></html> |
|||
exception: notHTMLType |
|||
|
|||
Not a feed: |
|||
type: text/html |
|||
input: > |
|||
<html></html> |
|||
exception: notHTMLFeed |
|||
|
|||
Not an XHTML document: |
|||
type: application/xhtml+xml |
|||
input: > |
|||
<html></html> |
|||
exception: notXHTML |
@ -0,0 +1,53 @@ |
|||
Basic example 1: |
|||
input: > |
|||
<html class="h-feed"></html> |
|||
output: |
|||
format: 'h-feed' |
|||
version: '1' |
|||
|
|||
Basic example 2: |
|||
input: > |
|||
<html class="home h-feed"></html> |
|||
output: |
|||
format: 'h-feed' |
|||
version: '1' |
|||
|
|||
Basic example 3: |
|||
input: > |
|||
<html class="h-feed main"></html> |
|||
output: |
|||
format: 'h-feed' |
|||
version: '1' |
|||
|
|||
Basic example 4: |
|||
input: > |
|||
<html class="home h-feed main"></html> |
|||
output: |
|||
format: 'h-feed' |
|||
version: '1' |
|||
|
|||
|
|||
Basic example 5: |
|||
input: > |
|||
<html class="main"> |
|||
<body class="h-feed"></body> |
|||
</html> |
|||
output: |
|||
format: 'h-feed' |
|||
version: '1' |
|||
|
|||
Basic XHTML example: |
|||
type: application/xhtml+xml |
|||
input: > |
|||
<html xmlns="http://www.w3.org/1999/xhtml" class="h-feed"></html> |
|||
output: |
|||
format: 'h-feed' |
|||
version: '1' |
|||
|
|||
Basic XHTML fallback example: |
|||
type: application/xhtml+xml |
|||
input: > |
|||
<html class="h-feed"> |
|||
output: |
|||
format: 'h-feed' |
|||
version: '1' |
@ -0,0 +1,28 @@ |
|||
<?php |
|||
/** @license MIT |
|||
* Copyright 2018 J. King |
|||
* See LICENSE and AUTHORS files for details */ |
|||
|
|||
declare(strict_types=1); |
|||
namespace MensBeam\Lax\TestCase\Parser; |
|||
|
|||
/** |
|||
* @covers MensBeam\Lax\Parser\HTML\Feed<extended> |
|||
*/ |
|||
class HTMLTest extends AbstractParserTestCase { |
|||
/** @dataProvider provideHTML */ |
|||
public function testParseAnHtmlFeed(string $input, string $type, ?string $url, $exp): void { |
|||
$p = new \MensBeam\Lax\Parser\HTML\Feed($input, $type, $url); |
|||
if ($exp instanceof \Exception) { |
|||
$this->expectExceptionObject($exp); |
|||
$p->parse(); |
|||
} else { |
|||
$act = $p->parse(); |
|||
$this->assertEquals($exp, $act); |
|||
} |
|||
} |
|||
|
|||
public function provideHTML(): iterable { |
|||
return $this->provideParserTests(__DIR__."/HTML/*.yaml"); |
|||
} |
|||
} |
Loading…
Reference in new issue