Browse Source

Add function to parse from a URL

master
J. King 11 months ago
parent
commit
c2c14c6155
  1. 37
      lib/Microformats.php
  2. 21
      tests/cases/MicroformatsTest.php
  3. 10
      tests/docroot/404.php
  4. 8
      tests/docroot/redir.php
  5. 10
      tests/docroot/utf8.php
  6. 81
      tests/server.php

37
lib/Microformats.php

@ -9,6 +9,7 @@ namespace MensBeam;
use MensBeam\HTML\Parser as HTMLParser;
use MensBeam\Microformats\Parser as MfParser;
use MensBeam\Microformats\Url;
/** A generic parser for microformats
*
@ -28,14 +29,46 @@ use MensBeam\Microformats\Parser as MfParser;
* data may be supported in future.
*/
class Microformats {
/** Parses a resource at a URL for microformats
*
* If retrieving the resource fails `null` is returned.
*
* @param string $file The resource to retrieve and parse
* @param array $options Options for the parser; please see the class documentetation for details
*/
public static function fromUrl(string $url, array $options = []): ?array {
$stream = fopen($url, "r");
if ($stream) {
$location = null;
$type = null;
$data = stream_get_contents($stream);
if ($data !== false) {
$meta = stream_get_meta_data($stream);
if ($meta && $meta['wrapper_type'] === "http") {
foreach ($meta['wrapper_data'] ?? [] as $h) {
if (preg_match('/^HTTP\//i', $h)) {
$type = null;
} elseif (preg_match('/^Location\s*:\s*(.*)/is', $h, $match)) {
$location = (string) URL::fromString($match[1], $location ?? $url);
} elseif (preg_match('/^Content-Type\s*:\s*(.*)/is', $h, $match)) {
$type = $match[1];
}
}
}
return static::fromString($data, $type ?? "", $location ?? $url, $options);
}
}
return null;
}
/** Parses a file for microformats
*
* If reading the file fails `null` is returned.
*
* While fopen wrappers can be used to open remote resources over HTTP, no
* effort is made to support this specially by reading the `Content-Type`
* header or deducing the URL. Using a proper HTTP client such as Guzzle
* is highly recommended instead.
* header or deducing the final URL. The `Microformats::fromUrl` method
* should be used for this purpose instead.
*
* @param string $file The file to read and parse
* @param string $contentType The HTTP Content-Type of the file if known, optionally with parameters

21
tests/cases/MicroformatsTest.php

@ -21,4 +21,25 @@ class MicroformatsTest extends \PHPUnit\Framework\TestCase {
public function testParseMissingFile(): void {
$this->assertNull(@Microformats::fromFile("THIS FILE DOES NOT EXIST", "", ""));
}
public function testParseRedirectedUrl(): void {
$exp = [
'items' => [
[
'type' => ["h-test"],
'properties' => [
'name' => ["Ça et là"],
'url' => ["http://localhost:8000/root.html"],
],
],
],
'rels' => [],
'rel-urls' => [],
];
$this->assertSame($exp, Microformats::fromUrl("http://localhost:8000/redir"));
}
public function testParseInvalidUrl(): void {
$this->assertNull(@Microformats::fromUrl("http://localhost:8000/404"));
}
}

10
tests/docroot/404.php

@ -0,0 +1,10 @@
<?php return [
'code' => 404,
'mime' => "text/html; charset=utf-8",
'content' => <<< HTML_DOC
<div class="h-test">
<span class="p-name">Ça et là</span>
<a class="u-url" href="root.html">Voir ici</a>
</div>
HTML_DOC,
];

8
tests/docroot/redir.php

@ -0,0 +1,8 @@
<?php return [
'code' => 302,
'mime' => "text/plain",
'content' => "",
'fields' => [
"Location: /utf8",
],
];

10
tests/docroot/utf8.php

@ -0,0 +1,10 @@
<?php return [
'code' => 200,
'mime' => "text/html; charset=utf-8",
'content' => <<< HTML_DOC
<div class="h-test">
<span class="p-name">Ça et là</span>
<a class="u-url" href="root.html">Voir ici</a>
</div>
HTML_DOC,
];

81
tests/server.php

@ -0,0 +1,81 @@
<?php
/** @license MIT
* Copyright 2017 J. King, Dustin Wilson et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Microformats;
require_once __DIR__."/bootstrap.php";
/*
This is a so-called router for the the internal PHP Web server:
<http://php.net/manual/en/features.commandline.webserver.php>
It is used to test feed parsing in a controlled environment,
answering specific requests used in tests with the data required
to pass the test.
The parameters of the responses are kept in separate files,
which include the following data:
- Response content
- Response code
- Content type
- Whether to send cache headers
- Last modified
- Any other headers
*/
ignore_user_abort(false);
ob_start();
$defaults = [ // default values for response
'code' => 200,
'content' => "",
'mime' => "application/octet-stream",
'lastMod' => time(),
'cache' => true,
'fields' => [],
];
$url = explode("?", $_SERVER['REQUEST_URI'])[0];
if ($url === "/") {
$url = "/index";
}
$base = BASE."tests".\DIRECTORY_SEPARATOR."docroot";
$test = $base.str_replace("/", \DIRECTORY_SEPARATOR, $url).".php";
if (!file_exists($test)) {
$response = [
'code' => 499,
'content' => "Test '$test' missing.",
'mime' => "application/octet-stream",
'lastMod' => time(),
'cache' => true,
'fields' => [],
];
} else {
$response = array_merge($defaults, (include $test));
}
// set the response code
http_response_code((int) $response['code']);
// if the response has a body, set the content type and (possibly) the ETag.
if (strlen((string) $response['content'])) {
header("Content-Type: ".$response['mime']);
if ($response['cache']) {
header('ETag: "'.md5($response['content']).'"');
}
}
// if caching is enabled, set the last-modified date
if ($response['cache']) {
header("Last-Modified: ".gmdate("D, d M Y H:i:s \G\M\T", $response['lastMod']));
}
// set any other specified fields verbatim
foreach ($response['fields'] as $h) {
header($h);
}
// send the content
echo $response['content'];
ob_end_flush();
Loading…
Cancel
Save