Browse Source

Tweak options

- change simpleTrim to thoroughTrim, its inverse. This makes all-false
be standard rather than default behaviour
- add a dateNormalization option, default to true. This makes testing
easier, though some patching of tests is still required
master
J. King 10 months ago
parent
commit
7486cc598d
  1. 7
      README.md
  2. 11
      lib/Microformats.php
  3. 32
      lib/Microformats/Parser.php
  4. 81
      tests/cases/StandardTest.php
  5. 4
      tests/cases/mensbeam/default-settings/datetime.json

7
README.md

@ -72,7 +72,8 @@ Since Microformats data is represented as a structure of nested arrays, some of
The parsing methods all optionally take an `$options` array as an argument. These options are all flags, either for experimental features, or for backwards-compatible features no longer used by default. The options are as followings:
| Key | Type | Default | Description
|--------------|---------|---------|------------
|---------------------|---------|---------|------------
| `dateNormalization` | Boolean | `true` | This optiona enables date and time normalization throughout microformat parsing rather than only where required by the specification
| `impliedTz` | Boolean | `false` | Time values in microformats may have an implied date associated with them taken from a prior date value in the same microformat structure. This option allows for a time zone to be implied as well, if a time does not include its time zone.
| `lang` | Boolean | `false` | This option determines whether language information is retrieved from the parsed document and included in the output, in `lang` keys. Both Microformat structures and embedded markup (`e-` property) structures are affected by this options.
| `simpleTrim` | Boolean | `false` | This option uses the "classic", simpler whitespace-trimming algorithm rather than the more aggressive one proposed for future standardization, and used by default for this algorithm. This affects both `p-` and `e-` properties.
| `lang` | Boolean | `true` | This option determines whether language information is retrieved from the parsed document and included in the output, in `lang` keys. Both Microformat structures and embedded markup (`e-` property) structures are affected by this options.
| `thoroughTrim` | Boolean | `true` | This option uses the more thorough whitespace-trimming algorithm proposed for future standardization rather than the "classic", simpler whitespace-trimming algorithm mandated by the parsing specification. This affects both `p-` and `e-` properties.

11
lib/Microformats.php

@ -18,12 +18,17 @@ use MensBeam\Microformats\Url;
* is optional. Where an $options array is a possible parameter, the following
* keys are understood:
*
* - `dateNormalization` (bool) Whether to perform date and time normalization
* throughout parsing rather than only in value-class parsing where it is
* required by the specification. True by default
* - `impliedTz` (bool) Whether to allow an implied datetime value to supply an
* implied timezone to datetimes without a timezone
* - `lang` (bool) Whether to include language information in microformat and
* rich-text structures
* - `simpleTrim` (bool) Whether to use the traditional "simple" whitespace
* trimming algorithm rather than the default, more aggressive trimming algorithm
* rich-text structures. True by default
* - `thoroughTrim` (bool) Whether to use the more thorough whitespace-trimming
* algorithm proposed for future standardization rather than the "classic",
* simpler whitespace-trimming algorithm mandated by the parsing specification.
* True by default.
*
* Currently all input is assumed to be HTML, but processing of generic XML
* data may be supported in future.

32
lib/Microformats/Parser.php

@ -16,13 +16,17 @@ use MensBeam\HTML\Parser\Serializer;
* is optional. Where an $options array is a possible parameter, the following
* keys are understood:
*
* - `dateNormalization` (bool) Whether to perform date and time normalization
* throughout parsing rather than only in value-class parsing where it is
* required by the specification. True by default
* - `impliedTz` (bool) Whether to allow an implied datetime value to supply an
* implied timezone to datetimes without a timezone
* - `lang` (bool) Whether to include language information in microformat and
* rich-text structures
* - `simpleTrim` (bool) Whether to use the traditional "simple" whitespace
* trimming algorithm rather than the default, more aggressive trimming
* algorithm
* rich-text structures. True by default
* - `thoroughTrim` (bool) Whether to use the more thorough whitespace-trimming
* algorithm proposed for future standardization rather than the "classic",
* simpler whitespace-trimming algorithm mandated by the parsing specification.
* True by default.
*/
class Parser {
/** @var array A ranking of prefixes (with 1 being least preferred) to break ties when multiple properties of the same name exist on one element */
@ -910,7 +914,6 @@ class Parser {
# element, if any).
return $this->normalizeUrl($url);
case "dt":
// NOTE: Because we perform implied date resolution we don't blindly return data from nodes; returning is done below after checks
# To parse an element for a dt-x property value (whether explicit dt-* or backcompat equivalent):
if (!$isChild && ($date = $this->getValueClassPattern($node, $prefix, $backcompatTypes, $impliedDate)) !== null) {
# parse the element for the Value Class Pattern, including the date and time parsing rules. If a value is found, then return it.
@ -928,7 +931,11 @@ class Parser {
# else return the textContent of the element after removing all leading/trailing spaces and nested <script> & <style> elements.
$date = $this->getCleanText($node, $prefix);
}
if ($this->options['dateNormalization']) {
return $this->stitchDate($this->parseDatePart($date), $impliedDate) ?? $date;
} else {
return $date;
}
case "e":
# To parse an element for a e-x property value (whether explicit "e-*" or backcompat equivalent):
# return a dictionary with two keys:
@ -1263,7 +1270,7 @@ class Parser {
* @param string $prefix The prefix of the microformat property the text is to be used for. This is only relevant for the "simple" algorithm
*/
protected function getCleanText(\DOMElement $node, string $prefix): string {
if ($this->options['simpleTrim']) {
if (!$this->options['thoroughTrim']) {
return $this->getCleanTextSimple($node, $prefix);
} else {
// https://microformats.org/wiki/textcontent-parsing
@ -1469,6 +1476,10 @@ class Parser {
return $next;
}
protected function trim(string $str): string {
return trim($str, " \r\n\t\f");
}
/** Normalizes an array of options
*
* Default values are filled in and unknown options removed
@ -1477,13 +1488,10 @@ class Parser {
*/
protected function normalizeOptions(array $options): array {
return [
'dateNormalization' => (bool) ($options['dateNormalization'] ?? true),
'impliedTz' => (bool) ($options['impliedTz'] ?? false),
'lang' => (bool) ($options['lang'] ?? false),
'simpleTrim' => (bool) ($options['simpleTrim'] ?? false),
'lang' => (bool) ($options['lang'] ?? true),
'thoroughTrim' => (bool) ($options['thoroughTrim'] ?? true),
];
}
protected function trim(string $str): string {
return trim($str, " \r\n\t\f");
}
}

81
tests/cases/StandardTest.php

@ -21,6 +21,8 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
'microformats-v1/includes/object' => "include pattern not implemented",
'microformats-v1/includes/table' => "include pattern not implemented",
'microformats-v2/rel/duplicate-rels' => "this test has a spurious newline at the beginning of a value",
'microformats-v2-unit/names/names-microformats' => "This is probably a bug in the HTML parser",
'microformats-v2-unit/nested/nested-microformat-mistyped' => "The spec may change here soon",
];
/** @dataProvider provideStandardTests */
@ -29,7 +31,8 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
$this->markTestIncomplete(self::SUPPRESSED[$name]);
}
// parse input
$act = Microformats::fromFile($path.".html", "text/html; charset=UTF-8", "http://example.com/", $options);
$base = strpos($name, "microformats-v2-unit/") === 0 ? "http://example.test/" : "http://example.com/";
$act = Microformats::fromFile($path.".html", "text/html; charset=UTF-8", $base, $options);
// read expectation data
$exp = json_decode(file_get_contents($path.".json"), true);
if ($exp) {
@ -50,30 +53,39 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
$exp = $this->fixTests($exp, $name);
} else {
// if there are no expectations we're probably developing a new test; print the output as JSON
echo Microformats::toJson($act, \JSON_PRETTY_PRINT | \JSON_UNESCAPED_SLASHES | \JSON_UNESCAPED_UNICODE);
echo Microformats::toJson($act, \JSON_PRETTY_PRINT | \JSON_UNESCAPED_SLASHES);
exit;
}
// sort both arrays
$this->ksort($exp);
$this->ksort($act);
// run comparison
if (!$exp) {
foreach ($exp['items'] as $k => $mf) {
$x = json_encode($mf, \JSON_PRETTY_PRINT | \JSON_UNESCAPED_SLASHES);
$a = json_encode($act['items'][$k] ?? new \stdClass, \JSON_PRETTY_PRINT | \JSON_UNESCAPED_SLASHES);
$types = implode(", ", $mf['type']);
$this->assertSame($x, $a, "Microformat $types does not match");
}
$this->assertSame($exp, $act);
$this->assertEquals($exp, $act);
}
public function provideStandardTests(): \Generator {
// the standard tests
yield from $this->provideTestList(\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/", ['simpleTrim' => true]);
yield from $this->provideTestList(\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/", ['thoroughTrim' => false, 'dateNormalization' => false]);
// tests from php-mf2
yield from $this->provideTestList(\MensBeam\Microformats\BASE."tests/cases/third-party/", []);
yield from $this->provideTestList(\MensBeam\Microformats\BASE."tests/cases/third-party/", ['dateNormalization' => false, 'lang' => false]);
// tests from our own corpus
yield from $this->provideTestList(\MensBeam\Microformats\BASE."tests/cases/mensbeam/default-settings/", []);
yield from $this->provideTestList(\MensBeam\Microformats\BASE."tests/cases/mensbeam/lang-true/", ['lang' => true]);
yield from $this->provideTestList(\MensBeam\Microformats\BASE."tests/cases/mensbeam/simpletrim-true/", ['simpleTrim' => true]);
yield from $this->provideTestList(\MensBeam\Microformats\BASE."tests/cases/mensbeam/thoroughtrim-false/", ['thoroughTrim' => false]);
// new unit tests, still being written
yield from $this->provideTestList(\MensBeam\Microformats\BASE."tests/cases/microformats-v2-unit/");
}
protected function provideTestList(string $set, ?array $options = null): \Generator {
if (!is_dir($set)) {
return;
}
$base = strtr(\MensBeam\Microformats\BASE."tests/cases/", "\\", "/");
if (strpos(strtr($set, "\\", "/"), $base,) !== 0) {
$base = strtr($set, "\\", "/");
@ -83,51 +95,52 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
$path = preg_replace('/\.json$/', '', $path);
$name = strtr($path, "\\", "/");
$name = str_replace($base, "", $name);
// perform some special handling for the standard unit test suite
if (!$options && preg_match('/^microformats-v2-unit\/(?!text\/)/', $name)) {
// run the test with both text trimming algorithms so that we ensure the tests pass with both
$opt = [
'thoroughTrim' => true,
'dateNormalization' => true,
];
yield "$name options:default" => [$name, $path, $opt];
$opt = [
'thoroughTrim' => false,
'dateNormalization' => false,
];
yield "$name options:standard" => [$name, $path, $opt];
} else {
yield $name => [$name, $path, $options];
}
}
}
protected function ksort(&$arr) {
foreach ($arr as &$v) {
if (is_array($v))
if (is_array($v)) {
$this->ksort($v);
}
}
ksort($arr);
}
protected function fixTests(array $exp, string $test) {
switch ($test) {
case "microformats-v1/hentry/summarycontent":
case "microformats-v2/h-entry/summarycontent":
$this->fixDates($exp['items'][0]['properties']['updated']);
break;
case "microformats-v2/h-feed/implied-title":
case "microformats-v2/h-feed/simple":
$this->fixDates($exp['items'][0]['children'][0]['properties']['updated']);
break;
case "microformats-v2/h-event/dates":
case "microformats-v2/h-event/time":
case "microformats-v1/hcalendar/time":
$this->fixDates($exp['items'][0]['properties']['start']);
break;
case "microformats-v1/hnews/minimum":
case "microformats-v1/hnews/all":
$this->fixDates($exp['items'][0]['properties']['entry'][0]['properties']['updated']);
break;
case "microformats-v1/hfeed/simple":
$this->fixDates($exp['items'][0]['children'][0]['properties']['updated']);
break;
case "microformats-v1/hcard/single":
$this->fixDates($exp['items'][0]['properties']['bday']);
$this->fixDates($exp['items'][0]['properties']['rev']);
break;
case "third-party/phpmf2/classic/fberriman":
case "third-party/phpmf2/classic/mixedroots2":
case "third-party/phpmf2/classic/hentry-tag":
$this->fixDates($exp['items'][0]['properties']['published']);
case "microformats-v2/h-event/concatenate":
case "third-party/phpmf2/classic/vevent-summary":
$this->fixDates($exp['items'][0]['properties']['start']);
$this->fixDates($exp['items'][0]['properties']['end']);
break;
case "third-party/phpmf2/vcp":
$this->fixDates($exp['items'][5]['properties']['published']);
$exp['items'][7]['properties']['published'][0] = "2013-02-01 06:01";
break;
case "third-party/phpmf2/classic/fberriman":
$exp['items'][0]['properties']['published'][0] = "2013-05-14T11:54:06+00:00";
break;
}
return $exp;
}
@ -135,8 +148,8 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
protected function fixDates(&$dateArray): void {
foreach ($dateArray as &$d) {
$d = strtr($d, "Tt", " ");
$d = preg_replace('/([+-]\d\d):(\d\d)$/', "$1$2", $d);
$d = preg_replace('/:\d\d[+-]\d\d$/', "$0000", $d);
$d = preg_replace('/([+-]\d\d)(\d\d)$/', "$1:$2", $d);
$d = preg_replace('/:\d\d[+-]\d\d$/', "$0:00", $d);
}
}
}

4
tests/cases/mensbeam/default-settings/datetime.json

@ -6,7 +6,7 @@
],
"properties": {
"test": [
"2023-06-25 09:32:12-0400"
"2023-06-25 09:32:12-04:00"
],
"name": [
"2023-06-25 09:32:12-04"
@ -19,7 +19,7 @@
],
"properties": {
"test": [
"2023-06-25 09:32:12-0400"
"2023-06-25 09:32:12-04:00"
],
"name": [
"2023-06-25 09:32:12 -04"

Loading…
Cancel
Save