Browse Source

Tweak options

- change simpleTrim to thoroughTrim, its inverse. This makes all-false
be standard rather than default behaviour
- add a dateNormalization option, default to true. This makes testing
easier, though some patching of tests is still required
master
J. King 10 months ago
parent
commit
7486cc598d
  1. 11
      README.md
  2. 15
      lib/Microformats.php
  3. 36
      lib/Microformats/Parser.php
  4. 97
      tests/cases/StandardTest.php
  5. 4
      tests/cases/mensbeam/default-settings/datetime.json

11
README.md

@ -71,8 +71,9 @@ Since Microformats data is represented as a structure of nested arrays, some of
The parsing methods all optionally take an `$options` array as an argument. These options are all flags, either for experimental features, or for backwards-compatible features no longer used by default. The options are as followings:
| Key | Type | Default | Description
|--------------|---------|---------|------------
| `impliedTz` | Boolean | `false` | Time values in microformats may have an implied date associated with them taken from a prior date value in the same microformat structure. This option allows for a time zone to be implied as well, if a time does not include its time zone.
| `lang` | Boolean | `false` | This option determines whether language information is retrieved from the parsed document and included in the output, in `lang` keys. Both Microformat structures and embedded markup (`e-` property) structures are affected by this options.
| `simpleTrim` | Boolean | `false` | This option uses the "classic", simpler whitespace-trimming algorithm rather than the more aggressive one proposed for future standardization, and used by default for this algorithm. This affects both `p-` and `e-` properties.
| Key | Type | Default | Description
|---------------------|---------|---------|------------
| `dateNormalization` | Boolean | `true` | This optiona enables date and time normalization throughout microformat parsing rather than only where required by the specification
| `impliedTz` | Boolean | `false` | Time values in microformats may have an implied date associated with them taken from a prior date value in the same microformat structure. This option allows for a time zone to be implied as well, if a time does not include its time zone.
| `lang` | Boolean | `true` | This option determines whether language information is retrieved from the parsed document and included in the output, in `lang` keys. Both Microformat structures and embedded markup (`e-` property) structures are affected by this options.
| `thoroughTrim` | Boolean | `true` | This option uses the more thorough whitespace-trimming algorithm proposed for future standardization rather than the "classic", simpler whitespace-trimming algorithm mandated by the parsing specification. This affects both `p-` and `e-` properties.

15
lib/Microformats.php

@ -17,14 +17,19 @@ use MensBeam\Microformats\Url;
* so-called "classic" or "backcompat" Microformats. Some of its functionality
* is optional. Where an $options array is a possible parameter, the following
* keys are understood:
*
*
* - `dateNormalization` (bool) Whether to perform date and time normalization
* throughout parsing rather than only in value-class parsing where it is
* required by the specification. True by default
* - `impliedTz` (bool) Whether to allow an implied datetime value to supply an
* implied timezone to datetimes without a timezone
* - `lang` (bool) Whether to include language information in microformat and
* rich-text structures
* - `simpleTrim` (bool) Whether to use the traditional "simple" whitespace
* trimming algorithm rather than the default, more aggressive trimming algorithm
*
* rich-text structures. True by default
* - `thoroughTrim` (bool) Whether to use the more thorough whitespace-trimming
* algorithm proposed for future standardization rather than the "classic",
* simpler whitespace-trimming algorithm mandated by the parsing specification.
* True by default.
*
* Currently all input is assumed to be HTML, but processing of generic XML
* data may be supported in future.
*/

36
lib/Microformats/Parser.php

@ -16,13 +16,17 @@ use MensBeam\HTML\Parser\Serializer;
* is optional. Where an $options array is a possible parameter, the following
* keys are understood:
*
* - `dateNormalization` (bool) Whether to perform date and time normalization
* throughout parsing rather than only in value-class parsing where it is
* required by the specification. True by default
* - `impliedTz` (bool) Whether to allow an implied datetime value to supply an
* implied timezone to datetimes without a timezone
* - `lang` (bool) Whether to include language information in microformat and
* rich-text structures
* - `simpleTrim` (bool) Whether to use the traditional "simple" whitespace
* trimming algorithm rather than the default, more aggressive trimming
* algorithm
* rich-text structures. True by default
* - `thoroughTrim` (bool) Whether to use the more thorough whitespace-trimming
* algorithm proposed for future standardization rather than the "classic",
* simpler whitespace-trimming algorithm mandated by the parsing specification.
* True by default.
*/
class Parser {
/** @var array A ranking of prefixes (with 1 being least preferred) to break ties when multiple properties of the same name exist on one element */
@ -910,7 +914,6 @@ class Parser {
# element, if any).
return $this->normalizeUrl($url);
case "dt":
// NOTE: Because we perform implied date resolution we don't blindly return data from nodes; returning is done below after checks
# To parse an element for a dt-x property value (whether explicit dt-* or backcompat equivalent):
if (!$isChild && ($date = $this->getValueClassPattern($node, $prefix, $backcompatTypes, $impliedDate)) !== null) {
# parse the element for the Value Class Pattern, including the date and time parsing rules. If a value is found, then return it.
@ -928,7 +931,11 @@ class Parser {
# else return the textContent of the element after removing all leading/trailing spaces and nested <script> & <style> elements.
$date = $this->getCleanText($node, $prefix);
}
return $this->stitchDate($this->parseDatePart($date), $impliedDate) ?? $date;
if ($this->options['dateNormalization']) {
return $this->stitchDate($this->parseDatePart($date), $impliedDate) ?? $date;
} else {
return $date;
}
case "e":
# To parse an element for a e-x property value (whether explicit "e-*" or backcompat equivalent):
# return a dictionary with two keys:
@ -1263,7 +1270,7 @@ class Parser {
* @param string $prefix The prefix of the microformat property the text is to be used for. This is only relevant for the "simple" algorithm
*/
protected function getCleanText(\DOMElement $node, string $prefix): string {
if ($this->options['simpleTrim']) {
if (!$this->options['thoroughTrim']) {
return $this->getCleanTextSimple($node, $prefix);
} else {
// https://microformats.org/wiki/textcontent-parsing
@ -1469,6 +1476,10 @@ class Parser {
return $next;
}
protected function trim(string $str): string {
return trim($str, " \r\n\t\f");
}
/** Normalizes an array of options
*
* Default values are filled in and unknown options removed
@ -1477,13 +1488,10 @@ class Parser {
*/
protected function normalizeOptions(array $options): array {
return [
'impliedTz' => (bool) ($options['impliedTz'] ?? false),
'lang' => (bool) ($options['lang'] ?? false),
'simpleTrim' => (bool) ($options['simpleTrim'] ?? false),
'dateNormalization' => (bool) ($options['dateNormalization'] ?? true),
'impliedTz' => (bool) ($options['impliedTz'] ?? false),
'lang' => (bool) ($options['lang'] ?? true),
'thoroughTrim' => (bool) ($options['thoroughTrim'] ?? true),
];
}
protected function trim(string $str): string {
return trim($str, " \r\n\t\f");
}
}

97
tests/cases/StandardTest.php

@ -14,13 +14,15 @@ use MensBeam\Microformats;
*/
class StandardTest extends \PHPUnit\Framework\TestCase {
protected const SUPPRESSED = [
'microformats-v1/hcard/multiple' => "whether vcard keys are p- or u- is unclear",
'microformats-v1/includes/hcarditemref' => "include pattern not implemented",
'microformats-v1/includes/heventitemref' => "include pattern not implemented",
'microformats-v1/includes/hyperlink' => "include pattern not implemented",
'microformats-v1/includes/object' => "include pattern not implemented",
'microformats-v1/includes/table' => "include pattern not implemented",
'microformats-v2/rel/duplicate-rels' => "this test has a spurious newline at the beginning of a value",
'microformats-v1/hcard/multiple' => "whether vcard keys are p- or u- is unclear",
'microformats-v1/includes/hcarditemref' => "include pattern not implemented",
'microformats-v1/includes/heventitemref' => "include pattern not implemented",
'microformats-v1/includes/hyperlink' => "include pattern not implemented",
'microformats-v1/includes/object' => "include pattern not implemented",
'microformats-v1/includes/table' => "include pattern not implemented",
'microformats-v2/rel/duplicate-rels' => "this test has a spurious newline at the beginning of a value",
'microformats-v2-unit/names/names-microformats' => "This is probably a bug in the HTML parser",
'microformats-v2-unit/nested/nested-microformat-mistyped' => "The spec may change here soon",
];
/** @dataProvider provideStandardTests */
@ -29,7 +31,8 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
$this->markTestIncomplete(self::SUPPRESSED[$name]);
}
// parse input
$act = Microformats::fromFile($path.".html", "text/html; charset=UTF-8", "http://example.com/", $options);
$base = strpos($name, "microformats-v2-unit/") === 0 ? "http://example.test/" : "http://example.com/";
$act = Microformats::fromFile($path.".html", "text/html; charset=UTF-8", $base, $options);
// read expectation data
$exp = json_decode(file_get_contents($path.".json"), true);
if ($exp) {
@ -50,30 +53,39 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
$exp = $this->fixTests($exp, $name);
} else {
// if there are no expectations we're probably developing a new test; print the output as JSON
echo Microformats::toJson($act, \JSON_PRETTY_PRINT | \JSON_UNESCAPED_SLASHES | \JSON_UNESCAPED_UNICODE);
echo Microformats::toJson($act, \JSON_PRETTY_PRINT | \JSON_UNESCAPED_SLASHES);
exit;
}
// sort both arrays
$this->ksort($exp);
$this->ksort($act);
// run comparison
if (!$exp) {
foreach ($exp['items'] as $k => $mf) {
$x = json_encode($mf, \JSON_PRETTY_PRINT | \JSON_UNESCAPED_SLASHES);
$a = json_encode($act['items'][$k] ?? new \stdClass, \JSON_PRETTY_PRINT | \JSON_UNESCAPED_SLASHES);
$types = implode(", ", $mf['type']);
$this->assertSame($x, $a, "Microformat $types does not match");
}
$this->assertSame($exp, $act);
$this->assertEquals($exp, $act);
}
public function provideStandardTests(): \Generator {
// the standard tests
yield from $this->provideTestList(\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/", ['simpleTrim' => true]);
yield from $this->provideTestList(\MensBeam\Microformats\BASE."vendor-bin/phpunit/vendor/mf2/tests/tests/", ['thoroughTrim' => false, 'dateNormalization' => false]);
// tests from php-mf2
yield from $this->provideTestList(\MensBeam\Microformats\BASE."tests/cases/third-party/", []);
yield from $this->provideTestList(\MensBeam\Microformats\BASE."tests/cases/third-party/", ['dateNormalization' => false, 'lang' => false]);
// tests from our own corpus
yield from $this->provideTestList(\MensBeam\Microformats\BASE."tests/cases/mensbeam/default-settings/", []);
yield from $this->provideTestList(\MensBeam\Microformats\BASE."tests/cases/mensbeam/lang-true/", ['lang' => true]);
yield from $this->provideTestList(\MensBeam\Microformats\BASE."tests/cases/mensbeam/simpletrim-true/", ['simpleTrim' => true]);
yield from $this->provideTestList(\MensBeam\Microformats\BASE."tests/cases/mensbeam/thoroughtrim-false/", ['thoroughTrim' => false]);
// new unit tests, still being written
yield from $this->provideTestList(\MensBeam\Microformats\BASE."tests/cases/microformats-v2-unit/");
}
protected function provideTestList(string $set, ?array $options = null): \Generator {
if (!is_dir($set)) {
return;
}
$base = strtr(\MensBeam\Microformats\BASE."tests/cases/", "\\", "/");
if (strpos(strtr($set, "\\", "/"), $base,) !== 0) {
$base = strtr($set, "\\", "/");
@ -83,51 +95,52 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
$path = preg_replace('/\.json$/', '', $path);
$name = strtr($path, "\\", "/");
$name = str_replace($base, "", $name);
yield $name => [$name, $path, $options];
// perform some special handling for the standard unit test suite
if (!$options && preg_match('/^microformats-v2-unit\/(?!text\/)/', $name)) {
// run the test with both text trimming algorithms so that we ensure the tests pass with both
$opt = [
'thoroughTrim' => true,
'dateNormalization' => true,
];
yield "$name options:default" => [$name, $path, $opt];
$opt = [
'thoroughTrim' => false,
'dateNormalization' => false,
];
yield "$name options:standard" => [$name, $path, $opt];
} else {
yield $name => [$name, $path, $options];
}
}
}
protected function ksort(&$arr) {
foreach ($arr as &$v) {
if (is_array($v))
if (is_array($v)) {
$this->ksort($v);
}
}
ksort($arr);
}
protected function fixTests(array $exp, string $test) {
switch ($test) {
case "microformats-v1/hentry/summarycontent":
case "microformats-v2/h-entry/summarycontent":
$this->fixDates($exp['items'][0]['properties']['updated']);
break;
case "microformats-v2/h-feed/implied-title":
case "microformats-v2/h-feed/simple":
$this->fixDates($exp['items'][0]['children'][0]['properties']['updated']);
break;
case "microformats-v2/h-event/dates":
case "microformats-v2/h-event/time":
case "microformats-v1/hcalendar/time":
$this->fixDates($exp['items'][0]['properties']['start']);
break;
case "microformats-v1/hnews/minimum":
case "microformats-v1/hnews/all":
$this->fixDates($exp['items'][0]['properties']['entry'][0]['properties']['updated']);
break;
case "microformats-v1/hfeed/simple":
$this->fixDates($exp['items'][0]['children'][0]['properties']['updated']);
break;
case "microformats-v1/hcard/single":
$this->fixDates($exp['items'][0]['properties']['bday']);
$this->fixDates($exp['items'][0]['properties']['rev']);
break;
case "third-party/phpmf2/classic/fberriman":
case "third-party/phpmf2/classic/mixedroots2":
case "third-party/phpmf2/classic/hentry-tag":
$this->fixDates($exp['items'][0]['properties']['published']);
case "microformats-v2/h-event/concatenate":
case "third-party/phpmf2/classic/vevent-summary":
$this->fixDates($exp['items'][0]['properties']['start']);
$this->fixDates($exp['items'][0]['properties']['end']);
break;
case "third-party/phpmf2/vcp":
$this->fixDates($exp['items'][5]['properties']['published']);
$exp['items'][7]['properties']['published'][0] = "2013-02-01 06:01";
break;
case "third-party/phpmf2/classic/fberriman":
$exp['items'][0]['properties']['published'][0] = "2013-05-14T11:54:06+00:00";
break;
}
return $exp;
}
@ -135,8 +148,8 @@ class StandardTest extends \PHPUnit\Framework\TestCase {
protected function fixDates(&$dateArray): void {
foreach ($dateArray as &$d) {
$d = strtr($d, "Tt", " ");
$d = preg_replace('/([+-]\d\d):(\d\d)$/', "$1$2", $d);
$d = preg_replace('/:\d\d[+-]\d\d$/', "$0000", $d);
$d = preg_replace('/([+-]\d\d)(\d\d)$/', "$1:$2", $d);
$d = preg_replace('/:\d\d[+-]\d\d$/', "$0:00", $d);
}
}
}

4
tests/cases/mensbeam/default-settings/datetime.json

@ -6,7 +6,7 @@
],
"properties": {
"test": [
"2023-06-25 09:32:12-0400"
"2023-06-25 09:32:12-04:00"
],
"name": [
"2023-06-25 09:32:12-04"
@ -19,7 +19,7 @@
],
"properties": {
"test": [
"2023-06-25 09:32:12-0400"
"2023-06-25 09:32:12-04:00"
],
"name": [
"2023-06-25 09:32:12 -04"

Loading…
Cancel
Save