Browse Source

First stab at high-level parsing

Whilethis is simple, it will not work due to embedded XPath in
parameters in some Instapaper directives which are not documented by
FiveFilters. A nominally XPath-aware parser will be required.
master
J. King 2 years ago
parent
commit
2f6366648b
  1. 48
      lib/SitePattern.php
  2. 10
      lib/SitePatternException.php

48
lib/SitePattern.php

@ -0,0 +1,48 @@
<?php
/** @license MIT
* Copyright 2021, J. King
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\FullText;
class SitePattern {
public function __construct(string $rules) {
$lines = preg_split('/\r\n?|\n/', $rules);
foreach ($lines as $l => $d) {
if (preg_match('/^#|^\s*$/', $d)) {
// the line contains no data
continue;
} elseif (!preg_match('/^([a-z_]+)(?:\(([a-zA-Z0-9\-_]+)\))?:\s*(.+)$/', $d, $m)) {
throw new SitePatternException("Malformed data on line ".($l + 1), 1);
}
[, $directive, $param, $value] = $m;
switch ($directive) {
case "title":
case "body":
case "date":
case "author":
case "strip":
case "single_page_link":
case "single_page_link_in_feed":
case "next_page_link":
# XPath
case "strip_id_or_class":
case "strip_image_src":
case "find_string":
# string
case "replace_sring":
case "http_header":
# string with param
case "tidy":
case "prune":
case "atodetect_on_failure":
# yes/no boolean
case "test_url":
# URL
default:
# Unsupported directive
}
}
}
}

10
lib/SitePatternException.php

@ -0,0 +1,10 @@
<?php
/** @license MIT
* Copyright 2021, J. King
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\FullText;
class SitePatternException extends \Exception {
}
Loading…
Cancel
Save