|
|
@ -7,16 +7,84 @@ declare(strict_types=1); |
|
|
|
namespace MensBeam\FullText; |
|
|
|
|
|
|
|
class SitePattern { |
|
|
|
public $title = []; |
|
|
|
public $body = []; |
|
|
|
public $date = []; |
|
|
|
public $author = []; |
|
|
|
public $strip = []; |
|
|
|
public $single_page_link = []; |
|
|
|
public $single_page_link_in_feed = []; |
|
|
|
public $next_page_link = []; |
|
|
|
|
|
|
|
public function __construct(string $rules) { |
|
|
|
$lines = preg_split('/\r\n?|\n/', $rules); |
|
|
|
foreach ($lines as $l => $d) { |
|
|
|
if (preg_match('/^#|^\s*$/', $d)) { |
|
|
|
// the line contains no data |
|
|
|
continue; |
|
|
|
} elseif (!preg_match('/^([a-z_]+)(?:\(([a-zA-Z0-9\-_]+)\))?:\s*(.+)$/', $d, $m)) { |
|
|
|
throw new SitePatternException("Malformed data on line ".($l + 1), 1); |
|
|
|
} |
|
|
|
[, $directive, $param, $value] = $m; |
|
|
|
$pos = 0; |
|
|
|
$end = strlen($d); |
|
|
|
// consume the directive |
|
|
|
$len = strspn($d, "abcdefghijklmnopqrstuvwxyz_"); |
|
|
|
if (!$len) { |
|
|
|
throw new SitePatternException("Malformed data on line ".($l + 1)." at position 1", 1); |
|
|
|
} |
|
|
|
$directive = substr($d, $pos, $len); |
|
|
|
$pos += $len; |
|
|
|
$c = @$d[$pos++]; |
|
|
|
if ($c === ":") { |
|
|
|
// the rest of the line is the value |
|
|
|
$param = ""; |
|
|
|
$value = substr($d, $pos); |
|
|
|
} elseif ($c === "(") { |
|
|
|
// directive has a parameter |
|
|
|
// certain directives' parameters are XPath expressions, so we must be mindful of these and parse the value |
|
|
|
$xpath = in_array($directive, ["move_into"]); |
|
|
|
if (!$xpath) { |
|
|
|
$len = strcspn($d, ")", $pos); |
|
|
|
$param = substr($d, $pos, $len); |
|
|
|
$pos += $len; |
|
|
|
} else { |
|
|
|
$param = ""; |
|
|
|
$depth = 0; |
|
|
|
while (true) { |
|
|
|
$len = strcspn($d, "()'\"", $pos); |
|
|
|
$param .= substr($d, $pos, $len); |
|
|
|
$pos += $len; |
|
|
|
$c = @$d[$pos++]; |
|
|
|
if ($c === ")") { |
|
|
|
if (!$depth) { |
|
|
|
break; |
|
|
|
} else { |
|
|
|
$param .= $c; |
|
|
|
$depth--; |
|
|
|
} |
|
|
|
} elseif ($c === '"') { |
|
|
|
$len = strcspn($d, '"', $pos); |
|
|
|
$param .+ '"'.substr($d, $pos, $len + 1); |
|
|
|
$pos += $len + 1; |
|
|
|
} elseif ($c === "'") { |
|
|
|
$len = strcspn($d, "'", $pos); |
|
|
|
$param .= "'".substr($d, $pos, $len + 1); |
|
|
|
$pos += $len + 1; |
|
|
|
} elseif ($c === "(") { |
|
|
|
$param .= $c; |
|
|
|
$depth++; |
|
|
|
} else { |
|
|
|
throw new SitePatternException("Malformed data on line ".($l + 1)." at position $pos", 1); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
// after the parameter should come a colon and the value |
|
|
|
$c = @$d[$pos++]; |
|
|
|
if ($c !== ":") { |
|
|
|
throw new SitePatternException("Malformed data on line ".($l + 1)." at position $pos", 1); |
|
|
|
} |
|
|
|
$value = trim(substr($d, $pos)); |
|
|
|
} else { |
|
|
|
throw new SitePatternException("Malformed data on line ".($l + 1)." at position $pos", 1); |
|
|
|
} |
|
|
|
switch ($directive) { |
|
|
|
case "title": |
|
|
|
case "body": |
|
|
@ -26,18 +94,31 @@ class SitePattern { |
|
|
|
case "single_page_link": |
|
|
|
case "single_page_link_in_feed": |
|
|
|
case "next_page_link": |
|
|
|
# XPath |
|
|
|
$this->$directive[] = $value; |
|
|
|
// TODO: evaluate the XPathexpression to ensure syntactic validity |
|
|
|
break; |
|
|
|
case "strip_id_or_class": |
|
|
|
case "strip_image_src": |
|
|
|
case "find_string": |
|
|
|
# string |
|
|
|
if ((preg_match('/^"/', $value) && preg_match('/"$/', $value)) || (preg_match("/^'/", $value) && preg_match("/'$/", $value))) { |
|
|
|
$this->$directive[] = substr($value, 1, strlen($value) - 2); |
|
|
|
} else { |
|
|
|
$this->$directive[] = $value |
|
|
|
} |
|
|
|
case "replace_sring": |
|
|
|
case "http_header": |
|
|
|
# string with param |
|
|
|
case "tidy": |
|
|
|
case "prune": |
|
|
|
case "atodetect_on_failure": |
|
|
|
# yes/no boolean |
|
|
|
if ($value === "yes") { |
|
|
|
$this->$directive = true; |
|
|
|
} elseif ($value === "no") { |
|
|
|
$this->$directive = false; |
|
|
|
} else { |
|
|
|
throw new SitePatternException("Invalid value for boolean on line ".($l + 1), 1); |
|
|
|
} |
|
|
|
break; |
|
|
|
case "test_url": |
|
|
|
# URL |
|
|
|
default: |
|
|
|