Browse Source

Parse XPath parameters properly; assignsome values

master
J. King 2 years ago
parent
commit
6f2af8846c
  1. 93
      lib/SitePattern.php

93
lib/SitePattern.php

@ -7,16 +7,84 @@ declare(strict_types=1);
namespace MensBeam\FullText;
class SitePattern {
public $title = [];
public $body = [];
public $date = [];
public $author = [];
public $strip = [];
public $single_page_link = [];
public $single_page_link_in_feed = [];
public $next_page_link = [];
public function __construct(string $rules) {
$lines = preg_split('/\r\n?|\n/', $rules);
foreach ($lines as $l => $d) {
if (preg_match('/^#|^\s*$/', $d)) {
// the line contains no data
continue;
} elseif (!preg_match('/^([a-z_]+)(?:\(([a-zA-Z0-9\-_]+)\))?:\s*(.+)$/', $d, $m)) {
throw new SitePatternException("Malformed data on line ".($l + 1), 1);
}
[, $directive, $param, $value] = $m;
$pos = 0;
$end = strlen($d);
// consume the directive
$len = strspn($d, "abcdefghijklmnopqrstuvwxyz_");
if (!$len) {
throw new SitePatternException("Malformed data on line ".($l + 1)." at position 1", 1);
}
$directive = substr($d, $pos, $len);
$pos += $len;
$c = @$d[$pos++];
if ($c === ":") {
// the rest of the line is the value
$param = "";
$value = substr($d, $pos);
} elseif ($c === "(") {
// directive has a parameter
// certain directives' parameters are XPath expressions, so we must be mindful of these and parse the value
$xpath = in_array($directive, ["move_into"]);
if (!$xpath) {
$len = strcspn($d, ")", $pos);
$param = substr($d, $pos, $len);
$pos += $len;
} else {
$param = "";
$depth = 0;
while (true) {
$len = strcspn($d, "()'\"", $pos);
$param .= substr($d, $pos, $len);
$pos += $len;
$c = @$d[$pos++];
if ($c === ")") {
if (!$depth) {
break;
} else {
$param .= $c;
$depth--;
}
} elseif ($c === '"') {
$len = strcspn($d, '"', $pos);
$param .+ '"'.substr($d, $pos, $len + 1);
$pos += $len + 1;
} elseif ($c === "'") {
$len = strcspn($d, "'", $pos);
$param .= "'".substr($d, $pos, $len + 1);
$pos += $len + 1;
} elseif ($c === "(") {
$param .= $c;
$depth++;
} else {
throw new SitePatternException("Malformed data on line ".($l + 1)." at position $pos", 1);
}
}
}
// after the parameter should come a colon and the value
$c = @$d[$pos++];
if ($c !== ":") {
throw new SitePatternException("Malformed data on line ".($l + 1)." at position $pos", 1);
}
$value = trim(substr($d, $pos));
} else {
throw new SitePatternException("Malformed data on line ".($l + 1)." at position $pos", 1);
}
switch ($directive) {
case "title":
case "body":
@ -26,18 +94,31 @@ class SitePattern {
case "single_page_link":
case "single_page_link_in_feed":
case "next_page_link":
# XPath
$this->$directive[] = $value;
// TODO: evaluate the XPathexpression to ensure syntactic validity
break;
case "strip_id_or_class":
case "strip_image_src":
case "find_string":
# string
if ((preg_match('/^"/', $value) && preg_match('/"$/', $value)) || (preg_match("/^'/", $value) && preg_match("/'$/", $value))) {
$this->$directive[] = substr($value, 1, strlen($value) - 2);
} else {
$this->$directive[] = $value
}
case "replace_sring":
case "http_header":
# string with param
case "tidy":
case "prune":
case "atodetect_on_failure":
# yes/no boolean
if ($value === "yes") {
$this->$directive = true;
} elseif ($value === "no") {
$this->$directive = false;
} else {
throw new SitePatternException("Invalid value for boolean on line ".($l + 1), 1);
}
break;
case "test_url":
# URL
default:

Loading…
Cancel
Save