Browse Source

Parse XPath parameters properly; assignsome values

master
J. King 2 years ago
parent
commit
6f2af8846c
  1. 93
      lib/SitePattern.php

93
lib/SitePattern.php

@ -7,16 +7,84 @@ declare(strict_types=1);
namespace MensBeam\FullText; namespace MensBeam\FullText;
class SitePattern { class SitePattern {
public $title = [];
public $body = [];
public $date = [];
public $author = [];
public $strip = [];
public $single_page_link = [];
public $single_page_link_in_feed = [];
public $next_page_link = [];
public function __construct(string $rules) { public function __construct(string $rules) {
$lines = preg_split('/\r\n?|\n/', $rules); $lines = preg_split('/\r\n?|\n/', $rules);
foreach ($lines as $l => $d) { foreach ($lines as $l => $d) {
if (preg_match('/^#|^\s*$/', $d)) { if (preg_match('/^#|^\s*$/', $d)) {
// the line contains no data // the line contains no data
continue; continue;
} elseif (!preg_match('/^([a-z_]+)(?:\(([a-zA-Z0-9\-_]+)\))?:\s*(.+)$/', $d, $m)) {
throw new SitePatternException("Malformed data on line ".($l + 1), 1);
} }
[, $directive, $param, $value] = $m; $pos = 0;
$end = strlen($d);
// consume the directive
$len = strspn($d, "abcdefghijklmnopqrstuvwxyz_");
if (!$len) {
throw new SitePatternException("Malformed data on line ".($l + 1)." at position 1", 1);
}
$directive = substr($d, $pos, $len);
$pos += $len;
$c = @$d[$pos++];
if ($c === ":") {
// the rest of the line is the value
$param = "";
$value = substr($d, $pos);
} elseif ($c === "(") {
// directive has a parameter
// certain directives' parameters are XPath expressions, so we must be mindful of these and parse the value
$xpath = in_array($directive, ["move_into"]);
if (!$xpath) {
$len = strcspn($d, ")", $pos);
$param = substr($d, $pos, $len);
$pos += $len;
} else {
$param = "";
$depth = 0;
while (true) {
$len = strcspn($d, "()'\"", $pos);
$param .= substr($d, $pos, $len);
$pos += $len;
$c = @$d[$pos++];
if ($c === ")") {
if (!$depth) {
break;
} else {
$param .= $c;
$depth--;
}
} elseif ($c === '"') {
$len = strcspn($d, '"', $pos);
$param .+ '"'.substr($d, $pos, $len + 1);
$pos += $len + 1;
} elseif ($c === "'") {
$len = strcspn($d, "'", $pos);
$param .= "'".substr($d, $pos, $len + 1);
$pos += $len + 1;
} elseif ($c === "(") {
$param .= $c;
$depth++;
} else {
throw new SitePatternException("Malformed data on line ".($l + 1)." at position $pos", 1);
}
}
}
// after the parameter should come a colon and the value
$c = @$d[$pos++];
if ($c !== ":") {
throw new SitePatternException("Malformed data on line ".($l + 1)." at position $pos", 1);
}
$value = trim(substr($d, $pos));
} else {
throw new SitePatternException("Malformed data on line ".($l + 1)." at position $pos", 1);
}
switch ($directive) { switch ($directive) {
case "title": case "title":
case "body": case "body":
@ -26,18 +94,31 @@ class SitePattern {
case "single_page_link": case "single_page_link":
case "single_page_link_in_feed": case "single_page_link_in_feed":
case "next_page_link": case "next_page_link":
# XPath $this->$directive[] = $value;
// TODO: evaluate the XPathexpression to ensure syntactic validity
break;
case "strip_id_or_class": case "strip_id_or_class":
case "strip_image_src": case "strip_image_src":
case "find_string": case "find_string":
# string if ((preg_match('/^"/', $value) && preg_match('/"$/', $value)) || (preg_match("/^'/", $value) && preg_match("/'$/", $value))) {
$this->$directive[] = substr($value, 1, strlen($value) - 2);
} else {
$this->$directive[] = $value
}
case "replace_sring": case "replace_sring":
case "http_header": case "http_header":
# string with param # string with param
case "tidy": case "tidy":
case "prune": case "prune":
case "atodetect_on_failure": case "atodetect_on_failure":
# yes/no boolean if ($value === "yes") {
$this->$directive = true;
} elseif ($value === "no") {
$this->$directive = false;
} else {
throw new SitePatternException("Invalid value for boolean on line ".($l + 1), 1);
}
break;
case "test_url": case "test_url":
# URL # URL
default: default:

Loading…
Cancel
Save