From 6f2af8846c7a25a931333ebb3efbf02a42136ee7 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Mon, 13 Dec 2021 22:53:58 -0500 Subject: [PATCH] Parse XPath parameters properly; assignsome values --- lib/SitePattern.php | 93 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 87 insertions(+), 6 deletions(-) diff --git a/lib/SitePattern.php b/lib/SitePattern.php index b5ae16b..013a43b 100644 --- a/lib/SitePattern.php +++ b/lib/SitePattern.php @@ -7,16 +7,84 @@ declare(strict_types=1); namespace MensBeam\FullText; class SitePattern { + public $title = []; + public $body = []; + public $date = []; + public $author = []; + public $strip = []; + public $single_page_link = []; + public $single_page_link_in_feed = []; + public $next_page_link = []; + public function __construct(string $rules) { $lines = preg_split('/\r\n?|\n/', $rules); foreach ($lines as $l => $d) { if (preg_match('/^#|^\s*$/', $d)) { // the line contains no data continue; - } elseif (!preg_match('/^([a-z_]+)(?:\(([a-zA-Z0-9\-_]+)\))?:\s*(.+)$/', $d, $m)) { - throw new SitePatternException("Malformed data on line ".($l + 1), 1); } - [, $directive, $param, $value] = $m; + $pos = 0; + $end = strlen($d); + // consume the directive + $len = strspn($d, "abcdefghijklmnopqrstuvwxyz_"); + if (!$len) { + throw new SitePatternException("Malformed data on line ".($l + 1)." at position 1", 1); + } + $directive = substr($d, $pos, $len); + $pos += $len; + $c = @$d[$pos++]; + if ($c === ":") { + // the rest of the line is the value + $param = ""; + $value = substr($d, $pos); + } elseif ($c === "(") { + // directive has a parameter + // certain directives' parameters are XPath expressions, so we must be mindful of these and parse the value + $xpath = in_array($directive, ["move_into"]); + if (!$xpath) { + $len = strcspn($d, ")", $pos); + $param = substr($d, $pos, $len); + $pos += $len; + } else { + $param = ""; + $depth = 0; + while (true) { + $len = strcspn($d, "()'\"", $pos); + $param .= substr($d, $pos, $len); + $pos += $len; + $c = @$d[$pos++]; + if ($c === ")") { + if (!$depth) { + break; + } else { + $param .= $c; + $depth--; + } + } elseif ($c === '"') { + $len = strcspn($d, '"', $pos); + $param .+ '"'.substr($d, $pos, $len + 1); + $pos += $len + 1; + } elseif ($c === "'") { + $len = strcspn($d, "'", $pos); + $param .= "'".substr($d, $pos, $len + 1); + $pos += $len + 1; + } elseif ($c === "(") { + $param .= $c; + $depth++; + } else { + throw new SitePatternException("Malformed data on line ".($l + 1)." at position $pos", 1); + } + } + } + // after the parameter should come a colon and the value + $c = @$d[$pos++]; + if ($c !== ":") { + throw new SitePatternException("Malformed data on line ".($l + 1)." at position $pos", 1); + } + $value = trim(substr($d, $pos)); + } else { + throw new SitePatternException("Malformed data on line ".($l + 1)." at position $pos", 1); + } switch ($directive) { case "title": case "body": @@ -26,18 +94,31 @@ class SitePattern { case "single_page_link": case "single_page_link_in_feed": case "next_page_link": - # XPath + $this->$directive[] = $value; + // TODO: evaluate the XPathexpression to ensure syntactic validity + break; case "strip_id_or_class": case "strip_image_src": case "find_string": - # string + if ((preg_match('/^"/', $value) && preg_match('/"$/', $value)) || (preg_match("/^'/", $value) && preg_match("/'$/", $value))) { + $this->$directive[] = substr($value, 1, strlen($value) - 2); + } else { + $this->$directive[] = $value + } case "replace_sring": case "http_header": # string with param case "tidy": case "prune": case "atodetect_on_failure": - # yes/no boolean + if ($value === "yes") { + $this->$directive = true; + } elseif ($value === "no") { + $this->$directive = false; + } else { + throw new SitePatternException("Invalid value for boolean on line ".($l + 1), 1); + } + break; case "test_url": # URL default: