From 06601b02e65fe15678c1c4ad19f7d5c6cab96c0f Mon Sep 17 00:00:00 2001 From: "J. King" Date: Thu, 16 Dec 2021 22:54:52 -0500 Subject: [PATCH] Separate directives into processing stages --- lib/SitePattern.php | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/lib/SitePattern.php b/lib/SitePattern.php index 013a43b..7f77199 100644 --- a/lib/SitePattern.php +++ b/lib/SitePattern.php @@ -7,17 +7,14 @@ declare(strict_types=1); namespace MensBeam\FullText; class SitePattern { - public $title = []; - public $body = []; - public $date = []; - public $author = []; - public $strip = []; - public $single_page_link = []; - public $single_page_link_in_feed = []; - public $next_page_link = []; + public $preParse = []; + public $cleanUp = []; + public $readOut = []; public function __construct(string $rules) { $lines = preg_split('/\r\n?|\n/', $rules); + $findString = null; + $findStringLine = 0; foreach ($lines as $l => $d) { if (preg_match('/^#|^\s*$/', $d)) { // the line contains no data @@ -94,29 +91,45 @@ class SitePattern { case "single_page_link": case "single_page_link_in_feed": case "next_page_link": - $this->$directive[] = $value; - // TODO: evaluate the XPathexpression to ensure syntactic validity + $stage = ($directive === "strip") ? "cleanUp" : "readOut"; + // TODO: evaluate the XPath expression to ensure syntactic validity + $this->$stage[] = [$directive, [$value]]; break; case "strip_id_or_class": case "strip_image_src": case "find_string": + // string values may be enclosed by single or double quotation marks as leading and trailing whitespace is otherwise stripped if ((preg_match('/^"/', $value) && preg_match('/"$/', $value)) || (preg_match("/^'/", $value) && preg_match("/'$/", $value))) { - $this->$directive[] = substr($value, 1, strlen($value) - 2); + $value = substr($value, 1, strlen($value) - 2); } else { - $this->$directive[] = $value + $value = $value; } + if ($directive === "find_string") { + // the "find_string" directive must be matched to a later "replace_string" directive, so we keep one buffered + // a find_string which is not cleared by a replace_string is an error + if ($findString === null) { + $findString = $value; + $findStringLine = $l + 1; + } else { + throw new SitePatternException("\"find_string\" directive on line $findStringLine is not followed by a matching \"replace_string\" directive", 3); + } + } else { + $this->cleanUp[] = [$directive, [$value]]; + } + break; case "replace_sring": + # requires special handling to match up with find_string case "http_header": # string with param case "tidy": case "prune": case "atodetect_on_failure": if ($value === "yes") { - $this->$directive = true; + $value = true; } elseif ($value === "no") { - $this->$directive = false; + $value = false; } else { - throw new SitePatternException("Invalid value for boolean on line ".($l + 1), 1); + throw new SitePatternException("Invalid value for boolean on line ".($l + 1), 2); } break; case "test_url":