|
@ -7,17 +7,14 @@ declare(strict_types=1); |
|
|
namespace MensBeam\FullText; |
|
|
namespace MensBeam\FullText; |
|
|
|
|
|
|
|
|
class SitePattern { |
|
|
class SitePattern { |
|
|
public $title = []; |
|
|
public $preParse = []; |
|
|
public $body = []; |
|
|
public $cleanUp = []; |
|
|
public $date = []; |
|
|
public $readOut = []; |
|
|
public $author = []; |
|
|
|
|
|
public $strip = []; |
|
|
|
|
|
public $single_page_link = []; |
|
|
|
|
|
public $single_page_link_in_feed = []; |
|
|
|
|
|
public $next_page_link = []; |
|
|
|
|
|
|
|
|
|
|
|
public function __construct(string $rules) { |
|
|
public function __construct(string $rules) { |
|
|
$lines = preg_split('/\r\n?|\n/', $rules); |
|
|
$lines = preg_split('/\r\n?|\n/', $rules); |
|
|
|
|
|
$findString = null; |
|
|
|
|
|
$findStringLine = 0; |
|
|
foreach ($lines as $l => $d) { |
|
|
foreach ($lines as $l => $d) { |
|
|
if (preg_match('/^#|^\s*$/', $d)) { |
|
|
if (preg_match('/^#|^\s*$/', $d)) { |
|
|
// the line contains no data |
|
|
// the line contains no data |
|
@ -94,29 +91,45 @@ class SitePattern { |
|
|
case "single_page_link": |
|
|
case "single_page_link": |
|
|
case "single_page_link_in_feed": |
|
|
case "single_page_link_in_feed": |
|
|
case "next_page_link": |
|
|
case "next_page_link": |
|
|
$this->$directive[] = $value; |
|
|
$stage = ($directive === "strip") ? "cleanUp" : "readOut"; |
|
|
// TODO: evaluate the XPathexpression to ensure syntactic validity |
|
|
// TODO: evaluate the XPath expression to ensure syntactic validity |
|
|
|
|
|
$this->$stage[] = [$directive, [$value]]; |
|
|
break; |
|
|
break; |
|
|
case "strip_id_or_class": |
|
|
case "strip_id_or_class": |
|
|
case "strip_image_src": |
|
|
case "strip_image_src": |
|
|
case "find_string": |
|
|
case "find_string": |
|
|
|
|
|
// string values may be enclosed by single or double quotation marks as leading and trailing whitespace is otherwise stripped |
|
|
if ((preg_match('/^"/', $value) && preg_match('/"$/', $value)) || (preg_match("/^'/", $value) && preg_match("/'$/", $value))) { |
|
|
if ((preg_match('/^"/', $value) && preg_match('/"$/', $value)) || (preg_match("/^'/", $value) && preg_match("/'$/", $value))) { |
|
|
$this->$directive[] = substr($value, 1, strlen($value) - 2); |
|
|
$value = substr($value, 1, strlen($value) - 2); |
|
|
} else { |
|
|
} else { |
|
|
$this->$directive[] = $value |
|
|
$value = $value; |
|
|
} |
|
|
} |
|
|
|
|
|
if ($directive === "find_string") { |
|
|
|
|
|
// the "find_string" directive must be matched to a later "replace_string" directive, so we keep one buffered |
|
|
|
|
|
// a find_string which is not cleared by a replace_string is an error |
|
|
|
|
|
if ($findString === null) { |
|
|
|
|
|
$findString = $value; |
|
|
|
|
|
$findStringLine = $l + 1; |
|
|
|
|
|
} else { |
|
|
|
|
|
throw new SitePatternException("\"find_string\" directive on line $findStringLine is not followed by a matching \"replace_string\" directive", 3); |
|
|
|
|
|
} |
|
|
|
|
|
} else { |
|
|
|
|
|
$this->cleanUp[] = [$directive, [$value]]; |
|
|
|
|
|
} |
|
|
|
|
|
break; |
|
|
case "replace_sring": |
|
|
case "replace_sring": |
|
|
|
|
|
# requires special handling to match up with find_string |
|
|
case "http_header": |
|
|
case "http_header": |
|
|
# string with param |
|
|
# string with param |
|
|
case "tidy": |
|
|
case "tidy": |
|
|
case "prune": |
|
|
case "prune": |
|
|
case "atodetect_on_failure": |
|
|
case "atodetect_on_failure": |
|
|
if ($value === "yes") { |
|
|
if ($value === "yes") { |
|
|
$this->$directive = true; |
|
|
$value = true; |
|
|
} elseif ($value === "no") { |
|
|
} elseif ($value === "no") { |
|
|
$this->$directive = false; |
|
|
$value = false; |
|
|
} else { |
|
|
} else { |
|
|
throw new SitePatternException("Invalid value for boolean on line ".($l + 1), 1); |
|
|
throw new SitePatternException("Invalid value for boolean on line ".($l + 1), 2); |
|
|
} |
|
|
} |
|
|
break; |
|
|
break; |
|
|
case "test_url": |
|
|
case "test_url": |
|
|