You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

143 lines
6.1 KiB

<?php
/** @license MIT
* Copyright 2021, J. King
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\FullText;
class SitePattern {
public $preParse = [];
public $cleanUp = [];
public $readOut = [];
public function __construct(string $rules) {
$lines = preg_split('/\r\n?|\n/', $rules);
$findString = null;
$findStringLine = 0;
foreach ($lines as $l => $d) {
if (preg_match('/^#|^\s*$/', $d)) {
// the line contains no data
continue;
}
$pos = 0;
$end = strlen($d);
// consume the directive
$len = strspn($d, "abcdefghijklmnopqrstuvwxyz_");
if (!$len) {
throw new SitePatternException("Malformed data on line ".($l + 1)." at position 1", 1);
}
$directive = substr($d, $pos, $len);
$pos += $len;
$c = @$d[$pos++];
if ($c === ":") {
// the rest of the line is the value
$param = "";
$value = substr($d, $pos);
} elseif ($c === "(") {
// directive has a parameter
// certain directives' parameters are XPath expressions, so we must be mindful of these and parse the value
$xpath = in_array($directive, ["move_into"]);
if (!$xpath) {
$len = strcspn($d, ")", $pos);
$param = substr($d, $pos, $len);
$pos += $len;
} else {
$param = "";
$depth = 0;
while ($pos < $end) {
$len = strcspn($d, "()'\"", $pos);
$param .= substr($d, $pos, $len);
$pos += $len;
$c = @$d[$pos++];
if ($c === ")") {
if (!$depth) {
break;
} else {
$param .= $c;
$depth--;
}
} elseif ($c === '"') {
$len = strcspn($d, '"', $pos);
$param .+ '"'.substr($d, $pos, $len + 1);
$pos += $len + 1;
} elseif ($c === "'") {
$len = strcspn($d, "'", $pos);
$param .= "'".substr($d, $pos, $len + 1);
$pos += $len + 1;
} elseif ($c === "(") {
$param .= $c;
$depth++;
} else {
throw new SitePatternException("Malformed data on line ".($l + 1)." at position $pos", 1);
}
}
}
// after the parameter should come a colon and the value
$c = @$d[$pos++];
if ($c !== ":") {
throw new SitePatternException("Malformed data on line ".($l + 1)." at position $pos", 1);
}
$value = trim(substr($d, $pos));
} else {
throw new SitePatternException("Malformed data on line ".($l + 1)." at position $pos", 1);
}
switch ($directive) {
case "title":
case "body":
case "date":
case "author":
case "strip":
case "single_page_link":
case "single_page_link_in_feed":
case "next_page_link":
$stage = ($directive === "strip") ? "cleanUp" : "readOut";
// TODO: evaluate the XPath expression to ensure syntactic validity
$this->$stage[] = [$directive, [$value]];
break;
case "strip_id_or_class":
case "strip_image_src":
case "find_string":
case "replace_sring":
// string values may be enclosed by single or double quotation marks as leading and trailing whitespace is otherwise stripped
if ((preg_match('/^"/', $value) && preg_match('/"$/', $value)) || (preg_match("/^'/", $value) && preg_match("/'$/", $value))) {
$value = substr($value, 1, strlen($value) - 2);
} else {
$value = $value;
}
if ($directive === "find_string") {
// the "find_string" directive must be matched to a later "replace_string" directive, so we keep one buffered
// a find_string which is not cleared by a replace_string is an error
if ($findString === null) {
$findString = $value;
$findStringLine = $l + 1;
} else {
throw new SitePatternException("\"find_string\" directive on line $findStringLine is not followed by a matching \"replace_string\" directive", 3);
}
} elseif ($directive === "replace_string") {
# requires special handling to match up with find_string
} else {
$this->cleanUp[] = [$directive, [$value]];
}
break;
case "http_header":
# string with param
case "tidy":
case "prune":
case "atodetect_on_failure":
if ($value === "yes") {
$value = true;
} elseif ($value === "no") {
$value = false;
} else {
throw new SitePatternException("Invalid value for boolean on line ".($l + 1), 2);
}
break;
case "test_url":
# URL
default:
# Unsupported directive
}
}
}
}