You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
129 lines
5.1 KiB
129 lines
5.1 KiB
<?php
|
|
/** @license MIT
|
|
* Copyright 2021, J. King
|
|
* See LICENSE and AUTHORS files for details */
|
|
|
|
declare(strict_types=1);
|
|
namespace MensBeam\FullText;
|
|
|
|
class SitePattern {
|
|
public $title = [];
|
|
public $body = [];
|
|
public $date = [];
|
|
public $author = [];
|
|
public $strip = [];
|
|
public $single_page_link = [];
|
|
public $single_page_link_in_feed = [];
|
|
public $next_page_link = [];
|
|
|
|
public function __construct(string $rules) {
|
|
$lines = preg_split('/\r\n?|\n/', $rules);
|
|
foreach ($lines as $l => $d) {
|
|
if (preg_match('/^#|^\s*$/', $d)) {
|
|
// the line contains no data
|
|
continue;
|
|
}
|
|
$pos = 0;
|
|
$end = strlen($d);
|
|
// consume the directive
|
|
$len = strspn($d, "abcdefghijklmnopqrstuvwxyz_");
|
|
if (!$len) {
|
|
throw new SitePatternException("Malformed data on line ".($l + 1)." at position 1", 1);
|
|
}
|
|
$directive = substr($d, $pos, $len);
|
|
$pos += $len;
|
|
$c = @$d[$pos++];
|
|
if ($c === ":") {
|
|
// the rest of the line is the value
|
|
$param = "";
|
|
$value = substr($d, $pos);
|
|
} elseif ($c === "(") {
|
|
// directive has a parameter
|
|
// certain directives' parameters are XPath expressions, so we must be mindful of these and parse the value
|
|
$xpath = in_array($directive, ["move_into"]);
|
|
if (!$xpath) {
|
|
$len = strcspn($d, ")", $pos);
|
|
$param = substr($d, $pos, $len);
|
|
$pos += $len;
|
|
} else {
|
|
$param = "";
|
|
$depth = 0;
|
|
while (true) {
|
|
$len = strcspn($d, "()'\"", $pos);
|
|
$param .= substr($d, $pos, $len);
|
|
$pos += $len;
|
|
$c = @$d[$pos++];
|
|
if ($c === ")") {
|
|
if (!$depth) {
|
|
break;
|
|
} else {
|
|
$param .= $c;
|
|
$depth--;
|
|
}
|
|
} elseif ($c === '"') {
|
|
$len = strcspn($d, '"', $pos);
|
|
$param .+ '"'.substr($d, $pos, $len + 1);
|
|
$pos += $len + 1;
|
|
} elseif ($c === "'") {
|
|
$len = strcspn($d, "'", $pos);
|
|
$param .= "'".substr($d, $pos, $len + 1);
|
|
$pos += $len + 1;
|
|
} elseif ($c === "(") {
|
|
$param .= $c;
|
|
$depth++;
|
|
} else {
|
|
throw new SitePatternException("Malformed data on line ".($l + 1)." at position $pos", 1);
|
|
}
|
|
}
|
|
}
|
|
// after the parameter should come a colon and the value
|
|
$c = @$d[$pos++];
|
|
if ($c !== ":") {
|
|
throw new SitePatternException("Malformed data on line ".($l + 1)." at position $pos", 1);
|
|
}
|
|
$value = trim(substr($d, $pos));
|
|
} else {
|
|
throw new SitePatternException("Malformed data on line ".($l + 1)." at position $pos", 1);
|
|
}
|
|
switch ($directive) {
|
|
case "title":
|
|
case "body":
|
|
case "date":
|
|
case "author":
|
|
case "strip":
|
|
case "single_page_link":
|
|
case "single_page_link_in_feed":
|
|
case "next_page_link":
|
|
$this->$directive[] = $value;
|
|
// TODO: evaluate the XPathexpression to ensure syntactic validity
|
|
break;
|
|
case "strip_id_or_class":
|
|
case "strip_image_src":
|
|
case "find_string":
|
|
if ((preg_match('/^"/', $value) && preg_match('/"$/', $value)) || (preg_match("/^'/", $value) && preg_match("/'$/", $value))) {
|
|
$this->$directive[] = substr($value, 1, strlen($value) - 2);
|
|
} else {
|
|
$this->$directive[] = $value
|
|
}
|
|
case "replace_sring":
|
|
case "http_header":
|
|
# string with param
|
|
case "tidy":
|
|
case "prune":
|
|
case "atodetect_on_failure":
|
|
if ($value === "yes") {
|
|
$this->$directive = true;
|
|
} elseif ($value === "no") {
|
|
$this->$directive = false;
|
|
} else {
|
|
throw new SitePatternException("Invalid value for boolean on line ".($l + 1), 1);
|
|
}
|
|
break;
|
|
case "test_url":
|
|
# URL
|
|
default:
|
|
# Unsupported directive
|
|
}
|
|
}
|
|
}
|
|
}
|