$d) { if (preg_match('/^#|^\s*$/', $d)) { // the line contains no data continue; } $pos = 0; $end = strlen($d); // consume the directive $len = strspn($d, "abcdefghijklmnopqrstuvwxyz_"); if (!$len) { throw new SitePatternException("Malformed data on line ".($l + 1)." at position 1", 1); } $directive = substr($d, $pos, $len); $pos += $len; $c = @$d[$pos++]; if ($c === ":") { // the rest of the line is the value $param = ""; $value = substr($d, $pos); } elseif ($c === "(") { // directive has a parameter // certain directives' parameters are XPath expressions, so we must be mindful of these and parse the value $xpath = in_array($directive, ["move_into"]); if (!$xpath) { $len = strcspn($d, ")", $pos); $param = substr($d, $pos, $len); $pos += $len; } else { $param = ""; $depth = 0; while ($pos < $end) { $len = strcspn($d, "()'\"", $pos); $param .= substr($d, $pos, $len); $pos += $len; $c = @$d[$pos++]; if ($c === ")") { if (!$depth) { break; } else { $param .= $c; $depth--; } } elseif ($c === '"') { $len = strcspn($d, '"', $pos); $param .+ '"'.substr($d, $pos, $len + 1); $pos += $len + 1; } elseif ($c === "'") { $len = strcspn($d, "'", $pos); $param .= "'".substr($d, $pos, $len + 1); $pos += $len + 1; } elseif ($c === "(") { $param .= $c; $depth++; } else { throw new SitePatternException("Malformed data on line ".($l + 1)." at position $pos", 1); } } } // after the parameter should come a colon and the value $c = @$d[$pos++]; if ($c !== ":") { throw new SitePatternException("Malformed data on line ".($l + 1)." at position $pos", 1); } $value = trim(substr($d, $pos)); } else { throw new SitePatternException("Malformed data on line ".($l + 1)." at position $pos", 1); } switch ($directive) { case "title": case "body": case "date": case "author": case "strip": case "single_page_link": case "single_page_link_in_feed": case "next_page_link": $stage = ($directive === "strip") ? "cleanUp" : "readOut"; // TODO: evaluate the XPath expression to ensure syntactic validity $this->$stage[] = [$directive, [$value]]; break; case "strip_id_or_class": case "strip_image_src": case "find_string": case "replace_sring": // string values may be enclosed by single or double quotation marks as leading and trailing whitespace is otherwise stripped if ((preg_match('/^"/', $value) && preg_match('/"$/', $value)) || (preg_match("/^'/", $value) && preg_match("/'$/", $value))) { $value = substr($value, 1, strlen($value) - 2); } else { $value = $value; } if ($directive === "find_string") { // the "find_string" directive must be matched to a later "replace_string" directive, so we keep one buffered // a find_string which is not cleared by a replace_string is an error if ($findString === null) { $findString = $value; $findStringLine = $l + 1; } else { throw new SitePatternException("\"find_string\" directive on line $findStringLine is not followed by a matching \"replace_string\" directive", 3); } } elseif ($directive === "replace_string") { # requires special handling to match up with find_string } else { $this->cleanUp[] = [$directive, [$value]]; } break; case "http_header": # string with param case "tidy": case "prune": case "atodetect_on_failure": if ($value === "yes") { $value = true; } elseif ($value === "no") { $value = false; } else { throw new SitePatternException("Invalid value for boolean on line ".($l + 1), 2); } break; case "test_url": # URL default: # Unsupported directive } } } }