A modern, accurate HTML parser and serializer for PHP
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

263 lines
10 KiB

<?php
/** @license MIT
* Copyright 2017 , Dustin Wilson, J. King et al.
* See LICENSE and AUTHORS files for details */
use Robo\Result;
const BASE = __DIR__.\DIRECTORY_SEPARATOR;
const BASE_TEST = BASE."tests".\DIRECTORY_SEPARATOR;
define("IS_WIN", defined("PHP_WINDOWS_VERSION_MAJOR"));
define("IS_MAC", php_uname("s") === "Darwin");
error_reporting(0);
function norm(string $path): string {
$out = realpath($path);
if (!$out) {
$out = str_replace(["/", "\\"], \DIRECTORY_SEPARATOR, $path);
}
return $out;
}
class RoboFile extends \Robo\Tasks {
/** Runs the typical test suite
*
* Arguments passed to the task are passed on to PHPUnit. Thus one may, for
* example, run the following command and get the expected results:
*
* ./robo test --testsuite Tokenizer --exclude-group slow --testdox
*
* Please see the PHPUnit documentation for available options.
*/
public function test(array $args): Result {
return $this->runTests(escapeshellarg(\PHP_BINARY), "typical", $args);
}
/** Runs the full test suite
*
* This includes pedantic tests which may help to identify problems.
* See help for the "test" task for more details.
*/
public function testFull(array $args): Result {
return $this->runTests(escapeshellarg(\PHP_BINARY), "full", $args);
}
/**
* Runs a quick subset of the test suite
*
* See help for the "test" task for more details.
*/
public function testQuick(array $args): Result {
return $this->runTests(escapeshellarg(\PHP_BINARY), "quick", $args);
}
/** Manually updates the imported html5lib test suite */
public function testUpdate(): Result {
$repos = [
'html5lib-tests' => "https://github.com/html5lib/html5lib-tests",
'platform-tests' => "https://github.com/web-platform-tests/wpt",
];
$c = $this->collectionBuilder();
foreach ($repos as $dir => $url) {
$dir = BASE_TEST.$dir;
if (is_dir($dir)) {
$c->addTask($this->taskGitStack()->dir($dir)->pull());
} else {
$c->addTask($this->taskGitStack()->cloneRepo($url, $dir));
}
}
return $c->run();
}
/** Produces a code coverage report
*
* By default this task produces an HTML-format coverage report in
* tests/coverage/. Additional reports may be produced by passing
* arguments to this task as one would to PHPUnit.
*/
public function coverage(array $args): Result {
// run tests with code coverage reporting enabled
$exec = $this->findCoverageEngine();
return $this->runTests($exec, "coverage", array_merge(["--coverage-html", BASE_TEST."coverage"], $args));
}
/** Produces a code coverage report, with redundant tests
*
* Depending on the environment, some tests that normally provide
* coverage may be skipped, while working alternatives are normally
* suppressed for reasons of time. This coverage report will try to
* run all tests which may cover code.
*
* See also help for the "coverage" task for more details.
*/
public function coverageFull(array $args): Result {
// run tests with code coverage reporting enabled
$exec = $this->findCoverageEngine();
return $this->runTests($exec, "typical", array_merge(["--coverage-html", BASE_TEST."coverage"], $args));
}
protected function findCoverageEngine(): string {
$dir = rtrim(ini_get("extension_dir"), "/").\DIRECTORY_SEPARATOR;
$ext = IS_WIN ? "dll" : "so";
$php = escapeshellarg(\PHP_BINARY);
$code = escapeshellarg(BASE."lib");
if (extension_loaded("pcov")) {
return "$php -d opcache.enable_cli=0 -d pcov.enabled=1 -d pcov.directory=$code";
} elseif (extension_loaded("xdebug")) {
return "$php -d opcache.enable_cli=0 -d xdebug.mode=coverage";
} elseif (file_exists($dir."pcov.$ext")) {
return "$php -d opcache.enable_cli=0 -d extension=pcov.$ext -d pcov.enabled=1 -d pcov.directory=$code";
} elseif (file_exists($dir."xdebug.$ext")) {
return "$php -d opcache.enable_cli=0 -d zend_extension=xdebug.$ext -d xdebug.mode=coverage";
} else {
if (IS_WIN) {
$dbg = dirname(\PHP_BINARY)."\\phpdbg.exe";
$dbg = file_exists($dbg) ? $dbg : "";
} else {
$dbg = trim(`which phpdbg 2>/dev/null`);
}
if ($dbg) {
return escapeshellarg($dbg)." -qrr";
} else {
return $php;
}
}
}
protected function blackhole(bool $all = false): string {
$hole = IS_WIN ? "nul" : "/dev/null";
return $all ? ">$hole 2>&1" : "2>$hole";
}
protected function runTests(string $executor, string $set, array $args) : Result {
switch ($set) {
case "typical":
$set = ["--exclude-group", "optional"];
break;
case "quick":
$set = ["--exclude-group", "optional,slow"];
break;
case "coverage":
$set = ["--exclude-group", "optional,coverageOptional"];
break;
case "full":
$set = [];
break;
default:
throw new \Exception;
}
$execpath = norm(BASE."vendor-bin/phpunit/vendor/phpunit/phpunit/phpunit");
$confpath = realpath(BASE_TEST."phpunit.dist.xml") ?: norm(BASE_TEST."phpunit.xml");
// clone the html5lib test suite if it's not already present
if (!is_dir(BASE_TEST."html5lib-tests")) {
$this->testUpdate();
}
return $this->taskExec($executor)->option("-d", "zend.assertions=1")->arg($execpath)->option("-c", $confpath)->args(array_merge($set, $args))->run();
}
/** Produces the CharacterReference class file */
public function charref() {
$template = <<<'FILE'
<?php
declare(strict_types=1);
namespace MensBeam\HTML\Parser;
// This file is machine-generated
// DO NOT MODIFY
// To update, run ./robo charref
class CharacterReference {
const LONGEST_NAME = %LONGEST%;
const PREFIX_PATTERN = %NAMED_PATTERN%;
const NAMES = [
%NAMED_REFERENCES%
];
const C1_TABLE = [
%C1_SUBSTITUTIONS%
];
}
FILE;
$input = @json_decode(@file_get_contents("https://html.spec.whatwg.org/entities.json"), true);
if (!is_array($input)) {
throw new \Exception("Could not retrieve character reference table.");
}
$list = [];
$terms = [];
foreach ($input as $entity => $data) {
// strip the ampersand from the entity name
$entity = substr($entity, 1);
// add the entity name to an array of regular expression terms
// if the entry exists in unterminated form, compress it into one, skiping the unterminated version
if (substr($entity, -1) === ';') {
if (isset($input['&'.substr($entity, 0, strlen($entity) -1)])) {
$terms[] = "$entity?";
} else {
$terms[] = $entity;
}
}
// add a PHP-code representation of the entity name and its characters to another array
$chars = $data['codepoints'];
for ($a = 0; $a < sizeof($chars); $a++) {
$chars[$a] = '\u{'.dechex($chars[$a]).'}';
}
$chars = implode('', $chars);
$list[] = "'$entity'=>\"$chars\"";
}
// concatenate the list of entities and substitute them into the template
$list = implode(",", $list);
$template = str_replace('%NAMED_REFERENCES%', $list, $template);
// prepare the list of terms as a regular expression
// sort longest terms first
usort($terms, function($a, $b) {
return -1 * (strlen(preg_replace("/\W/", "", $a)) <=> strlen(preg_replace("/\W/", "", $b)));
});
// note the longest term
$longest = strlen(preg_replace("/\W/", "", $terms[0]));
$template = str_replace('%LONGEST%', $longest, $template);
// concatenate the terms into a case-sensitive non-capturing prefix search
$regexp = '/^(?:'.implode('|', $terms).')/';
$template = str_replace('%NAMED_PATTERN%', var_export($regexp, true), $template);
// Compile the C1 control substitution table
// See https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
$list = [];
$c1table = [
0x80 => 0x20AC, // EURO SIGN (€)
0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK (‚)
0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK (ƒ)
0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK („)
0x85 => 0x2026, // HORIZONTAL ELLIPSIS (…)
0x86 => 0x2020, // DAGGER (†)
0x87 => 0x2021, // DOUBLE DAGGER (‡)
0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ)
0x89 => 0x2030, // PER MILLE SIGN (‰)
0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON (Š)
0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹)
0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE (Œ)
0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON (Ž)
0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK (‘)
0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK (’)
0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK (“)
0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK (”)
0x95 => 0x2022, // BULLET (•)
0x96 => 0x2013, // EN DASH (–)
0x97 => 0x2014, // EM DASH (—)
0x98 => 0x02DC, // SMALL TILDE (˜)
0x99 => 0x2122, // TRADE MARK SIGN (™)
0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON (š)
0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›)
0x9C => 0x0153, // LATIN SMALL LIGATURE OE (œ)
0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON (ž)
0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ)
];
foreach ($c1table as $c1 => $code) {
$list[] = "$c1=>$code";
}
$list = implode(",", $list);
$template = str_replace('%C1_SUBSTITUTIONS%', $list, $template);
// output the file itself
file_put_contents(BASE."lib/Parser/CharacterReference.php", $template);
}
}