From 2aa6bb2deac048164eb75dcdb0c3baaff963ed5e Mon Sep 17 00:00:00 2001 From: "J. King" Date: Sat, 21 Dec 2019 15:05:42 -0500 Subject: [PATCH] Remove unnecessary test abstraction --- tests/cases/TestTokenizer.php | 141 +++++++++++++++++++++++++++++++--- tests/lib/StandardTest.php | 138 --------------------------------- 2 files changed, 131 insertions(+), 148 deletions(-) delete mode 100644 tests/lib/StandardTest.php diff --git a/tests/cases/TestTokenizer.php b/tests/cases/TestTokenizer.php index 9a055b7..1158a40 100644 --- a/tests/cases/TestTokenizer.php +++ b/tests/cases/TestTokenizer.php @@ -7,10 +7,14 @@ use dW\HTML5\EOFToken; use dW\HTML5\OpenElementsStack; use dW\HTML5\ParseError; use dW\HTML5\Tokenizer; +use dW\HTML5\CharacterToken; +use dW\HTML5\CommentToken; +use dW\HTML5\DOCTYPEToken; +use dW\HTML5\EndTagToken; +use dW\HTML5\StartTagToken; /** * @covers \dW\HTML5\Tokenizer - * @covers \dW\HTML5\Data * @covers \dW\HTML5\CharacterToken * @covers \dW\HTML5\CommentToken * @covers \dW\HTML5\DataToken @@ -18,15 +22,16 @@ use dW\HTML5\Tokenizer; * @covers \dW\HTML5\DOCTYPEToken * @covers \dW\HTML5\TokenAttr */ -class TestTokenizer extends \dW\HTML5\Test\StandardTest { - const DEBUG = false; +class TestTokenizer extends \PHPUnit\Framework\TestCase { + const STATE_MAP = [ + 'Data state' => Tokenizer::DATA_STATE, + 'PLAINTEXT state' => Tokenizer::PLAINTEXT_STATE, + 'RCDATA state' => Tokenizer::RCDATA_STATE, + 'RAWTEXT state' => Tokenizer::RAWTEXT_STATE, + 'Script data state' => Tokenizer::SCRIPT_DATA_STATE, + 'CDATA section state' => Tokenizer::CDATA_SECTION_STATE, + ]; - public function setUp(): void { - if (self::DEBUG) { - ob_end_clean(); - Tokenizer::$debug = true; - } - } /** @dataProvider provideStandardTokenizerTests */ public function testStandardTokenizerTests(string $input, array $expected, int $state, string $open = null, array $expErrors) { // convert parse error constants into standard symbols in specification @@ -48,7 +53,7 @@ class TestTokenizer extends \dW\HTML5\Test\StandardTest { $stack[] = (new \DOMDocument)->createElement($open); } // initialize the data stream and tokenizer - $data = new Data($input, "STDIN", $errorHandler); + $data = new Data($input, "STDIN", $errorHandler, "UTF-8"); $tokenizer = new Tokenizer($data, $stack, $errorHandler); $tokenizer->state = $state; // perform the test @@ -79,4 +84,120 @@ class TestTokenizer extends \dW\HTML5\Test\StandardTest { } return $this->makeTokenTests(...$tests); } + + protected function reverseDoubleEscape(string $str): string { + if (preg_match_all("/\\\\u([0-9a-f]{4})/i", $str, $matches)) { + for ($a = 0; $a < sizeof($matches[0]); $a++) { + $esc = $matches[0][$a]; + $chr = \MensBeam\Intl\Encoding\UTF8::encode(hexdec($matches[1][$a])); + $str = str_replace($esc, $chr, $str); + } + } + return $str; + } + + protected function combineCharacterTokens(array $tokens) : array { + $out = []; + $pending = null; + foreach ($tokens as $t) { + if ($t instanceof CharacterToken) { + if (!$pending) { + $pending = $t; + } else { + $pending->data .= $t->data; + } + } else { + if ($pending) { + $out[] = $pending; + $pending = null; + } + $out[] = $t; + } + } + if ($pending) { + $out[] = $pending; + } + return $out; + } + + protected function makeTokenTests(string ...$file): iterable { + foreach ($file as $path) { + $f = basename($path); + $testSet = json_decode(file_get_contents($path), true); + foreach ($testSet['tests'] ?? $testSet['xmlViolationTests'] as $index => $test) { + $testId = "$f #$index"; + if ($test['doubleEscaped'] ?? false) { + $test['input'] = $this->reverseDoubleEscape($test['input']); + for ($a = 0; $a < sizeof($test['output']); $a++) { + for ($b = 0; $b < sizeof($test['output'][$a]); $b++) { + if (is_string($test['output'][$a][$b])) { + $test['output'][$a][$b] = $this->reverseDoubleEscape($test['output'][$a][$b]); + } + } + } + } + $test['initialStates'] = $test['initialStates'] ?? ["Data state"]; + // check if a test needs a patch due to trivial differences in implementation + $this->patchTest($testId, $test); + for ($a = 0; $a < sizeof($test['initialStates']); $a++) { + $tokens = []; + foreach ($test['output'] as $token) { + switch ($token[0]) { + case "DOCTYPE": + $t = new DOCTYPEToken((string) $token[1], (string) $token[2], (string) $token[3]); + $t->forceQuirks = !$token[4]; + $tokens[] = $t; + break; + case "StartTag": + $t = new StartTagToken($token[1], $token[3] ?? false); + foreach ($token[2] ?? [] as $name => $value) { + $t->setAttribute((string) $name, $value); + } + $tokens[] = $t; + break; + case "EndTag": + $tokens[] = new EndTagToken($token[1]); + break; + case "Character": + $tokens[] = new CharacterToken($token[1]); + break; + case "Comment": + $tokens[] = new CommentToken($token[1]); + break; + default: + throw new \Exception("Token type '{$token[0]}' not implemented in standard test interpreter"); + } + unset($t); + } + yield "$testId: {$test['description']} ({$test['initialStates'][$a]})" => [ + $test['input'], // input + $tokens, // output + self::STATE_MAP[$test['initialStates'][$a]], // initial state + $test['lastStartTag'] ?? null, // open element, if any + $test['errors'] ?? [], // errors, if any + ]; + } + } + } + } + + protected function patchTest(string $id, &$test): void { + switch ($id) { + // test emits input stream error first despite peeking + case "test3.test #30": + $test['errors'] = array_reverse($test['errors']); + break; + // eof-in-comment positions in some tests don't make sense + // https://github.com/html5lib/html5lib-tests/issues/125 + case "test3.test #143": + $test['errors'][0]['col'] = 10; + break; + case "test3.test #144": + case "test3.test #145": + case "test3.test #146": + $test['errors'][0]['line'] = 2; + $test['errors'][0]['col'] = 2; + break; + } + } } diff --git a/tests/lib/StandardTest.php b/tests/lib/StandardTest.php deleted file mode 100644 index 453d27f..0000000 --- a/tests/lib/StandardTest.php +++ /dev/null @@ -1,138 +0,0 @@ - Tokenizer::DATA_STATE, - 'PLAINTEXT state' => Tokenizer::PLAINTEXT_STATE, - 'RCDATA state' => Tokenizer::RCDATA_STATE, - 'RAWTEXT state' => Tokenizer::RAWTEXT_STATE, - 'Script data state' => Tokenizer::SCRIPT_DATA_STATE, - 'CDATA section state' => Tokenizer::CDATA_SECTION_STATE, - ]; - - protected function reverseDoubleEscape(string $str): string { - if (preg_match_all("/\\\\u([0-9a-f]{4})/i", $str, $matches)) { - for ($a = 0; $a < sizeof($matches[0]); $a++) { - $esc = $matches[0][$a]; - $chr = \MensBeam\Intl\Encoding\UTF8::encode(hexdec($matches[1][$a])); - $str = str_replace($esc, $chr, $str); - } - } - return $str; - } - - protected function combineCharacterTokens(array $tokens) : array { - $out = []; - $pending = null; - foreach ($tokens as $t) { - if ($t instanceof CharacterToken) { - if (!$pending) { - $pending = $t; - } else { - $pending->data .= $t->data; - } - } else { - if ($pending) { - $out[] = $pending; - $pending = null; - } - $out[] = $t; - } - } - if ($pending) { - $out[] = $pending; - } - return $out; - } - - protected function makeTokenTests(string ...$file): iterable { - foreach ($file as $path) { - $f = basename($path); - $testSet = json_decode(file_get_contents($path), true); - foreach ($testSet['tests'] ?? $testSet['xmlViolationTests'] as $index => $test) { - $testId = "$f #$index"; - if ($test['doubleEscaped'] ?? false) { - $test['input'] = $this->reverseDoubleEscape($test['input']); - for ($a = 0; $a < sizeof($test['output']); $a++) { - for ($b = 0; $b < sizeof($test['output'][$a]); $b++) { - if (is_string($test['output'][$a][$b])) { - $test['output'][$a][$b] = $this->reverseDoubleEscape($test['output'][$a][$b]); - } - } - } - } - $test['initialStates'] = $test['initialStates'] ?? ["Data state"]; - // check if a test needs a patch due to trivial differences in implementation - $this->patchTest($testId, $test); - for ($a = 0; $a < sizeof($test['initialStates']); $a++) { - $tokens = []; - foreach ($test['output'] as $token) { - switch ($token[0]) { - case "DOCTYPE": - $t = new DOCTYPEToken((string) $token[1], (string) $token[2], (string) $token[3]); - $t->forceQuirks = !$token[4]; - $tokens[] = $t; - break; - case "StartTag": - $t = new StartTagToken($token[1], $token[3] ?? false); - foreach ($token[2] ?? [] as $name => $value) { - $t->setAttribute((string) $name, $value); - } - $tokens[] = $t; - break; - case "EndTag": - $tokens[] = new EndTagToken($token[1]); - break; - case "Character": - $tokens[] = new CharacterToken($token[1]); - break; - case "Comment": - $tokens[] = new CommentToken($token[1]); - break; - default: - throw new \Exception("Token type '{$token[0]}' not implemented in standard test interpreter"); - } - unset($t); - } - yield "$testId: {$test['description']} ({$test['initialStates'][$a]})" => [ - $test['input'], // input - $tokens, // output - self::STATE_MAP[$test['initialStates'][$a]], // initial state - $test['lastStartTag'] ?? null, // open element, if any - $test['errors'] ?? [], // errors, if any - ]; - } - } - } - } - - protected function patchTest(string $id, &$test): void { - switch ($id) { - // test emits input stream error first despite peeking - case "test3.test #30": - $test['errors'] = array_reverse($test['errors']); - break; - // eof-in-comment positions in some tests don't make sense - // https://github.com/html5lib/html5lib-tests/issues/125 - case "test3.test #143": - $test['errors'][0]['col'] = 10; - break; - case "test3.test #144": - case "test3.test #145": - case "test3.test #146": - $test['errors'][0]['line'] = 2; - $test['errors'][0]['col'] = 2; - break; - } - - } -}