HTML-Parser/tests/cases/TestTokenizer.php

<?php
declare(strict_types=1);
namespace dW\HTML5\TestCase;

use dW\HTML5\Data;
use dW\HTML5\EOFToken;
use dW\HTML5\OpenElementsStack;
use dW\HTML5\ParseError;
use dW\HTML5\Tokenizer;
use dW\HTML5\CharacterToken;
use dW\HTML5\CommentToken;
use dW\HTML5\DOCTYPEToken;
use dW\HTML5\EndTagToken;
use dW\HTML5\StartTagToken;

/** 
 * @covers \dW\HTML5\Tokenizer
 * @covers \dW\HTML5\CharacterToken
 * @covers \dW\HTML5\CommentToken
 * @covers \dW\HTML5\DataToken
 * @covers \dW\HTML5\TagToken
 * @covers \dW\HTML5\DOCTYPEToken
 * @covers \dW\HTML5\TokenAttr
 */
class TestTokenizer extends \PHPUnit\Framework\TestCase {
    const STATE_MAP = [
        'Data state'          => Tokenizer::DATA_STATE,
        'PLAINTEXT state'     => Tokenizer::PLAINTEXT_STATE,
        'RCDATA state'        => Tokenizer::RCDATA_STATE,
        'RAWTEXT state'       => Tokenizer::RAWTEXT_STATE,
        'Script data state'   => Tokenizer::SCRIPT_DATA_STATE,
        'CDATA section state' => Tokenizer::CDATA_SECTION_STATE,
    ];

    /** @dataProvider provideStandardTokenizerTests */
    public function testStandardTokenizerTests(string $input, array $expected, int $state, string $open = null, array $expErrors) {
        // convert parse error constants into standard symbols in specification
        $errorMap = array_map(function($str) {
            return strtolower(str_replace("_", "-", $str));
        }, array_flip(array_filter((new \ReflectionClass(ParseError::class))->getConstants(), function($v) {
            return is_int($v);
        })));
        // create a stub error handler which collects parse errors
        $errors = [];
        $errorHandler = $this->createStub(ParseError::class);
        $errorHandler->method("emit")->willReturnCallback(function($file, $line, $col, $code) use (&$errors, $errorMap) {
            $errors[] = ['code' => $errorMap[$code], 'line' => $line, 'col' => $col];
            return true;
        });
        // initialize a stack of open elements, possibly with an open element
        $stack = new OpenElementsStack();
        if ($open) {
            $stack[] = (new \DOMDocument)->createElement($open);
        }
        // initialize the data stream and tokenizer
        $data = new Data($input, "STDIN", $errorHandler, "UTF-8");
        $tokenizer = new Tokenizer($data, $stack, $errorHandler);
        $tokenizer->state = $state;
        // perform the test
        $actual = [];
        try {
            do {
                $t = $tokenizer->createToken();
                if (!($t instanceof EOFToken)) {
                    $actual[] = $t;
                }
            } while (!($t instanceof EOFToken));
        } finally {
            $actual = $this->combineCharacterTokens($actual);
            $this->assertEquals($expected, $actual, $tokenizer->debugLog);
            $this->assertEquals($expErrors, $errors, $tokenizer->debugLog);
        }
    }

    public function provideStandardTokenizerTests() {
        $tests = [];
        $blacklist = ["xmlViolation.test"];
        foreach (new \GlobIterator(\dW\HTML5\BASE."tests/html5lib-tests/tokenizer/*.test", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME) as $file) {
            if (!in_array(basename($file), $blacklist)) {
                $tests[] = $file;
            }
        }
        return $this->makeTokenTests(...$tests);
    }

    protected function reverseDoubleEscape(string $str): string {
        if (preg_match_all("/\\\\u([0-9a-f]{4})/i", $str, $matches)) {
            for ($a = 0; $a < sizeof($matches[0]); $a++) {
                $esc = $matches[0][$a];
                $chr = \MensBeam\Intl\Encoding\UTF8::encode(hexdec($matches[1][$a]));
                $str = str_replace($esc, $chr, $str);
            }
        }
        return $str;
    }

    protected function combineCharacterTokens(array $tokens) : array {
        $out = [];
        $pending = null;
        foreach ($tokens as $t) {
            if ($t instanceof CharacterToken) {
                if (!$pending) {
                    $pending = $t;
                } else {
                    $pending->data .= $t->data;
                }
            } else {
                if ($pending) {
                    $out[] = $pending;
                    $pending = null;
                }
                $out[] = $t;
            }
        }
        if ($pending) {
            $out[] = $pending;
        }
        return $out;
    }

    protected function makeTokenTests(string ...$file): iterable {
        foreach ($file as $path) {
            $f = basename($path);
            $testSet = json_decode(file_get_contents($path), true);
            foreach ($testSet['tests'] ?? $testSet['xmlViolationTests'] as $index => $test) {
                $testId = "$f #$index";
                if ($test['doubleEscaped'] ?? false) {
                    $test['input'] = $this->reverseDoubleEscape($test['input']);
                    for ($a = 0; $a < sizeof($test['output']); $a++) {
                        for ($b = 0; $b < sizeof($test['output'][$a]); $b++) {
                            if (is_string($test['output'][$a][$b])) {
                                $test['output'][$a][$b] = $this->reverseDoubleEscape($test['output'][$a][$b]);
                            }
                        }
                    }
                }
                $test['initialStates'] = $test['initialStates'] ?? ["Data state"];
                // check if a test needs a patch due to trivial differences in implementation
                $this->patchTest($test);
                for ($a = 0; $a < sizeof($test['initialStates']); $a++) {
                    $tokens = [];
                    foreach ($test['output'] as $token) {
                        switch ($token[0]) {
                            case "DOCTYPE":
                                $t = new DOCTYPEToken((string) $token[1], (string) $token[2], (string) $token[3]);
                                $t->forceQuirks = !$token[4];
                                $tokens[] = $t;
                                break;
                            case "StartTag":
                                $t = new StartTagToken($token[1], $token[3] ?? false);
                                foreach ($token[2] ?? [] as $name => $value) {
                                    $t->setAttribute((string) $name, $value);
                                }
                                $tokens[] = $t;
                                break;
                            case "EndTag":
                                $tokens[] = new EndTagToken($token[1]);
                                break;
                            case "Character":
                                $tokens[] = new CharacterToken($token[1]);
                                break;
                            case "Comment":
                                $tokens[] = new CommentToken($token[1]);
                                break;
                            default:
                                throw new \Exception("Token type '{$token[0]}' not implemented in standard test interpreter");
                        }
                        unset($t);
                    }
                    yield "$testId: {$test['description']} ({$test['initialStates'][$a]})" => [
                        $test['input'],                                 // input
                        $tokens,                                        // output
                        self::STATE_MAP[$test['initialStates'][$a]],    // initial state
                        $test['lastStartTag'] ?? null,                  // open element, if any
                        $test['errors'] ?? [],                          // errors, if any
                    ];
                }
            }
        }
    }

    protected function patchTest(&$test): void {
        $id = [$test['input'], $test['initialStates']];
        switch ($id) {
            // test emits input stream error first despite peeking 
            case ["<!\u{B}", ["Data state"]]:
                $test['errors'] = array_reverse($test['errors']);
                break;
            // eof-in-<whatever> positions in some tests don't make sense
            // https://github.com/html5lib/html5lib-tests/issues/125
            case ["", ["CDATA section state"]]:
                // there is no position 2
                $test['errors'][0]['col']--;
                break;
            case ["\u{A}", ["CDATA section state"]]:
                // the line break is, for some reason, not counted in the test
                $test['errors'][0]['line']++;
                $test['errors'][0]['col'] = 1;
                break;
            case ["<!----!\r\n>", ["Data state"]]:
            case ["<!----!\n>", ["Data state"]]:
            case ["<!----!\r>", ["Data state"]]:
                // the line break is, for some reason, not counted in the test
                $test['errors'][0]['line']++;
                $test['errors'][0]['col'] = 2;
                break;
            case ["<!----! >", ["Data state"]]:
                $test['errors'][0]['col']++;
                break;
            case [hex2bin("f4808080"), ["CDATA section state"]]:
            case [hex2bin("3bf4808080"), ["CDATA section state"]]:
                // malpaired surrogates count as two characters
                $test['errors'][0]['col']++;
                break;
        }
    }
}
Basic skeleton of test suite 5 years ago			`<?php`
			`declare(strict_types=1);`
			`namespace dW\HTML5\TestCase;`

			`use dW\HTML5\Data;`
			`use dW\HTML5\EOFToken;`
			`use dW\HTML5\OpenElementsStack;`
Rewrite how parse errors are handled Everything which can emit a parse error should have the error handler and data stream as properties and use the ParseErrorEmitter trait to avoid complicating the task of actually producing an error. Normally the Parser would be expected to set the error handler before it begins (this commit does not do this) and unset it after it's done. Alternatively, the entire means of reporting errors can now be easily replaced. 5 years ago			`use dW\HTML5\ParseError;`
Basic skeleton of test suite 5 years ago			`use dW\HTML5\Tokenizer;`
Remove unnecessary test abstraction 4 years ago			`use dW\HTML5\CharacterToken;`
			`use dW\HTML5\CommentToken;`
			`use dW\HTML5\DOCTYPEToken;`
			`use dW\HTML5\EndTagToken;`
			`use dW\HTML5\StartTagToken;`
Basic skeleton of test suite 5 years ago
Tidying 4 years ago			`/**`
			`* @covers \dW\HTML5\Tokenizer`
			`* @covers \dW\HTML5\CharacterToken`
			`* @covers \dW\HTML5\CommentToken`
			`* @covers \dW\HTML5\DataToken`
			`* @covers \dW\HTML5\TagToken`
			`* @covers \dW\HTML5\DOCTYPEToken`
			`* @covers \dW\HTML5\TokenAttr`
			`*/`
Remove unnecessary test abstraction 4 years ago			`class TestTokenizer extends \PHPUnit\Framework\TestCase {`
			`const STATE_MAP = [`
			`'Data state' => Tokenizer::DATA_STATE,`
			`'PLAINTEXT state' => Tokenizer::PLAINTEXT_STATE,`
			`'RCDATA state' => Tokenizer::RCDATA_STATE,`
			`'RAWTEXT state' => Tokenizer::RAWTEXT_STATE,`
			`'Script data state' => Tokenizer::SCRIPT_DATA_STATE,`
			`'CDATA section state' => Tokenizer::CDATA_SECTION_STATE,`
			`];`
Basic endless loop helper 5 years ago
Basic skeleton of test suite 5 years ago			`/** @dataProvider provideStandardTokenizerTests */`
Address errors and omissions in error emission One test still fails, though it is arguably immaterial. This does not account for line and column number, which are known to be mostly off by one. 4 years ago			`public function testStandardTokenizerTests(string $input, array $expected, int $state, string $open = null, array $expErrors) {`
			`// convert parse error constants into standard symbols in specification`
			`$errorMap = array_map(function($str) {`
			`return strtolower(str_replace("_", "-", $str));`
			`}, array_flip(array_filter((new \ReflectionClass(ParseError::class))->getConstants(), function($v) {`
			`return is_int($v);`
			`})));`
			`// create a stub error handler which collects parse errors`
			`$errors = [];`
			`$errorHandler = $this->createStub(ParseError::class);`
			`$errorHandler->method("emit")->willReturnCallback(function($file, $line, $col, $code) use (&$errors, $errorMap) {`
			`$errors[] = ['code' => $errorMap[$code], 'line' => $line, 'col' => $col];`
			`return true;`
			`});`
Silence parse errors for now 5 years ago			`// initialize a stack of open elements, possibly with an open element`
Basic skeleton of test suite 5 years ago			`$stack = new OpenElementsStack();`
			`if ($open) {`
Add more tests 5 years ago			`$stack[] = (new \DOMDocument)->createElement($open);`
Basic skeleton of test suite 5 years ago			`}`
Silence parse errors for now 5 years ago			`// initialize the data stream and tokenizer`
Remove unnecessary test abstraction 4 years ago			`$data = new Data($input, "STDIN", $errorHandler, "UTF-8");`
Rewrite how parse errors are handled Everything which can emit a parse error should have the error handler and data stream as properties and use the ParseErrorEmitter trait to avoid complicating the task of actually producing an error. Normally the Parser would be expected to set the error handler before it begins (this commit does not do this) and unset it after it's done. Alternatively, the entire means of reporting errors can now be easily replaced. 5 years ago			`$tokenizer = new Tokenizer($data, $stack, $errorHandler);`
Basic skeleton of test suite 5 years ago			`$tokenizer->state = $state;`
Silence parse errors for now 5 years ago			`// perform the test`
Basic skeleton of test suite 5 years ago			`$actual = [];`
Prep for character references - Add missing state constants - Break all existing deviations for character refs - Add assertions before use of $attribute - Also fix DOCTYPE state 5 years ago			`try {`
			`do {`
			`$t = $tokenizer->createToken();`
			`if (!($t instanceof EOFToken)) {`
			`$actual[] = $t;`
			`}`
			`} while (!($t instanceof EOFToken));`
			`} finally {`
			`$actual = $this->combineCharacterTokens($actual);`
			`$this->assertEquals($expected, $actual, $tokenizer->debugLog);`
Address errors and omissions in error emission One test still fails, though it is arguably immaterial. This does not account for line and column number, which are known to be mostly off by one. 4 years ago			`$this->assertEquals($expErrors, $errors, $tokenizer->debugLog);`
Prep for character references - Add missing state constants - Break all existing deviations for character refs - Add assertions before use of $attribute - Also fix DOCTYPE state 5 years ago			`}`
Basic skeleton of test suite 5 years ago			`}`

			`public function provideStandardTokenizerTests() {`
Add more tests 5 years ago			`$tests = [];`
Include pending spec changes tests 5 years ago			`$blacklist = ["xmlViolation.test"];`
Update intl dependency 5 years ago			`foreach (new \GlobIterator(\dW\HTML5\BASE."tests/html5lib-tests/tokenizer/*.test", \FilesystemIterator::SKIP_DOTS \| \FilesystemIterator::CURRENT_AS_PATHNAME) as $file) {`
Add more tests 5 years ago			`if (!in_array(basename($file), $blacklist)) {`
			`$tests[] = $file;`
			`}`
			`}`
			`return $this->makeTokenTests(...$tests);`
Basic skeleton of test suite 5 years ago			`}`
Remove unnecessary test abstraction 4 years ago
			`protected function reverseDoubleEscape(string $str): string {`
			`if (preg_match_all("/\\\\u([0-9a-f]{4})/i", $str, $matches)) {`
			`for ($a = 0; $a < sizeof($matches[0]); $a++) {`
			`$esc = $matches[0][$a];`
			`$chr = \MensBeam\Intl\Encoding\UTF8::encode(hexdec($matches[1][$a]));`
			`$str = str_replace($esc, $chr, $str);`
			`}`
			`}`
			`return $str;`
			`}`

			`protected function combineCharacterTokens(array $tokens) : array {`
			`$out = [];`
			`$pending = null;`
			`foreach ($tokens as $t) {`
			`if ($t instanceof CharacterToken) {`
			`if (!$pending) {`
			`$pending = $t;`
			`} else {`
			`$pending->data .= $t->data;`
			`}`
			`} else {`
			`if ($pending) {`
			`$out[] = $pending;`
			`$pending = null;`
			`}`
			`$out[] = $t;`
			`}`
			`}`
			`if ($pending) {`
			`$out[] = $pending;`
			`}`
			`return $out;`
			`}`

			`protected function makeTokenTests(string ...$file): iterable {`
			`foreach ($file as $path) {`
			`$f = basename($path);`
			`$testSet = json_decode(file_get_contents($path), true);`
			`foreach ($testSet['tests'] ?? $testSet['xmlViolationTests'] as $index => $test) {`
			`$testId = "$f #$index";`
			`if ($test['doubleEscaped'] ?? false) {`
			`$test['input'] = $this->reverseDoubleEscape($test['input']);`
			`for ($a = 0; $a < sizeof($test['output']); $a++) {`
			`for ($b = 0; $b < sizeof($test['output'][$a]); $b++) {`
			`if (is_string($test['output'][$a][$b])) {`
			`$test['output'][$a][$b] = $this->reverseDoubleEscape($test['output'][$a][$b]);`
			`}`
			`}`
			`}`
			`}`
			`$test['initialStates'] = $test['initialStates'] ?? ["Data state"];`
			`// check if a test needs a patch due to trivial differences in implementation`
Patch tests based on input not unstable identifier 4 years ago			`$this->patchTest($test);`
Remove unnecessary test abstraction 4 years ago			`for ($a = 0; $a < sizeof($test['initialStates']); $a++) {`
			`$tokens = [];`
			`foreach ($test['output'] as $token) {`
			`switch ($token[0]) {`
			`case "DOCTYPE":`
			`$t = new DOCTYPEToken((string) $token[1], (string) $token[2], (string) $token[3]);`
			`$t->forceQuirks = !$token[4];`
			`$tokens[] = $t;`
			`break;`
			`case "StartTag":`
			`$t = new StartTagToken($token[1], $token[3] ?? false);`
			`foreach ($token[2] ?? [] as $name => $value) {`
			`$t->setAttribute((string) $name, $value);`
			`}`
			`$tokens[] = $t;`
			`break;`
			`case "EndTag":`
			`$tokens[] = new EndTagToken($token[1]);`
			`break;`
			`case "Character":`
			`$tokens[] = new CharacterToken($token[1]);`
			`break;`
			`case "Comment":`
			`$tokens[] = new CommentToken($token[1]);`
			`break;`
			`default:`
			`throw new \Exception("Token type '{$token[0]}' not implemented in standard test interpreter");`
			`}`
			`unset($t);`
			`}`
			`yield "$testId: {$test['description']} ({$test['initialStates'][$a]})" => [`
			`$test['input'], // input`
			`$tokens, // output`
			`self::STATE_MAP[$test['initialStates'][$a]], // initial state`
			`$test['lastStartTag'] ?? null, // open element, if any`
			`$test['errors'] ?? [], // errors, if any`
			`];`
			`}`
			`}`
			`}`
			`}`

Patch tests based on input not unstable identifier 4 years ago			`protected function patchTest(&$test): void {`
			`$id = [$test['input'], $test['initialStates']];`
Remove unnecessary test abstraction 4 years ago			`switch ($id) {`
			`// test emits input stream error first despite peeking`
Patch tests based on input not unstable identifier 4 years ago			`case ["<!\u{B}", ["Data state"]]:`
Remove unnecessary test abstraction 4 years ago			`$test['errors'] = array_reverse($test['errors']);`
			`break;`
Patch tests based on input not unstable identifier 4 years ago			`// eof-in-<whatever> positions in some tests don't make sense`
Remove unnecessary test abstraction 4 years ago			`// https://github.com/html5lib/html5lib-tests/issues/125`
Patch tests based on input not unstable identifier 4 years ago			`case ["", ["CDATA section state"]]:`
			`// there is no position 2`
			`$test['errors'][0]['col']--;`
Remove unnecessary test abstraction 4 years ago			`break;`
Patch tests based on input not unstable identifier 4 years ago			`case ["\u{A}", ["CDATA section state"]]:`
			`// the line break is, for some reason, not counted in the test`
			`$test['errors'][0]['line']++;`
			`$test['errors'][0]['col'] = 1;`
			`break;`
			`case ["<!----!\r\n>", ["Data state"]]:`
			`case ["<!----!\n>", ["Data state"]]:`
			`case ["<!----!\r>", ["Data state"]]:`
			`// the line break is, for some reason, not counted in the test`
			`$test['errors'][0]['line']++;`
Remove unnecessary test abstraction 4 years ago			`$test['errors'][0]['col'] = 2;`
			`break;`
Patch tests based on input not unstable identifier 4 years ago			`case ["<!----! >", ["Data state"]]:`
			`$test['errors'][0]['col']++;`
			`break;`
			`case [hex2bin("f4808080"), ["CDATA section state"]]:`
			`case [hex2bin("3bf4808080"), ["CDATA section state"]]:`
			`// malpaired surrogates count as two characters`
			`$test['errors'][0]['col']++;`
			`break;`
Remove unnecessary test abstraction 4 years ago			`}`
			`}`
Basic skeleton of test suite 5 years ago			`}`