From b4d2f21199021b38e0d46d87a9c05ed02d3dcb26 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Wed, 17 Mar 2021 15:35:44 -0400 Subject: [PATCH] Add missing tests for charset pre-scan --- .gitattributes | 2 ++ lib/Charset.php | 6 ++--- tests/cases/TestCharset.php | 7 ++++-- tests/cases/encoding/mensbeam01.dat | 35 +++++++++++++++++++++++++++++ tests/phpunit.dist.xml | 4 +++- 5 files changed, 48 insertions(+), 6 deletions(-) create mode 100644 .gitattributes create mode 100644 tests/cases/encoding/mensbeam01.dat diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..897831b --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +*.dat -text diff +*.test -text diff diff --git a/lib/Charset.php b/lib/Charset.php index b00accc..fa02cca 100644 --- a/lib/Charset.php +++ b/lib/Charset.php @@ -265,7 +265,7 @@ abstract class Charset { spaces: # If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), # or 0x20 (SP) then advance position to the next byte, then, repeat this step. - while (in_array(@$s[$pos], ["\x09", "\x0A", "\x0C", "\x0D", " ", "/"])) { + while (in_array(@$s[$pos], ["\x09", "\x0A", "\x0C", "\x0D", " "])) { $pos++; } $char = @$s[$pos]; @@ -284,7 +284,7 @@ abstract class Charset { value: # If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), # or 0x20 (SP) then advance position to the next byte, then, repeat this step. - while (in_array(@$s[$pos], ["\x09", "\x0A", "\x0C", "\x0D", " ", "/"])) { + while (in_array(@$s[$pos], ["\x09", "\x0A", "\x0C", "\x0D", " "])) { $pos++; } $char = @$s[$pos]; @@ -424,5 +424,5 @@ abstract class Charset { return self::fromCharset(substr($s, $pos, $size)); } } - } + } // @codeCoverageIgnore } diff --git a/tests/cases/TestCharset.php b/tests/cases/TestCharset.php index fd0b907..257879a 100644 --- a/tests/cases/TestCharset.php +++ b/tests/cases/TestCharset.php @@ -86,7 +86,10 @@ class TestCharset extends \PHPUnit\Framework\TestCase { public function provideStandardEncodingTests() { $tests = []; $blacklist = []; - foreach (new \GlobIterator(\dW\HTML5\BASE."tests/html5lib-tests/encoding/*.dat", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME) as $file) { + $files = new \AppendIterator(); + $files->append(new \GlobIterator(\dW\HTML5\BASE."tests/html5lib-tests/encoding/*.dat", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME)); + $files->append(new \GlobIterator(\dW\HTML5\BASE."tests/cases/encoding/*.dat", \FilesystemIterator::SKIP_DOTS | \FilesystemIterator::CURRENT_AS_PATHNAME)); + foreach ($files as $file) { if (!in_array(basename($file), $blacklist)) { $tests[] = $file; } @@ -111,7 +114,7 @@ class TestCharset extends \PHPUnit\Framework\TestCase { if ($l >= $end) { return; } - yield $testId => [trim($data), trim($test[$l++])]; + yield $testId => [trim($data, "\r\n"), trim($test[$l++])]; } } } diff --git a/tests/cases/encoding/mensbeam01.dat b/tests/cases/encoding/mensbeam01.dat new file mode 100644 index 0000000..161e58c --- /dev/null +++ b/tests/cases/encoding/mensbeam01.dat @@ -0,0 +1,35 @@ +#data + + +#encoding +Windows-1252 + +#data + + +#encoding +UTF-8 + +#data + + + +#encoding +Windows-1252 + +#data + + +#encoding +Windows-1252 + +#data + + +#encoding +UTF-8 diff --git a/tests/phpunit.dist.xml b/tests/phpunit.dist.xml index cccd836..0e8d06e 100644 --- a/tests/phpunit.dist.xml +++ b/tests/phpunit.dist.xml @@ -16,8 +16,10 @@ - + cases/TestCharset.php + + cases/TestTokenizer.php