Browse Source

Add some encoding-change tests and fix some bugs

ns
J. King 3 years ago
parent
commit
d0de04d870
  1. 4
      lib/Parser.php
  2. 26
      lib/Parser/Data.php
  3. 12
      lib/Parser/Token.php
  4. 4
      lib/Parser/TreeBuilder.php
  5. 46
      tests/cases/TestEncodingChange.php
  6. 3
      tests/phpunit.dist.xml

4
lib/Parser.php

@ -56,10 +56,12 @@ class Parser {
} catch (EncodingChangeException $e) {
// We are supposed to reparse with a new encoding
// Clear out the document
if ($document->doctype) {
$document->removeChild($document->doctype);
}
while ($document->hasChildNodes()) {
$document->removeChild($document->firstChild);
}
$document->removeChild($document->doctype);
// save the target encoding
$encoding = $decoder->encoding;
// Destroy our existing objects

26
lib/Parser/Data.php

@ -289,9 +289,11 @@ class Data {
}
public function changeEncoding(string $encoding): void {
$newEncoding = $encoding;
$oldEncoding = $this->encoding;
assert(!$this->encodingCertain, new \Exception("Encoding is already certain"));
assert(!!$this->encoding && $this->encoding === Encoding::matchLabel($this->encoding), new \Exception("Current encoding '{$this->encoding}' is invalid"));
assert($encoding === Encoding::matchLabel($encoding), new \Exception("Invalid encoding name '$encoding'"));
assert($oldEncoding === Charset::fromCharset($oldEncoding), new \Exception("Current encoding '{$this->encoding}' is invalid"));
assert($newEncoding === Charset::fromCharset($newEncoding), new \Exception("Invalid encoding name '$encoding'"));
# When the parser requires the user agent to change the encoding,
# it must run the following steps. This might happen if the encoding
# sniffing algorithm described above failed to find a character encoding,
@ -302,17 +304,17 @@ class Data {
# stream is UTF-16BE/LE, then set the confidence to certain and
# return. The new encoding is ignored; if it was anything but the
# same encoding, then it would be clearly incorrect.
if (in_array($this->encoding, ["UTF16-LE", "UTF-16BE"])) {
if (in_array($oldEncoding, ["UTF16-LE", "UTF-16BE"])) {
$this->encodingCertain = true;
return;
}
# If the new encoding is UTF-16BE/LE, then change it to UTF-8.
if (in_array($encoding, ["UTF16-LE", "UTF-16BE"])) {
$encoding = "UTF-8";
if (in_array($newEncoding, ["UTF-16LE", "UTF-16BE"])) {
$newEncoding = "UTF-8";
}
# If the new encoding is x-user-defined, then change it to windows-1252.
if ($encoding === "x-user-defined") {
$encoding = "windows-1252";
if ($newEncoding === "x-user-defined") {
$newEncoding = "windows-1252";
}
# If the new encoding is identical or equivalent to the encoding
# that is already being used to interpret the input stream, then
@ -322,7 +324,7 @@ class Data {
# pass through the parser if the first pass found that the encoding
# sniffing algorithm described in the earlier section failed to find
# the right encoding.
if ($encoding === $this->encoding) {
if ($newEncoding === $oldEncoding) {
$this->encodingCertain = true;
return;
}
@ -338,7 +340,7 @@ class Data {
// interpretation if they are all ASCII. This does require special
// handling for those encodings which are not quite ASCII-compatible
// (only ISO 2022-JP), but is relatively simple to confirm
$this->encoding = $encoding;
$this->encoding = $newEncoding;
$this->encodingCertain = true;
$bytes = $this->data->posByte();
$chars = $this->data->posChar();
@ -349,15 +351,15 @@ class Data {
} else {
$range = '[\x{00}-\x{7F}]';
}
if (preg_match("/^$range{$bytes}/s", $this->string)) {
if (preg_match('/^'.$range.'{'.$bytes.'}/s', $this->string)) {
// The bytes are the same; change the encoding, seek to the same location, and continue parsing
$this->data = Encoding::createDecoder($encoding, $this->string, false, true);
$this->data = Encoding::createDecoder($newEncoding, $this->string, false, true);
$this->data->seek($chars);
} else {
// If the bytes are not the same we have to throw everything out and start over
// The simplest way, ugly though it is, is to throw an exceptionto unwind all
// the way back to the invocation of the parser
$this->data = Encoding::createDecoder($encoding, $this->string, false, true);
$this->data = Encoding::createDecoder($newEncoding, $this->string, false, true);
throw new EncodingChangeException;
}
}

12
lib/Parser/Token.php

@ -78,12 +78,20 @@ abstract class TagToken extends Token {
return ($this->_getAttributeKey($name) !== null);
}
public function getAttribute(string $name) {
public function getAttribute(string $name): ?TokenAttr {
$key = $this->_getAttributeKey($name);
return (isset($this->attributes[$key])) ? $this->attributes[$key] : null;
}
private function _getAttributeKey(string $name) {
public function getAttributeValue(string $name): ?string {
$attr = $this->getAttribute($name);
if ($attr) {
return $attr->value;
}
return null;
}
private function _getAttributeKey(string $name): ?int {
foreach ($this->attributes as $key => $a) {
if ($a->name === $name) {
return $key;

4
lib/Parser/TreeBuilder.php

@ -1611,9 +1611,9 @@ class TreeBuilder {
# and the confidence is currently tentative, then change the encoding to the
# extracted encoding.
if (!$this->data->encodingCertain) {
if ($enc = Charset::fromCharset((string) $token->getAttribute("charset"))) {
if ($enc = Charset::fromCharset((string) $token->getAttributeValue("charset"))) {
$this->data->changeEncoding($enc);
} elseif (preg_match("/^Content-Type$/i", (string) $token->getAttribute("http-equiv")) && $enc = Charset::fromMeta((string) $token->getAttribute("content"))) {
} elseif (preg_match("/^Content-Type$/i", (string) $token->getAttributeValue("http-equiv")) && $enc = Charset::fromMeta((string) $token->getAttributeValue("content"))) {
$this->data->changeEncoding($enc);
}
}

46
tests/cases/TestEncodingChange.php

@ -0,0 +1,46 @@
<?php
/** @license MIT
* Copyright 2017 , Dustin Wilson, J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\HTML\TestCase;
use MensBeam\HTML\Parser;
use MensBeam\HTML\Parser\Output;
use MensBeam\HTML\Parser\Config;
/**
* @covers \MensBeam\HTML\Parser
* @covers \MensBeam\HTML\Parser\TreeBuilder
* @covers \MensBeam\HTML\Parser\Data::changeEncoding
*/
class TestEncodingChange extends \PHPUnit\Framework\TestCase {
/** @dataProvider provideEncodingChanges */
public function testChangeEncodingWithCharset(string $assumedEncoding, string $statedEncoding, string $actualEncoding, string $titleBytes, string $titleUTF8): void {
$in = "<!DOCTYPE html><html><head>".str_repeat(" ", 1024)."<title>$titleBytes</title><meta charset=$statedEncoding></head><body></body></html>";
// if the input is some form of UTF-16, add the null bytes in the correct places
if ($assumedEncoding === "UTF-16BE") {
$in = preg_replace("/(.)/s", "\0$1", $in);
} else if ($assumedEncoding === "UTF16-LE") {
$in = preg_replace("/(.)/s", "$1\0", $in);
}
// set up the test
$conf = new Config;
$conf->encodingFallback = $assumedEncoding;
$out = Parser::parse($in, "", null, null, null, $conf);
$this->assertInstanceOf(Output::class, $out);
// check the output
$this->assertSame($actualEncoding, $out->encoding);
$this->assertSame($titleUTF8, $out->document->getElementsByTagName("title")[0]->textContent);
}
public function provideEncodingChanges(): iterable {
return [
["windows-1252", "UTF-8", "UTF-8", "ASCII title", "ASCII title"],
["windows-1252", "UTF-16BE", "UTF-8", "ASCII title", "ASCII title"],
["windows-1252", "UTF-16LE", "UTF-8", "ASCII title", "ASCII title"],
["windows-1252", "UTF-8", "UTF-8", "H\xC3\xA9", "H\u{E9}"],
];
}
}

3
tests/phpunit.dist.xml

@ -28,5 +28,8 @@
<testsuite name="Parser">
<file>cases/TestParser.php</file>
</testsuite>
<testsuite name="Encoding change">
<file>cases/TestEncodingChange.php</file>
</testsuite>
</testsuites>
</phpunit>

Loading…
Cancel
Save