A modern, accurate HTML parser and serializer for PHP
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

81 lines
4.8 KiB

<?php
/** @license MIT
* Copyright 2017 , Dustin Wilson, J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\HTML\TestCase;
use MensBeam\HTML\Parser;
use MensBeam\HTML\Parser\Output;
use MensBeam\HTML\Parser\Config;
/**
* @covers \MensBeam\HTML\Parser
* @covers \MensBeam\HTML\Parser\TreeConstructor
* @covers \MensBeam\HTML\Parser\TagToken
* @covers \MensBeam\HTML\Parser\Data::changeEncoding
*/
class TestEncodingChange extends \PHPUnit\Framework\TestCase {
/** @dataProvider provideEncodingChanges */
public function testChangeEncodingWithCharset(string $assumedEncoding, string $statedEncoding, string $actualEncoding, string $titleBytes, string $titleUTF8): void {
$in = "<!DOCTYPE html><html><head>".str_repeat(" ", 1024)."<title>$titleBytes</title><meta charset=$statedEncoding></head><body></body></html>";
// if the input is some form of UTF-16, add the null bytes in the correct places
if ($assumedEncoding === "UTF-16BE") {
$in = preg_replace("/(.)/s", "\x00$1", $in);
} elseif ($assumedEncoding === "UTF-16LE") {
$in = preg_replace("/(.)/s", "$1\x00", $in);
}
// set up the test
$conf = new Config;
$conf->encodingFallback = $assumedEncoding;
$out = Parser::parse($in, "", $conf);
$this->assertInstanceOf(Output::class, $out);
// check the output
$this->assertSame($actualEncoding, $out->encoding);
$this->assertSame($titleUTF8, $out->document->getElementsByTagName("title")[0]->textContent);
}
/** @dataProvider provideEncodingChanges */
public function testChangeEncodingWithHttpEquiv(string $assumedEncoding, string $statedEncoding, string $actualEncoding, string $titleBytes, string $titleUTF8): void {
$in = "<!DOCTYPE html><html><head>".str_repeat(" ", 1024)."<title>$titleBytes</title><meta http-equiv=CoNtenT-TYpe content='text/html;charset=$statedEncoding'></head><body></body></html>";
// if the input is some form of UTF-16, add the null bytes in the correct places
if ($assumedEncoding === "UTF-16BE") {
$in = preg_replace("/(.)/s", "\x00$1", $in);
} elseif ($assumedEncoding === "UTF-16LE") {
$in = preg_replace("/(.)/s", "$1\x00", $in);
}
// set up the test
$conf = new Config;
$conf->encodingFallback = $assumedEncoding;
$out = Parser::parse($in, "", $conf);
$this->assertInstanceOf(Output::class, $out);
// check the output
$this->assertSame($actualEncoding, $out->encoding);
$this->assertSame($titleUTF8, $out->document->getElementsByTagName("title")[0]->textContent);
}
public function provideEncodingChanges(): iterable {
return [
["windows-1252", "", "windows-1252", "ASCII title", "ASCII title"],
["windows-1252", "UTF-8", "UTF-8", "ASCII title", "ASCII title"],
["windows-1252", "UTF-16BE", "UTF-8", "ASCII title", "ASCII title"],
["windows-1252", "UTF-16LE", "UTF-8", "ASCII title", "ASCII title"],
["UTF-8", "x-user-defined", "windows-1252", "ASCII title", "ASCII title"],
["windows-1252", "UTF-8", "UTF-8", "H\xC3\xA9", "H\u{E9}"],
["UTF-8", "UTF-8", "UTF-8", "H\xC3\xA9", "H\u{E9}"],
["UTF-16LE", "UTF-8", "UTF-16LE", "ASCII title", "ASCII title"],
["UTF-16BE", "UTF-8", "UTF-16BE", "ASCII title", "ASCII title"],
["windows-1252", "bogus", "windows-1252", "H\xE9", "H\u{E9}"],
["ISO-2022-JP", "ISO-2022-JP", "ISO-2022-JP", "\x1B\x28\x49\x56\x1B\x28\x42", "\u{FF96}"],
["ISO-2022-JP", "UTF-8", "UTF-8", "\x1B\x28\x49\x56\x1B\x28\x42", "\u{1B}(IV\u{1B}(B"],
["UTF-8", "ISO-2022-JP", "ISO-2022-JP", "ASCII title", "ASCII title"],
["UTF-8", "UTF-8", "UTF-8", "\x0E", "\u{E}"],
["UTF-8", "UTF-8", "UTF-8", "\x0F", "\u{F}"],
["UTF-8", "UTF-8", "UTF-8", "\x1B", "\u{1B}"],
["UTF-8", "ISO-2022-JP", "ISO-2022-JP", "\x0E", "\u{FFFD}"],
["UTF-8", "ISO-2022-JP", "ISO-2022-JP", "\x0F", "\u{FFFD}"],
["UTF-8", "ISO-2022-JP", "ISO-2022-JP", "\x1B", "\u{FFFD}"],
];
}
}