You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
81 lines
4.8 KiB
81 lines
4.8 KiB
<?php
|
|
/** @license MIT
|
|
* Copyright 2017 , Dustin Wilson, J. King et al.
|
|
* See LICENSE and AUTHORS files for details */
|
|
|
|
declare(strict_types=1);
|
|
namespace MensBeam\HTML\TestCase;
|
|
|
|
use MensBeam\HTML\Parser;
|
|
use MensBeam\HTML\Parser\Output;
|
|
use MensBeam\HTML\Parser\Config;
|
|
|
|
/**
|
|
* @covers \MensBeam\HTML\Parser
|
|
* @covers \MensBeam\HTML\Parser\TreeConstructor
|
|
* @covers \MensBeam\HTML\Parser\TagToken
|
|
* @covers \MensBeam\HTML\Parser\Data::changeEncoding
|
|
*/
|
|
class TestEncodingChange extends \PHPUnit\Framework\TestCase {
|
|
/** @dataProvider provideEncodingChanges */
|
|
public function testChangeEncodingWithCharset(string $assumedEncoding, string $statedEncoding, string $actualEncoding, string $titleBytes, string $titleUTF8): void {
|
|
$in = "<!DOCTYPE html><html><head>".str_repeat(" ", 1024)."<title>$titleBytes</title><meta charset=$statedEncoding></head><body></body></html>";
|
|
// if the input is some form of UTF-16, add the null bytes in the correct places
|
|
if ($assumedEncoding === "UTF-16BE") {
|
|
$in = preg_replace("/(.)/s", "\x00$1", $in);
|
|
} elseif ($assumedEncoding === "UTF-16LE") {
|
|
$in = preg_replace("/(.)/s", "$1\x00", $in);
|
|
}
|
|
// set up the test
|
|
$conf = new Config;
|
|
$conf->encodingFallback = $assumedEncoding;
|
|
$out = Parser::parse($in, "", $conf);
|
|
$this->assertInstanceOf(Output::class, $out);
|
|
// check the output
|
|
$this->assertSame($actualEncoding, $out->encoding);
|
|
$this->assertSame($titleUTF8, $out->document->getElementsByTagName("title")[0]->textContent);
|
|
}
|
|
|
|
/** @dataProvider provideEncodingChanges */
|
|
public function testChangeEncodingWithHttpEquiv(string $assumedEncoding, string $statedEncoding, string $actualEncoding, string $titleBytes, string $titleUTF8): void {
|
|
$in = "<!DOCTYPE html><html><head>".str_repeat(" ", 1024)."<title>$titleBytes</title><meta http-equiv=CoNtenT-TYpe content='text/html;charset=$statedEncoding'></head><body></body></html>";
|
|
// if the input is some form of UTF-16, add the null bytes in the correct places
|
|
if ($assumedEncoding === "UTF-16BE") {
|
|
$in = preg_replace("/(.)/s", "\x00$1", $in);
|
|
} elseif ($assumedEncoding === "UTF-16LE") {
|
|
$in = preg_replace("/(.)/s", "$1\x00", $in);
|
|
}
|
|
// set up the test
|
|
$conf = new Config;
|
|
$conf->encodingFallback = $assumedEncoding;
|
|
$out = Parser::parse($in, "", $conf);
|
|
$this->assertInstanceOf(Output::class, $out);
|
|
// check the output
|
|
$this->assertSame($actualEncoding, $out->encoding);
|
|
$this->assertSame($titleUTF8, $out->document->getElementsByTagName("title")[0]->textContent);
|
|
}
|
|
|
|
public function provideEncodingChanges(): iterable {
|
|
return [
|
|
["windows-1252", "", "windows-1252", "ASCII title", "ASCII title"],
|
|
["windows-1252", "UTF-8", "UTF-8", "ASCII title", "ASCII title"],
|
|
["windows-1252", "UTF-16BE", "UTF-8", "ASCII title", "ASCII title"],
|
|
["windows-1252", "UTF-16LE", "UTF-8", "ASCII title", "ASCII title"],
|
|
["UTF-8", "x-user-defined", "windows-1252", "ASCII title", "ASCII title"],
|
|
["windows-1252", "UTF-8", "UTF-8", "H\xC3\xA9", "H\u{E9}"],
|
|
["UTF-8", "UTF-8", "UTF-8", "H\xC3\xA9", "H\u{E9}"],
|
|
["UTF-16LE", "UTF-8", "UTF-16LE", "ASCII title", "ASCII title"],
|
|
["UTF-16BE", "UTF-8", "UTF-16BE", "ASCII title", "ASCII title"],
|
|
["windows-1252", "bogus", "windows-1252", "H\xE9", "H\u{E9}"],
|
|
["ISO-2022-JP", "ISO-2022-JP", "ISO-2022-JP", "\x1B\x28\x49\x56\x1B\x28\x42", "\u{FF96}"],
|
|
["ISO-2022-JP", "UTF-8", "UTF-8", "\x1B\x28\x49\x56\x1B\x28\x42", "\u{1B}(IV\u{1B}(B"],
|
|
["UTF-8", "ISO-2022-JP", "ISO-2022-JP", "ASCII title", "ASCII title"],
|
|
["UTF-8", "UTF-8", "UTF-8", "\x0E", "\u{E}"],
|
|
["UTF-8", "UTF-8", "UTF-8", "\x0F", "\u{F}"],
|
|
["UTF-8", "UTF-8", "UTF-8", "\x1B", "\u{1B}"],
|
|
["UTF-8", "ISO-2022-JP", "ISO-2022-JP", "\x0E", "\u{FFFD}"],
|
|
["UTF-8", "ISO-2022-JP", "ISO-2022-JP", "\x0F", "\u{FFFD}"],
|
|
["UTF-8", "ISO-2022-JP", "ISO-2022-JP", "\x1B", "\u{FFFD}"],
|
|
];
|
|
}
|
|
}
|
|
|