From 07d26e3f45c3a3167eb6389572419d3bda7ff5e1 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Sun, 24 Oct 2021 10:37:46 -0400 Subject: [PATCH] Add BOM handling Per specification this does not extend to GB18030 --- CHANGELOG | 8 +++++++- lib/Encoding.php | 20 +++++++++++++++++++- lib/Encoding/UTF16.php | 13 ++++++++++++- lib/Encoding/UTF8.php | 17 ++++++++++++++++- tests/cases/Encoding/TestUTF16BE.php | 21 +++++++++++---------- tests/cases/Encoding/TestUTF16LE.php | 21 +++++++++++---------- tests/cases/Encoding/TestUTF8.php | 21 +++++++++++---------- tests/cases/TestEncoding.php | 18 ++++++++++++++++++ tests/lib/DecoderTest.php | 21 +++++++++++++++------ 9 files changed, 120 insertions(+), 40 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 139d860..928be85 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,10 @@ -Version 0.9.0 (2021-03-25) +Version 0.9.1 (2021-10-24) +========================== + +Bug fixes +- Correctly skip byte order marks +- Detect byte order marks in \MensBeam\Intl\Encoding::createEncoder() + ========================== New features: diff --git a/lib/Encoding.php b/lib/Encoding.php index a2ff2c4..2ed8b8f 100644 --- a/lib/Encoding.php +++ b/lib/Encoding.php @@ -16,6 +16,8 @@ abstract class Encoding { protected const NAME_MAP = ['Big5'=>\MensBeam\Intl\Encoding\Big5::class,'EUC-JP'=>\MensBeam\Intl\Encoding\EUCJP::class,'EUC-KR'=>\MensBeam\Intl\Encoding\EUCKR::class,'gb18030'=>\MensBeam\Intl\Encoding\GB18030::class,'GBK'=>\MensBeam\Intl\Encoding\GBK::class,'IBM866'=>\MensBeam\Intl\Encoding\IBM866::class,'ISO-2022-JP'=>\MensBeam\Intl\Encoding\ISO2022JP::class,'ISO-8859-10'=>\MensBeam\Intl\Encoding\ISO885910::class,'ISO-8859-13'=>\MensBeam\Intl\Encoding\ISO885913::class,'ISO-8859-14'=>\MensBeam\Intl\Encoding\ISO885914::class,'ISO-8859-15'=>\MensBeam\Intl\Encoding\ISO885915::class,'ISO-8859-16'=>\MensBeam\Intl\Encoding\ISO885916::class,'ISO-8859-2'=>\MensBeam\Intl\Encoding\ISO88592::class,'ISO-8859-3'=>\MensBeam\Intl\Encoding\ISO88593::class,'ISO-8859-4'=>\MensBeam\Intl\Encoding\ISO88594::class,'ISO-8859-5'=>\MensBeam\Intl\Encoding\ISO88595::class,'ISO-8859-6'=>\MensBeam\Intl\Encoding\ISO88596::class,'ISO-8859-7'=>\MensBeam\Intl\Encoding\ISO88597::class,'ISO-8859-8'=>\MensBeam\Intl\Encoding\ISO88598::class,'ISO-8859-8-I'=>\MensBeam\Intl\Encoding\ISO88598I::class,'KOI8-R'=>\MensBeam\Intl\Encoding\KOI8R::class,'KOI8-U'=>\MensBeam\Intl\Encoding\KOI8U::class,'macintosh'=>\MensBeam\Intl\Encoding\Macintosh::class,'replacement'=>\MensBeam\Intl\Encoding\Replacement::class,'Shift_JIS'=>\MensBeam\Intl\Encoding\ShiftJIS::class,'UTF-16BE'=>\MensBeam\Intl\Encoding\UTF16BE::class,'UTF-16LE'=>\MensBeam\Intl\Encoding\UTF16LE::class,'UTF-8'=>\MensBeam\Intl\Encoding\UTF8::class,'windows-1250'=>\MensBeam\Intl\Encoding\Windows1250::class,'windows-1251'=>\MensBeam\Intl\Encoding\Windows1251::class,'windows-1252'=>\MensBeam\Intl\Encoding\Windows1252::class,'windows-1253'=>\MensBeam\Intl\Encoding\Windows1253::class,'windows-1254'=>\MensBeam\Intl\Encoding\Windows1254::class,'windows-1255'=>\MensBeam\Intl\Encoding\Windows1255::class,'windows-1256'=>\MensBeam\Intl\Encoding\Windows1256::class,'windows-1257'=>\MensBeam\Intl\Encoding\Windows1257::class,'windows-1258'=>\MensBeam\Intl\Encoding\Windows1258::class,'windows-874'=>\MensBeam\Intl\Encoding\Windows874::class,'x-mac-cyrillic'=>\MensBeam\Intl\Encoding\XMacCyrillic::class,'x-user-defined'=>\MensBeam\Intl\Encoding\XUserDefined::class]; /** Returns a new decoder for the specified $encodingLabel operating on $data, or null if the label is not valid + * + * If $data includes a UTF-8 or UTF-16 byte order mark, this will take precedence over the specified encoding * * @param string $encodingLabel One of the encoding labels listed in the specification e.g. "utf-8", "Latin1", "shift_JIS" * @param string $data The string to decode @@ -25,7 +27,7 @@ abstract class Encoding { * @see https://encoding.spec.whatwg.org#names-and-labels */ public static function createDecoder(string $encodingLabel, string $data, bool $fatal = false, bool $allowSurrogates = false): ?Decoder { - $encoding = self::matchLabel($encodingLabel); + $encoding = self::matchLabel(self::sniffBOM($data) ?? $encodingLabel); if ($encoding) { $class = $encoding['class']; return new $class($data, $fatal, $allowSurrogates); @@ -77,4 +79,20 @@ abstract class Encoding { return null; } } + + /** Finds a Unicode byte order mark in a byte stream and returns the detected encoding, if any + * + * @param string $data The string to examine + */ + public static function sniffBOM(string $data): ?string { + if (substr($data, 0, 3) === "\xEF\xBB\xBF") { + return "UTF-8"; + } elseif (substr($data, 0, 2) === "\xFE\xFF") { + return "UTF-16BE"; + } elseif (substr($data, 0, 2) === "\xFF\xFE") { + return "UTF-16LE"; + } else { + return null; + } + } } diff --git a/lib/Encoding/UTF16.php b/lib/Encoding/UTF16.php index 5e4fb32..ac584d8 100644 --- a/lib/Encoding/UTF16.php +++ b/lib/Encoding/UTF16.php @@ -9,10 +9,21 @@ namespace MensBeam\Intl\Encoding; abstract class UTF16 extends AbstractEncoding { protected $selfSynchronizing = true; protected $dirtyEOF = 0; + /** @var int The size of the string's byte order mark, if any */ + protected $BOM = 0; public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) { $this->stateProps[] = "dirtyEOF"; parent::__construct($string, $fatal, $allowSurrogates); + if (substr($string, 0, 2) === (static::BE ? "\xFE\xFF" : "\xFF\xFE")) { + $this->BOM = 2; + $this->posByte = 2; + } + } + + public function rewind(): void { + parent::rewind(); + $this->posByte = $this->BOM; } public function nextCode() { @@ -144,7 +155,7 @@ abstract class UTF16 extends AbstractEncoding { $this->posByte -= $this->dirtyEOF; $this->dirtyEOF = 0; } - while ($distance > 0 && $this->posByte > 0) { + while ($distance > 0 && $this->posChar > 0) { $distance--; $this->posChar--; if ($this->posByte < 4) { diff --git a/lib/Encoding/UTF8.php b/lib/Encoding/UTF8.php index b86729f..8df7604 100644 --- a/lib/Encoding/UTF8.php +++ b/lib/Encoding/UTF8.php @@ -18,6 +18,21 @@ class UTF8 extends AbstractEncoding implements Coder, Decoder { ]; protected $selfSynchronizing = true; + /** @var int The size of the string's byte order mark, if any */ + protected $BOM = 0; + + public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) { + parent::__construct($string, $fatal, $allowSurrogates); + if (substr($string, 0, 3) === "\xEF\xBB\xBF") { + $this->BOM = 3; + $this->posByte = 3; + } + } + + public function rewind(): void { + parent::rewind(); + $this->posByte = $this->BOM; + } public function nextCode() { // this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder @@ -99,7 +114,7 @@ class UTF8 extends AbstractEncoding implements Coder, Decoder { /** Implements backward seeking $distance characters */ protected function seekBack(int $distance): int { - while ($distance > 0 && $this->posByte > 0) { + while ($distance > 0 && $this->posChar > 0) { $distance--; $this->posChar--; $b = ord(@$this->string[$this->posByte - 1]); diff --git a/tests/cases/Encoding/TestUTF16BE.php b/tests/cases/Encoding/TestUTF16BE.php index 6b7d117..df5688b 100644 --- a/tests/cases/Encoding/TestUTF16BE.php +++ b/tests/cases/Encoding/TestUTF16BE.php @@ -11,18 +11,19 @@ use MensBeam\Intl\Encoding\UTF16BE; class TestUTF16BE extends TestUTF16LE { protected $testedClass = UTF16BE::class; /* - Char 0 U+007A (2 byte) Offset 0 - Char 1 U+00A2 (2 bytes) Offset 2 - Char 2 U+6C34 (2 bytes) Offset 4 - Char 3 U+1D11E (4 bytes) Offset 6 - Char 4 U+F8FF (2 bytes) Offset 10 - Char 5 U+10FFFD (4 bytes) Offset 12 - Char 6 U+FFFE (2 bytes) Offset 16 - End of string at char 7, offset 18 + Byte Order Mark (2 bytes) Offset 0 + Char 0 U+007A (2 bytes) Offset 2 + Char 1 U+00A2 (2 bytes) Offset 4 + Char 2 U+6C34 (2 bytes) Offset 6 + Char 3 U+1D11E (4 bytes) Offset 8 + Char 4 U+F8FF (2 bytes) Offset 12 + Char 5 U+10FFFD (4 bytes) Offset 14 + Char 6 U+FFFE (2 bytes) Offset 18 + End of string at char 7, offset 20 */ - protected $seekString = "007A 00A2 6C34 D834DD1E F8FF DBFFDFFD FFFE"; + protected $seekString = "FEFF 007A 00A2 6C34 D834DD1E F8FF DBFFDFFD FFFE"; protected $seekCodes = [0x007A, 0x00A2, 0x6C34, 0x1D11E, 0xF8FF, 0x10FFFD, 0xFFFE]; - protected $seekOffsets = [0, 2, 4, 6, 10, 12, 16, 18]; + protected $seekOffsets = [2, 4, 6, 8, 12, 14, 18, 20]; /* This string contains an invalid character sequence sandwiched between two null characters */ protected $brokenChar = "0000 DC00 0000"; /* This string conatins the ASCII characters "A" and "Z" followed by two arbitrary non-ASCII characters, followed by the two ASCII characters "0" and "9" */ diff --git a/tests/cases/Encoding/TestUTF16LE.php b/tests/cases/Encoding/TestUTF16LE.php index ea9d50d..dd8b25a 100644 --- a/tests/cases/Encoding/TestUTF16LE.php +++ b/tests/cases/Encoding/TestUTF16LE.php @@ -11,18 +11,19 @@ use MensBeam\Intl\Encoding\UTF16LE; class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest { protected $testedClass = UTF16LE::class; /* - Char 0 U+007A (2 byte) Offset 0 - Char 1 U+00A2 (2 bytes) Offset 2 - Char 2 U+6C34 (2 bytes) Offset 4 - Char 3 U+1D11E (4 bytes) Offset 6 - Char 4 U+F8FF (2 bytes) Offset 10 - Char 5 U+10FFFD (4 bytes) Offset 12 - Char 6 U+FFFE (2 bytes) Offset 16 - End of string at char 7, offset 18 + Byte Order Mark (2 bytes) Offset 0 + Char 0 U+007A (2 bytes) Offset 2 + Char 1 U+00A2 (2 bytes) Offset 4 + Char 2 U+6C34 (2 bytes) Offset 6 + Char 3 U+1D11E (4 bytes) Offset 8 + Char 4 U+F8FF (2 bytes) Offset 12 + Char 5 U+10FFFD (4 bytes) Offset 14 + Char 6 U+FFFE (2 bytes) Offset 18 + End of string at char 7, offset 20 */ - protected $seekString = "7A00 A200 346C 34D81EDD FFF8 FFDBFDDF FEFF"; + protected $seekString = "FFFE 7A00 A200 346C 34D81EDD FFF8 FFDBFDDF FEFF"; protected $seekCodes = [0x007A, 0x00A2, 0x6C34, 0x1D11E, 0xF8FF, 0x10FFFD, 0xFFFE]; - protected $seekOffsets = [0, 2, 4, 6, 10, 12, 16, 18]; + protected $seekOffsets = [2, 4, 6, 8, 12, 14, 18, 20]; /* This string contains an invalid character sequence sandwiched between two null characters */ protected $brokenChar = "0000 00DC 0000"; /* This string conatins the ASCII characters "A" and "Z" followed by two arbitrary non-ASCII characters, followed by the two ASCII characters "0" and "9" */ diff --git a/tests/cases/Encoding/TestUTF8.php b/tests/cases/Encoding/TestUTF8.php index 80685eb..b673a16 100644 --- a/tests/cases/Encoding/TestUTF8.php +++ b/tests/cases/Encoding/TestUTF8.php @@ -13,18 +13,19 @@ use MensBeam\Intl\Encoding\EncoderException; class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest { protected $testedClass = UTF8::class; /* - Char 0 U+007A (1 byte) Offset 0 - Char 1 U+00A2 (2 bytes) Offset 1 - Char 2 U+6C34 (3 bytes) Offset 3 - Char 3 U+1D11E (4 bytes) Offset 6 - Char 4 U+F8FF (3 bytes) Offset 10 - Char 5 U+10FFFD (4 bytes) Offset 13 - Char 6 U+FFFE (3 bytes) Offset 17 - End of string at char 7, offset 20 + Byte Order Mark (3 bytes) Offset 0 + Char 0 U+007A (1 byte) Offset 3 + Char 1 U+00A2 (2 bytes) Offset 4 + Char 2 U+6C34 (3 bytes) Offset 6 + Char 3 U+1D11E (4 bytes) Offset 9 + Char 4 U+F8FF (3 bytes) Offset 13 + Char 5 U+10FFFD (4 bytes) Offset 16 + Char 6 U+FFFE (3 bytes) Offset 20 + End of string at char 7, offset 23 */ - protected $seekString = "7A C2A2 E6B0B4 F09D849E EFA3BF F48FBFBD EFBFBE"; + protected $seekString = "EFBBBF 7A C2A2 E6B0B4 F09D849E EFA3BF F48FBFBD EFBFBE"; protected $seekCodes = [0x007A, 0x00A2, 0x6C34, 0x1D11E, 0xF8FF, 0x10FFFD, 0xFFFE]; - protected $seekOffsets = [0, 1, 3, 6, 10, 13, 17, 20]; + protected $seekOffsets = [3, 4, 6, 9, 13, 16, 20, 23]; /* This string contains an invalid character sequence sandwiched between two null characters */ protected $brokenChar = "00 FF 00"; /* This string conatins the ASCII characters "A" and "Z" followed by two arbitrary non-ASCII characters, followed by the two ASCII characters "0" and "9" */ diff --git a/tests/cases/TestEncoding.php b/tests/cases/TestEncoding.php index 218a60d..5494eba 100644 --- a/tests/cases/TestEncoding.php +++ b/tests/cases/TestEncoding.php @@ -8,6 +8,9 @@ namespace MensBeam\Intl\TestCase; use MensBeam\Intl\Encoding; use MensBeam\Intl\Encoding\Encoder; +use MensBeam\Intl\Encoding\UTF16BE; +use MensBeam\Intl\Encoding\UTF16LE; +use MensBeam\Intl\Encoding\UTF8; class TestEncoding extends \PHPUnit\Framework\TestCase { /** @dataProvider provideLabelData */ @@ -28,6 +31,11 @@ class TestEncoding extends \PHPUnit\Framework\TestCase { $this->assertInstanceOf($data['class'], Encoding::createDecoder(" $label\n\n\r\t", "")); } + /** @dataProvider provideBOMSniffings */ + public function testCreateADecoderWhileSniffingBOM(string $label, string $string, string $class) { + $this->assertInstanceOf($class, Encoding::createDecoder($label, $string)); + } + public function testFailToCreateADecoderFromALabel() { $this->assertNull(Encoding::createDecoder("Not a label", "")); } @@ -71,4 +79,14 @@ class TestEncoding extends \PHPUnit\Framework\TestCase { yield [(string) $label, ['label' => (string) $label, 'name' => $name, 'class' => $class, 'encoder' => $encoder]]; } } + + public function provideBOMSniffings() { + return [ + 'No BOM' => ["UTF-8", "Hello world!", UTF8::class], + 'UTF-8 BOM' => ["Shift_JIS", "\xEF\xBB\xBFA", UTF8::class], + 'UTF-16BE BOM' => ["UTF-8", "\xFE\xFF\x00A", UTF16BE::class], + 'UTF-16LE BOM' => ["UTF-8", "\xFF\xFEA\x00", UTF16LE::class], + 'GB18030 BOM' => ["UTF-8", "\x84\x31\x95\x33A", UTF8::class], + ]; + } } diff --git a/tests/lib/DecoderTest.php b/tests/lib/DecoderTest.php index 8591b9b..d3b1fba 100644 --- a/tests/lib/DecoderTest.php +++ b/tests/lib/DecoderTest.php @@ -8,6 +8,9 @@ namespace MensBeam\Intl\Test; use MensBeam\Intl\Encoding\DecoderException; use MensBeam\Intl\Encoding\ISO2022JP; +use MensBeam\Intl\Encoding\UTF16BE; +use MensBeam\Intl\Encoding\UTF16LE; +use MensBeam\Intl\Encoding\UTF8; abstract class DecoderTest extends \PHPUnit\Framework\TestCase { protected $random = "L51yGwEFuatjbZi7wgNC80qYncvauVm1Lh8vCSK/KJs6QxoynMU8TCamx5TNhbjeh5VpWqQ0Q1j/W6u4O/InxBDxk8g83azJFQHzU+L7Npk0bkdofFv2AHDI2SUlXotYeEOnkKa/c6eQiDk8NapS0LGnb64ypKASacAMp6s2wSUU03l6iVVapHsNBgYs0cD++vnG8ckgbGsV3KkE3Lh601u6jviDyeRwbTxLZcUfSS2uIzrvvGWFfw6D4/FOa3uTR1k2Ya6jT+T/F+OdMgWlUPouuAVgLuvFxj9v9ZBnI+FAFc0kX4aT/JoTuBGMm8YS4xPVvczdrPXCUijML5TZrU201uFqeB9LDDWULp1Ai9d41fcD/8GBFrzlpXPIV+hsSJ4HvWswXdDeVKWgSMrQ78pf+zwvD66TA4FjMiEsLLpf9bb+mPiS2Aa3BP0JpjPwi0gdBu8QipLXNGFUUGW/15jGlj3eNynELRAtvyYZnoYIYShsN1TIU+buw8hHOp9iKsKT+fqPaEuuLLtlJ/cqhcxaZhbaWRB6vCQW9mO7f8whl7cpbBOO+NwDDCJZCsULh7rINF2omkexfOZzQSt/LC3yw+Pzqrf5Pmp5YgpMvoNgHcY1FkpsHc48IHMsJ+gex2zltIG51TQBAhy/fWF0KIqd+IPT+qngVGYIw/WuXj0LaK7XIVp33tc6fzuXNv+GUzYwpv4k9ry8R/DW8EX572FXFA49HHxbytSIJLD/+KpE2CE1WOr3ONwOXm6WduUBmFi4bwlRrCKnHqnFtLztVdLwMOauFa8N822XoAnWvHs+8R1DLHtgUyZas3ktp/qjMp5oVsb2PO+VpPFHIighHySgljrPl+sKaPULh7P/rAHXOuS9p9zTZKHrQ4nccl8SnYZlHKdioWo1NK5LRZB0PXYH8Ytu8aWVBmb4lAlpAFbSTqtOhydUJ/lyM29STG5mTV3rbG6tWMsUXBpaX4PrGCnhj40RVdz0BzsgvzLu4PNI+s3TJ6ZKV4hGS5on040xMDC2423DpKHPNa7mbl7J036dFt0JcYeGu07maGxssJnwLbebg5cm36Ecea7cTBWEGFMqiFjLoBEu0Y2CfF/GEbwqOf55/p1ewaZMrunFKd/Mj89qyYU5bp6mwmXSwj10psAA+qtXYm3XzRrLHKfCuiukyPEtvI+RdjbQDtMP1vF5qkmjlQLHXvEDpviJMaqvIPkjGrZkvAej1JX5yka50z0od9LLz8TIernjLLoVZ+cWtpd3kchO6w+zTpIOups4HdD66zaiPJrXIrJwi5bIgwTOWLhVs3ufZ0loFjlWWUh5FlTW+oWl1AD4h/yPBHWglqfMaTTqH75B4XEriy+Bw9k="; @@ -72,16 +75,22 @@ abstract class DecoderTest extends \PHPUnit\Framework\TestCase { $input = $this->prepString($this->seekString); $off = $this->seekOffsets; $s = new $class($input); + $bom = [ + UTF8::class => 3, + UTF16BE::class => 2, + UTF16LE::class => 2, + ][$this->testedClass] ?? 0; + $this->assertSame(0, $s->posChar()); - $this->assertSame(0, $s->posByte()); + $this->assertSame($bom, $s->posByte()); $this->assertSame(0, $s->seek(0)); $this->assertSame(0, $s->posChar()); - $this->assertSame(0, $s->posByte()); + $this->assertSame($bom, $s->posByte()); $this->assertSame(1, $s->seek(-1)); $this->assertSame(0, $s->posChar()); - $this->assertSame(0, $s->posByte()); + $this->assertSame($bom, $s->posByte()); $this->assertSame(0, $s->seek(1)); $this->assertSame(1, $s->posChar()); @@ -109,15 +118,15 @@ abstract class DecoderTest extends \PHPUnit\Framework\TestCase { $this->assertSame(6, $s->seek(-10)); $this->assertSame(0, $s->posChar()); - $this->assertSame(0, $s->posByte()); + $this->assertSame($bom, $s->posByte()); $this->assertSame(0, $s->seek(5)); $this->assertSame(5, $s->posChar()); $this->assertSame($off[5], $s->posByte()); - $s->rewind(0); + $s->rewind(); $this->assertSame(0, $s->posChar()); - $this->assertSame(0, $s->posByte()); + $this->assertSame($bom, $s->posByte()); } public function testTraversePastTheEndOfAString() {