J. King
6 years ago
10 changed files with 374 additions and 22 deletions
@ -0,0 +1,114 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\Encoding; |
||||
|
|
||||
|
abstract class UTF16 implements Encoding { |
||||
|
use GenericEncoding; |
||||
|
|
||||
|
protected $dirtyEOF = 0; |
||||
|
|
||||
|
/** Decodes the next character from the string and returns its code point number |
||||
|
* |
||||
|
* If the end of the string has been reached, false is returned |
||||
|
* |
||||
|
* @return int|bool |
||||
|
*/ |
||||
|
public function nextCode() { |
||||
|
$lead_b = null; |
||||
|
$lead_s = null; |
||||
|
$this->posChar++; |
||||
|
while (($b = @$this->string[$this->posByte++]) !== "") { |
||||
|
$b = ord($b); |
||||
|
if (is_null($lead_b)) { |
||||
|
$lead_b = $b; |
||||
|
continue; |
||||
|
} else { |
||||
|
if (static::BE) { |
||||
|
$code = ($lead_b << 8) + $b; |
||||
|
} else { |
||||
|
$code = ($b << 8) + $lead_b; |
||||
|
} |
||||
|
$lead_b = null; |
||||
|
if (!is_null($lead_s)) { |
||||
|
if ($code >= 0xDC00 && $code <= 0xDFFF) { |
||||
|
return 0x10000 + (($lead_s - 0xD800) << 10) + ($code - 0xDC00); |
||||
|
} else { |
||||
|
$this->posByte -= 2; |
||||
|
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]); |
||||
|
} |
||||
|
} else { |
||||
|
if ($code >= 0xD800 && $code <= 0xDBFF) { |
||||
|
$lead_s = $code; |
||||
|
continue; |
||||
|
} elseif ($code >= 0xDC00 && $code <= 0xDFFF) { |
||||
|
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]); |
||||
|
} else { |
||||
|
return $code; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
$this->posByte--; |
||||
|
if (($lead_b + $lead_s) == 0) { |
||||
|
// clean EOF |
||||
|
$this->posChar--; |
||||
|
return false; |
||||
|
} else { |
||||
|
// dirty EOF; note how many bytes the last character had |
||||
|
// properly synchronizing UTF-16 is possible without retaining this information, but retaining it makes the task easier |
||||
|
$this->dirtyEOF = ($lead_s && $lead_b ? 3 : ($lead_s ? 2 : 1)); |
||||
|
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** Retrieve the next character in the string, in UTF-8 encoding |
||||
|
* |
||||
|
* The returned character may be a replacement character, or the empty string if the end of the string has been reached |
||||
|
*/ |
||||
|
public function nextChar(): string { |
||||
|
// get the byte at the current position |
||||
|
$b = @$this->string[$this->posByte]; |
||||
|
if ($b === "") { |
||||
|
// if the byte is end of input, simply return it |
||||
|
return ""; |
||||
|
} else { |
||||
|
// otherwise return the serialization of the code point at the current position |
||||
|
return UTF8::encode($this->nextCode()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** Implements backward seeking $distance characters */ |
||||
|
protected function seekBack(int $distance): int { |
||||
|
if ($this->posByte >= $this->lenByte && $this->dirtyEOF > 0) { |
||||
|
// if we are at the end of the string and it did not terminate cleanly, go back the correct number of dirty bytes to seek through the last character |
||||
|
$this->posByte -= $this->dirtyEOF; |
||||
|
$distance--; |
||||
|
$this->posChar--; |
||||
|
} |
||||
|
while ($distance > 0 && $this->posByte > 0) { |
||||
|
$distance--; |
||||
|
$this->posChar--; |
||||
|
if ($this->posByte < 4) { |
||||
|
// if we're less than four bytes into the string, the previous character is necessarily double-byte |
||||
|
$this->posByte -= 2; |
||||
|
} else { |
||||
|
// otherwise go back four bytes and consume a character |
||||
|
$start = $this->posByte; |
||||
|
$this->posByte -= 4; |
||||
|
$this->posChar--; |
||||
|
$this->nextCode(); |
||||
|
if ($this->posByte == $start) { |
||||
|
// if we're back at our starting position the character was four bytes |
||||
|
$this->posByte -= 4; |
||||
|
} else { |
||||
|
// otherwise we're already where we need to be |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
return $distance; |
||||
|
} |
||||
|
} |
@ -0,0 +1,13 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\Encoding; |
||||
|
|
||||
|
class UTF16BE extends UTF16 { |
||||
|
const BE = true; |
||||
|
const NAME = "UTF-16BE"; |
||||
|
const LABELS = ["utf-16be"]; |
||||
|
} |
@ -0,0 +1,13 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\Encoding; |
||||
|
|
||||
|
class UTF16LE extends UTF16 { |
||||
|
const BE = false; |
||||
|
const NAME = "UTF-16LE"; |
||||
|
const LABELS = ["utf-16", "utf-16le"]; |
||||
|
} |
@ -0,0 +1,44 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\TestCase\Encoding; |
||||
|
|
||||
|
use MensBeam\Intl\Encoding\UTF16LE; |
||||
|
use MensBeam\Intl\Encoding\UTF16BE; |
||||
|
|
||||
|
class TestUTF16BE extends TestUTF16LE { |
||||
|
protected $testedClass = UTF16BE::class; |
||||
|
/* |
||||
|
Char 0 U+007A (2 byte) Offset 0 |
||||
|
Char 1 U+00A2 (2 bytes) Offset 2 |
||||
|
Char 2 U+6C34 (2 bytes) Offset 4 |
||||
|
Char 3 U+1D11E (4 bytes) Offset 6 |
||||
|
Char 4 U+F8FF (2 bytes) Offset 10 |
||||
|
Char 5 U+10FFFD (4 bytes) Offset 12 |
||||
|
Char 6 U+FFFE (2 bytes) Offset 16 |
||||
|
End of string at char 7, offset 18 |
||||
|
*/ |
||||
|
protected $seekString = "007A 00A2 6C34 D834DD1E F8FF DBFFDFFD FFFE"; |
||||
|
protected $seekCodes = [0x007A, 0x00A2, 0x6C34, 0x1D11E, 0xF8FF, 0x10FFFD, 0xFFFE]; |
||||
|
protected $seekOffsets = [0, 2, 4, 6, 10, 12, 16, 18]; |
||||
|
/* This string contains an invalid character sequence sandwiched between two null characters */ |
||||
|
protected $brokenChar = "0000 DC00 0000"; |
||||
|
protected $lowerA = "\x00a"; |
||||
|
|
||||
|
public function provideStrings() { |
||||
|
foreach (parent::provideStrings() as $name => $test) { |
||||
|
list($string, $codes) = $test; |
||||
|
$words = explode(" ", $string); |
||||
|
foreach($words as $a => $word) { |
||||
|
if (strlen($word) == 4) { |
||||
|
$words[$a] = $word[2].$word[3].$word[0].$word[1]; |
||||
|
} |
||||
|
} |
||||
|
$string = implode(" ",$words); |
||||
|
yield $name => [$string, $codes]; |
||||
|
} |
||||
|
} |
||||
|
} |
@ -0,0 +1,151 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\TestCase\Encoding; |
||||
|
|
||||
|
use MensBeam\Intl\Encoding\UTF16LE; |
||||
|
use MensBeam\Intl\Encoding\UTF16BE; |
||||
|
|
||||
|
class TestUTF16LE extends \MensBeam\Intl\Test\EncodingTest { |
||||
|
protected $testedClass = UTF16LE::class; |
||||
|
/* |
||||
|
Char 0 U+007A (2 byte) Offset 0 |
||||
|
Char 1 U+00A2 (2 bytes) Offset 2 |
||||
|
Char 2 U+6C34 (2 bytes) Offset 4 |
||||
|
Char 3 U+1D11E (4 bytes) Offset 6 |
||||
|
Char 4 U+F8FF (2 bytes) Offset 10 |
||||
|
Char 5 U+10FFFD (4 bytes) Offset 12 |
||||
|
Char 6 U+FFFE (2 bytes) Offset 16 |
||||
|
End of string at char 7, offset 18 |
||||
|
*/ |
||||
|
protected $seekString = "7A00 A200 346C 34D81EDD FFF8 FFDBFDDF FEFF"; |
||||
|
protected $seekCodes = [0x007A, 0x00A2, 0x6C34, 0x1D11E, 0xF8FF, 0x10FFFD, 0xFFFE]; |
||||
|
protected $seekOffsets = [0, 2, 4, 6, 10, 12, 16, 18]; |
||||
|
/* This string contains an invalid character sequence sandwiched between two null characters */ |
||||
|
protected $brokenChar = "0000 00DC 0000"; |
||||
|
protected $lowerA = "a\x00"; |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideCodePoints |
||||
|
* @coversNothing |
||||
|
*/ |
||||
|
public function testEncodeCodePoints(bool $fatal, $input, $exp) { |
||||
|
// UTF-16 has no encoder |
||||
|
$this->assertTrue(true); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::__construct |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::nextCode |
||||
|
*/ |
||||
|
public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) { |
||||
|
return parent::testDecodeMultipleCharactersAsCodePoints($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::__construct |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::nextChar |
||||
|
*/ |
||||
|
public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) { |
||||
|
return parent::testDecodeMultipleCharactersAsStrings($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::seekBack |
||||
|
*/ |
||||
|
public function testSTepBackThroughAString(string $input, array $exp) { |
||||
|
return parent::testSTepBackThroughAString($input, $exp); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::seek |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::posChar |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::posByte |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::rewind |
||||
|
*/ |
||||
|
public function testSeekThroughAString() { |
||||
|
return parent::testSeekThroughAString(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::posChar |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::posByte |
||||
|
*/ |
||||
|
public function testTraversePastTheEndOfAString() { |
||||
|
return parent::testTraversePastTheEndOfAString(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::peekChar |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::stateSave |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::stateApply |
||||
|
*/ |
||||
|
public function testPeekAtCharacters() { |
||||
|
return parent::testPeekAtCharacters(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::peekCode |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::stateSave |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::stateApply |
||||
|
*/ |
||||
|
public function testPeekAtCodePoints() { |
||||
|
return parent::testPeekAtCodePoints(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::len |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::stateSave |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::stateApply |
||||
|
*/ |
||||
|
public function testGetStringLength(string $input, array $points) { |
||||
|
return parent::testGetStringLength($input, $points); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::err |
||||
|
*/ |
||||
|
public function testReplacementModes() { |
||||
|
return parent::testReplacementModes(); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* @dataProvider provideStrings |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::rewind |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::chars |
||||
|
* @covers MensBeam\Intl\Encoding\UTF16::codes |
||||
|
*/ |
||||
|
public function testIterateThroughAString(string $input, array $exp) { |
||||
|
return parent::testIterateThroughAString($input, $exp); |
||||
|
} |
||||
|
|
||||
|
public function provideCodePoints() { |
||||
|
// UTF-16 has no encoder |
||||
|
return [[true, 0, ""]]; |
||||
|
} |
||||
|
|
||||
|
public function provideStrings() { |
||||
|
return [ |
||||
|
// control samples |
||||
|
'empty string' => ["", []], |
||||
|
'sanity check' => ["6100 6200 6300 3100 3200 3300", [97, 98, 99, 49, 50, 51]], |
||||
|
'mixed sample' => ["7A00 A200 346C 34D8 1EDD FFF8 FFDB FDDF FEFF", [122, 162, 27700, 119070, 63743, 1114109, 65534]], |
||||
|
// unexpected EOF |
||||
|
'EOF in BMP character' => ["FF", [65533]], |
||||
|
'EOF after lead surrogate' => ["34D8", [65533]], |
||||
|
'EOF in trail surrogate' => ["34D8 1E", [65533]], |
||||
|
// invalid UTF-16 surrogates |
||||
|
'lead surrogate without trail' => ["34D8 0000", [65533, 0]], |
||||
|
'trail surrogate without lead' => ["1EDD 0000", [65533, 0]], |
||||
|
'double lead surrogate' => ["34D8 34D8 1EDD", [65533, 119070]], |
||||
|
'double trail surrogate' => ["34D8 1EDD 1EDD", [119070, 65533]], |
||||
|
]; |
||||
|
} |
||||
|
} |
Loading…
Reference in new issue