J. King
6 years ago
10 changed files with 374 additions and 22 deletions
@ -0,0 +1,114 @@ |
|||
<?php |
|||
/** @license MIT |
|||
* Copyright 2018 J. King et al. |
|||
* See LICENSE and AUTHORS files for details */ |
|||
|
|||
declare(strict_types=1); |
|||
namespace MensBeam\Intl\Encoding; |
|||
|
|||
abstract class UTF16 implements Encoding { |
|||
use GenericEncoding; |
|||
|
|||
protected $dirtyEOF = 0; |
|||
|
|||
/** Decodes the next character from the string and returns its code point number |
|||
* |
|||
* If the end of the string has been reached, false is returned |
|||
* |
|||
* @return int|bool |
|||
*/ |
|||
public function nextCode() { |
|||
$lead_b = null; |
|||
$lead_s = null; |
|||
$this->posChar++; |
|||
while (($b = @$this->string[$this->posByte++]) !== "") { |
|||
$b = ord($b); |
|||
if (is_null($lead_b)) { |
|||
$lead_b = $b; |
|||
continue; |
|||
} else { |
|||
if (static::BE) { |
|||
$code = ($lead_b << 8) + $b; |
|||
} else { |
|||
$code = ($b << 8) + $lead_b; |
|||
} |
|||
$lead_b = null; |
|||
if (!is_null($lead_s)) { |
|||
if ($code >= 0xDC00 && $code <= 0xDFFF) { |
|||
return 0x10000 + (($lead_s - 0xD800) << 10) + ($code - 0xDC00); |
|||
} else { |
|||
$this->posByte -= 2; |
|||
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]); |
|||
} |
|||
} else { |
|||
if ($code >= 0xD800 && $code <= 0xDBFF) { |
|||
$lead_s = $code; |
|||
continue; |
|||
} elseif ($code >= 0xDC00 && $code <= 0xDFFF) { |
|||
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]); |
|||
} else { |
|||
return $code; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
$this->posByte--; |
|||
if (($lead_b + $lead_s) == 0) { |
|||
// clean EOF |
|||
$this->posChar--; |
|||
return false; |
|||
} else { |
|||
// dirty EOF; note how many bytes the last character had |
|||
// properly synchronizing UTF-16 is possible without retaining this information, but retaining it makes the task easier |
|||
$this->dirtyEOF = ($lead_s && $lead_b ? 3 : ($lead_s ? 2 : 1)); |
|||
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]); |
|||
} |
|||
} |
|||
|
|||
/** Retrieve the next character in the string, in UTF-8 encoding |
|||
* |
|||
* The returned character may be a replacement character, or the empty string if the end of the string has been reached |
|||
*/ |
|||
public function nextChar(): string { |
|||
// get the byte at the current position |
|||
$b = @$this->string[$this->posByte]; |
|||
if ($b === "") { |
|||
// if the byte is end of input, simply return it |
|||
return ""; |
|||
} else { |
|||
// otherwise return the serialization of the code point at the current position |
|||
return UTF8::encode($this->nextCode()); |
|||
} |
|||
} |
|||
|
|||
/** Implements backward seeking $distance characters */ |
|||
protected function seekBack(int $distance): int { |
|||
if ($this->posByte >= $this->lenByte && $this->dirtyEOF > 0) { |
|||
// if we are at the end of the string and it did not terminate cleanly, go back the correct number of dirty bytes to seek through the last character |
|||
$this->posByte -= $this->dirtyEOF; |
|||
$distance--; |
|||
$this->posChar--; |
|||
} |
|||
while ($distance > 0 && $this->posByte > 0) { |
|||
$distance--; |
|||
$this->posChar--; |
|||
if ($this->posByte < 4) { |
|||
// if we're less than four bytes into the string, the previous character is necessarily double-byte |
|||
$this->posByte -= 2; |
|||
} else { |
|||
// otherwise go back four bytes and consume a character |
|||
$start = $this->posByte; |
|||
$this->posByte -= 4; |
|||
$this->posChar--; |
|||
$this->nextCode(); |
|||
if ($this->posByte == $start) { |
|||
// if we're back at our starting position the character was four bytes |
|||
$this->posByte -= 4; |
|||
} else { |
|||
// otherwise we're already where we need to be |
|||
} |
|||
} |
|||
} |
|||
return $distance; |
|||
} |
|||
} |
@ -0,0 +1,13 @@ |
|||
<?php |
|||
/** @license MIT |
|||
* Copyright 2018 J. King et al. |
|||
* See LICENSE and AUTHORS files for details */ |
|||
|
|||
declare(strict_types=1); |
|||
namespace MensBeam\Intl\Encoding; |
|||
|
|||
class UTF16BE extends UTF16 { |
|||
const BE = true; |
|||
const NAME = "UTF-16BE"; |
|||
const LABELS = ["utf-16be"]; |
|||
} |
@ -0,0 +1,13 @@ |
|||
<?php |
|||
/** @license MIT |
|||
* Copyright 2018 J. King et al. |
|||
* See LICENSE and AUTHORS files for details */ |
|||
|
|||
declare(strict_types=1); |
|||
namespace MensBeam\Intl\Encoding; |
|||
|
|||
class UTF16LE extends UTF16 { |
|||
const BE = false; |
|||
const NAME = "UTF-16LE"; |
|||
const LABELS = ["utf-16", "utf-16le"]; |
|||
} |
@ -0,0 +1,44 @@ |
|||
<?php |
|||
/** @license MIT |
|||
* Copyright 2018 J. King et al. |
|||
* See LICENSE and AUTHORS files for details */ |
|||
|
|||
declare(strict_types=1); |
|||
namespace MensBeam\Intl\TestCase\Encoding; |
|||
|
|||
use MensBeam\Intl\Encoding\UTF16LE; |
|||
use MensBeam\Intl\Encoding\UTF16BE; |
|||
|
|||
class TestUTF16BE extends TestUTF16LE { |
|||
protected $testedClass = UTF16BE::class; |
|||
/* |
|||
Char 0 U+007A (2 byte) Offset 0 |
|||
Char 1 U+00A2 (2 bytes) Offset 2 |
|||
Char 2 U+6C34 (2 bytes) Offset 4 |
|||
Char 3 U+1D11E (4 bytes) Offset 6 |
|||
Char 4 U+F8FF (2 bytes) Offset 10 |
|||
Char 5 U+10FFFD (4 bytes) Offset 12 |
|||
Char 6 U+FFFE (2 bytes) Offset 16 |
|||
End of string at char 7, offset 18 |
|||
*/ |
|||
protected $seekString = "007A 00A2 6C34 D834DD1E F8FF DBFFDFFD FFFE"; |
|||
protected $seekCodes = [0x007A, 0x00A2, 0x6C34, 0x1D11E, 0xF8FF, 0x10FFFD, 0xFFFE]; |
|||
protected $seekOffsets = [0, 2, 4, 6, 10, 12, 16, 18]; |
|||
/* This string contains an invalid character sequence sandwiched between two null characters */ |
|||
protected $brokenChar = "0000 DC00 0000"; |
|||
protected $lowerA = "\x00a"; |
|||
|
|||
public function provideStrings() { |
|||
foreach (parent::provideStrings() as $name => $test) { |
|||
list($string, $codes) = $test; |
|||
$words = explode(" ", $string); |
|||
foreach($words as $a => $word) { |
|||
if (strlen($word) == 4) { |
|||
$words[$a] = $word[2].$word[3].$word[0].$word[1]; |
|||
} |
|||
} |
|||
$string = implode(" ",$words); |
|||
yield $name => [$string, $codes]; |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,151 @@ |
|||
<?php |
|||
/** @license MIT |
|||
* Copyright 2018 J. King et al. |
|||
* See LICENSE and AUTHORS files for details */ |
|||
|
|||
declare(strict_types=1); |
|||
namespace MensBeam\Intl\TestCase\Encoding; |
|||
|
|||
use MensBeam\Intl\Encoding\UTF16LE; |
|||
use MensBeam\Intl\Encoding\UTF16BE; |
|||
|
|||
class TestUTF16LE extends \MensBeam\Intl\Test\EncodingTest { |
|||
protected $testedClass = UTF16LE::class; |
|||
/* |
|||
Char 0 U+007A (2 byte) Offset 0 |
|||
Char 1 U+00A2 (2 bytes) Offset 2 |
|||
Char 2 U+6C34 (2 bytes) Offset 4 |
|||
Char 3 U+1D11E (4 bytes) Offset 6 |
|||
Char 4 U+F8FF (2 bytes) Offset 10 |
|||
Char 5 U+10FFFD (4 bytes) Offset 12 |
|||
Char 6 U+FFFE (2 bytes) Offset 16 |
|||
End of string at char 7, offset 18 |
|||
*/ |
|||
protected $seekString = "7A00 A200 346C 34D81EDD FFF8 FFDBFDDF FEFF"; |
|||
protected $seekCodes = [0x007A, 0x00A2, 0x6C34, 0x1D11E, 0xF8FF, 0x10FFFD, 0xFFFE]; |
|||
protected $seekOffsets = [0, 2, 4, 6, 10, 12, 16, 18]; |
|||
/* This string contains an invalid character sequence sandwiched between two null characters */ |
|||
protected $brokenChar = "0000 00DC 0000"; |
|||
protected $lowerA = "a\x00"; |
|||
|
|||
/** |
|||
* @dataProvider provideCodePoints |
|||
* @coversNothing |
|||
*/ |
|||
public function testEncodeCodePoints(bool $fatal, $input, $exp) { |
|||
// UTF-16 has no encoder |
|||
$this->assertTrue(true); |
|||
} |
|||
|
|||
/** |
|||
* @dataProvider provideStrings |
|||
* @covers MensBeam\Intl\Encoding\UTF16::__construct |
|||
* @covers MensBeam\Intl\Encoding\UTF16::nextCode |
|||
*/ |
|||
public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) { |
|||
return parent::testDecodeMultipleCharactersAsCodePoints($input, $exp); |
|||
} |
|||
|
|||
/** |
|||
* @dataProvider provideStrings |
|||
* @covers MensBeam\Intl\Encoding\UTF16::__construct |
|||
* @covers MensBeam\Intl\Encoding\UTF16::nextChar |
|||
*/ |
|||
public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) { |
|||
return parent::testDecodeMultipleCharactersAsStrings($input, $exp); |
|||
} |
|||
|
|||
/** |
|||
* @dataProvider provideStrings |
|||
* @covers MensBeam\Intl\Encoding\UTF16::seekBack |
|||
*/ |
|||
public function testSTepBackThroughAString(string $input, array $exp) { |
|||
return parent::testSTepBackThroughAString($input, $exp); |
|||
} |
|||
|
|||
/** |
|||
* @covers MensBeam\Intl\Encoding\UTF16::seek |
|||
* @covers MensBeam\Intl\Encoding\UTF16::posChar |
|||
* @covers MensBeam\Intl\Encoding\UTF16::posByte |
|||
* @covers MensBeam\Intl\Encoding\UTF16::rewind |
|||
*/ |
|||
public function testSeekThroughAString() { |
|||
return parent::testSeekThroughAString(); |
|||
} |
|||
|
|||
/** |
|||
* @covers MensBeam\Intl\Encoding\UTF16::posChar |
|||
* @covers MensBeam\Intl\Encoding\UTF16::posByte |
|||
*/ |
|||
public function testTraversePastTheEndOfAString() { |
|||
return parent::testTraversePastTheEndOfAString(); |
|||
} |
|||
|
|||
/** |
|||
* @covers MensBeam\Intl\Encoding\UTF16::peekChar |
|||
* @covers MensBeam\Intl\Encoding\UTF16::stateSave |
|||
* @covers MensBeam\Intl\Encoding\UTF16::stateApply |
|||
*/ |
|||
public function testPeekAtCharacters() { |
|||
return parent::testPeekAtCharacters(); |
|||
} |
|||
|
|||
/** |
|||
* @covers MensBeam\Intl\Encoding\UTF16::peekCode |
|||
* @covers MensBeam\Intl\Encoding\UTF16::stateSave |
|||
* @covers MensBeam\Intl\Encoding\UTF16::stateApply |
|||
*/ |
|||
public function testPeekAtCodePoints() { |
|||
return parent::testPeekAtCodePoints(); |
|||
} |
|||
|
|||
/** |
|||
* @dataProvider provideStrings |
|||
* @covers MensBeam\Intl\Encoding\UTF16::len |
|||
* @covers MensBeam\Intl\Encoding\UTF16::stateSave |
|||
* @covers MensBeam\Intl\Encoding\UTF16::stateApply |
|||
*/ |
|||
public function testGetStringLength(string $input, array $points) { |
|||
return parent::testGetStringLength($input, $points); |
|||
} |
|||
|
|||
/** |
|||
* @covers MensBeam\Intl\Encoding\UTF16::err |
|||
*/ |
|||
public function testReplacementModes() { |
|||
return parent::testReplacementModes(); |
|||
} |
|||
|
|||
/** |
|||
* @dataProvider provideStrings |
|||
* @covers MensBeam\Intl\Encoding\UTF16::rewind |
|||
* @covers MensBeam\Intl\Encoding\UTF16::chars |
|||
* @covers MensBeam\Intl\Encoding\UTF16::codes |
|||
*/ |
|||
public function testIterateThroughAString(string $input, array $exp) { |
|||
return parent::testIterateThroughAString($input, $exp); |
|||
} |
|||
|
|||
public function provideCodePoints() { |
|||
// UTF-16 has no encoder |
|||
return [[true, 0, ""]]; |
|||
} |
|||
|
|||
public function provideStrings() { |
|||
return [ |
|||
// control samples |
|||
'empty string' => ["", []], |
|||
'sanity check' => ["6100 6200 6300 3100 3200 3300", [97, 98, 99, 49, 50, 51]], |
|||
'mixed sample' => ["7A00 A200 346C 34D8 1EDD FFF8 FFDB FDDF FEFF", [122, 162, 27700, 119070, 63743, 1114109, 65534]], |
|||
// unexpected EOF |
|||
'EOF in BMP character' => ["FF", [65533]], |
|||
'EOF after lead surrogate' => ["34D8", [65533]], |
|||
'EOF in trail surrogate' => ["34D8 1E", [65533]], |
|||
// invalid UTF-16 surrogates |
|||
'lead surrogate without trail' => ["34D8 0000", [65533, 0]], |
|||
'trail surrogate without lead' => ["1EDD 0000", [65533, 0]], |
|||
'double lead surrogate' => ["34D8 34D8 1EDD", [65533, 119070]], |
|||
'double trail surrogate' => ["34D8 1EDD 1EDD", [119070, 65533]], |
|||
]; |
|||
} |
|||
} |
Loading…
Reference in new issue