Browse Source

Implement UTF-16

span
J. King 6 years ago
parent
commit
dd9bed2e84
  1. 114
      lib/Encoding/UTF16.php
  2. 13
      lib/Encoding/UTF16BE.php
  3. 13
      lib/Encoding/UTF16LE.php
  4. 6
      tests/cases/Encoding/TestGB18030.php
  5. 26
      tests/cases/Encoding/TestSingleByte.php
  6. 44
      tests/cases/Encoding/TestUTF16BE.php
  7. 151
      tests/cases/Encoding/TestUTF16LE.php
  8. 8
      tests/cases/Encoding/TestUTF8.php
  9. 19
      tests/lib/EncodingTest.php
  10. 2
      tests/phpunit.xml

114
lib/Encoding/UTF16.php

@ -0,0 +1,114 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Intl\Encoding;
abstract class UTF16 implements Encoding {
use GenericEncoding;
protected $dirtyEOF = 0;
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
*/
public function nextCode() {
$lead_b = null;
$lead_s = null;
$this->posChar++;
while (($b = @$this->string[$this->posByte++]) !== "") {
$b = ord($b);
if (is_null($lead_b)) {
$lead_b = $b;
continue;
} else {
if (static::BE) {
$code = ($lead_b << 8) + $b;
} else {
$code = ($b << 8) + $lead_b;
}
$lead_b = null;
if (!is_null($lead_s)) {
if ($code >= 0xDC00 && $code <= 0xDFFF) {
return 0x10000 + (($lead_s - 0xD800) << 10) + ($code - 0xDC00);
} else {
$this->posByte -= 2;
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]);
}
} else {
if ($code >= 0xD800 && $code <= 0xDBFF) {
$lead_s = $code;
continue;
} elseif ($code >= 0xDC00 && $code <= 0xDFFF) {
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]);
} else {
return $code;
}
}
}
}
$this->posByte--;
if (($lead_b + $lead_s) == 0) {
// clean EOF
$this->posChar--;
return false;
} else {
// dirty EOF; note how many bytes the last character had
// properly synchronizing UTF-16 is possible without retaining this information, but retaining it makes the task easier
$this->dirtyEOF = ($lead_s && $lead_b ? 3 : ($lead_s ? 2 : 1));
return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]);
}
}
/** Retrieve the next character in the string, in UTF-8 encoding
*
* The returned character may be a replacement character, or the empty string if the end of the string has been reached
*/
public function nextChar(): string {
// get the byte at the current position
$b = @$this->string[$this->posByte];
if ($b === "") {
// if the byte is end of input, simply return it
return "";
} else {
// otherwise return the serialization of the code point at the current position
return UTF8::encode($this->nextCode());
}
}
/** Implements backward seeking $distance characters */
protected function seekBack(int $distance): int {
if ($this->posByte >= $this->lenByte && $this->dirtyEOF > 0) {
// if we are at the end of the string and it did not terminate cleanly, go back the correct number of dirty bytes to seek through the last character
$this->posByte -= $this->dirtyEOF;
$distance--;
$this->posChar--;
}
while ($distance > 0 && $this->posByte > 0) {
$distance--;
$this->posChar--;
if ($this->posByte < 4) {
// if we're less than four bytes into the string, the previous character is necessarily double-byte
$this->posByte -= 2;
} else {
// otherwise go back four bytes and consume a character
$start = $this->posByte;
$this->posByte -= 4;
$this->posChar--;
$this->nextCode();
if ($this->posByte == $start) {
// if we're back at our starting position the character was four bytes
$this->posByte -= 4;
} else {
// otherwise we're already where we need to be
}
}
}
return $distance;
}
}

13
lib/Encoding/UTF16BE.php

@ -0,0 +1,13 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Intl\Encoding;
class UTF16BE extends UTF16 {
const BE = true;
const NAME = "UTF-16BE";
const LABELS = ["utf-16be"];
}

13
lib/Encoding/UTF16LE.php

@ -0,0 +1,13 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Intl\Encoding;
class UTF16LE extends UTF16 {
const BE = false;
const NAME = "UTF-16LE";
const LABELS = ["utf-16", "utf-16le"];
}

6
tests/cases/Encoding/TestGB18030.php

@ -9,7 +9,6 @@ namespace MensBeam\Intl\TestCase\Encoding;
use MensBeam\Intl\Encoding\GBK;
use MensBeam\Intl\Encoding\GB18030;
use MensBeam\Intl\Encoding\EncoderException;
use MensBeam\Intl\Encoding\DecoderException;
class TestGB18030 extends \MensBeam\Intl\Test\EncodingTest {
protected $testedClass = GB18030::class;
@ -26,8 +25,9 @@ class TestGB18030 extends \MensBeam\Intl\Test\EncodingTest {
protected $seekString = "7A 81 30 84 34 CB AE 94 32 BE 34 84 30 81 30 E3 32 9A 33 84 31 A4 38";
protected $seekCodes = [0x007A, 0x00A2, 0x6C34, 0x1D11E, 0xF8FF, 0x10FFFD, 0xFFFE];
protected $seekOffsets = [0, 1, 5, 7, 11, 15, 19, 23];
/* This string contains a single invalid character sequence */
protected $brokenChar = "FF";
/* This string contains an invalid character sequence sandwiched between two null characters */
protected $brokenChar = "00 FF 00";
protected $lowerA = "a";
public function tearDown() {
$this->testedClass = GB18030::class;

26
tests/cases/Encoding/TestSingleByte.php

@ -8,7 +8,6 @@ namespace MensBeam\Intl\TestCase\Encoding;
use MensBeam\Intl\Encoding\SingleByteEncoding;
use MensBeam\Intl\Encoding\EncoderException;
use MensBeam\Intl\Encoding\DecoderException;
class TestSingleByte extends \MensBeam\Intl\Test\EncodingTest {
// maps taken from https://github.com/web-platform-tests/wpt/blob/d6c29bef8d4bcdfe4f689defca73360b07647d71/encoding/single-byte-decoder.html
@ -79,8 +78,9 @@ class TestSingleByte extends \MensBeam\Intl\Test\EncodingTest {
protected $seekString = "30 31 32 33 34 35 36";
protected $seekCodes = [0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36];
protected $seekOffsets = [0, 1, 2, 3, 4, 5, 6, 7];
/* This string is supposed to contain a single invalid character sequence; this is different for each single-byte encoding (and many do not have invalid characters) */
/* This string is supposed to contain an invalid character sequence sandwiched between two null characters; this is different for each single-byte encoding (and many do not have invalid characters) */
protected $brokenChar = "";
protected $lowerA = "a";
/**
* @dataProvider provideCodePoints
@ -182,17 +182,17 @@ class TestSingleByte extends \MensBeam\Intl\Test\EncodingTest {
}
/**
* @dataProvider provideStrings
* @dataProvider provideBrokenStrings
* @covers MensBeam\Intl\Encoding\SingleByteEncoding::err
*/
public function testReplacementModes(string $input = "", array $points = [], string $class = SingleByteEncoding::class) {
if (($bump = array_search(0xFFFD, $points, true)) === false) {
public function testReplacementModes(string $input = "", string $class = SingleByteEncoding::class) {
if (!$input) {
// if the encoding uses all 128 high byte values, this test is non-operative
$this->assertTrue(true);
return;
}
$this->testedClass = $class;
$this->brokenChar = bin2hex(chr($bump));
$this->brokenChar = $input;
return parent::testReplacementModes();
}
@ -262,6 +262,20 @@ class TestSingleByte extends \MensBeam\Intl\Test\EncodingTest {
}
}
public function provideBrokenStrings() {
foreach ($this->provideStrings() as $name => $test) {
$codes = $test[1];
$class = $test[2];
if (($bump = array_search(0xFFFD, $codes, true)) === false) {
// if the encoding uses all 128 high byte values, this test is non-operative
yield $name => ["", $class];
} else {
$byte = strtoupper(bin2hex(chr($bump)));
yield $name => ["00 $byte 00", $class];
}
}
}
/**
* @dataProvider provideInvalids
* @covers MensBeam\Intl\Encoding\SingleByteEncoding::encode

44
tests/cases/Encoding/TestUTF16BE.php

@ -0,0 +1,44 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Intl\TestCase\Encoding;
use MensBeam\Intl\Encoding\UTF16LE;
use MensBeam\Intl\Encoding\UTF16BE;
class TestUTF16BE extends TestUTF16LE {
protected $testedClass = UTF16BE::class;
/*
Char 0 U+007A (2 byte) Offset 0
Char 1 U+00A2 (2 bytes) Offset 2
Char 2 U+6C34 (2 bytes) Offset 4
Char 3 U+1D11E (4 bytes) Offset 6
Char 4 U+F8FF (2 bytes) Offset 10
Char 5 U+10FFFD (4 bytes) Offset 12
Char 6 U+FFFE (2 bytes) Offset 16
End of string at char 7, offset 18
*/
protected $seekString = "007A 00A2 6C34 D834DD1E F8FF DBFFDFFD FFFE";
protected $seekCodes = [0x007A, 0x00A2, 0x6C34, 0x1D11E, 0xF8FF, 0x10FFFD, 0xFFFE];
protected $seekOffsets = [0, 2, 4, 6, 10, 12, 16, 18];
/* This string contains an invalid character sequence sandwiched between two null characters */
protected $brokenChar = "0000 DC00 0000";
protected $lowerA = "\x00a";
public function provideStrings() {
foreach (parent::provideStrings() as $name => $test) {
list($string, $codes) = $test;
$words = explode(" ", $string);
foreach($words as $a => $word) {
if (strlen($word) == 4) {
$words[$a] = $word[2].$word[3].$word[0].$word[1];
}
}
$string = implode(" ",$words);
yield $name => [$string, $codes];
}
}
}

151
tests/cases/Encoding/TestUTF16LE.php

@ -0,0 +1,151 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Intl\TestCase\Encoding;
use MensBeam\Intl\Encoding\UTF16LE;
use MensBeam\Intl\Encoding\UTF16BE;
class TestUTF16LE extends \MensBeam\Intl\Test\EncodingTest {
protected $testedClass = UTF16LE::class;
/*
Char 0 U+007A (2 byte) Offset 0
Char 1 U+00A2 (2 bytes) Offset 2
Char 2 U+6C34 (2 bytes) Offset 4
Char 3 U+1D11E (4 bytes) Offset 6
Char 4 U+F8FF (2 bytes) Offset 10
Char 5 U+10FFFD (4 bytes) Offset 12
Char 6 U+FFFE (2 bytes) Offset 16
End of string at char 7, offset 18
*/
protected $seekString = "7A00 A200 346C 34D81EDD FFF8 FFDBFDDF FEFF";
protected $seekCodes = [0x007A, 0x00A2, 0x6C34, 0x1D11E, 0xF8FF, 0x10FFFD, 0xFFFE];
protected $seekOffsets = [0, 2, 4, 6, 10, 12, 16, 18];
/* This string contains an invalid character sequence sandwiched between two null characters */
protected $brokenChar = "0000 00DC 0000";
protected $lowerA = "a\x00";
/**
* @dataProvider provideCodePoints
* @coversNothing
*/
public function testEncodeCodePoints(bool $fatal, $input, $exp) {
// UTF-16 has no encoder
$this->assertTrue(true);
}
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\UTF16::__construct
* @covers MensBeam\Intl\Encoding\UTF16::nextCode
*/
public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) {
return parent::testDecodeMultipleCharactersAsCodePoints($input, $exp);
}
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\UTF16::__construct
* @covers MensBeam\Intl\Encoding\UTF16::nextChar
*/
public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) {
return parent::testDecodeMultipleCharactersAsStrings($input, $exp);
}
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\UTF16::seekBack
*/
public function testSTepBackThroughAString(string $input, array $exp) {
return parent::testSTepBackThroughAString($input, $exp);
}
/**
* @covers MensBeam\Intl\Encoding\UTF16::seek
* @covers MensBeam\Intl\Encoding\UTF16::posChar
* @covers MensBeam\Intl\Encoding\UTF16::posByte
* @covers MensBeam\Intl\Encoding\UTF16::rewind
*/
public function testSeekThroughAString() {
return parent::testSeekThroughAString();
}
/**
* @covers MensBeam\Intl\Encoding\UTF16::posChar
* @covers MensBeam\Intl\Encoding\UTF16::posByte
*/
public function testTraversePastTheEndOfAString() {
return parent::testTraversePastTheEndOfAString();
}
/**
* @covers MensBeam\Intl\Encoding\UTF16::peekChar
* @covers MensBeam\Intl\Encoding\UTF16::stateSave
* @covers MensBeam\Intl\Encoding\UTF16::stateApply
*/
public function testPeekAtCharacters() {
return parent::testPeekAtCharacters();
}
/**
* @covers MensBeam\Intl\Encoding\UTF16::peekCode
* @covers MensBeam\Intl\Encoding\UTF16::stateSave
* @covers MensBeam\Intl\Encoding\UTF16::stateApply
*/
public function testPeekAtCodePoints() {
return parent::testPeekAtCodePoints();
}
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\UTF16::len
* @covers MensBeam\Intl\Encoding\UTF16::stateSave
* @covers MensBeam\Intl\Encoding\UTF16::stateApply
*/
public function testGetStringLength(string $input, array $points) {
return parent::testGetStringLength($input, $points);
}
/**
* @covers MensBeam\Intl\Encoding\UTF16::err
*/
public function testReplacementModes() {
return parent::testReplacementModes();
}
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\UTF16::rewind
* @covers MensBeam\Intl\Encoding\UTF16::chars
* @covers MensBeam\Intl\Encoding\UTF16::codes
*/
public function testIterateThroughAString(string $input, array $exp) {
return parent::testIterateThroughAString($input, $exp);
}
public function provideCodePoints() {
// UTF-16 has no encoder
return [[true, 0, ""]];
}
public function provideStrings() {
return [
// control samples
'empty string' => ["", []],
'sanity check' => ["6100 6200 6300 3100 3200 3300", [97, 98, 99, 49, 50, 51]],
'mixed sample' => ["7A00 A200 346C 34D8 1EDD FFF8 FFDB FDDF FEFF", [122, 162, 27700, 119070, 63743, 1114109, 65534]],
// unexpected EOF
'EOF in BMP character' => ["FF", [65533]],
'EOF after lead surrogate' => ["34D8", [65533]],
'EOF in trail surrogate' => ["34D8 1E", [65533]],
// invalid UTF-16 surrogates
'lead surrogate without trail' => ["34D8 0000", [65533, 0]],
'trail surrogate without lead' => ["1EDD 0000", [65533, 0]],
'double lead surrogate' => ["34D8 34D8 1EDD", [65533, 119070]],
'double trail surrogate' => ["34D8 1EDD 1EDD", [119070, 65533]],
];
}
}

8
tests/cases/Encoding/TestUTF8.php

@ -8,7 +8,6 @@ namespace MensBeam\Intl\TestCase\Encoding;
use MensBeam\Intl\Encoding\UTF8;
use MensBeam\Intl\Encoding\EncoderException;
use MensBeam\Intl\Encoding\DecoderException;
class TestUTF8 extends \MensBeam\Intl\Test\EncodingTest {
protected $testedClass = UTF8::class;
@ -22,11 +21,12 @@ class TestUTF8 extends \MensBeam\Intl\Test\EncodingTest {
Char 6 U+FFFE (3 bytes) Offset 17
End of string at char 7, offset 20
*/
protected $seekString = "7A C2 A2 E6 B0 B4 F0 9D 84 9E EF A3 BF F4 8F BF BD EF BF BE";
protected $seekString = "7A C2A2 E6B0B4 F09D849E EFA3BF F48FBFBD EFBFBE";
protected $seekCodes = [0x007A, 0x00A2, 0x6C34, 0x1D11E, 0xF8FF, 0x10FFFD, 0xFFFE];
protected $seekOffsets = [0, 1, 3, 6, 10, 13, 17, 20];
/* This string contains a single invalid charactersequence */
protected $brokenChar = "FF";
/* This string contains an invalid character sequence sandwiched between two null characters */
protected $brokenChar = "00 FF 00";
protected $lowerA = "a";
/**
* @dataProvider provideCodePoints

19
tests/lib/EncodingTest.php

@ -34,7 +34,7 @@ abstract class EncodingTest extends \PHPUnit\Framework\TestCase {
$out[] = $p;
}
$this->assertSame($exp, $out);
$this->assertSame($s->posByte(), strlen($input));
$this->assertSame(strlen($input), $s->posByte());
}
public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) {
@ -49,7 +49,7 @@ abstract class EncodingTest extends \PHPUnit\Framework\TestCase {
$out[] = $p;
}
$this->assertSame($exp, $out);
$this->assertSame($s->posByte(), strlen($input));
$this->assertSame(strlen($input), $s->posByte());
}
public function testSTepBackThroughAString(string $input, array $exp) {
@ -118,29 +118,30 @@ abstract class EncodingTest extends \PHPUnit\Framework\TestCase {
public function testTraversePastTheEndOfAString() {
$class = $this->testedClass;
$s = new $class("a");
$s = new $class($this->lowerA);
$l = strlen($this->lowerA);
$this->assertSame(0, $s->posChar());
$this->assertSame(0, $s->posByte());
$this->assertSame("a", $s->nextChar());
$this->assertSame(1, $s->posChar());
$this->assertSame(1, $s->posByte());
$this->assertSame($l, $s->posByte());
$this->assertSame("", $s->nextChar());
$this->assertSame(1, $s->posChar());
$this->assertSame(1, $s->posByte());
$this->assertSame($l, $s->posByte());
$s = new $class("a");
$s = new $class($this->lowerA);
$this->assertSame(0, $s->posChar());
$this->assertSame(0, $s->posByte());
$this->assertSame(ord("a"), $s->nextCode());
$this->assertSame(1, $s->posChar());
$this->assertSame(1, $s->posByte());
$this->assertSame($l, $s->posByte());
$this->assertSame(false, $s->nextCode());
$this->assertSame(1, $s->posChar());
$this->assertSame(1, $s->posByte());
$this->assertSame($l, $s->posByte());
}
public function testPeekAtCharacters() {
@ -220,7 +221,7 @@ abstract class EncodingTest extends \PHPUnit\Framework\TestCase {
public function testReplacementModes() {
$class = $this->testedClass;
$input = $this->prepString("00".$this->brokenChar."00");
$input = $this->prepString($this->brokenChar);
// officially test replacement characters (already effectively tested by other tests)
$s = new $class($input, false);
$s->seek(1);

2
tests/phpunit.xml

@ -19,6 +19,8 @@
<testsuites>
<testsuite name="Encoding">
<file>cases/Encoding/TestUTF8.php</file>
<file>cases/Encoding/TestUTF16LE.php</file>
<file>cases/Encoding/TestUTF16BE.php</file>
<file>cases/Encoding/TestSingleByte.php</file>
<file>cases/Encoding/TestGB18030.php</file>
</testsuite>

Loading…
Cancel
Save