diff --git a/lib/UTF8.php b/lib/UTF8.php index 7998533..a7c6599 100644 --- a/lib/UTF8.php +++ b/lib/UTF8.php @@ -10,6 +10,7 @@ class UTF8 { protected $string; protected $posByte = 0; protected $posChar = 0; + protected $length = null; public function __construct(string $string) { @@ -165,6 +166,22 @@ class UTF8 { return $out; } + /** Calculates the length of the string in code points + * + * Note that this involves processing to the end of the string + */ + public function len(): int { + return $this->length ?? (function() { + $pC = $this->posChar; + $pB = $this->posByte; + while ($this->nextChr() !== ""); + $this->length = $this->posChar; + $this->posChar = $pC; + $this->posByte = $pB; + return $this->length; + })(); + } + /** Synchronize to the byte offset of the start of the nearest character at or before byte offset $pos */ protected function sync(int $pos) { $b = ord(@$this->string[$pos]); diff --git a/tests/cases/TestInstance.php b/tests/cases/TestInstance.php index 2b55ff4..3c02c34 100644 --- a/tests/cases/TestInstance.php +++ b/tests/cases/TestInstance.php @@ -17,6 +17,7 @@ class TestInstance extends \PHPUnit\Framework\TestCase { */ public function testDecodeMultipleCharactersAsCodePoints(string $input, array $exp) { $s = new UTF8($input); + $out = []; while (($p = $s->nextOrd()) !== false) { $out[] = $p ?? 0xFFFD; } @@ -29,6 +30,7 @@ class TestInstance extends \PHPUnit\Framework\TestCase { * @covers \MensBeam\UTF8\UTF8::nextChr */ public function testDecodeMultipleCharactersAsStrings(string $input, array $exp) { + $out = []; $exp = array_map(function ($v) { return \IntlChar::chr($v); }, $exp); @@ -46,6 +48,7 @@ class TestInstance extends \PHPUnit\Framework\TestCase { public function testSTepBackThroughAString(string $input, array $points) { $s = new UTF8($input); $a = 0; + $this->assertTrue(true); // prevent risky test of empty string while (($p1 = $s->nextOrd() ?? 0xFFFD) !== false) { $this->assertSame(0, $s->seek(-1)); $p2 = $s->nextOrd() ?? 0xFFFD; @@ -219,10 +222,26 @@ class TestInstance extends \PHPUnit\Framework\TestCase { $this->assertSame(5, $s->posChr()); $this->assertSame(13, $s->posByte()); } + + /** + * @dataProvider provideStrings + * @covers \MensBeam\UTF8\UTF8::len + */ + public function testGetStringLength(string $input, array $points) { + $s = new UTF8($input); + $s->seek(1); + $posChar = $s->posChr(); + $posByte = $s->posByte(); + + $this->assertSame(sizeof($points), $s->len()); + $this->assertSame($posChar, $s->posChr()); + $this->assertSame($posByte, $s->posByte()); + } public function provideStrings() { return [ // control samples + 'empty string' => ["", []], 'sanity check' => ["\x61\x62\x63\x31\x32\x33", [97, 98, 99, 49, 50, 51]], 'multibyte control' => ["\xE5\x8F\xA4\xE6\xB1\xA0\xE3\x82\x84\xE8\x9B\x99\xE9\xA3\x9B\xE3\x81\xB3\xE8\xBE\xBC\xE3\x82\x80\xE6\xB0\xB4\xE3\x81\xAE\xE9\x9F\xB3", [21476, 27744, 12420, 34521, 39131, 12403, 36796, 12416, 27700, 12398, 38899]], 'mixed sample' => ["\x7A\xC2\xA2\xE6\xB0\xB4\xF0\x9D\x84\x9E\xEF\xA3\xBF\xF4\x8F\xBF\xBD\xEF\xBF\xBE", [122, 162, 27700, 119070, 63743, 1114109, 65534]],