From b871c4f2fde9eedc016e6d907995f0d726022995 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Sat, 21 Jul 2018 19:59:56 -0400 Subject: [PATCH] Implement seeking backward though a string --- lib/UTF8String.php | 68 +++++++++++++++++++++++++++++++++++++++ tests/cases/TestCodec.php | 17 ++++++++++ 2 files changed, 85 insertions(+) diff --git a/lib/UTF8String.php b/lib/UTF8String.php index 27c6b92..1ec900e 100644 --- a/lib/UTF8String.php +++ b/lib/UTF8String.php @@ -16,6 +16,18 @@ class UTF8String { $this->string = $string; } + public function posByte(): int { + return $this->posByte; + } + + public function posChar(): int { + return $this->posChar; + } + + /** Retrieve the next character in the string + * + * The returned character may be a replacement character, or the empty string if the end of the string has already been reached + */ public function nextChr(): string { // get the byte at the current position $b = @$this->string[$this->posByte]; @@ -30,6 +42,10 @@ class UTF8String { } } + /** Decodes the next UTF-8 character from the string and returns its code point number + * + * If a character could not be decoded, null is returned; if the end of the string has already been reached, false is returned + */ public function nextOrd() { // this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder // though it differs from a slavish implementation because it operates on only a single @@ -86,4 +102,56 @@ class UTF8String { } return $point; } + + /** Advance $distance characters through the string + * + * If $distance is negative, the operation will be performed in reverse + * + * If the end (or beginning) of the string was reached before the end of the operation, false is returned + */ + public function seek(int $distance): bool { + if ($distance > 0) { + do { + // get the next code point; this automatically increments the character position + $p = $this->nextOrd(); + } while (--$distance && $p !== false); // stop after we have skipped the desired number of characters, or reached EOF + return !$distance; + } elseif ($distance < 0) { + if (!$this->posByte) { + // if we're already at the start of the string, we can't go further back + return false; + } + $distance = abs($distance); + do { + $this->sync($this->posByte - 1); + // manually decrement the character position + $this->posChar--; + } while (--$distance && $this->posByte); + return !$distance; + } else { + return true; + } + } + + /** Synchronize to the byte offset of the start of the nearest character at or before byte offset $pos */ + protected function sync(int $pos) { + $b = ord(@$this->string[$pos]); + if ($b < 0x80) { + // if the byte is an ASCII byte or the end of input, then this is already a synchronized position + $this->posByte = $pos; + } else { + $s = $pos; + while ($b >= 0x80 && $b <= 0xBF && ($s - $pos) < 3) { // go back at most three bytes, no further than the start of the string, and only as long as the byte remains a continuation byte + $b = ord(@$this->string[--$pos]); + } + $this->posByte = $pos; + // decrement the character position because nextOrd() increments it + $this->posChar--; + if (is_null($this->nextOrd())) { + $this->posByte = $s; + } else { + $this->posByte = $pos; + } + } + } } diff --git a/tests/cases/TestCodec.php b/tests/cases/TestCodec.php index a1bd38a..4795418 100644 --- a/tests/cases/TestCodec.php +++ b/tests/cases/TestCodec.php @@ -38,6 +38,23 @@ class TestConf extends \PHPUnit\Framework\TestCase { } $this->assertEquals($exp, $out); } + + /** + * @dataProvider provideStrings + * @covers \MensBeam\UTF8\UTF8String::seek + * @covers \MensBeam\UTF8\UTF8String::sync + * @covers \MensBeam\UTF8\UTF8String::posChar + */ + public function testSTepBackThroughAString(string $input, array $points) { + $s = new UTF8String($input); + $a = 0; + while (($p1 = $s->nextOrd() ?? 0xFFFD) !== false) { + $this->assertTrue($s->seek(-1)); + $p2 = $s->nextOrd() ?? 0xFFFD; + $this->assertSame($p1, $p2, "Mismatch at character position $a"); + $this->assertSame(++$a, $s->posChar(), "Character position should be $a"); + } + } public function provideStrings() { return [