Implement seeking backward though a string

6 years ago · b871c4f2fd
2 changed files with 85 additions and 0 deletions
--- a/lib/UTF8String.php
+++ b/lib/UTF8String.php
@ -16,6 +16,18 @@ class UTF8String {
        $this->string = $string;
    }

+    public function posByte(): int {
+        return $this->posByte;
+    }
+
+    public function posChar(): int {
+        return $this->posChar;
+    }
+
+    /** Retrieve the next character in the string
+     * 
+     * The returned character may be a replacement character, or the empty string if the end of the string has already been reached
+     */
    public function nextChr(): string {
        // get the byte at the current position
        $b = @$this->string[$this->posByte];
@ -30,6 +42,10 @@ class UTF8String {
        }
    }

+    /** Decodes the next UTF-8 character from the string and returns its code point number
+     *
+     * If a character could not be decoded, null is returned; if the end of the string has already been reached, false is returned
+     */
    public function nextOrd() {
        // this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder
        // though it differs from a slavish implementation because it operates on only a single
@ -86,4 +102,56 @@ class UTF8String {
        }
        return $point;
    }
+
+    /** Advance $distance characters through the string
+     *
+     * If $distance is negative, the operation will be performed in reverse
+     *
+     * If the end (or beginning) of the string was reached before the end of the operation, false is returned
+     */
+    public function seek(int $distance): bool {
+        if ($distance > 0) {
+            do {
+                // get the next code point; this automatically increments the character position
+                $p = $this->nextOrd();
+            } while (--$distance && $p !== false); // stop after we have skipped the desired number of characters, or reached EOF
+            return !$distance;
+        } elseif ($distance < 0) {
+            if (!$this->posByte) {
+                // if we're already at the start of the string, we can't go further back
+                return false;
+            }
+            $distance = abs($distance);
+            do {
+                $this->sync($this->posByte - 1);
+                // manually decrement the character position
+                $this->posChar--;
+            } while (--$distance && $this->posByte);
+            return !$distance;
+        } else {
+            return true;
+        }
+    }
+
+    /** Synchronize to the byte offset of the start of the nearest character at or before byte offset $pos */
+    protected function sync(int $pos) {
+        $b = ord(@$this->string[$pos]);
+        if ($b < 0x80) {
+            // if the byte is an ASCII byte or the end of input, then this is already a synchronized position
+            $this->posByte = $pos;
+        } else {
+            $s = $pos;
+            while ($b >= 0x80 && $b <= 0xBF && ($s - $pos) < 3) { // go back at most three bytes, no further than the start of the string, and only as long as the byte remains a continuation byte
+                $b = ord(@$this->string[--$pos]);
+            }
+            $this->posByte = $pos;
+            // decrement the character position because nextOrd() increments it
+            $this->posChar--;
+            if (is_null($this->nextOrd())) {
+                $this->posByte = $s;
+            } else {
+                $this->posByte = $pos;
+            }
+        }
+    }
 }
--- a/tests/cases/TestCodec.php
+++ b/tests/cases/TestCodec.php
@ -38,6 +38,23 @@ class TestConf extends \PHPUnit\Framework\TestCase {
        }
        $this->assertEquals($exp, $out);
    }
+    
+    /** 
+     * @dataProvider provideStrings
+     * @covers \MensBeam\UTF8\UTF8String::seek
+     * @covers \MensBeam\UTF8\UTF8String::sync
+     * @covers \MensBeam\UTF8\UTF8String::posChar
+    */
+    public function testSTepBackThroughAString(string $input, array $points) {
+        $s = new UTF8String($input);
+        $a = 0;
+        while (($p1 = $s->nextOrd() ?? 0xFFFD) !== false) {
+            $this->assertTrue($s->seek(-1));
+            $p2 = $s->nextOrd() ?? 0xFFFD;
+            $this->assertSame($p1, $p2, "Mismatch at character position $a");
+            $this->assertSame(++$a, $s->posChar(), "Character position should be $a");
+        }
+    }

    public function provideStrings() {
        return [