From 1449fae90827ae39d96170084ecd822886df7f05 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Wed, 29 Aug 2018 23:39:56 -0400 Subject: [PATCH] Refactor UTF-8 seeking --- lib/Encoding/UTF8.php | 75 +++++++++---------------------- tests/cases/Encoding/TestUTF8.php | 2 +- 2 files changed, 23 insertions(+), 54 deletions(-) diff --git a/lib/Encoding/UTF8.php b/lib/Encoding/UTF8.php index 839e84a..69d500c 100644 --- a/lib/Encoding/UTF8.php +++ b/lib/Encoding/UTF8.php @@ -102,62 +102,31 @@ class UTF8 implements StatelessEncoding { return $bytes; } - /** Advance $distance characters through the string - * - * If $distance is negative, the operation will be performed in reverse - * - * If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned - */ - public function seek(int $distance): int { - if ($distance > 0) { - if ($this->posByte == strlen($this->string)) { - // if we're already at the end of the string, we can't go further - return $distance; - } - do { - // get the next code point; this automatically increments the character position - $p = $this->nextCode(); - } while (--$distance && $p !== false); // stop after we have skipped the desired number of characters, or reached EOF - return $distance; - } elseif ($distance < 0) { - $distance = abs($distance); - if (!$this->posByte) { - // if we're already at the start of the string, we can't go further back - return $distance; - } - $mode = $this->errMode; - $this->errMode = self::MODE_NULL; - do { - $this->sync($this->posByte - 1); - // manually decrement the character position - $this->posChar--; - } while (--$distance && $this->posByte); - $this->errMode = $mode; - return $distance; - } else { - return 0; - } - } - - /** Synchronize to the byte offset of the start of the nearest character at or before byte offset $pos */ - protected function sync(int $pos) { - $b = ord(@$this->string[$pos]); - if ($b < 0x80) { - // if the byte is an ASCII byte or the end of input, then this is already a synchronized position - $this->posByte = $pos; - } else { - $s = $pos; - while ($b >= 0x80 && $b <= 0xBF && $pos > 0 && ($s - $pos) < 3) { // go back at most three bytes, no further than the start of the string, and only as long as the byte remains a continuation byte - $b = ord(@$this->string[--$pos]); - } - $this->posByte = $pos; - // decrement the character position because nextCode() increments it + /** Implements backward seeking $distance characters */ + protected function seekBack(int $distance): int { + while ($distance > 0 && $this->posByte > 0) { + $distance--; $this->posChar--; - if (is_null($this->nextCode())) { - $this->posByte = $s; + $pos = $this->posByte - 1; + $b = ord(@$this->string[$pos]); + if ($b < 0x80) { + // if the byte is an ASCII byte or the end of input, then this is already a synchronized position + $this->posByte = $pos; } else { - $this->posByte = ($this->posByte > $s) ? $pos : $s; + $s = $pos; + while ($b >= 0x80 && $b <= 0xBF && $pos > 0 && ($s - $pos) < 3) { // go back at most three bytes, no further than the start of the string, and only as long as the byte remains a continuation byte + $b = ord(@$this->string[--$pos]); + } + $this->posByte = $pos; + // decrement the character position because nextCode() increments it + $this->posChar--; + if (is_null($this->nextCode())) { + $this->posByte = $s; + } else { + $this->posByte = ($this->posByte > $s) ? $pos : $s; + } } } + return $distance; } } diff --git a/tests/cases/Encoding/TestUTF8.php b/tests/cases/Encoding/TestUTF8.php index a026809..19c472f 100644 --- a/tests/cases/Encoding/TestUTF8.php +++ b/tests/cases/Encoding/TestUTF8.php @@ -57,7 +57,7 @@ class TestUTF8 extends \MensBeam\Intl\Test\EncodingTest { /** * @dataProvider provideStrings - * @covers MensBeam\Intl\Encoding\UTF8::sync + * @covers MensBeam\Intl\Encoding\UTF8::seekBack */ public function testSTepBackThroughAString(string $input, array $exp) { return parent::testSTepBackThroughAString($input, $exp);