Browse Source

Refactor UTF-8 seeking

span
J. King 6 years ago
parent
commit
1449fae908
  1. 75
      lib/Encoding/UTF8.php
  2. 2
      tests/cases/Encoding/TestUTF8.php

75
lib/Encoding/UTF8.php

@ -102,62 +102,31 @@ class UTF8 implements StatelessEncoding {
return $bytes;
}
/** Advance $distance characters through the string
*
* If $distance is negative, the operation will be performed in reverse
*
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
*/
public function seek(int $distance): int {
if ($distance > 0) {
if ($this->posByte == strlen($this->string)) {
// if we're already at the end of the string, we can't go further
return $distance;
}
do {
// get the next code point; this automatically increments the character position
$p = $this->nextCode();
} while (--$distance && $p !== false); // stop after we have skipped the desired number of characters, or reached EOF
return $distance;
} elseif ($distance < 0) {
$distance = abs($distance);
if (!$this->posByte) {
// if we're already at the start of the string, we can't go further back
return $distance;
}
$mode = $this->errMode;
$this->errMode = self::MODE_NULL;
do {
$this->sync($this->posByte - 1);
// manually decrement the character position
$this->posChar--;
} while (--$distance && $this->posByte);
$this->errMode = $mode;
return $distance;
} else {
return 0;
}
}
/** Synchronize to the byte offset of the start of the nearest character at or before byte offset $pos */
protected function sync(int $pos) {
$b = ord(@$this->string[$pos]);
if ($b < 0x80) {
// if the byte is an ASCII byte or the end of input, then this is already a synchronized position
$this->posByte = $pos;
} else {
$s = $pos;
while ($b >= 0x80 && $b <= 0xBF && $pos > 0 && ($s - $pos) < 3) { // go back at most three bytes, no further than the start of the string, and only as long as the byte remains a continuation byte
$b = ord(@$this->string[--$pos]);
}
$this->posByte = $pos;
// decrement the character position because nextCode() increments it
/** Implements backward seeking $distance characters */
protected function seekBack(int $distance): int {
while ($distance > 0 && $this->posByte > 0) {
$distance--;
$this->posChar--;
if (is_null($this->nextCode())) {
$this->posByte = $s;
$pos = $this->posByte - 1;
$b = ord(@$this->string[$pos]);
if ($b < 0x80) {
// if the byte is an ASCII byte or the end of input, then this is already a synchronized position
$this->posByte = $pos;
} else {
$this->posByte = ($this->posByte > $s) ? $pos : $s;
$s = $pos;
while ($b >= 0x80 && $b <= 0xBF && $pos > 0 && ($s - $pos) < 3) { // go back at most three bytes, no further than the start of the string, and only as long as the byte remains a continuation byte
$b = ord(@$this->string[--$pos]);
}
$this->posByte = $pos;
// decrement the character position because nextCode() increments it
$this->posChar--;
if (is_null($this->nextCode())) {
$this->posByte = $s;
} else {
$this->posByte = ($this->posByte > $s) ? $pos : $s;
}
}
}
return $distance;
}
}

2
tests/cases/Encoding/TestUTF8.php

@ -57,7 +57,7 @@ class TestUTF8 extends \MensBeam\Intl\Test\EncodingTest {
/**
* @dataProvider provideStrings
* @covers MensBeam\Intl\Encoding\UTF8::sync
* @covers MensBeam\Intl\Encoding\UTF8::seekBack
*/
public function testSTepBackThroughAString(string $input, array $exp) {
return parent::testSTepBackThroughAString($input, $exp);

Loading…
Cancel
Save