Browse Source

Fix UTF-8 seeking through truncated sequences

multi-byte
J. King 4 years ago
parent
commit
1f007b88f1
  1. 18
      lib/Encoding/UTF8.php

18
lib/Encoding/UTF8.php

@ -95,24 +95,22 @@ class UTF8 extends AbstractEncoding implements StatelessEncoding {
while ($distance > 0 && $this->posByte > 0) { while ($distance > 0 && $this->posByte > 0) {
$distance--; $distance--;
$this->posChar--; $this->posChar--;
$pos = $this->posByte - 1; $b = ord(@$this->string[$this->posByte - 1]);
$b = ord(@$this->string[$pos]);
if ($b < 0x80) { if ($b < 0x80) {
// if the byte is an ASCII byte or the end of input, then this is already a synchronized position // if the byte is an ASCII byte or the end of input, then this is already a synchronized position
$this->posByte = $pos; $this->posByte--;
} else { } else {
$s = $pos; $s = $this->posByte;
while ($b >= 0x80 && $b <= 0xBF && $pos > 0 && ($s - $pos) < 3) { // go back at most three bytes, no further than the start of the string, and only as long as the byte remains a continuation byte $pos = $s - 1;
while ($b >= 0x80 && $b <= 0xBF && $pos > 0 && ($s - $pos) < 4) { // go back at most four bytes, no further than the start of the string, and only as long as the byte remains a continuation byte
$b = ord(@$this->string[--$pos]); $b = ord(@$this->string[--$pos]);
} }
$this->posByte = $pos; $this->posByte = $pos;
// decrement the character position because nextCode() increments it // decrement the character position because nextCode() increments it
$this->posChar--; $this->posChar--;
if (is_null($this->nextCode())) { // check for overlong sequences: if the sequence is overlong consuming the character will yield an earlier position than where we started
$this->posByte = $s; $this->nextCode();
} else { $this->posByte = ($this->posByte < $s) ? $s - 1 : $pos;
$this->posByte = ($this->posByte > $s) ? $pos : $s;
}
} }
} }
return $distance; return $distance;

Loading…
Cancel
Save