Browse Source

Fix UTF-8 seeking through truncated sequences

span
J. King 4 years ago
parent
commit
dc11f98c4c
  1. 18
      lib/Encoding/UTF8.php

18
lib/Encoding/UTF8.php

@ -95,24 +95,22 @@ class UTF8 extends AbstractEncoding implements StatelessEncoding {
while ($distance > 0 && $this->posByte > 0) {
$distance--;
$this->posChar--;
$pos = $this->posByte - 1;
$b = ord(@$this->string[$pos]);
$b = ord(@$this->string[$this->posByte - 1]);
if ($b < 0x80) {
// if the byte is an ASCII byte or the end of input, then this is already a synchronized position
$this->posByte = $pos;
$this->posByte--;
} else {
$s = $pos;
while ($b >= 0x80 && $b <= 0xBF && $pos > 0 && ($s - $pos) < 3) { // go back at most three bytes, no further than the start of the string, and only as long as the byte remains a continuation byte
$s = $this->posByte;
$pos = $s - 1;
while ($b >= 0x80 && $b <= 0xBF && $pos > 0 && ($s - $pos) < 4) { // go back at most four bytes, no further than the start of the string, and only as long as the byte remains a continuation byte
$b = ord(@$this->string[--$pos]);
}
$this->posByte = $pos;
// decrement the character position because nextCode() increments it
$this->posChar--;
if (is_null($this->nextCode())) {
$this->posByte = $s;
} else {
$this->posByte = ($this->posByte > $s) ? $pos : $s;
}
// check for overlong sequences: if the sequence is overlong consuming the character will yield an earlier position than where we started
$this->nextCode();
$this->posByte = ($this->posByte < $s) ? $s - 1 : $pos;
}
}
return $distance;

Loading…
Cancel
Save