Fix UTF-8 seeking through truncated sequences

4 years ago · dc11f98c4c
1 changed files with 8 additions and 10 deletions
--- a/lib/Encoding/UTF8.php
+++ b/lib/Encoding/UTF8.php
@ -95,24 +95,22 @@ class UTF8 extends AbstractEncoding implements StatelessEncoding {
        while ($distance > 0 && $this->posByte > 0) {
            $distance--;
            $this->posChar--;
-            $pos = $this->posByte - 1;
-            $b = ord(@$this->string[$pos]);
+            $b = ord(@$this->string[$this->posByte - 1]);
            if ($b < 0x80) {
                // if the byte is an ASCII byte or the end of input, then this is already a synchronized position
-                $this->posByte = $pos;
+                $this->posByte--;
            } else {
-                $s = $pos;
-                while ($b >= 0x80 && $b <= 0xBF && $pos > 0 && ($s - $pos) < 3) { // go back at most three bytes, no further than the start of the string, and only as long as the byte remains a continuation byte
+                $s = $this->posByte;
+                $pos = $s - 1;
+                while ($b >= 0x80 && $b <= 0xBF && $pos > 0 && ($s - $pos) < 4) { // go back at most four bytes, no further than the start of the string, and only as long as the byte remains a continuation byte
                    $b = ord(@$this->string[--$pos]);
                }
                $this->posByte = $pos;
                // decrement the character position because nextCode() increments it
                $this->posChar--;
-                if (is_null($this->nextCode())) {
-                    $this->posByte = $s;
-                } else {
-                    $this->posByte = ($this->posByte > $s) ? $pos : $s;
-                }
+                // check for overlong sequences: if the sequence is overlong consuming the character will yield an earlier position than where we started
+                $this->nextCode();
+                $this->posByte = ($this->posByte < $s) ? $s - 1 : $pos;
            }
        }
        return $distance;