From 1f007b88f1b7a35b2a1b9271f6551303e2a3d862 Mon Sep 17 00:00:00 2001
From: "J. King" <jking@jkingweb.ca>
Date: Fri, 2 Oct 2020 16:17:03 -0400
Subject: [PATCH] Fix UTF-8 seeking through truncated sequences

---
 lib/Encoding/UTF8.php | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/lib/Encoding/UTF8.php b/lib/Encoding/UTF8.php
index a91cb3e..e2f9e8b 100644
--- a/lib/Encoding/UTF8.php
+++ b/lib/Encoding/UTF8.php
@@ -95,24 +95,22 @@ class UTF8 extends AbstractEncoding implements StatelessEncoding {
         while ($distance > 0 && $this->posByte > 0) {
             $distance--;
             $this->posChar--;
-            $pos = $this->posByte - 1;
-            $b = ord(@$this->string[$pos]);
+            $b = ord(@$this->string[$this->posByte - 1]);
             if ($b < 0x80) {
                 // if the byte is an ASCII byte or the end of input, then this is already a synchronized position
-                $this->posByte = $pos;
+                $this->posByte--;
             } else {
-                $s = $pos;
-                while ($b >= 0x80 && $b <= 0xBF && $pos > 0 && ($s - $pos) < 3) { // go back at most three bytes, no further than the start of the string, and only as long as the byte remains a continuation byte
+                $s = $this->posByte;
+                $pos = $s - 1;
+                while ($b >= 0x80 && $b <= 0xBF && $pos > 0 && ($s - $pos) < 4) { // go back at most four bytes, no further than the start of the string, and only as long as the byte remains a continuation byte
                     $b = ord(@$this->string[--$pos]);
                 }
                 $this->posByte = $pos;
                 // decrement the character position because nextCode() increments it
                 $this->posChar--;
-                if (is_null($this->nextCode())) {
-                    $this->posByte = $s;
-                } else {
-                    $this->posByte = ($this->posByte > $s) ? $pos : $s;
-                }
+                // check for overlong sequences: if the sequence is overlong consuming the character will yield an earlier position than where we started
+                $this->nextCode();
+                $this->posByte = ($this->posByte < $s) ? $s - 1 : $pos;
             }
         }
         return $distance;