From fd8c333a68819590102a66ef3ab2209e4916b11e Mon Sep 17 00:00:00 2001
From: "J. King" <jking@jkingweb.ca>
Date: Tue, 10 Apr 2018 17:58:09 -0400
Subject: [PATCH] Split off UTF-8 processing into its own class, greately
 expanded

Also simplified some parts of the algorithm implementation

Part of this simplification involves the use of goto statements
---
 lib/URI.php  | 100 ++++---------------------
 lib/UTF8.php | 207 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 220 insertions(+), 87 deletions(-)
 create mode 100644 lib/UTF8.php

diff --git a/lib/URI.php b/lib/URI.php
index ea44399..4b7afbd 100644
--- a/lib/URI.php
+++ b/lib/URI.php
@@ -426,10 +426,7 @@ class URI {
         # Let the @ flag, [] flag, and passwordTokenSeenFlag be unset.
         $flagAtSign = $flagSquareBracket = $flagPasswordTokenSeen = false;
         # Let pointer be a pointer to first code point in input.
-        // DEVIATION: we operate on byte strings: $pos is the byte offset of the character referred to by $pointer;
-        // $posPrev is the byte offset of the start of the previous character i.e. ($pointer - 1)
-        $pointer = 0;
-        $posPrev = $pos;
+        // we operate on byte strings: $pos is the byte offset of the character referred to by $pointer
         $pos = 0;
         # Keep running the following state machine by switching on state.
         # If after a run pointer points to the EOF code point, go to the next step.
@@ -437,10 +434,12 @@ class URI {
         // Note: the state machine is designed to run once even with an empty string
         do {
             # Within a parser algorithm that uses a pointer variable, c references the code point the pointer variable points to.
-            // DEVIATION: we operate on byte strings: $pos is the byte offset of the character referred to by $pointer; 
+            // we operate on byte strings: $pos is the byte offset of the character referred to by $pointer; 
             // $posNext is the start of "remaining" i.e. the offset of the next UTF-8 character
-            // $posPrev is the byte offset of the start of the previous character i.e. ($pointer - 1)
-            list($c, $posNext) = $this->getChar($input, $pos);
+            $c = UTF8::get($input, $pos, $posNext);
+            // when the algorithm specifies to decrease the pointer by one, the result is to reprocess the current character; we
+            // accomplish this by going back to this label, which skips the increment at the end of each iteration
+            processChar:
             // switch on state
             switch ($state) {
                 # scheme start state
@@ -452,8 +451,7 @@ class URI {
                     } elseif (!$stateOverride) {
                         # Otherwise, if state override is not given, set state to no scheme state, and decrease pointer by one.
                         $state = self::ST_NO_SCHEME;
-                        $pos = $posPrev; 
-                        $pointer--;
+                        goto processChar;
                     } else {
                         # Otherwise, validation error, return failure.
                         # NOTE: This indication of failure is used exclusively by Location object’s protocol attribute.
@@ -527,7 +525,7 @@ class URI {
                         $state = self::ST_NO_SCHEME;
                         $pos = 0;
                         $pointer = 0;
-                        continue 2;
+                        goto processChar;
                     } else {
                         # Otherwise, validation error, return failure.
                         # NOTE: This indication of failure is used exclusively by Location object’s protocol attribute. Furthermore, the non-failure termination earlier in this state is an intentional difference for defining that attribute.
@@ -560,13 +558,11 @@ class URI {
                     } elseif ($base->scheme != "file") {
                         # Otherwise, if base’s scheme is not "file", set state to relative state and decrease pointer by one.
                         $state = self::ST_RELATIVE;
-                        $pos = $posPrev;
-                        $pointer--;
+                        goto processChar;
                     } else {
                         # Otherwise, set state to file state and decrease pointer by one.
                         $state = self::ST_FILE;
-                        $pos = $posPrev;
-                        $pointer--;
+                        goto processChar;
                     }
                     break;
                 # special relative or authority state
@@ -577,8 +573,7 @@ class URI {
                     } else {
                         # Otherwise, validation error, set state to relative state and decrease pointer by one.
                         $state = self::ST_RELATIVE;
-                        $pos = $posPrev;
-                        $pointer--;
+                        goto processChar;
                     }
                     break;
                 # path or authority state
@@ -589,8 +584,7 @@ class URI {
                     } else {
                         # Otherwise, set state to path state, and decrease pointer by one.
                         $state = self::ST_PATH;
-                        $pos = $posPrev;
-                        $pointer--;
+                        goto processChar;
                     }
                     break;
                 // invalid or unimplemented state
@@ -600,10 +594,8 @@ class URI {
             }
             # If after a run pointer points to the EOF code point, go to the next step.
             # Otherwise, increase pointer by one and continue with the state machine.
-            // DEVIATION: we operate on byte strings: $pos is the byte offset of the character referred to by $pointer; 
+            // we operate on byte strings: $pos is the byte offset of the character referred to by $pointer; 
             // $posNext is the start of "remaining" i.e. the offset of the next UTF-8 character
-            // $posPrev is the byte offset of the start of the previous character i.e. ($pointer - 1)
-            $posPrev = $pos;
             $pos = $posNext;
             $pointer++;
         } while ($pos <= $eof);
@@ -632,70 +624,4 @@ class URI {
                 throw new \Exception;
         }
     }
-
-    /** Returns the UTF-8 character at byte offset $pos (which could possibly be a replacement charcter) along with the byte offset of the next character */
-    protected function getChar(string $input, int $pos, bool $throwOnError = false, $replacementChar = "\u{FFFD}"): array {
-        // get the byte at the specified position
-        $b = ($pos < strlen($input)) ? $input[$pos] : "";
-        if ($b < "\x80" || $b=="") {
-            // if the byte is an ASCII character or end of input, simply return it
-            return [$b, $pos + 1];
-        } else {
-            // otherwise determine the byte-length of the UTF-8 character
-            $l = $this->getCharLength($b);
-            if (!$l && $throwOnError) {
-                // if the byte is invalid and we're supposed to halt, halt
-                throw new \Exception;
-            } elseif (!$l) {
-                // if the byte is invalid and we're supposed to continue, skip any further invalid bytes and return a replacement character instead
-                do {
-                    $l = $this->getCharLength($input[++$pos]);
-                } while (!$l);
-                return [$replacementChar, $pos];
-            } else {
-                // otherwise collect valid mid-sequence bytes into a buffer until the whole character is retrieved or an invalid byte is encountered
-                $buffer = $b;
-                do {
-                    $b = (++$pos < strlen($input)) ? $input[$pos] : "";
-                    if ($b >= "\x80" && $b <= "\xBF") {
-                        // if the byte is valid, add it to the buffer
-                        $buffer .= $b;
-                    } elseif ($throwOnError) {
-                        // if the byte is invalid and we're supposed to halt, halt
-                        throw new \Exception;
-                    } else {
-                        // if the byte is invalid and we're supposed to continue, go back one byte and skip any bytes which are not sequence-start bytes, then return a replacement character
-                        $pos--;
-                        do {
-                            $l = $this->getCharLength($input[++$pos]);
-                        } while (!$l);
-                        return [$replacementChar, $pos];
-                    }
-                } while (strlen($buffer) < $l);
-                // return the filled buffer and the position of the next byte
-                return [$buffer, $pos + 1];
-            }
-        }
-    }
-
-    /** 
-     * Returns the total expected length of the UTF-8 character starting with byte $b 
-     * 
-     * If the byte is not the start of a UTF-8 sequence, 0 is returned
-     */
-    protected function getCharLength(string $b): int {
-        if ($b >= "\xC0" && $b <= "\xDF") { // two-byte character
-            return 2;
-        } elseif ($b >= "\xE0" && $b <= "\xEF") { // three-byte character
-            return 3;
-        } elseif ($b >= "\xF0" && $b <= "\xF7") { // four-byte character
-            return 4;
-        } elseif ($b < "\x80") { // ASCII byte: one-byte character
-            return 1;
-        } elseif ($b == "") { // end of input: pretend it's a valid single-byte character
-            return 1;
-        } else { // invalid byte
-            return 0;
-        }
-    }
 }
diff --git a/lib/UTF8.php b/lib/UTF8.php
new file mode 100644
index 0000000..33bfc88
--- /dev/null
+++ b/lib/UTF8.php
@@ -0,0 +1,207 @@
+<?php
+/** @license MIT
+ * Copyright 2018 J. King et al.
+ * See LICENSE and AUTHORS files for details */
+
+declare(strict_types=1);
+namespace JKingWeb\URI;
+
+abstract class UTF8 {
+    public static $replacementChar = "\u{FFFD}";
+    public static $errMode = self::M_REPLACE;
+
+    const M_REPLACE = 0;
+    const M_SKIP = 1;
+    const M_HALT = 2;
+
+    /** Retrieve a character from $string starting at byte offset $pos
+     * 
+     * $next is a variable in which to store the next byte offset at which a character starts
+     * 
+     * The returned character may be a replacement character, or the empty string if $pos is beyond the end of $string
+     */
+    public static function get(string $string, int $pos, &$next = null, int $errMode = null): string {
+        start:
+        // get the byte at the specified position
+        $b = ($pos < strlen($string)) ? $string[$pos] : "";
+        if ($b < "\x80" || $b=="") {
+            // if the byte is an ASCII character or end of input, simply return it
+            $next = $pos + 1;
+            return $b;
+        } else {
+            $errMode = $errMode ?? self::$errMode;
+            // otherwise determine the byte-length of the UTF-8 character
+            $l = self::l($b);
+            if (!$l && $errMode==self::M_SKIP) {
+                // if the byte is invalid and we're supposed to skip, advance the position and start over
+                $pos++;
+                goto start;
+            } elseif (!$l && $errMode == self::M_REPLACE) {
+                // if the byte is invalid and we're supposed to replace, return a replacement character
+                $next = $pos + 1;
+                return self::$replacementChar;
+            } elseif (!$l) {
+                // if the byte is invalid and we're supposed to halt, halt
+                throw new \Exception;
+            } else {
+                // otherwise collect valid mid-sequence bytes into a buffer until the whole character is retrieved or an invalid byte is encountered
+                $buffer = $b;
+                do {
+                    $b = (++$pos < strlen($string)) ? $string[$pos] : "";
+                    if ($b >= "\x80" && $b <= "\xBF") {
+                        // if the byte is valid, add it to the buffer
+                        $buffer .= $b;
+                    } elseif ($errMode==self::M_SKIP) {
+                        // if the byte is invalid and we're supposed to skip, start over from the current position
+                        goto start;
+                    } elseif ($errMode==self::M_REPLACE) {
+                        // if the byte is invalid and we're supposed to replace, return a replacement character
+                        $next = $pos;
+                        return self::$replacementChar;
+                    } else {
+                        // if the byte is invalid and we're supposed to halt, halt
+                        throw new \Exception;
+                    }
+                } while (strlen($buffer) < $l);
+                // return the filled buffer and the position of the next byte
+                $next = $pos + 1;
+                return $buffer;
+            }
+        }
+    }
+
+    /** Starting from byte offset $pos, advance $num characters through $string and return the byte offset of the found character 
+     * 
+     * If $num is negative, the operation will be performed in reverse
+     * 
+     * If $pos is omitted, the start of the string will be used for a forward seek, and the end for a reverse seek
+     */
+    public static function seek(string $string, int $num, int $pos = null, int $errMode = null): int {
+        $errMode = $errMode ?? self::$errMode;
+        if ($num > 0) {
+            $pos = $pos ?? 0;
+            do {
+                $c = self::get($string, $pos, $pos, $errMode); // the current position is getting overwritten with the next position, by reference
+            } while (--$num && $c != ""); // stop after we have skipped the desired number of characters, or reached EOF
+            return $pos;
+        } elseif ($num < 0) {
+            $pos = $pos ?? strlen($string);
+            if (!$pos) {
+                // if we're already at the start of the string, we can't go further back
+                return $pos;
+            }
+            $num = abs($num);
+            do {
+                $pos = self::sync($string, $pos -1, $errMode);
+                $num--;
+            } while ($num && $pos);
+            return $pos;
+        } else {
+            // seeking zero characters is equivalent to a sync
+            return self::sync($string, $pos, $errMode);
+        }
+    }
+
+    /** Synchronize to the byte offset of the start of the nearest character at or before byte offset $pos */
+    public static function sync(string $string, int $pos, int $errMode = null): int {
+        $errMode = $errMode ?? self::$errMode;
+        start:
+        if (!$pos || $pos >= strlen($string)) {
+            // if we're at the start of the string or past its end, then this is the character start
+            return $pos;
+        }
+        // save the start position for later, and increment before the coming decrement loop
+        $s = $pos++;
+        // examine the current byte and skip up to three continuation bytes, going backward and counting the number of examined bytes (between 1 and 4)
+        $t = 0;
+        do {
+            $pos--;
+            $t++;
+            $b = ($pos < strlen($string)) ? $string[$pos] : "";
+        } while (
+            $b >= "\x80" && $b <= "\xBF" && // continuation bytes
+            ($t < 4 || $errMode==self::M_SKIP) && // stop after four bytes, unless we're skipping invalid sequences
+            $pos // stop once the start of the string has been reached
+        ); 
+        // get the expected length of the character starting at the last examined byte
+        $l = self::l($b);
+        if ($l==$t) {
+            // if the expected length matches the number of examined bytes, the character is valid
+            return $pos;
+        } elseif ($errMode==self::M_SKIP) {
+            // if we're expected to ignore invalid sequences:
+            if ($l && $t > $l) {
+                // if the last examined byte is the start of a sequence and we have more than the right amount of continuation characters, return the current position
+                return $pos;
+            } elseif (!$pos) {
+                // if we're already at the start of the string, give up
+                return $pos;
+            } else {
+                // otherwise skip over the last examined byte and start over
+                $pos--;
+                goto start;
+            }
+        } elseif ($errMode==self::M_REPLACE) {
+            // if we're expected to treat invalid sequences as replacement characters, return 
+            // the offset of the most recently examined byte if it is the start of a multi-byte
+            // sequence, or the starting offset otherwise: in the latter case the original byte
+            // is itself a replacement character position
+            return ($l > 1) ? $pos: $s;
+        } else {
+            // if the character is invalid and we're expected to halt, halt
+            throw new \Exception;
+        }
+    }
+
+    public static function len(string $string, int $start = 0, int $end = null, int $errMode = null): int {
+        $errMode = $errMode ?? self::$errMode;
+        $end = $end ?? strlen($string);
+        if (substr($string, $start, ($end - $start)) =="") {
+            return 0;
+        }
+        $count = 0;
+        $pos = $start;
+        do {
+            $c = self::get($string, $pos, $pos, $errMode);
+        } while ($c != "" && ++$count && $pos < $end);
+        return $count;
+    }
+
+    public static function substr(string $str, int $start = 0, int $length = null, &$next = null, int $errMode = null): string {
+        $errMode = $errMode ?? self::$errMode;
+        if ($length > 0) {
+            $pos = $start;
+            $buffer = "";
+            do {
+                $c = self::get($string, $pos, $pos, $errMode); // the current position is getting overwritten with the next position, by reference
+                $buffer .= $c;
+            } while (--$length && $c != ""); // stop after we have skipped the desired number of characters, or reached EOF
+            $next = $pos;
+            return $buffer;
+        } else {
+            $next = self::sync($string, $start, $errMode);
+            return "";
+        }
+    }
+
+    /** 
+     * Returns the expected byte length of a UTF-8 character starting with byte $b 
+     * 
+     * If the byte is not the start of a UTF-8 sequence, 0 is returned
+     */
+    protected static function l(string $b): int {
+        if ($b >= "\xC0" && $b <= "\xDF") { // two-byte character
+            return 2;
+        } elseif ($b >= "\xE0" && $b <= "\xEF") { // three-byte character
+            return 3;
+        } elseif ($b >= "\xF0" && $b <= "\xF7") { // four-byte character
+            return 4;
+        } elseif ($b < "\x80") { // ASCII byte: one-byte character
+            return 1;
+        } elseif ($b == "") { // end of input: pretend it's a valid single-byte character
+            return 1;
+        } else { // invalid byte
+            return 0;
+        }
+    }
+}