code point > bytes) to normalize overlong sequences return self::chr($p); } elseif ($errMode==self::M_REPLACE) { // if the byte is invalid and we're supposed to replace, return a replacement character return self::$replacementChar; } elseif ($errMode==self::M_SKIP) { // if the character is invalid and we're supposed to skip invalid characters, advance the position and start over $pos = $next; goto start; } else { // if the byte is invalid and we're supposed to halt, halt throw new \Exception; } } } /** Starting from byte offset $pos, advance $num characters through $string and return the byte offset of the found character * * If $num is negative, the operation will be performed in reverse * * If $pos is omitted, the start of the string will be used for a forward seek, and the end for a reverse seek */ public static function seek(string $string, int $num, int $pos = null, int $errMode = null): int { $errMode = $errMode ?? self::$errMode; if ($num > 0) { $pos = $pos ?? 0; do { $c = self::get($string, $pos, $pos, $errMode); // the current position is getting overwritten with the next position, by reference } while (--$num && $c != ""); // stop after we have skipped the desired number of characters, or reached EOF return $pos; } elseif ($num < 0) { $pos = $pos ?? strlen($string); if (!$pos) { // if we're already at the start of the string, we can't go further back return $pos; } $num = abs($num); do { $pos = self::sync($string, $pos -1, $errMode); $num--; } while ($num && $pos); return $pos; } else { // seeking zero characters is equivalent to a sync return self::sync($string, $pos, $errMode); } } /** Synchronize to the byte offset of the start of the nearest character at or before byte offset $pos */ public static function sync(string $string, int $pos, int $errMode = null): int { $errMode = $errMode ?? self::$errMode; start: if (!$pos || $pos >= strlen($string)) { // if we're at the start of the string or past its end, then this is the character start return $pos; } // save the start position for later, and increment before the coming decrement loop $s = $pos++; // examine the current byte and skip up to three continuation bytes, going backward and counting the number of examined bytes (between 1 and 4) $t = 0; do { $pos--; $t++; $b = ($pos < strlen($string)) ? $string[$pos] : ""; } while ( $b >= "\x80" && $b <= "\xBF" && // continuation bytes ($t < 4 || $errMode==self::M_SKIP) && // stop after four bytes, unless we're skipping invalid sequences $pos > 0 // stop once the start of the string has been reached ); // attempt to extract a code point at the current position $p = self::ord($string, $pos, $n, self::M_REPLACE); // if the position of the character after the one we just consumed is earlier than our start position, // then there was at least one invalid sequence between the consumed character and the start position if ($n < $s) { if ($errMode==self::M_SKIP) { // if we're supposed to skip invalid sequences, there is no need to do anything } elseif ($errMode==self::M_REPLACE) { // if we're supposed to replace invalid sequences, return the starting offset: it is itself a character return $s; } else { // otherwise if the character is invalid and we're expected to halt, halt throw new \Exception; } } // if the consumed character is valid, return the current position if (is_int($p)) { return $pos; } elseif ($errMode==self::M_SKIP) { // if we're supposed to skip invalid sequences: if ($pos < 1) { // if we're already at the start of the string, give up return $pos; } else { // otherwise skip over the last examined byte and start over $pos--; goto start; } } elseif ($errMode==self::M_REPLACE) { // if we're supposed to replace invalid sequences, return the current offset: we've synchronized return $pos; } else { // otherwise if the character is invalid and we're expected to halt, halt throw new \Exception; } } public static function len(string $string, int $start = 0, int $end = null, int $errMode = null): int { $errMode = $errMode ?? self::$errMode; $end = $end ?? strlen($string); if (substr($string, $start, ($end - $start)) =="") { return 0; } $count = 0; $pos = $start; do { $c = self::get($string, $pos, $pos, $errMode); } while ($c != "" && ++$count && $pos < $end); return $count; } public static function substr(string $str, int $start = 0, int $length = null, &$next = null, int $errMode = null): string { $errMode = $errMode ?? self::$errMode; if ($length > 0) { $pos = $start; $buffer = ""; do { $c = self::get($string, $pos, $pos, $errMode); // the current position is getting overwritten with the next position, by reference $buffer .= $c; } while (--$length && $c != ""); // stop after we have skipped the desired number of characters, or reached EOF $next = $pos; return $buffer; } else { $next = self::sync($string, $start, $errMode); return ""; } } /** Decodes the first UTF-8 character from a byte sequence into a numeric code point, starting at byte offset $pos * * Upon success, returns the numeric code point of the character, an integer between 0 and 1114111 * * Upon error, returns false; if $char is the empty string or $pos is beyond the end of the string, null is returned * * $next is a variable in which to store the next byte offset at which a character starts */ public static function ord(string $string, int $pos = 0, &$next = null, int $errMode = null) { // this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder // though it differs from a slavish implementation because it operates on only a single // character rather than a whole stream $eof = strlen($string); start: $point = null; $seen = 0; $needed = 0; $next = $pos + 1; $lower = "\x80"; $upper = "\xBF"; while ($pos < $eof && !($needed && $seen >= $needed)) { $b = $string[$pos++]; $next = $pos; $seen++; if(!$needed) { $needed = self::l($b); switch($needed) { case 1: $point = ord($b); break; case 2: $point = ord($b) & 0x1F; break; case 3: if ($b=="\xE0") { $lower = "\xA0"; } elseif ($b=="\xED") { $upper = "\x9F"; } $point = ord($b) & 0xF; break; case 4: if ($b=="\xF0") { $lower = "\x90"; } elseif ($b=="\xF4") { $upper = "\x8F"; } $point = ord($b) & 0x7; break; case 0: switch ($errMode ?? self::$errMode) { case self::M_SKIP: goto start; case self::M_REPLACE: return false; default: throw new \Exception; } break; } } elseif ($b < $lower || $b > $upper) { switch ($errMode ?? self::$errMode) { case self::M_SKIP: goto start; case self::M_REPLACE: return false; default: throw new \Exception; } } else { $lower = "\x80"; $upper = "\xBF"; $point = ($point << 6) | (ord($b) & 0x3F); } } if ($seen < $needed) { switch ($errMode ?? self::$errMode) { case self::M_SKIP: goto start; case self::M_REPLACE: return false; default: throw new \Exception; } } else { return $point; } } /** Returns the UTF-8 encoding of $codePoint * * If $codePoint is less than 0 or greater than 1114111, an empty string is returned */ public static function chr(int $codePoint): string { // this function implements https://encoding.spec.whatwg.org/#utf-8-encoder if ($codePoint < 0 || $codePoint > 0x10FFFF) { return ""; } elseif ($codePoint < 128) { return chr($codePoint); } elseif ($codePoint < 0x800) { $count = 1; $offset = 0xC0; } elseif ($codePoint < 0x10000) { $count = 2; $offset = 0xE0; } else { $count = 3; $offset = 0xF0; } $bytes = chr(($codePoint >> (6 * $count)) + $offset); while ($count > 0) { $temp = $codePoint >> (6 * ($count - 1)); $bytes .= chr(0x80 | ($temp & 0x3F)); $count--; } return $bytes; } /** * Returns the expected byte length of a UTF-8 character starting with byte $b * * If the byte is not the start of a UTF-8 sequence, 0 is returned */ protected static function l(string $b): int { if ($b >= "\xC2" && $b <= "\xDF") { // two-byte character return 2; } elseif ($b >= "\xE0" && $b <= "\xEF") { // three-byte character return 3; } elseif ($b >= "\xF0" && $b <= "\xF4") { // four-byte character return 4; } elseif ($b < "\x80") { // ASCII byte: one-byte character return 1; } elseif ($b == "") { // end of input: pretend it's a valid single-byte character return 1; } else { // invalid byte return 0; } } }