|
|
|
<?php
|
|
|
|
/** @license MIT
|
|
|
|
* Copyright 2018 J. King et al.
|
|
|
|
* See LICENSE and AUTHORS files for details */
|
|
|
|
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace MensBeam\UTF8;
|
|
|
|
|
|
|
|
class UTF8String {
|
|
|
|
protected $string;
|
|
|
|
protected $posByte = 0;
|
|
|
|
protected $posChar = 0;
|
|
|
|
|
|
|
|
|
|
|
|
public function __construct(string $string) {
|
|
|
|
$this->string = $string;
|
|
|
|
}
|
|
|
|
|
|
|
|
public function posByte(): int {
|
|
|
|
return $this->posByte;
|
|
|
|
}
|
|
|
|
|
|
|
|
public function posChar(): int {
|
|
|
|
return $this->posChar;
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Retrieve the next character in the string
|
|
|
|
*
|
|
|
|
* The returned character may be a replacement character, or the empty string if the end of the string has already been reached
|
|
|
|
*/
|
|
|
|
public function nextChr(): string {
|
|
|
|
// get the byte at the current position
|
|
|
|
$b = @$this->string[$this->posByte];
|
|
|
|
if (ord($b) < 0x80) {
|
|
|
|
// if the byte is an ASCII character or end of input, simply return it
|
|
|
|
$this->posChar++;
|
|
|
|
$this->posByte++;
|
|
|
|
return $b;
|
|
|
|
} else {
|
|
|
|
// otherwise return the serialization of the code point at the current position
|
|
|
|
return UTF8::chr($this->nextOrd() ?? 0xFFFD);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Decodes the next UTF-8 character from the string and returns its code point number
|
|
|
|
*
|
|
|
|
* If a character could not be decoded, null is returned; if the end of the string has already been reached, false is returned
|
|
|
|
*/
|
|
|
|
public function nextOrd() {
|
|
|
|
// this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder
|
|
|
|
// though it differs from a slavish implementation because it operates on only a single
|
|
|
|
// character rather than a whole stream
|
|
|
|
$this->posChar++;
|
|
|
|
// optimization for ASCII characters
|
|
|
|
$b = @$this->string[$this->posByte];
|
|
|
|
if ($b=="") {
|
|
|
|
$this->posByte++;
|
|
|
|
return false;
|
|
|
|
} elseif (($b = ord($b)) < 0x80) {
|
|
|
|
$this->posByte++;
|
|
|
|
return $b;
|
|
|
|
}
|
|
|
|
$point = 0;
|
|
|
|
$seen = 0;
|
|
|
|
$needed = 1;
|
|
|
|
$lower = 0x80;
|
|
|
|
$upper = 0xBF;
|
|
|
|
while ($seen < $needed) {
|
|
|
|
$b = ord(@$this->string[$this->posByte++]);
|
|
|
|
if (!$seen) {
|
|
|
|
if ($b >= 0xC2 && $b <= 0xDF) { // two-byte character
|
|
|
|
$needed = 2;
|
|
|
|
$point = $b & 0x1F;
|
|
|
|
} elseif ($b >= 0xE0 && $b <= 0xEF) { // three-byte character
|
|
|
|
$needed = 3;
|
|
|
|
if ($b==0xE0) {
|
|
|
|
$lower = 0xA0;
|
|
|
|
} elseif ($b==0xED) {
|
|
|
|
$upper = 0x9F;
|
|
|
|
}
|
|
|
|
$point = $b & 0xF;
|
|
|
|
} elseif ($b >= 0xF0 && $b <= 0xF4) { // four-byte character
|
|
|
|
$needed = 4;
|
|
|
|
if ($b==0xF0) {
|
|
|
|
$lower = 0x90;
|
|
|
|
} elseif ($b==0xF4) {
|
|
|
|
$upper = 0x8F;
|
|
|
|
}
|
|
|
|
$point = $b & 0x7;
|
|
|
|
} else { // invalid byte
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
} elseif ($b < $lower || $b > $upper) {
|
|
|
|
$this->posByte--;
|
|
|
|
return null;
|
|
|
|
} else {
|
|
|
|
$lower = 0x80;
|
|
|
|
$upper = 0xBF;
|
|
|
|
$point = ($point << 6) | ($b & 0x3F);
|
|
|
|
}
|
|
|
|
$seen++;
|
|
|
|
}
|
|
|
|
return $point;
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Advance $distance characters through the string
|
|
|
|
*
|
|
|
|
* If $distance is negative, the operation will be performed in reverse
|
|
|
|
*
|
|
|
|
* If the end (or beginning) of the string was reached before the end of the operation, false is returned
|
|
|
|
*/
|
|
|
|
public function seek(int $distance): bool {
|
|
|
|
if ($distance > 0) {
|
|
|
|
do {
|
|
|
|
// get the next code point; this automatically increments the character position
|
|
|
|
$p = $this->nextOrd();
|
|
|
|
} while (--$distance && $p !== false); // stop after we have skipped the desired number of characters, or reached EOF
|
|
|
|
return !$distance;
|
|
|
|
} elseif ($distance < 0) {
|
|
|
|
if (!$this->posByte) {
|
|
|
|
// if we're already at the start of the string, we can't go further back
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
$distance = abs($distance);
|
|
|
|
do {
|
|
|
|
$this->sync($this->posByte - 1);
|
|
|
|
// manually decrement the character position
|
|
|
|
$this->posChar--;
|
|
|
|
} while (--$distance && $this->posByte);
|
|
|
|
return !$distance;
|
|
|
|
} else {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Synchronize to the byte offset of the start of the nearest character at or before byte offset $pos */
|
|
|
|
protected function sync(int $pos) {
|
|
|
|
$b = ord(@$this->string[$pos]);
|
|
|
|
if ($b < 0x80) {
|
|
|
|
// if the byte is an ASCII byte or the end of input, then this is already a synchronized position
|
|
|
|
$this->posByte = $pos;
|
|
|
|
} else {
|
|
|
|
$s = $pos;
|
|
|
|
while ($b >= 0x80 && $b <= 0xBF && $pos > 0 && ($s - $pos) < 3) { // go back at most three bytes, no further than the start of the string, and only as long as the byte remains a continuation byte
|
|
|
|
$b = ord(@$this->string[--$pos]);
|
|
|
|
}
|
|
|
|
$this->posByte = $pos;
|
|
|
|
// decrement the character position because nextOrd() increments it
|
|
|
|
$this->posChar--;
|
|
|
|
if (is_null($this->nextOrd())) {
|
|
|
|
$this->posByte = $s;
|
|
|
|
} else {
|
|
|
|
$this->posByte = ($this->posByte > $s ) ? $pos : $s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|