<?php
/** @license MIT
 * Copyright 2018 J. King et al.
 * See LICENSE and AUTHORS files for details */

declare(strict_types=1);
namespace MensBeam\UTF8;

class UTF8String {
    protected $string;
    protected $posByte = 0;
    protected $posChar = 0;


    public function __construct(string $string) {
        $this->string = $string;
    }

    public function posByte(): int {
        return $this->posByte;
    }

    public function posChar(): int {
        return $this->posChar;
    }

    /** Retrieve the next character in the string
     * 
     * The returned character may be a replacement character, or the empty string if the end of the string has already been reached
     */
    public function nextChr(): string {
        // get the byte at the current position
        $b = @$this->string[$this->posByte];
        if (ord($b) < 0x80) {
            // if the byte is an ASCII character or end of input, simply return it
            $this->posChar++;
            $this->posByte++;
            return $b;
        } else {
            // otherwise return the serialization of the code point at the current position
            return UTF8::chr($this->nextOrd() ?? 0xFFFD);
        }
    }

    /** Decodes the next UTF-8 character from the string and returns its code point number
     *
     * If a character could not be decoded, null is returned; if the end of the string has already been reached, false is returned
     */
    public function nextOrd() {
        // this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder
        // though it differs from a slavish implementation because it operates on only a single
        // character rather than a whole stream
        $this->posChar++;
        // optimization for ASCII characters
        $b = @$this->string[$this->posByte];
        if ($b=="") {
            $this->posByte++;
            return false;
        } elseif (($b = ord($b)) < 0x80) {
            $this->posByte++;
            return $b;
        }
        $point = 0;
        $seen = 0;
        $needed = 1;
        $lower = 0x80;
        $upper = 0xBF;
        while ($seen < $needed) {
            $b = ord(@$this->string[$this->posByte++]);
            if (!$seen) {
                if ($b >= 0xC2 && $b <= 0xDF) { // two-byte character
                    $needed = 2;
                    $point = $b & 0x1F;
                } elseif ($b >= 0xE0 && $b <= 0xEF) { // three-byte character
                    $needed = 3;
                    if ($b==0xE0) {
                        $lower = 0xA0;
                    } elseif ($b==0xED) {
                        $upper = 0x9F;
                    }
                    $point = $b & 0xF;
                } elseif ($b >= 0xF0 && $b <= 0xF4) { // four-byte character
                    $needed = 4;
                    if ($b==0xF0) {
                        $lower = 0x90;
                    } elseif ($b==0xF4) {
                        $upper = 0x8F;
                    }
                    $point = $b & 0x7;
                } else { // invalid byte
                    return null;
                }
            } elseif ($b < $lower || $b > $upper) {
                $this->posByte--;
                return null;
            } else {
                $lower = 0x80;
                $upper = 0xBF;
                $point = ($point << 6) | ($b & 0x3F);
            }
            $seen++;
        }
        return $point;
    }

    /** Advance $distance characters through the string
     *
     * If $distance is negative, the operation will be performed in reverse
     *
     * If the end (or beginning) of the string was reached before the end of the operation, false is returned
     */
    public function seek(int $distance): bool {
        if ($distance > 0) {
            do {
                // get the next code point; this automatically increments the character position
                $p = $this->nextOrd();
            } while (--$distance && $p !== false); // stop after we have skipped the desired number of characters, or reached EOF
            return !$distance;
        } elseif ($distance < 0) {
            if (!$this->posByte) {
                // if we're already at the start of the string, we can't go further back
                return false;
            }
            $distance = abs($distance);
            do {
                $this->sync($this->posByte - 1);
                // manually decrement the character position
                $this->posChar--;
            } while (--$distance && $this->posByte);
            return !$distance;
        } else {
            return true;
        }
    }

    /** Synchronize to the byte offset of the start of the nearest character at or before byte offset $pos */
    protected function sync(int $pos) {
        $b = ord(@$this->string[$pos]);
        if ($b < 0x80) {
            // if the byte is an ASCII byte or the end of input, then this is already a synchronized position
            $this->posByte = $pos;
        } else {
            $s = $pos;
            while ($b >= 0x80 && $b <= 0xBF && $pos > 0 && ($s - $pos) < 3) { // go back at most three bytes, no further than the start of the string, and only as long as the byte remains a continuation byte
                $b = ord(@$this->string[--$pos]);
            }
            $this->posByte = $pos;
            // decrement the character position because nextOrd() increments it
            $this->posChar--;
            if (is_null($this->nextOrd())) {
                $this->posByte = $s;
            } else {
                $this->posByte = ($this->posByte > $s ) ? $pos : $s;
            }
        }
    }
}