Define interfaces for encodings

6 years ago · 8c97b42303
5 changed files with 256 additions and 141 deletions
--- a/lib/Encoding/Encoding.php
+++ b/lib/Encoding/Encoding.php
@ -0,0 +1,77 @@
+<?php
+/** @license MIT
+ * Copyright 2018 J. King et al.
+ * See LICENSE and AUTHORS files for details */
+
+declare(strict_types=1);
+namespace MensBeam\Intl\Encoding;
+
+interface Encoding {
+    const MODE_NULL = 0;
+    const MODE_REPLACE = 1;
+    const MODE_HTML = 2;
+    const MODE_FATAL_DEC = 3;
+    const MODE_FATAL_ENC = 4;
+
+    const E_INVALID_CODE_POINT = 1;
+    const E_INVALID_BYTE = 2;
+    const E_INVALID_MODE = 3;
+
+    /** Constructs a new decoder
+     * 
+     * If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted
+     */
+    public function __construct(string $string, bool $fatal = false);
+
+    /** Returns the current byte position of the decoder */
+    public function posByte(): int;
+
+    /** Returns the current character position of the decoder */
+    public function posChar(): int;
+
+    /** Retrieve the next character in the string, in UTF-8 encoding
+     *
+     * The returned character may be a replacement character, or the empty string if the end of the string has been reached
+     */
+    public function nextChar(): string;
+
+    /** Decodes the next character from the string and returns its code point number
+     *
+     * If the end of the string has been reached, false is returned
+     *
+     * @return int|bool
+     */
+    public function nextCode();
+
+    /** Advance $distance characters through the string
+     *
+     * If $distance is negative, the operation will be performed in reverse
+     *
+     * If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
+     */
+    public function seek(int $distance): int;
+
+    /** Seeks to the start of the string
+     *
+     * This is usually faster than using the seek method for the same purpose
+    */
+    public function rewind();
+
+    /** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */
+    public function peekChar(int $num = 1): string;
+
+    /** Retrieves the next $num code points from the string, without advancing the character pointer */
+    public function peekCode(int $num = 1): array;
+
+    /** Calculates the length of the string in code points
+     *
+     * Note that this may involve processing to the end of the string
+    */
+    public function len(): int;
+
+    /** Generates an iterator which steps through each character in the string */
+    public function chars(): \Generator;
+
+    /** Generates an iterator which steps through each code point in the string  */
+    public function codes(): \Generator;
+}
--- a/lib/Encoding/GenericEncoding.php
+++ b/lib/Encoding/GenericEncoding.php
@ -0,0 +1,141 @@
+<?php
+/** @license MIT
+ * Copyright 2018 J. King et al.
+ * See LICENSE and AUTHORS files for details */
+
+declare(strict_types=1);
+namespace MensBeam\Intl\Encoding;
+
+trait GenericEncoding {
+
+    protected $string;
+    protected $posByte = 0;
+    protected $posChar = 0;
+    protected $lenByte = null;
+    protected $lenChar = null;
+    protected $errMode = self::MODE_REPLACE;
+
+    /** Constructs a new decoder
+     * 
+     * If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted
+     */
+    public function __construct(string $string, bool $fatal = false) {
+        $this->string = $string;
+        $this->lenByte = strlen($string);
+        $this->errMode = $fatal ? self::MODE_FATAL_DEC : self::MODE_REPLACE;
+    }
+
+    /** Returns the current byte position of the decoder */
+    public function posByte(): int {
+        return $this->posByte;
+    }
+
+    /** Returns the current character position of the decoder */
+    public function posChar(): int {
+        return $this->posChar;
+    }
+
+    /** Seeks to the start of the string
+     *
+     * This is usually faster than using the seek method for the same purpose
+    */
+    public function rewind() {
+        $this->posByte = 0;
+        $this->posChar = 0;
+    }
+
+    /** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */
+    public function peekChar(int $num = 1): string {
+        $out = "";
+        $state = $this->stateSave();
+        try {
+            while ($num-- > 0 && ($b = $this->nextChar()) !== "") {
+                $out .= $b;
+            }
+        } finally {
+            $this->stateApply($state);
+        }
+        return $out;
+    }
+
+    /** Retrieves the next $num code points from the string, without advancing the character pointer */
+    public function peekCode(int $num = 1): array {
+        $out = [];
+        $state = $this->stateSave();
+        try {
+            while ($num-- > 0 && ($b = $this->nextCode()) !== false) {
+                $out[] = $b;
+            }
+        } finally {
+            $this->stateApply($state);
+        }
+        return $out;
+    }
+
+    /** Calculates the length of the string in code points
+     *
+     * Note that this may involve processing to the end of the string
+    */
+    public function len(): int {
+        return $this->lenChar ?? (function() {
+            $state = $this->stateSave();
+            while ($this->nextCode() !== false);
+            $this->lenChar = $this->posChar;
+            $this->stateApply($state);
+            return $this->lenChar;
+        })();
+    }
+
+    /** Generates an iterator which steps through each character in the string */
+    public function chars(): \Generator {
+        while (($c = $this->nextChar()) !== "") {
+            yield ($this->posChar - 1) => $c;
+        }
+    }
+
+    /** Generates an iterator which steps through each code point in the string  */
+    public function codes(): \Generator {
+        while (($c = $this->nextCode()) !== false) {
+            yield ($this->posChar - 1) => $c;
+        }
+    }
+
+    /** Returns a copy of the decoder's state to keep in memory */
+    protected function stateSave(): array {
+        return [
+            'posChar' => $this->posChar,
+            'posByte' => $this->posByte,
+        ];
+    }
+
+    /** Sets the decoder's state to the values specified */
+    protected function stateApply(array $state) {
+        foreach ($state as $key => $value) {
+            $this->$key = $value;
+        }
+    }
+
+    /** Handles decoding and encoding errors */
+    protected static function err(int $mode, $data = null) {
+        switch ($mode) {
+            case self::MODE_NULL:
+                // used internally during backward seeking
+                return null;
+            case self::MODE_REPLACE:
+                // standard "replace" mode
+                return 0xFFFD;
+            case self::MODE_HTML: // @codeCoverageIgnore
+                // the "html" replacement mode; not applicable to Unicode transformation formats
+                return "&#".(string) $data.";"; // @codeCoverageIgnore
+            case self::MODE_FATAL_DEC:
+                // fatal replacement mode for decoders
+                throw new DecoderException("Invalid code sequence at character offset {$data[0]} (byte offset {$data[1]})", self::E_INVALID_BYTE);
+            case self::MODE_FATAL_ENC: // @codeCoverageIgnore
+                // fatal replacement mode for decoders; not applicable to Unicode transformation formats
+                throw new EncoderException("Code point $data not available in target encoding", self::E_INVALID_BYTE); // @codeCoverageIgnore
+            default:
+                // indicative of internal bug; should never be triggered
+                throw new DecoderException("Invalid replacement mode {$mode}", self::E_INVALID_MODE); // @codeCoverageIgnore
+        }
+    }
+}
--- a/lib/Encoding/StatefulEncoding.php
+++ b/lib/Encoding/StatefulEncoding.php
@ -0,0 +1,18 @@
+<?php
+/** @license MIT
+ * Copyright 2018 J. King et al.
+ * See LICENSE and AUTHORS files for details */
+
+declare(strict_types=1);
+namespace MensBeam\Intl\Encoding;
+
+interface StatefulEncoding extends Encoding {
+
+    /** Returns the encoding of $codePoint as a byte string
+     *
+     * If $codePoint is less than 0 or greater than 1114111, an exception is thrown
+     * 
+     * If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
+     */
+    public static function encode(array $codePoints, bool $fatal = true): string;
+}
--- a/lib/Encoding/StatelessEncoding.php
+++ b/lib/Encoding/StatelessEncoding.php
@ -0,0 +1,18 @@
+<?php
+/** @license MIT
+ * Copyright 2018 J. King et al.
+ * See LICENSE and AUTHORS files for details */
+
+declare(strict_types=1);
+namespace MensBeam\Intl\Encoding;
+
+interface StatelessEncoding extends Encoding {
+
+    /** Returns the encoding of $codePoint as a byte string
+     *
+     * If $codePoint is less than 0 or greater than 1114111, an exception is thrown
+     * 
+     * If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
+     */
+    public static function encode(int $codePoint, bool $fatal = true): string;
+}
--- a/lib/Encoding/UTF8.php
+++ b/lib/Encoding/UTF8.php
@ -6,47 +6,12 @@
 declare(strict_types=1);
 namespace MensBeam\Intl\Encoding;

-class UTF8 {
-    const MODE_NULL = 0;
-    const MODE_REPLACE = 1;
-    const MODE_HTML = 2;
-    const MODE_FATAL_DEC = 3;
-    const MODE_FATAL_ENC = 4;
-
-    const E_INVALID_CODE_POINT = 1;
-    const E_INVALID_BYTE = 2;
-    const E_INVALID_MODE = 3;
+class UTF8 implements StatelessEncoding {
+    use GenericEncoding;

    const NAME = "UTF-8";
    const LABELS = ["unicode-1-1-utf-8", "utf-8", "utf8"];

-    protected $string;
-    protected $posByte = 0;
-    protected $posChar = 0;
-    protected $lenByte = null;
-    protected $lenChar = null;
-    protected $errMode = self::MODE_REPLACE;
-
-    /** Constructs a new decoder
-     * 
-     * If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted
-     */
-    public function __construct(string $string, bool $fatal = false) {
-        $this->string = $string;
-        $this->lenByte = strlen($string);
-        $this->errMode = $fatal ? self::MODE_FATAL_DEC : self::MODE_REPLACE;
-    }
-
-    /** Returns the current byte position of the decoder */
-    public function posByte(): int {
-        return $this->posByte;
-    }
-
-    /** Returns the current character position of the decoder */
-    public function posChar(): int {
-        return $this->posChar;
-    }
-
    /** Retrieve the next character in the string, in UTF-8 encoding
     *
     * The returned character may be a replacement character, or the empty string if the end of the string has been reached
@ -194,71 +159,6 @@ class UTF8 {
        }
    }

-    /** Seeks to the start of the string
-     *
-     * This is usually faster than using the seek method for the same purpose
-    */
-    public function rewind() {
-        $this->posByte = 0;
-        $this->posChar = 0;
-    }
-
-    /** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */
-    public function peekChar(int $num = 1): string {
-        $out = "";
-        $state = $this->stateSave();
-        try {
-            while ($num-- > 0 && ($b = $this->nextChar()) !== "") {
-                $out .= $b;
-            }
-        } finally {
-            $this->stateApply($state);
-        }
-        return $out;
-    }
-
-    /** Retrieves the next $num code points from the string, without advancing the character pointer */
-    public function peekCode(int $num = 1): array {
-        $out = [];
-        $state = $this->stateSave();
-        try {
-            while ($num-- > 0 && ($b = $this->nextCode()) !== false) {
-                $out[] = $b;
-            }
-        } finally {
-            $this->stateApply($state);
-        }
-        return $out;
-    }
-
-    /** Calculates the length of the string in code points
-     *
-     * Note that this may involve processing to the end of the string
-    */
-    public function len(): int {
-        return $this->lenChar ?? (function() {
-            $state = $this->stateSave();
-            while ($this->nextCode() !== false);
-            $this->lenChar = $this->posChar;
-            $this->stateApply($state);
-            return $this->lenChar;
-        })();
-    }
-
-    /** Generates an iterator which steps through each character in the string */
-    public function chars(): \Generator {
-        while (($c = $this->nextChar()) !== "") {
-            yield ($this->posChar - 1) => $c;
-        }
-    }
-
-    /** Generates an iterator which steps through each code point in the string  */
-    public function codes(): \Generator {
-        while (($c = $this->nextCode()) !== false) {
-            yield ($this->posChar - 1) => $c;
-        }
-    }
-
    /** Synchronize to the byte offset of the start of the nearest character at or before byte offset $pos */
    protected function sync(int $pos) {
        $b = ord(@$this->string[$pos]);
@ -280,43 +180,4 @@ class UTF8 {
            }
        }
    }
-
-    /** Returns a copy of the decoder's state to keep in memory */
-    protected function stateSave(): array {
-        return [
-            'posChar' => $this->posChar,
-            'posByte' => $this->posByte,
-        ];
-    }
-
-    /** Sets the decoder's state to the values specified */
-    protected function stateApply(array $state) {
-        foreach ($state as $key => $value) {
-            $this->$key = $value;
-        }
-    }
-
-    /** Handles decoding and encoding errors */
-    protected static function err(int $mode, $data = null) {
-        switch ($mode) {
-            case self::MODE_NULL:
-                // used internally during backward seeking
-                return null;
-            case self::MODE_REPLACE:
-                // standard "replace" mode
-                return 0xFFFD;
-            case self::MODE_HTML: // @codeCoverageIgnore
-                // the "html" replacement mode; not applicable to Unicode transformation formats
-                return "&#".(string) $data.";"; // @codeCoverageIgnore
-            case self::MODE_FATAL_DEC:
-                // fatal replacement mode for decoders
-                throw new DecoderException("Invalid code sequence at character offset {$data[0]} (byte offset {$data[1]})", self::E_INVALID_BYTE);
-            case self::MODE_FATAL_ENC: // @codeCoverageIgnore
-                // fatal replacement mode for decoders; not applicable to Unicode transformation formats
-                throw new EncoderException("Code point $data not available in target encoding", self::E_INVALID_BYTE); // @codeCoverageIgnore
-            default:
-                // indicative of internal bug; should never be triggered
-                throw new DecoderException("Invalid replacement mode {$mode}", self::E_INVALID_MODE); // @codeCoverageIgnore
-        }
-    }
 }