Browse Source

Define interfaces for encodings

labels
J. King 6 years ago
parent
commit
8c97b42303
  1. 77
      lib/Encoding/Encoding.php
  2. 141
      lib/Encoding/GenericEncoding.php
  3. 18
      lib/Encoding/StatefulEncoding.php
  4. 18
      lib/Encoding/StatelessEncoding.php
  5. 143
      lib/Encoding/UTF8.php

77
lib/Encoding/Encoding.php

@ -0,0 +1,77 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Intl\Encoding;
interface Encoding {
const MODE_NULL = 0;
const MODE_REPLACE = 1;
const MODE_HTML = 2;
const MODE_FATAL_DEC = 3;
const MODE_FATAL_ENC = 4;
const E_INVALID_CODE_POINT = 1;
const E_INVALID_BYTE = 2;
const E_INVALID_MODE = 3;
/** Constructs a new decoder
*
* If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted
*/
public function __construct(string $string, bool $fatal = false);
/** Returns the current byte position of the decoder */
public function posByte(): int;
/** Returns the current character position of the decoder */
public function posChar(): int;
/** Retrieve the next character in the string, in UTF-8 encoding
*
* The returned character may be a replacement character, or the empty string if the end of the string has been reached
*/
public function nextChar(): string;
/** Decodes the next character from the string and returns its code point number
*
* If the end of the string has been reached, false is returned
*
* @return int|bool
*/
public function nextCode();
/** Advance $distance characters through the string
*
* If $distance is negative, the operation will be performed in reverse
*
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
*/
public function seek(int $distance): int;
/** Seeks to the start of the string
*
* This is usually faster than using the seek method for the same purpose
*/
public function rewind();
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */
public function peekChar(int $num = 1): string;
/** Retrieves the next $num code points from the string, without advancing the character pointer */
public function peekCode(int $num = 1): array;
/** Calculates the length of the string in code points
*
* Note that this may involve processing to the end of the string
*/
public function len(): int;
/** Generates an iterator which steps through each character in the string */
public function chars(): \Generator;
/** Generates an iterator which steps through each code point in the string */
public function codes(): \Generator;
}

141
lib/Encoding/GenericEncoding.php

@ -0,0 +1,141 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Intl\Encoding;
trait GenericEncoding {
protected $string;
protected $posByte = 0;
protected $posChar = 0;
protected $lenByte = null;
protected $lenChar = null;
protected $errMode = self::MODE_REPLACE;
/** Constructs a new decoder
*
* If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted
*/
public function __construct(string $string, bool $fatal = false) {
$this->string = $string;
$this->lenByte = strlen($string);
$this->errMode = $fatal ? self::MODE_FATAL_DEC : self::MODE_REPLACE;
}
/** Returns the current byte position of the decoder */
public function posByte(): int {
return $this->posByte;
}
/** Returns the current character position of the decoder */
public function posChar(): int {
return $this->posChar;
}
/** Seeks to the start of the string
*
* This is usually faster than using the seek method for the same purpose
*/
public function rewind() {
$this->posByte = 0;
$this->posChar = 0;
}
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */
public function peekChar(int $num = 1): string {
$out = "";
$state = $this->stateSave();
try {
while ($num-- > 0 && ($b = $this->nextChar()) !== "") {
$out .= $b;
}
} finally {
$this->stateApply($state);
}
return $out;
}
/** Retrieves the next $num code points from the string, without advancing the character pointer */
public function peekCode(int $num = 1): array {
$out = [];
$state = $this->stateSave();
try {
while ($num-- > 0 && ($b = $this->nextCode()) !== false) {
$out[] = $b;
}
} finally {
$this->stateApply($state);
}
return $out;
}
/** Calculates the length of the string in code points
*
* Note that this may involve processing to the end of the string
*/
public function len(): int {
return $this->lenChar ?? (function() {
$state = $this->stateSave();
while ($this->nextCode() !== false);
$this->lenChar = $this->posChar;
$this->stateApply($state);
return $this->lenChar;
})();
}
/** Generates an iterator which steps through each character in the string */
public function chars(): \Generator {
while (($c = $this->nextChar()) !== "") {
yield ($this->posChar - 1) => $c;
}
}
/** Generates an iterator which steps through each code point in the string */
public function codes(): \Generator {
while (($c = $this->nextCode()) !== false) {
yield ($this->posChar - 1) => $c;
}
}
/** Returns a copy of the decoder's state to keep in memory */
protected function stateSave(): array {
return [
'posChar' => $this->posChar,
'posByte' => $this->posByte,
];
}
/** Sets the decoder's state to the values specified */
protected function stateApply(array $state) {
foreach ($state as $key => $value) {
$this->$key = $value;
}
}
/** Handles decoding and encoding errors */
protected static function err(int $mode, $data = null) {
switch ($mode) {
case self::MODE_NULL:
// used internally during backward seeking
return null;
case self::MODE_REPLACE:
// standard "replace" mode
return 0xFFFD;
case self::MODE_HTML: // @codeCoverageIgnore
// the "html" replacement mode; not applicable to Unicode transformation formats
return "&#".(string) $data.";"; // @codeCoverageIgnore
case self::MODE_FATAL_DEC:
// fatal replacement mode for decoders
throw new DecoderException("Invalid code sequence at character offset {$data[0]} (byte offset {$data[1]})", self::E_INVALID_BYTE);
case self::MODE_FATAL_ENC: // @codeCoverageIgnore
// fatal replacement mode for decoders; not applicable to Unicode transformation formats
throw new EncoderException("Code point $data not available in target encoding", self::E_INVALID_BYTE); // @codeCoverageIgnore
default:
// indicative of internal bug; should never be triggered
throw new DecoderException("Invalid replacement mode {$mode}", self::E_INVALID_MODE); // @codeCoverageIgnore
}
}
}

18
lib/Encoding/StatefulEncoding.php

@ -0,0 +1,18 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Intl\Encoding;
interface StatefulEncoding extends Encoding {
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
*/
public static function encode(array $codePoints, bool $fatal = true): string;
}

18
lib/Encoding/StatelessEncoding.php

@ -0,0 +1,18 @@
<?php
/** @license MIT
* Copyright 2018 J. King et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace MensBeam\Intl\Encoding;
interface StatelessEncoding extends Encoding {
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
*/
public static function encode(int $codePoint, bool $fatal = true): string;
}

143
lib/Encoding/UTF8.php

@ -6,47 +6,12 @@
declare(strict_types=1);
namespace MensBeam\Intl\Encoding;
class UTF8 {
const MODE_NULL = 0;
const MODE_REPLACE = 1;
const MODE_HTML = 2;
const MODE_FATAL_DEC = 3;
const MODE_FATAL_ENC = 4;
const E_INVALID_CODE_POINT = 1;
const E_INVALID_BYTE = 2;
const E_INVALID_MODE = 3;
class UTF8 implements StatelessEncoding {
use GenericEncoding;
const NAME = "UTF-8";
const LABELS = ["unicode-1-1-utf-8", "utf-8", "utf8"];
protected $string;
protected $posByte = 0;
protected $posChar = 0;
protected $lenByte = null;
protected $lenChar = null;
protected $errMode = self::MODE_REPLACE;
/** Constructs a new decoder
*
* If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted
*/
public function __construct(string $string, bool $fatal = false) {
$this->string = $string;
$this->lenByte = strlen($string);
$this->errMode = $fatal ? self::MODE_FATAL_DEC : self::MODE_REPLACE;
}
/** Returns the current byte position of the decoder */
public function posByte(): int {
return $this->posByte;
}
/** Returns the current character position of the decoder */
public function posChar(): int {
return $this->posChar;
}
/** Retrieve the next character in the string, in UTF-8 encoding
*
* The returned character may be a replacement character, or the empty string if the end of the string has been reached
@ -194,71 +159,6 @@ class UTF8 {
}
}
/** Seeks to the start of the string
*
* This is usually faster than using the seek method for the same purpose
*/
public function rewind() {
$this->posByte = 0;
$this->posChar = 0;
}
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */
public function peekChar(int $num = 1): string {
$out = "";
$state = $this->stateSave();
try {
while ($num-- > 0 && ($b = $this->nextChar()) !== "") {
$out .= $b;
}
} finally {
$this->stateApply($state);
}
return $out;
}
/** Retrieves the next $num code points from the string, without advancing the character pointer */
public function peekCode(int $num = 1): array {
$out = [];
$state = $this->stateSave();
try {
while ($num-- > 0 && ($b = $this->nextCode()) !== false) {
$out[] = $b;
}
} finally {
$this->stateApply($state);
}
return $out;
}
/** Calculates the length of the string in code points
*
* Note that this may involve processing to the end of the string
*/
public function len(): int {
return $this->lenChar ?? (function() {
$state = $this->stateSave();
while ($this->nextCode() !== false);
$this->lenChar = $this->posChar;
$this->stateApply($state);
return $this->lenChar;
})();
}
/** Generates an iterator which steps through each character in the string */
public function chars(): \Generator {
while (($c = $this->nextChar()) !== "") {
yield ($this->posChar - 1) => $c;
}
}
/** Generates an iterator which steps through each code point in the string */
public function codes(): \Generator {
while (($c = $this->nextCode()) !== false) {
yield ($this->posChar - 1) => $c;
}
}
/** Synchronize to the byte offset of the start of the nearest character at or before byte offset $pos */
protected function sync(int $pos) {
$b = ord(@$this->string[$pos]);
@ -280,43 +180,4 @@ class UTF8 {
}
}
}
/** Returns a copy of the decoder's state to keep in memory */
protected function stateSave(): array {
return [
'posChar' => $this->posChar,
'posByte' => $this->posByte,
];
}
/** Sets the decoder's state to the values specified */
protected function stateApply(array $state) {
foreach ($state as $key => $value) {
$this->$key = $value;
}
}
/** Handles decoding and encoding errors */
protected static function err(int $mode, $data = null) {
switch ($mode) {
case self::MODE_NULL:
// used internally during backward seeking
return null;
case self::MODE_REPLACE:
// standard "replace" mode
return 0xFFFD;
case self::MODE_HTML: // @codeCoverageIgnore
// the "html" replacement mode; not applicable to Unicode transformation formats
return "&#".(string) $data.";"; // @codeCoverageIgnore
case self::MODE_FATAL_DEC:
// fatal replacement mode for decoders
throw new DecoderException("Invalid code sequence at character offset {$data[0]} (byte offset {$data[1]})", self::E_INVALID_BYTE);
case self::MODE_FATAL_ENC: // @codeCoverageIgnore
// fatal replacement mode for decoders; not applicable to Unicode transformation formats
throw new EncoderException("Code point $data not available in target encoding", self::E_INVALID_BYTE); // @codeCoverageIgnore
default:
// indicative of internal bug; should never be triggered
throw new DecoderException("Invalid replacement mode {$mode}", self::E_INVALID_MODE); // @codeCoverageIgnore
}
}
}

Loading…
Cancel
Save