J. King
6 years ago
5 changed files with 256 additions and 141 deletions
@ -0,0 +1,77 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\Encoding; |
||||
|
|
||||
|
interface Encoding { |
||||
|
const MODE_NULL = 0; |
||||
|
const MODE_REPLACE = 1; |
||||
|
const MODE_HTML = 2; |
||||
|
const MODE_FATAL_DEC = 3; |
||||
|
const MODE_FATAL_ENC = 4; |
||||
|
|
||||
|
const E_INVALID_CODE_POINT = 1; |
||||
|
const E_INVALID_BYTE = 2; |
||||
|
const E_INVALID_MODE = 3; |
||||
|
|
||||
|
/** Constructs a new decoder |
||||
|
* |
||||
|
* If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted |
||||
|
*/ |
||||
|
public function __construct(string $string, bool $fatal = false); |
||||
|
|
||||
|
/** Returns the current byte position of the decoder */ |
||||
|
public function posByte(): int; |
||||
|
|
||||
|
/** Returns the current character position of the decoder */ |
||||
|
public function posChar(): int; |
||||
|
|
||||
|
/** Retrieve the next character in the string, in UTF-8 encoding |
||||
|
* |
||||
|
* The returned character may be a replacement character, or the empty string if the end of the string has been reached |
||||
|
*/ |
||||
|
public function nextChar(): string; |
||||
|
|
||||
|
/** Decodes the next character from the string and returns its code point number |
||||
|
* |
||||
|
* If the end of the string has been reached, false is returned |
||||
|
* |
||||
|
* @return int|bool |
||||
|
*/ |
||||
|
public function nextCode(); |
||||
|
|
||||
|
/** Advance $distance characters through the string |
||||
|
* |
||||
|
* If $distance is negative, the operation will be performed in reverse |
||||
|
* |
||||
|
* If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned |
||||
|
*/ |
||||
|
public function seek(int $distance): int; |
||||
|
|
||||
|
/** Seeks to the start of the string |
||||
|
* |
||||
|
* This is usually faster than using the seek method for the same purpose |
||||
|
*/ |
||||
|
public function rewind(); |
||||
|
|
||||
|
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */ |
||||
|
public function peekChar(int $num = 1): string; |
||||
|
|
||||
|
/** Retrieves the next $num code points from the string, without advancing the character pointer */ |
||||
|
public function peekCode(int $num = 1): array; |
||||
|
|
||||
|
/** Calculates the length of the string in code points |
||||
|
* |
||||
|
* Note that this may involve processing to the end of the string |
||||
|
*/ |
||||
|
public function len(): int; |
||||
|
|
||||
|
/** Generates an iterator which steps through each character in the string */ |
||||
|
public function chars(): \Generator; |
||||
|
|
||||
|
/** Generates an iterator which steps through each code point in the string */ |
||||
|
public function codes(): \Generator; |
||||
|
} |
@ -0,0 +1,141 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\Encoding; |
||||
|
|
||||
|
trait GenericEncoding { |
||||
|
|
||||
|
protected $string; |
||||
|
protected $posByte = 0; |
||||
|
protected $posChar = 0; |
||||
|
protected $lenByte = null; |
||||
|
protected $lenChar = null; |
||||
|
protected $errMode = self::MODE_REPLACE; |
||||
|
|
||||
|
/** Constructs a new decoder |
||||
|
* |
||||
|
* If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted |
||||
|
*/ |
||||
|
public function __construct(string $string, bool $fatal = false) { |
||||
|
$this->string = $string; |
||||
|
$this->lenByte = strlen($string); |
||||
|
$this->errMode = $fatal ? self::MODE_FATAL_DEC : self::MODE_REPLACE; |
||||
|
} |
||||
|
|
||||
|
/** Returns the current byte position of the decoder */ |
||||
|
public function posByte(): int { |
||||
|
return $this->posByte; |
||||
|
} |
||||
|
|
||||
|
/** Returns the current character position of the decoder */ |
||||
|
public function posChar(): int { |
||||
|
return $this->posChar; |
||||
|
} |
||||
|
|
||||
|
/** Seeks to the start of the string |
||||
|
* |
||||
|
* This is usually faster than using the seek method for the same purpose |
||||
|
*/ |
||||
|
public function rewind() { |
||||
|
$this->posByte = 0; |
||||
|
$this->posChar = 0; |
||||
|
} |
||||
|
|
||||
|
/** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */ |
||||
|
public function peekChar(int $num = 1): string { |
||||
|
$out = ""; |
||||
|
$state = $this->stateSave(); |
||||
|
try { |
||||
|
while ($num-- > 0 && ($b = $this->nextChar()) !== "") { |
||||
|
$out .= $b; |
||||
|
} |
||||
|
} finally { |
||||
|
$this->stateApply($state); |
||||
|
} |
||||
|
return $out; |
||||
|
} |
||||
|
|
||||
|
/** Retrieves the next $num code points from the string, without advancing the character pointer */ |
||||
|
public function peekCode(int $num = 1): array { |
||||
|
$out = []; |
||||
|
$state = $this->stateSave(); |
||||
|
try { |
||||
|
while ($num-- > 0 && ($b = $this->nextCode()) !== false) { |
||||
|
$out[] = $b; |
||||
|
} |
||||
|
} finally { |
||||
|
$this->stateApply($state); |
||||
|
} |
||||
|
return $out; |
||||
|
} |
||||
|
|
||||
|
/** Calculates the length of the string in code points |
||||
|
* |
||||
|
* Note that this may involve processing to the end of the string |
||||
|
*/ |
||||
|
public function len(): int { |
||||
|
return $this->lenChar ?? (function() { |
||||
|
$state = $this->stateSave(); |
||||
|
while ($this->nextCode() !== false); |
||||
|
$this->lenChar = $this->posChar; |
||||
|
$this->stateApply($state); |
||||
|
return $this->lenChar; |
||||
|
})(); |
||||
|
} |
||||
|
|
||||
|
/** Generates an iterator which steps through each character in the string */ |
||||
|
public function chars(): \Generator { |
||||
|
while (($c = $this->nextChar()) !== "") { |
||||
|
yield ($this->posChar - 1) => $c; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** Generates an iterator which steps through each code point in the string */ |
||||
|
public function codes(): \Generator { |
||||
|
while (($c = $this->nextCode()) !== false) { |
||||
|
yield ($this->posChar - 1) => $c; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** Returns a copy of the decoder's state to keep in memory */ |
||||
|
protected function stateSave(): array { |
||||
|
return [ |
||||
|
'posChar' => $this->posChar, |
||||
|
'posByte' => $this->posByte, |
||||
|
]; |
||||
|
} |
||||
|
|
||||
|
/** Sets the decoder's state to the values specified */ |
||||
|
protected function stateApply(array $state) { |
||||
|
foreach ($state as $key => $value) { |
||||
|
$this->$key = $value; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** Handles decoding and encoding errors */ |
||||
|
protected static function err(int $mode, $data = null) { |
||||
|
switch ($mode) { |
||||
|
case self::MODE_NULL: |
||||
|
// used internally during backward seeking |
||||
|
return null; |
||||
|
case self::MODE_REPLACE: |
||||
|
// standard "replace" mode |
||||
|
return 0xFFFD; |
||||
|
case self::MODE_HTML: // @codeCoverageIgnore |
||||
|
// the "html" replacement mode; not applicable to Unicode transformation formats |
||||
|
return "&#".(string) $data.";"; // @codeCoverageIgnore |
||||
|
case self::MODE_FATAL_DEC: |
||||
|
// fatal replacement mode for decoders |
||||
|
throw new DecoderException("Invalid code sequence at character offset {$data[0]} (byte offset {$data[1]})", self::E_INVALID_BYTE); |
||||
|
case self::MODE_FATAL_ENC: // @codeCoverageIgnore |
||||
|
// fatal replacement mode for decoders; not applicable to Unicode transformation formats |
||||
|
throw new EncoderException("Code point $data not available in target encoding", self::E_INVALID_BYTE); // @codeCoverageIgnore |
||||
|
default: |
||||
|
// indicative of internal bug; should never be triggered |
||||
|
throw new DecoderException("Invalid replacement mode {$mode}", self::E_INVALID_MODE); // @codeCoverageIgnore |
||||
|
} |
||||
|
} |
||||
|
} |
@ -0,0 +1,18 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\Encoding; |
||||
|
|
||||
|
interface StatefulEncoding extends Encoding { |
||||
|
|
||||
|
/** Returns the encoding of $codePoint as a byte string |
||||
|
* |
||||
|
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown |
||||
|
* |
||||
|
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted |
||||
|
*/ |
||||
|
public static function encode(array $codePoints, bool $fatal = true): string; |
||||
|
} |
@ -0,0 +1,18 @@ |
|||||
|
<?php |
||||
|
/** @license MIT |
||||
|
* Copyright 2018 J. King et al. |
||||
|
* See LICENSE and AUTHORS files for details */ |
||||
|
|
||||
|
declare(strict_types=1); |
||||
|
namespace MensBeam\Intl\Encoding; |
||||
|
|
||||
|
interface StatelessEncoding extends Encoding { |
||||
|
|
||||
|
/** Returns the encoding of $codePoint as a byte string |
||||
|
* |
||||
|
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown |
||||
|
* |
||||
|
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted |
||||
|
*/ |
||||
|
public static function encode(int $codePoint, bool $fatal = true): string; |
||||
|
} |
Loading…
Reference in new issue