Browse Source

Fill out API documentation

multi-byte
J. King 4 years ago
parent
commit
600379a4dd
  1. 35
      lib/Encoding.php
  2. 5
      lib/Encoding/AbstractEncoding.php
  3. 4
      lib/Encoding/Big5.php
  4. 5
      lib/Encoding/Coder.php
  5. 2
      lib/Encoding/Decoder.php
  6. 8
      lib/Encoding/EUCJP.php
  7. 2
      lib/Encoding/EUCKR.php
  8. 21
      lib/Encoding/Encoder.php
  9. 2
      lib/Encoding/GBCommon.php
  10. 21
      lib/Encoding/ISO2022JP.php
  11. 8
      lib/Encoding/ModalCoder.php

35
lib/Encoding.php

File diff suppressed because one or more lines are too long

5
lib/Encoding/AbstractEncoding.php

@ -34,6 +34,11 @@ abstract class AbstractEncoding implements Decoder {
public $posErr = 0;
/** Seeks backwards through the string the specified number of characters.
* If the beginning of the string is reached before the requested number
* of characters has been skipped over, the number of remaining characters
* is returned.
*/
abstract protected function seekBack(int $distance): int;
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) {

4
lib/Encoding/Big5.php

File diff suppressed because one or more lines are too long

5
lib/Encoding/Coder.php

@ -13,9 +13,8 @@ interface Coder {
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
* @param int $codePoint The Unicode code point to encode. If less than 0 or greater than 1114111, an exception is thrown
* @param bool $fatal Whether an exception will be thrown if the code point cannot be encoded into a character; if false HTML character references will be substituted
*/
public static function encode(int $codePoint, bool $fatal = true): string;
}

2
lib/Encoding/Decoder.php

@ -14,6 +14,8 @@ interface Decoder {
const E_INVALID_BYTE = 2;
/** Constructs a new decoder
*
* @param string $string The string to decode
* @param bool $fatal If true, throw enceptions when encountering invalid input. If false, substitute U+FFFD REPLACEMENT CHARACTER instead
* @param bool $allowSurrogates If true, treats surrogate characters as valid input; this only affects UTF-8 and UTF-16 encodings
*/

8
lib/Encoding/EUCJP.php

File diff suppressed because one or more lines are too long

2
lib/Encoding/EUCKR.php

File diff suppressed because one or more lines are too long

21
lib/Encoding/Encoder.php

@ -17,6 +17,13 @@ class Encoder {
protected $fatal = true;
protected $mode = self::MODE_ASCII;
/** Constructs a new encoder for the specified $label
*
* @param string $label One of the encoding labels listed in the specification e.g. "utf-8", "Latin1", "shift_JIS"
* @param bool $fatal If true (the default) exceptions will be thrown when a character cannot be represented in the target encoding; if false HTML character references will be substituted instead
*
* @see https://encoding.spec.whatwg.org#names-and-labels
*/
public function __construct(string $label, bool $fatal = true) {
$l = Matcher::matchLabel($label);
if (!$l || !$l['encoder']) {
@ -27,6 +34,10 @@ class Encoder {
}
}
/** Encodes a series of code point numbers into a string
*
* @param iterable $codePoints An iterable set of integers representing code points in the Unicode range
*/
public function encode(iterable $codePoints): string {
$out = "";
switch ($this->name) {
@ -220,6 +231,12 @@ class Encoder {
return $out;
}
/** Encodes a single character into a string
*
* When using this method to encode a string, the finalize() method should be called to terminate the string
*
* @param int $codePoint An integer representing the Unicode code point number to encode
*/
public function encodeChar(int $codePoint): string {
switch ($this->name) {
case "UTF-8":
@ -299,6 +316,10 @@ class Encoder {
}
} // @codeCoverageIgnore
/** Finalizes a string, returning any terminal bytes to append to the output
*
* For the ISO-2022-JP encoding, this method must be called fater the last character is encoded to correctly encode a string; for other encodings this is a no-op
*/
public function finalize(): string {
return ISO2022JP::encode(null, $this->fatal, $this->mode);
}

2
lib/Encoding/GBCommon.php

@ -12,6 +12,7 @@ abstract class GBCommon extends AbstractEncoding implements Coder, Decoder {
const TABLE_RANGES = [0,36,38,45,50,81,89,95,96,100,103,104,105,109,126,133,148,172,175,179,208,306,307,308,309,310,311,312,313,341,428,443,544,545,558,741,742,749,750,805,819,820,7922,7924,7925,7927,7934,7943,7944,7945,7950,8062,8148,8149,8152,8164,8174,8236,8240,8262,8264,8374,8380,8381,8384,8388,8390,8392,8393,8394,8396,8401,8406,8416,8419,8424,8437,8439,8445,8482,8485,8496,8521,8603,8936,8946,9046,9050,9063,9066,9076,9092,9100,9108,9111,9113,9131,9162,9164,9218,9219,11329,11331,11334,11336,11346,11361,11363,11366,11370,11372,11375,11389,11682,11686,11687,11692,11694,11714,11716,11723,11725,11730,11736,11982,11989,12102,12336,12348,12350,12384,12393,12395,12397,12510,12553,12851,12962,12973,13738,13823,13919,13933,14080,14298,14585,14698,15583,15847,16318,16434,16438,16481,16729,17102,17122,17315,17320,17402,17418,17859,17909,17911,17915,17916,17936,17939,17961,18664,18703,18814,18962,19043,33469,33470,33471,33484,33485,33490,33497,33501,33505,33513,33520,33536,33550,37845,37921,37948,38029,38038,38064,38065,38066,38069,38075,38076,38078,39108,39109,39113,39114,39115,39116,39265,39394,39420,189000,1237576];
const TABLE_OFFSETS = [128,165,169,178,184,216,226,235,238,244,248,251,253,258,276,284,300,325,329,334,364,463,465,467,469,471,473,475,477,506,594,610,712,716,730,930,938,962,970,1026,1104,1106,8209,8215,8218,8222,8231,8241,8244,8246,8252,8365,8452,8454,8458,8471,8482,8556,8570,8596,8602,8713,8720,8722,8726,8731,8737,8740,8742,8748,8751,8760,8766,8777,8781,8787,8802,8808,8816,8854,8858,8870,8896,8979,9322,9372,9548,9588,9616,9622,9634,9652,9662,9672,9676,9680,9702,9735,9738,9793,9795,11906,11909,11913,11917,11928,11944,11947,11951,11956,11960,11964,11979,12284,12292,12312,12319,12330,12351,12436,12447,12535,12543,12586,12842,12850,12964,13200,13215,13218,13253,13263,13267,13270,13384,13428,13727,13839,13851,14617,14703,14801,14816,14964,15183,15471,15585,16471,16736,17208,17325,17330,17374,17623,17997,18018,18212,18218,18301,18318,18760,18811,18814,18820,18823,18844,18848,18872,19576,19620,19738,19887,40870,59244,59336,59367,59413,59417,59423,59431,59437,59443,59452,59460,59478,59493,63789,63866,63894,63976,63986,64016,64018,64021,64025,64034,64037,64042,65074,65093,65107,65112,65127,65132,65375,65510,null,65536,1114112];
/** @var array $pointerCache A cached result of flipping the pointer-to-code-point table */
protected static $pointerCache;
public function nextCode() {
@ -136,7 +137,6 @@ abstract class GBCommon extends AbstractEncoding implements Coder, Decoder {
}
}
/** Implements backward seeking $distance characters */
protected function seekBack(int $distance): int {
while ($distance > 0 && $this->posByte > 0) {
$distance--;

21
lib/Encoding/ISO2022JP.php

@ -23,16 +23,21 @@ class ISO2022JP extends AbstractEncoding implements ModalCoder, Decoder {
const ESCAPE_START_STATE = 5;
const ESCAPE_STATE = 6;
/** @var array $modeStack The stack of previous decoding modes and their effective byte positions; the current mode is kept off the stack */
protected $modeStack = [];
/** @var int $mode The current decoding mode, a subset of the possible states defined in the specification */
protected $mode = self::ASCII_STATE;
/** @var int $modeMark The byte position marking theposition where the current mode first applied; it is the position of the first byte AFTER the escape sequence */
protected $modeMark = \PHP_INT_MIN;
protected $modeStack = [];
protected $dirtyEOF = 0;
/** @var bool $trailingEscape Whether the string ends in a valid escape sequence */
protected $trailingEscape = false;
/** @var array $pointerCache A cached result of flipping the pointer-to-code-point table */
protected static $pointerCache;
public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) {
parent::__construct($string, $fatal, $allowSurrogates);
$this->stateProps[] = "dirtyEOF";
$this->stateProps[] = "trailingEscape";
}
public function nextChar(): string {
@ -130,7 +135,7 @@ class ISO2022JP extends AbstractEncoding implements ModalCoder, Decoder {
unset($lead);
// if we're at the end of the string, mark the string as dirty
if ($this->posByte === $this->lenByte) {
$this->dirtyEOF = 3;
$this->trailingEscape = true;
}
continue;
}
@ -146,10 +151,10 @@ class ISO2022JP extends AbstractEncoding implements ModalCoder, Decoder {
}
protected function seekBack(int $distance): int {
if ($this->dirtyEOF && $this->posByte === $this->lenByte) {
if ($this->trailingEscape && $this->posByte === $this->lenByte) {
list($this->modeMark, $this->mode) = array_pop($this->modeStack);
$this->posByte -= $this->dirtyEOF;
$this->dirtyEOF = 0;
$this->posByte -= 3;
$this->trailingEscape = false;
}
while ($distance > 0 && $this->posByte > 0) {
$this->posChar--;
@ -192,7 +197,7 @@ class ISO2022JP extends AbstractEncoding implements ModalCoder, Decoder {
$this->modeStack = [];
$this->modeMark = \PHP_INT_MIN;
$this->mode = self::ASCII_STATE;
$this->dirtyEOF = 0;
$this->trailingEscape = false;
parent::rewind();
}

8
lib/Encoding/ModalCoder.php

@ -13,9 +13,9 @@ interface ModalCoder {
/** Returns the encoding of $codePoint as a byte string
*
* If $codePoint is less than 0 or greater than 1114111, an exception is thrown; if $codePoint is null this signals end-of-file
*
* If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
* @param int $codePoint The Unicode code point to encode. If less than 0 or greater than 1114111, an exception is thrown; if $codePoint is null this signals end-of-file
* @param bool $fatal Whether an exception will be thrown if the code point cannot be encoded into a character; if false HTML character references will be substituted
* @param mixed &$mode A reference keeping track of the current encoder mode. An uninitialized variable should be passed on first invocation, and that variable used for further invocations.
*/
public static function encode(?int $codePoint, bool $fatal = true, &$mode = self::MODE_ASCII): string;
public static function encode(?int $codePoint, bool $fatal = true, &$mode = null): string;
}

Loading…
Cancel
Save