posChar++; while (($b = @$this->string[$this->posByte++]) !== "") { $b = ord($b); if ($first === 0) { if ($b < 0x80) { return $b; } elseif ($b === 0x80) { return 0x20AC; } elseif ($b > 0x80 && $b < 0xFF) { $first = $b; continue; } else { return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); } } elseif ($second === 0) { if ($b > 0x2F && $b < 0x3A) { $second = $b; continue; } else { if (($b > 0x3A && $b < 0x7F) || ($b > 0x7F && $b < 0xFF)) { $offset = ($b < 0x7F) ? 0x40 : 0x41; $pointer = ($first - 0x81) * 190 + ($b - $offset); return self::TABLE_GBK[$pointer]; } elseif ($b < 0x80) { return self::err($this->errMode, [$this->posChar - 1, --$this->posByte]); } else { return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); } } } elseif ($third === 0) { if ($b > 0x80 && $b < 0xFF) { $third = $b; continue; } else { $this->posByte -= 2; return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); } } else { if ($b > 0x2F && $b < 0x3A) { // look up code point $pointer = (($first - 0x81) * (10 * 126 * 10)) + (($second - 0x30) * (10 * 126)) + (($third - 0x81) * 10) + $b - 0x30; if ($pointer === 7457) { return 0xE7C7; } for ($a = 1; $a < sizeof(self::TABLE_RANGES); $a++) { if ($pointer < self::TABLE_RANGES[$a]) { $offset = self::TABLE_RANGES[$a - 1]; $codePointOffset = self::TABLE_OFFSETS[$a - 1]; break; } } if (isset($codePointOffset)) { return $codePointOffset + $pointer - $offset; } else { return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); } } else { $this->posByte -= 3; return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]); } } } $this->posByte--; if (($first + $second + $third) == 0) { // clean EOF $this->posChar--; return false; } else { // dirty EOF; note how many bytes the last character had $this->dirtyEOF = ($third ? 3 : ($second ? 2 : 1)); return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]); } } /** Returns the encoding of $codePoint as a byte string * * If $codePoint is less than 0 or greater than 1114111, an exception is thrown * * If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted */ public static function encode(int $codePoint, bool $fatal = true): string { if ($codePoint < 0 || $codePoint > 0x10FFFF) { throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT); } elseif ($codePoint < 128) { return chr($codePoint); } elseif ($codePoint == 0xE5E5) { return self::err($fatal ? self::MODE_FATAL_ENC : self::MODE_HTML, $codePoint); } elseif (static::GBK && $codePoint == 0x20AC) { return "\x80"; } else { $pointer = array_flip(self::TABLE_GBK)[$codePoint] ?? null; if (isset($pointer)) { $lead = (int) ($pointer / 190) + 0x81; $trail = $pointer % 190; $offset = ($trail < 0x3F) ? 0x40 : 0x41; return chr($lead).chr($trail + $offset); } elseif (static::GBK) { return self::err($fatal ? self::MODE_FATAL_ENC : self::MODE_HTML, $codePoint); } else { if ($codePoint == 0xE7C7) { $pointer = 7457; } else { $index = 0; while ($codePoint >= self::TABLE_OFFSETS[$index + 1]) { $index++; } $offset = self::TABLE_OFFSETS[$index]; $pointer_offset = self::TABLE_RANGES[$index]; $pointer = $pointer_offset + $codePoint - $offset; } $byte1 = (int) ($pointer / (10 * 126 * 10)) + 0x81; $pointer %= (10 * 126 * 10); $byte2 = (int) ($pointer / (10 * 126)) + 0x30; $pointer %= (10 * 126); $byte3 = (int) ($pointer / 10) + 0x81; $byte4 = ($pointer % 10) + 0x30; return chr($byte1).chr($byte2).chr($byte3).chr($byte4); } } } /** Implements backward seeking $distance characters */ protected function seekBack(int $distance): int { while ($distance > 0 && $this->posByte > 0) { $distance--; $this->posChar--; if ($this->posByte == $this->lenByte && $this->dirtyEOF > 0) { // if we are at the end of the string and it did not terminate cleanly, go back the correct number of dirty bytes to seek through the last character $this->posByte -= $this->dirtyEOF; continue; } // go back one byte $b1 = ord(@$this->string[--$this->posByte]); if ($b1 < 0x30 || $b1 == 0x7F || $this->posByte == 0) { // these bytes never appear in sequences, and the first byte is necessarily the start of a sequence // the byte is a character continue; } // go back a second byte $b2 = ord(@$this->string[--$this->posByte]); if ($b2 < 0x81 || $b2 == 0xFF) { // these bytes never appear second-to-last in a sequence // the first byte was a character $this->posByte += 1; continue; } elseif ($b1 < 0x40 && $this->posByte < 2) { // byte values indicate a four-byte character, but there are insufficient bytes in the string // the first byte was a character $this->posByte += 1; continue; } elseif ($b1 > 0x39) { // the second byte is part of a two-byte sequence, but it's unclear if it's the lead or trail byte $start = $this->posByte + 2; // go back bytes until a definite trail byte or end of string while ($this->posByte > 0) { if ($b2 < 0x81 || $b2 == 0xFF) { $this->posByte++; break; } $b2 = ord(@$this->string[--$this->posByte]); } // if the number of ambiguous bytes is odd, the character is a single-byte character, otherwise it is double-byte $this->posByte = $start - (($start - $this->posByte) % 2 ? 1 : 2); continue; } // go back a third byte $b3 = ord(@$this->string[--$this->posByte]); if ($b3 > 0x39 || $b3 < 0x30) { // these bytes never appear in the second position of a four-byte sequence // the first byte was a character $this->posByte += 2; continue; } // go back a fourth byte $b4 = ord(@$this->string[--$this->posByte]); if (($b4 < 0x81 || $b4 == 0xFF)) { // these bytes never appear first in a four-byte sequence // the first byte was a character $this->posByte += 3; continue; } else { // this is a four-byte character } } return $distance; } }