From bf81571ce4615d35b58ba06a7370e878842455b3 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Fri, 12 Mar 2021 18:29:07 -0500 Subject: [PATCH] Prototype strspn equivalent --- lib/Encoding/AbstractEncoding.php | 20 +++++++++++++ lib/Encoding/Decoder.php | 22 ++++++++++++++ lib/Encoding/ISO2022JP.php | 34 ++++++++++++++++++++++ lib/Encoding/Replacement.php | 8 ++++++ lib/Encoding/UTF16.php | 48 +++++++++++++++++++++++++++++++ 5 files changed, 132 insertions(+) diff --git a/lib/Encoding/AbstractEncoding.php b/lib/Encoding/AbstractEncoding.php index c5fc06f..c5e28bf 100644 --- a/lib/Encoding/AbstractEncoding.php +++ b/lib/Encoding/AbstractEncoding.php @@ -11,6 +11,8 @@ abstract class AbstractEncoding implements Decoder { protected const MODE_REPLACE = 1; protected const MODE_FATAL = 2; + protected const HIGH_BYTES = "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF"; + /** @var string $string The string being decoded */ protected $string; /** @var int $posByte The current byte position in the string */ @@ -162,6 +164,24 @@ abstract class AbstractEncoding implements Decoder { } } + public function asciiSpan(string $mask, int $length = null): string { + $mask = preg_replace('/[\x80-\xFF]/gs', "", $mask); + $len = strspn($this->string, $mask, $this->posByte, $length); + $out = substr($this->string, $this->posByte, $len); + $this->posByte += $len; + $this->posChar += $len; + return $out; + } + + public function asciiSpanNot(string $mask, int $length = null): string { + $mask .= self::HIGH_BYTES; + $len = strcspn($this->string, $mask, $this->posByte, $length); + $out = substr($this->string, $this->posByte, $len); + $this->posByte += $len; + $this->posChar += $len; + return $out; + } + /** Returns a copy of the decoder's state to keep in memory */ protected function stateSave(): array { $out = ['errCount' => sizeof($this->errStack)]; diff --git a/lib/Encoding/Decoder.php b/lib/Encoding/Decoder.php index 12274b3..0dc643c 100644 --- a/lib/Encoding/Decoder.php +++ b/lib/Encoding/Decoder.php @@ -80,4 +80,26 @@ interface Decoder { /** Generates an iterator which steps through each code point in the string */ public function codes(): \Generator; + + /** Fast-forwards through a span of ASCII characters matching the supplied mask, returning any consumed characters + * + * The mask must consist only of ASCII characters. + * + * Note that if the empty string is returned, this does not necessarily signal the end of the string + * + * @param string $mask The set of ASCII characters to match + * @param int $length The maximum number oof characters to advance by + */ + public function asciiSpan(string $mask, int $length = null): string; + + /** Fast-forwards through a span of ASCII characters not matching the supplied mask, returning any consumed characters + * + * The mask must consist only of ASCII characters. + * + * Note that if the empty string is returned, this does not necessarily signal the end of the string + * + * @param string $mask The set of ASCII characters to not match + * @param int $length The maximum number oof characters to advance by + */ + public function asciiSpanNot(string $mask, int $length = null): string; } diff --git a/lib/Encoding/ISO2022JP.php b/lib/Encoding/ISO2022JP.php index 40ec71b..1ad84e9 100644 --- a/lib/Encoding/ISO2022JP.php +++ b/lib/Encoding/ISO2022JP.php @@ -184,6 +184,40 @@ class ISO2022JP extends AbstractEncoding implements ModalCoder, Decoder { return $distance; } + public function asciiSpan(string $mask, int $length = null): string { + if ($this->mode === self::ASCII_STATE) { + $exc = '/[\x0E\x0F\x1B\x80-\xFF]/gs'; + } elseif ($this->mode === self::ROMAN_STATE) { + $exc = '/[\x0E\x0F\x1B\x5C\x7E\x80-\xFF]/gs'; + } else { + // in other modes ASCII characters are never returned + return ""; + } + $mask = preg_replace($exc, "", $mask); + $len = strspn($this->string, $mask, $this->posByte, $length); + $out = substr($this->string, $this->posByte, $len); + $this->posByte += $len; + $this->posChar += $len; + return $out; + } + + public function asciiSpanNot(string $mask, int $length = null): string { + if ($this->mode === self::ASCII_STATE) { + $mask .= "\x0E\x0F\x1B"; + } elseif ($this->mode === self::ROMAN_STATE) { + $mask .= "\x0E\x0F\x1B\x5C\x7E"; + } else { + // in other modes ASCII characters are never returned + return ""; + } + $mask .= self::HIGH_BYTES; + $len = strcspn($this->string, $mask, $this->posByte, $length); + $out = substr($this->string, $this->posByte, $len); + $this->posByte += $len; + $this->posChar += $len; + return $out; + } + protected function stateSave(): array { $out = parent::stateSave(); $out['modeCount'] = sizeof($this->modeStack); diff --git a/lib/Encoding/Replacement.php b/lib/Encoding/Replacement.php index 37c8b60..7684833 100644 --- a/lib/Encoding/Replacement.php +++ b/lib/Encoding/Replacement.php @@ -122,4 +122,12 @@ class Replacement implements Decoder { yield 0 => $this->nextCode(); } } + + public function asciiSpan(string $mask, int $length = null): string { + return ""; + } + + public function asciiSpanNot(string $mask, int $length = null): string { + return ""; + } } diff --git a/lib/Encoding/UTF16.php b/lib/Encoding/UTF16.php index b01c756..e6876a5 100644 --- a/lib/Encoding/UTF16.php +++ b/lib/Encoding/UTF16.php @@ -84,6 +84,54 @@ abstract class UTF16 extends AbstractEncoding { } } + public function asciiSpan(string $mask, int $length = null): string { + // UTF-16 has no ASCII characters, so we must do things the hard way + $out = ""; + while (true) { + $c1 = @$this->string[$this->posByte]; + $c2 = @$this->string[$this->posByte + 1]; + $b = ord(self::BE ? $c1 : $c2); + if (!$b) { + $c = self::BE ? $c2 : $c1; + $b = ord($c); + if ($b < 0x80 && strpos($mask, $c) !== false && $c1 !== "" && $c2 !== "") { + $out .= $c; + $this->posByte += 2; + $this->posChar++; + } else { + break; + } + } else { + break; + } + } + return $out; + } + + public function asciiSpanNot(string $mask, int $length = null): string { + // this is a copy of asciiSpan above with only the strpos check reversed + $out = ""; + while (true) { + $c1 = @$this->string[$this->posByte]; + $c2 = @$this->string[$this->posByte + 1]; + $b = ord(self::BE ? $c1 : $c2); + if (!$b) { + $c = self::BE ? $c2 : $c1; + $b = ord($c); + if ($b < 0x80 && strpos($mask, $c) === false && $c1 !== "" && $c2 !== "") { + $out .= $c; + $this->posByte += 2; + $this->posChar++; + } else { + break; + } + } else { + break; + } + } + return $out; + } + /** Implements backward seeking $distance characters */ protected function seekBack(int $distance): int { if ($this->dirtyEOF && $distance) {