Optionally allow surrogates

Also removed unnecessary docblocks
2019-12-18 14:57:54 -05:00 · 2019-12-18 14:57:54 -05:00 · 200a310f72
commit 200a310f72
parent 2e47fde774
18 changed files with 133 additions and 141 deletions
--- a/7
+++ b/7
@ -1,3 +1,10 @@
+Version 0.6.0 (2019-12-18)
+==========================
+
+New features:
+- Added $allowSurrogates parameter to Encoding constructor
+- Added posErr public instance property to Encoding
+
 Version 0.5.0 (2019-12-13)
 ==========================

--- a/lib/Encoding/Big5.php
+++ b/lib/Encoding/Big5.php
@ -24,12 +24,6 @@ class Big5 implements StatelessEncoding {
    protected $bufferedCode = 0;


-    /** Decodes the next character from the string and returns its code point number
-     *
-     * If the end of the string has been reached, false is returned
-     *
-     * @return int|bool
-     */
    public function nextCode() {
        $this->posChar++;
        if ($this->bufferedCode > 0) {
@ -70,8 +64,10 @@ class Big5 implements StatelessEncoding {
                    return $code;
                } else {
                    if ($b < 0x80) {
+                        $this->posErr = $this->posChar;
                        return self::err($this->errMode, [$this->posChar -1, --$this->posByte - 1]);
                    } else {
+                        $this->posErr = $this->posChar;
                        return self::err($this->errMode, [$this->posChar -1, $this->posByte - 2]);
                    }
                }
@ -85,16 +81,11 @@ class Big5 implements StatelessEncoding {
        } else {
            // dirty EOF
            $this->dirtyEOF = 1;
+            $this->posErr = $this->posChar;
            return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]);
        }
    }

-    /** Returns the encoding of $codePoint as a byte string
-     *
-     * If $codePoint is less than 0 or greater than 1114111, an exception is thrown
-     *
-     * If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
-     */
    public static function encode(int $codePoint, bool $fatal = true): string {
        if ($codePoint < 0 || $codePoint > 0x10FFFF) {
            throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);
--- a/lib/Encoding/EUCKR.php
+++ b/lib/Encoding/EUCKR.php
@ -27,12 +27,6 @@ class EUCKR implements StatelessEncoding {
    protected $dirtyEOF = 0;


-    /** Decodes the next character from the string and returns its code point number
-     *
-     * If the end of the string has been reached, false is returned
-     *
-     * @return int|bool
-     */
    public function nextCode() {
        $this->posChar++;
        $lead = 0x00;
@ -42,6 +36,7 @@ class EUCKR implements StatelessEncoding {
                if ($b < 0x80) {
                    return $b;
                } elseif ($b == 0x80 || $b == 0xFF) {
+                    $this->posErr = $this->posChar;
                    return self::err($this->errMode, [$this->posChar -1, $this->posByte - 1]);
                } else {
                    $lead = $b;
@ -57,8 +52,10 @@ class EUCKR implements StatelessEncoding {
                    return $code;
                } else {
                    if ($b < 0x80) {
+                        $this->posErr = $this->posChar;
                        return self::err($this->errMode, [$this->posChar -1, --$this->posByte - 1]);
                    } else {
+                        $this->posErr = $this->posChar;
                        return self::err($this->errMode, [$this->posChar -1, $this->posByte - 2]);
                    }
                }
@ -72,16 +69,11 @@ class EUCKR implements StatelessEncoding {
        } else {
            // dirty EOF
            $this->dirtyEOF = 1;
+            $this->posErr = $this->posChar;
            return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]);
        }
    }

-    /** Returns the encoding of $codePoint as a byte string
-     *
-     * If $codePoint is less than 0 or greater than 1114111, an exception is thrown
-     *
-     * If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
-     */
    public static function encode(int $codePoint, bool $fatal = true): string {
        if ($codePoint < 0 || $codePoint > 0x10FFFF) {
            throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);
--- a/lib/Encoding/Encoding.php
+++ b/lib/Encoding/Encoding.php
@ -19,10 +19,10 @@ interface Encoding {
    const E_UNAVAILABLE_CODE_POINT = 4;

    /** Constructs a new decoder
-     *
-     * If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted
+     * @param bool $fatal If true, throw enceptions when encountering invalid input. If false, substitute U+FFFD REPLACEMENT CHARACTER instead
+     * @param bool $allowSurrogates If true, treats surrogate characters as valid input; this only affects UTF-8 and UTF-16 encodings
     */
-    public function __construct(string $string, bool $fatal = false);
+    public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false);

    /** Returns the current byte position of the decoder */
    public function posByte(): int;
@ -40,15 +40,15 @@ interface Encoding {
     *
     * If the end of the string has been reached, false is returned
     *
-     * @return int|bool
+     * @return int|false
     */
    public function nextCode();

    /** Advance $distance characters through the string
-     *
-     * If $distance is negative, the operation will be performed in reverse
     *
     * If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
+     * 
+     * @param int $distance The number of characters to advance. If negative, the operation will seek back toward the beginning of the string
     */
    public function seek(int $distance): int;

@ -58,10 +58,16 @@ interface Encoding {
    */
    public function rewind();

-    /** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */
+    /** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer
+     * 
+     * @param int $num The number of characters to retrieve
+     */
    public function peekChar(int $num = 1): string;

-    /** Retrieves the next $num code points from the string, without advancing the character pointer */
+    /** Retrieves the next $num code points from the string, without advancing the character pointer
+     * 
+     * @param int $num The number of code points to retrieve
+     */
    public function peekCode(int $num = 1): array;

    /** Calculates the length of the string in bytes */
--- a/lib/Encoding/GBCommon.php
+++ b/lib/Encoding/GBCommon.php
@ -15,12 +15,6 @@ abstract class GBCommon implements StatelessEncoding {

    protected $dirtyEOF = 0;

-    /** Decodes the next character from the string and returns its code point number
-     *
-     * If the end of the string has been reached, false is returned
-     *
-     * @return int|bool
-     */
    public function nextCode() {
        $first = 0;
        $second = 0;
@ -37,6 +31,7 @@ abstract class GBCommon implements StatelessEncoding {
                    $first = $b;
                    continue;
                } else {
+                    $this->posErr = $this->posChar;
                    return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
                }
            } elseif ($second === 0) {
@ -49,8 +44,10 @@ abstract class GBCommon implements StatelessEncoding {
                        $pointer = ($first - 0x81) * 190 + ($b - $offset);
                        return self::TABLE_GBK[$pointer];
                    } elseif ($b < 0x80) {
+                        $this->posErr = $this->posChar;
                        return self::err($this->errMode, [$this->posChar - 1, --$this->posByte]);
                    } else {
+                        $this->posErr = $this->posChar;
                        return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
                    }
                }
@ -60,6 +57,7 @@ abstract class GBCommon implements StatelessEncoding {
                    continue;
                } else {
                    $this->posByte -= 2;
+                    $this->posErr = $this->posChar;
                    return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
                }
            } else {
@ -79,10 +77,12 @@ abstract class GBCommon implements StatelessEncoding {
                    if (isset($codePointOffset)) {
                        return $codePointOffset + $pointer - $offset;
                    } else {
+                        $this->posErr = $this->posChar;
                        return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
                    }
                } else {
                    $this->posByte -= 3;
+                    $this->posErr = $this->posChar;
                    return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 1]);
                }
            }
@ -95,16 +95,11 @@ abstract class GBCommon implements StatelessEncoding {
        } else {
            // dirty EOF; note how many bytes the last character had
            $this->dirtyEOF = ($third ? 3 : ($second ? 2 : 1));
+            $this->posErr = $this->posChar;
            return self::err($this->errMode, [$this->posChar - 1, $this->posByte - $this->dirtyEOF]);
        }
    }

-    /** Returns the encoding of $codePoint as a byte string
-     *
-     * If $codePoint is less than 0 or greater than 1114111, an exception is thrown
-     *
-     * If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
-     */
    public static function encode(int $codePoint, bool $fatal = true): string {
        if ($codePoint < 0 || $codePoint > 0x10FFFF) {
            throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);
--- a/lib/Encoding/GenericEncoding.php
+++ b/lib/Encoding/GenericEncoding.php
@ -13,40 +13,30 @@ trait GenericEncoding {
    protected $lenByte = null;
    protected $lenChar = null;
    protected $errMode = self::MODE_REPLACE;
+    protected $allowSurrogates = false;

-    /** Constructs a new decoder
-     *
-     * If $fatal is true, an exception will be thrown whenever an invalid code sequence is encountered; otherwise replacement characters will be substituted
-     */
-    public function __construct(string $string, bool $fatal = false) {
+    public $posErr = 0;
+
+    public function __construct(string $string, bool $fatal = false, bool $allowSurrogates = false) {
        $this->string = $string;
        $this->lenByte = strlen($string);
        $this->errMode = $fatal ? self::MODE_FATAL_DEC : self::MODE_REPLACE;
+        $this->allowSurrogates = $allowSurrogates;
    }

-    /** Returns the current byte position of the decoder */
    public function posByte(): int {
        return $this->posByte;
    }

-    /** Returns the current character position of the decoder */
    public function posChar(): int {
        return $this->posChar;
    }

-    /** Seeks to the start of the string
-     *
-     * This is usually faster than using the seek method for the same purpose
-    */
    public function rewind() {
        $this->posByte = 0;
        $this->posChar = 0;
    }

-    /** Retrieve the next character in the string, in UTF-8 encoding
-     *
-     * The returned character may be a replacement character, or the empty string if the end of the string has been reached
-     */
    public function nextChar(): string {
        // get the byte at the current position
        $b = @$this->string[$this->posByte];
@ -64,12 +54,6 @@ trait GenericEncoding {
        }
    }

-    /** Advance $distance characters through the string
-     *
-     * If $distance is negative, the operation will be performed in reverse
-     *
-     * If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
-     */
    public function seek(int $distance): int {
        if ($distance > 0) {
            if ($this->posByte == strlen($this->string)) {
@ -94,7 +78,6 @@ trait GenericEncoding {
        }
    }

-    /** Retrieves the next $num characters (in UTF-8 encoding) from the string without advancing the character pointer */
    public function peekChar(int $num = 1): string {
        $out = "";
        $state = $this->stateSave();
@ -108,7 +91,6 @@ trait GenericEncoding {
        return $out;
    }

-    /** Retrieves the next $num code points from the string, without advancing the character pointer */
    public function peekCode(int $num = 1): array {
        $out = [];
        $state = $this->stateSave();
@ -122,15 +104,10 @@ trait GenericEncoding {
        return $out;
    }

-    /** Calculates the length of the string in bytes */
    public function lenByte(): int {
        return $this->lenByte;
    }

-    /** Calculates the length of the string in code points
-     *
-     * Note that this may involve processing to the end of the string
-    */
    public function lenChar(): int {
        return $this->lenChar ?? (function() {
            $state = $this->stateSave();
@ -141,19 +118,16 @@ trait GenericEncoding {
        })();
    }

-    /** Returns whether the character pointer is at the end of the string */
    public function eof(): bool {
        return $this->posByte >= $this->lenByte;
    }

-    /** Generates an iterator which steps through each character in the string */
    public function chars(): \Generator {
        while (($c = $this->nextChar()) !== "") {
            yield ($this->posChar - 1) => $c;
        }
    }

-    /** Generates an iterator which steps through each code point in the string  */
    public function codes(): \Generator {
        while (($c = $this->nextCode()) !== false) {
            yield ($this->posChar - 1) => $c;
@ -165,6 +139,7 @@ trait GenericEncoding {
        return [
            'posChar' => $this->posChar,
            'posByte' => $this->posByte,
+            'posErr'  => $this->posErr,
        ];
    }

@ -191,7 +166,7 @@ trait GenericEncoding {
                // fatal replacement mode for decoders
                throw new DecoderException("Invalid code sequence at character offset {$data[0]} (byte offset {$data[1]})", self::E_INVALID_BYTE);
            case self::MODE_FATAL_ENC:
-                // fatal replacement mode for decoders; not applicable to Unicode transformation formats
+                // fatal replacement mode for encoders; not applicable to Unicode transformation formats
                throw new EncoderException("Code point $data not available in target encoding", self::E_UNAVAILABLE_CODE_POINT);
            default:
                // indicative of internal bug; should never be triggered
--- a/lib/Encoding/SingleByteEncoding.php
+++ b/lib/Encoding/SingleByteEncoding.php
@ -9,10 +9,6 @@ namespace MensBeam\Intl\Encoding;
 abstract class SingleByteEncoding implements StatelessEncoding {
    use GenericEncoding;

-    /** Retrieve the next character in the string, in UTF-8 encoding
-     *
-     * The returned character may be a replacement character, or the empty string if the end of the string has been reached
-     */
    public function nextChar(): string {
        // get the byte at the current position
        $b = @$this->string[$this->posChar];
@ -29,12 +25,6 @@ abstract class SingleByteEncoding implements StatelessEncoding {
        }
    }

-    /** Decodes the next character from the string and returns its code point number
-     *
-     * If the end of the string has been reached, false is returned
-     *
-     * @return int|bool
-     */
    public function nextCode() {
        // get the byte at the current position
        $b = @$this->string[$this->posChar];
@ -51,12 +41,6 @@ abstract class SingleByteEncoding implements StatelessEncoding {
        }
    }

-    /** Returns the encoding of $codePoint as a byte string
-     *
-     * If $codePoint is less than 0 or greater than 1114111, an exception is thrown
-     *
-     * If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted
-     */
    public static function encode(int $codePoint, bool $fatal = true): string {
        if ($codePoint < 0 || $codePoint > 0x10FFFF) {
            throw new EncoderException("Encountered code point outside Unicode range ($codePoint)", self::E_INVALID_CODE_POINT);
@ -67,12 +51,6 @@ abstract class SingleByteEncoding implements StatelessEncoding {
        }
    }

-    /** Advance $distance characters through the string
-     *
-     * If $distance is negative, the operation will be performed in reverse
-     *
-     * If the end (or beginning) of the string was reached before the end of the operation, the remaining number of requested characters is returned
-     */
    public function seek(int $distance): int {
        if ($distance > 0) {
            while ($this->posChar < $this->lenByte && $distance > 0) {
@ -92,20 +70,14 @@ abstract class SingleByteEncoding implements StatelessEncoding {
        }
    }

-    /** Returns the current byte position of the decoder */
    public function posByte(): int {
        return $this->posChar;
    }

-    /** Calculates the length of the string in code points
-     *
-     * Note that this may involve processing to the end of the string
-    */
    public function lenChar(): int {
        return $this->lenByte;
    }

-    /** Returns whether the character pointer is at the end of the string */
    public function eof(): bool {
        return $this->posChar >= $this->lenByte;
    }
--- a/lib/Encoding/UTF16.php
+++ b/lib/Encoding/UTF16.php
@ -11,12 +11,6 @@ abstract class UTF16 implements Encoding {
    
    protected $dirtyEOF = 0;

-    /** Decodes the next character from the string and returns its code point number
-     *
-     * If the end of the string has been reached, false is returned
-     *
-     * @return int|bool
-     */
    public function nextCode() {
        $lead_b = null;
        $lead_s = null;
@ -36,6 +30,9 @@ abstract class UTF16 implements Encoding {
                if (!is_null($lead_s)) {
                    if ($code >= 0xDC00 && $code <= 0xDFFF) {
                        return 0x10000 + (($lead_s - 0xD800) << 10) + ($code - 0xDC00);
+                    } elseif ($this->allowSurrogates) {
+                        $this->posByte -= 2;
+                        return $lead_s;
                    } else {
                        $this->posByte -= 2;
                        return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]);
@ -45,7 +42,11 @@ abstract class UTF16 implements Encoding {
                        $lead_s = $code;
                        continue;
                    } elseif ($code >= 0xDC00 && $code <= 0xDFFF) {
-                        return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]);
+                        if ($this->allowSurrogates) {
+                            return $code;
+                        } else {
+                            return self::err($this->errMode, [$this->posChar - 1, $this->posByte - 2]);
+                        }
                    } else {
                        return $code;
                    }
@ -65,10 +66,6 @@ abstract class UTF16 implements Encoding {
        }
    }

-    /** Retrieve the next character in the string, in UTF-8 encoding
-     *
-     * The returned character may be a replacement character, or the empty string if the end of the string has been reached
-     */
    public function nextChar(): string {
        // get the byte at the current position
        $b = @$this->string[$this->posByte];
--- a/lib/Encoding/UTF8.php
+++ b/lib/Encoding/UTF8.php
@ -12,12 +12,6 @@ class UTF8 implements StatelessEncoding {
    const NAME = "UTF-8";
    const LABELS = ["unicode-1-1-utf-8", "utf-8", "utf8"];

-    /** Decodes the next character from the string and returns its code point number
-     *
-     * If the end of the string has been reached, false is returned
-     *
-     * @return int|bool
-     */
    public function nextCode() {
        // this function effectively implements https://encoding.spec.whatwg.org/#utf-8-decoder
        // optimization for ASCII characters
@ -46,7 +40,7 @@ class UTF8 implements StatelessEncoding {
                    if ($b==0xE0) {
                        $lower = 0xA0;
                    } elseif ($b==0xED) {
-                        $upper = 0x9F;
+                        $upper = ($this->allowSurrogates) ? 0xBF : 0x9F;
                    }
                    $point = $b & 0xF;
                } elseif ($b >= 0xF0 && $b <= 0xF4) { // four-byte character
@ -58,9 +52,11 @@ class UTF8 implements StatelessEncoding {
                    }
                    $point = $b & 0x7;
                } else { // invalid byte
+                    $this->posErr = $this->posChar;
                    return self::err($this->errMode, [$this->posChar, $this->posByte]);
                }
            } elseif ($b < $lower || $b > $upper) {
+                $this->posErr = $this->posChar;
                return self::err($this->errMode, [$this->posChar, $this->posByte--]);
            } else {
                $lower = 0x80;
@ -72,12 +68,6 @@ class UTF8 implements StatelessEncoding {
        return $point;
    }

-    /** Returns the encoding of $codePoint as a byte string
-     *
-     * If $codePoint is less than 0 or greater than 1114111, an exception is thrown
-     *
-     * If $fatal is true, an exception will be thrown if the code point cannot be encoded into a character; otherwise HTML character references will be substituted. When encoding to UTF-8, all Unicode characters can be encoded, so the argument is ignored
-     */
    public static function encode(int $codePoint, bool $fatal = true): string {
        // this function implements https://encoding.spec.whatwg.org/#utf-8-encoder
        if ($codePoint < 0 || $codePoint > 0x10FFFF) {
--- a/tests/cases/Encoding/TestBig5.php
+++ b/tests/cases/Encoding/TestBig5.php
@ -128,6 +128,14 @@ class TestBig5 extends \MensBeam\Intl\Test\CoderDecoderTest {
        return parent::testIterateThroughAString($input, $exp);
    }

+    /**
+     * @dataProvider provideStrings
+     * @coversNothing
+    */
+    public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
+        return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
+    }
+
    public function provideCodePoints() {
        return [
            'U+0064 (HTML)'    => [false, 0x64, "64"],
--- a/tests/cases/Encoding/TestEUCKR.php
+++ b/tests/cases/Encoding/TestEUCKR.php
@ -128,6 +128,14 @@ class TestEUCKR extends \MensBeam\Intl\Test\CoderDecoderTest {
        return parent::testIterateThroughAString($input, $exp);
    }

+    /**
+     * @dataProvider provideStrings
+     * @coversNothing
+    */
+    public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
+        return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
+    }
+
    public function provideCodePoints() {
        return [
            'U+0064 (HTML)'    => [false, 0x64, "64"],
--- a/tests/cases/Encoding/TestGB18030.php
+++ b/tests/cases/Encoding/TestGB18030.php
@ -136,6 +136,14 @@ class TestGB18030 extends \MensBeam\Intl\Test\CoderDecoderTest {
        return parent::testIterateThroughAString($input, $exp);
    }

+    /**
+     * @dataProvider provideStrings
+     * @coversNothing
+    */
+    public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
+        return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
+    }
+
    public function provideCodePoints() {
        // bytes confirmed using Firefox
        $series_gb18030 = [
--- a/tests/cases/Encoding/TestSingleByte.php
+++ b/tests/cases/Encoding/TestSingleByte.php
@ -205,6 +205,15 @@ class TestSingleByte extends \MensBeam\Intl\Test\CoderDecoderTest {
        return parent::testIterateThroughAString($input, $exp);
    }

+    /**
+     * @dataProvider provideStrings
+     * @coversNothing
+    */
+    public function testIterateThroughAStringAllowingSurrogates(string $input, array $exp, $class = null) {
+        $this->testedClass = $class;
+        return parent::testIterateThroughAStringAllowingSurrogates($input, $exp, $exp);
+    }
+
    public function provideClasses() {
        foreach (self::$classes as $name => $class) {
            yield $name => [$class];
--- a/tests/cases/Encoding/TestUTF16BE.php
+++ b/tests/cases/Encoding/TestUTF16BE.php
@ -6,7 +6,6 @@
 declare(strict_types=1);
 namespace MensBeam\Intl\TestCase\Encoding;

-use MensBeam\Intl\Encoding\UTF16LE;
 use MensBeam\Intl\Encoding\UTF16BE;

 class TestUTF16BE extends TestUTF16LE {
@ -30,7 +29,10 @@ class TestUTF16BE extends TestUTF16LE {

    public function provideStrings() {
        foreach (parent::provideStrings() as $name => $test) {
-            list($string, $codes) = $test;
+            if (sizeof($test) == 2) {
+                $test[] = null;
+            }
+            list($string, $codes, $altCodes) = $test;
            $words = explode(" ", $string);
            foreach ($words as $a => $word) {
                if (strlen($word) == 4) {
@ -38,7 +40,7 @@ class TestUTF16BE extends TestUTF16LE {
                }
            }
            $string = implode(" ", $words);
-            yield $name => [$string, $codes];
+            yield $name => [$string, $codes, $altCodes];
        }
    }
 }
--- a/tests/cases/Encoding/TestUTF16LE.php
+++ b/tests/cases/Encoding/TestUTF16LE.php
@ -7,7 +7,6 @@ declare(strict_types=1);
 namespace MensBeam\Intl\TestCase\Encoding;

 use MensBeam\Intl\Encoding\UTF16LE;
-use MensBeam\Intl\Encoding\UTF16BE;

 class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
    protected $testedClass = UTF16LE::class;
@ -119,6 +118,14 @@ class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
        return parent::testIterateThroughAString($input, $exp);
    }

+    /**
+     * @dataProvider provideStrings
+     * @covers MensBeam\Intl\Encoding\UTF16::nextCode
+    */
+    public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
+        return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
+    }
+
    public function provideStrings() {
        return [
            // control samples
@ -130,10 +137,10 @@ class TestUTF16LE extends \MensBeam\Intl\Test\DecoderTest {
            'EOF after lead surrogate' => ["0000 34D8", [0, 65533]],
            'EOF in trail surrogate' => ["0000 34D8 1E", [0, 65533]],
            // invalid UTF-16 surrogates
-            'lead surrogate without trail' => ["34D8 0000", [65533, 0]],
-            'trail surrogate without lead' => ["1EDD 0000", [65533, 0]],
-            'double lead surrogate' => ["34D8 34D8 1EDD", [65533, 119070]],
-            'double trail surrogate' => ["34D8 1EDD 1EDD", [119070, 65533]],
+            'lead surrogate without trail' => ["34D8 0000", [65533, 0], [0xD834, 0]],
+            'trail surrogate without lead' => ["1EDD 0000", [65533, 0], [0xDD1E, 0]],
+            'double lead surrogate' => ["34D8 34D8 1EDD", [65533, 119070], [0xD834, 119070]],
+            'double trail surrogate' => ["34D8 1EDD 1EDD", [119070, 65533], [119070, 0xDD1E]],
        ];
    }
 }
--- a/tests/cases/Encoding/TestUTF8.php
+++ b/tests/cases/Encoding/TestUTF8.php
@ -128,6 +128,14 @@ class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest {
        return parent::testIterateThroughAString($input, $exp);
    }

+    /**
+     * @dataProvider provideStrings
+     * @covers MensBeam\Intl\Encoding\UTF8::nextCode
+    */
+    public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
+        return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
+    }
+
    public function provideCodePoints() {
        return [
            'U+007A (HTML)'    => [false, 0x7A, "7A"],
@ -190,9 +198,10 @@ class TestUTF8 extends \MensBeam\Intl\Test\CoderDecoderTest {
            'overlong U+10FFFF - 5 bytes' => ["F8 84 8F BF BF", [65533, 65533, 65533, 65533, 65533]],
            'overlong U+10FFFF - 6 bytes' => ["FC 80 84 8F BF BF", [65533, 65533, 65533, 65533, 65533, 65533]],
            // UTF-16 surrogates
-            'lead surrogate' => ["ED A0 80", [65533, 65533, 65533]],
-            'trail surrogate' => ["ED B0 80", [65533, 65533, 65533]],
-            'surrogate pair' => ["ED A0 80 ED B0 80", [65533, 65533, 65533, 65533, 65533, 65533]],
+            // surrogates have alternate outputs for when surrogates are being allowed
+            'lead surrogate' => ["ED A0 80", [65533, 65533, 65533], [0xD800]],
+            'trail surrogate' => ["ED B0 80", [65533, 65533, 65533], [0xDC00]],
+            'surrogate pair' => ["ED A0 80 ED B0 80", [65533, 65533, 65533, 65533, 65533, 65533], [0xD800, 0xDC00]],
            // self-sync edge cases
            'trailing continuation' => ["0A 80 80", [10, 65533, 65533]],
            'trailing continuation 2' => ["E5 8F A4 80", [21476, 65533]],
--- a/tests/cases/Encoding/TestXUserDefined.php
+++ b/tests/cases/Encoding/TestXUserDefined.php
@ -109,6 +109,14 @@ class TestXUserDefined extends \MensBeam\Intl\Test\DecoderTest {
        return parent::testIterateThroughAString($input, $exp);
    }

+    /**
+     * @dataProvider provideStrings
+     * @coversNothing
+    */
+    public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
+        return parent::testIterateThroughAStringAllowingSurrogates($input, $strictExp, $relaxedExp);
+    }
+
    public function provideStrings() {
        $a_bytes = [];
        $a_codes = [];
--- a/tests/lib/DecoderTest.php
+++ b/tests/lib/DecoderTest.php
@ -281,10 +281,18 @@ abstract class DecoderTest extends \PHPUnit\Framework\TestCase {
    }

    public function testIterateThroughAString(string $input, array $exp) {
+        $this->iterateThroughAString($input, $exp, false);
+    }
+
+    public function testIterateThroughAStringAllowingSurrogates(string $input, array $strictExp, array $relaxedExp = null) {
+        $exp = $relaxedExp ?? $strictExp;
+        $this->iterateThroughAString($input, $exp, true);
+    }
+
+    protected function iterateThroughAString(string $input, array $exp, bool $allowSurrogates) {
        $class = $this->testedClass;
        $input = $this->prepString($input);
-        $s = new $class($input);
-        $out = [];
+        $s = new $class($input, false, $allowSurrogates);
        $a = 0;
        $this->assertTrue(true); // prevent risky test of empty string
        foreach ($s->codes() as $index => $p) {