Browse Source

More robust self-synchronization

labels
J. King 6 years ago
parent
commit
a99702d4ab
  1. 4
      lib/UTF8String.php
  2. 8
      tests/cases/TestCodec.php

4
lib/UTF8String.php

@ -141,7 +141,7 @@ class UTF8String {
$this->posByte = $pos;
} else {
$s = $pos;
while ($b >= 0x80 && $b <= 0xBF && ($s - $pos) < 3) { // go back at most three bytes, no further than the start of the string, and only as long as the byte remains a continuation byte
while ($b >= 0x80 && $b <= 0xBF && $pos > 0 && ($s - $pos) < 3) { // go back at most three bytes, no further than the start of the string, and only as long as the byte remains a continuation byte
$b = ord(@$this->string[--$pos]);
}
$this->posByte = $pos;
@ -150,7 +150,7 @@ class UTF8String {
if (is_null($this->nextOrd())) {
$this->posByte = $s;
} else {
$this->posByte = $pos;
$this->posByte = ($this->posByte > $s ) ? $pos : $s;
}
}
}

8
tests/cases/TestCodec.php

@ -58,8 +58,11 @@ class TestConf extends \PHPUnit\Framework\TestCase {
public function provideStrings() {
return [
// control samples
'sanity check' => ["\x61\x62\x63\x31\x32\x33", [97, 98, 99, 49, 50, 51]],
'multibyte control' => ["\xE5\x8F\xA4\xE6\xB1\xA0\xE3\x82\x84\xE8\x9B\x99\xE9\xA3\x9B\xE3\x81\xB3\xE8\xBE\xBC\xE3\x82\x80\xE6\xB0\xB4\xE3\x81\xAE\xE9\x9F\xB3", [21476, 27744, 12420, 34521, 39131, 12403, 36796, 12416, 27700, 12398, 38899]],
'mixed sample' => ["\x7A\xC2\xA2\xE6\xB0\xB4\xF0\x9D\x84\x9E\xEF\xA3\xBF\xF4\x8F\xBF\xBD\xEF\xBF\xBE", [122, 162, 27700, 119070, 63743, 1114109, 65534]],
// various invalid sequences
'invalid code' => ["\xFF", [65533]],
'ends early' => ["\xC0", [65533]],
'ends early 2' => ["\xE0", [65533]],
@ -90,10 +93,13 @@ class TestConf extends \PHPUnit\Framework\TestCase {
'overlong U+FFFF - 6 bytes' => ["\xFC\x80\x80\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533, 65533]],
'overlong U+10FFFF - 5 bytes' => ["\xF8\x84\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533]],
'overlong U+10FFFF - 6 bytes' => ["\xFC\x80\x84\x8F\xBF\xBF", [65533, 65533, 65533, 65533, 65533, 65533]],
// UTF-16 surrogates
'lead surrogate' => ["\xED\xA0\x80", [65533, 65533, 65533]],
'trail surrogate' => ["\xED\xB0\x80", [65533, 65533, 65533]],
'surrogate pair' => ["\xED\xA0\x80\xED\xB0\x80", [65533, 65533, 65533, 65533, 65533, 65533]],
'mixed sample' => ["\x7A\xC2\xA2\xE6\xB0\xB4\xF0\x9D\x84\x9E\xEF\xA3\xBF\xF4\x8F\xBF\xBD\xEF\xBF\xBE", [122, 162, 27700, 119070, 63743, 1114109, 65534]],
// self-sync edge cases
'trailing continuation' => ["\x0A\x80\x80", [10, 65533, 65533]],
'trailing continuation 2' => ["\xE5\x8F\xA4\x80", [21476, 65533]],
];
}
}

Loading…
Cancel
Save