From 1b172f4d611a6f819f545395ac8ae678b8ade3dd Mon Sep 17 00:00:00 2001 From: Dustin Wilson Date: Thu, 23 Aug 2018 23:17:27 -0500 Subject: [PATCH] Rewrote DataStream MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Changed DataStream from using an internal array storage to using /mensbeam/intl, much more efficient and in line with the specification. • DataStream now calculates the line and column as it goes instead of relying upon a weird array of positions. • Removed dependency on ext-intl. --- composer.json | 4 +- composer.lock | 52 +++++++++++++-- lib/DataStream.php | 154 +++++++++++++++++++++++++-------------------- 3 files changed, 135 insertions(+), 75 deletions(-) diff --git a/composer.json b/composer.json index 9b6a3da..f50bab3 100644 --- a/composer.json +++ b/composer.json @@ -4,10 +4,10 @@ "type": "library", "require": { "php": "^7.0", - "ext-intl": "*", "ext-mbstring": "*", "ext-ctype": "*", - "ext-hash": "*" + "ext-hash": "*", + "mensbeam/intl": "*" }, "license": "MIT", "authors": [ diff --git a/composer.lock b/composer.lock index 5aa7e35..ec5d365 100644 --- a/composer.lock +++ b/composer.lock @@ -1,11 +1,56 @@ { "_readme": [ "This file locks the dependencies of your project to a known state", - "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file", + "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "0dd2aa7522ed74e55953bac6d8f875dc", - "packages": [], + "content-hash": "081b624245e1c42efc06f310a4bc46bd", + "packages": [ + { + "name": "mensbeam/intl", + "version": "0.2.0", + "source": { + "type": "git", + "url": "https://code.mensbeam.com/MensBeam/intl", + "reference": "58444b9545665a19a0cb6d997ab3596025e3be71" + }, + "require": { + "php": "^7.0" + }, + "require-dev": { + "bamarni/composer-bin-plugin": "*", + "ext-intl": "*" + }, + "type": "library", + "autoload": { + "psr-4": { + "MensBeam\\Intl\\": "lib/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "J. King", + "email": "jking@jkingweb.ca", + "homepage": "https://jkingweb.ca/" + } + ], + "description": "A set of dependency-free basic internationalization tools", + "keywords": [ + "charset", + "encoding", + "internationalization", + "intl", + "unicode", + "utf-8", + "utf8" + ], + "time": "2018-08-12T02:04:45+00:00" + } + ], "packages-dev": [], "aliases": [], "minimum-stability": "stable", @@ -14,7 +59,6 @@ "prefer-lowest": false, "platform": { "php": "^7.0", - "ext-intl": "*", "ext-mbstring": "*", "ext-ctype": "*", "ext-hash": "*" diff --git a/lib/DataStream.php b/lib/DataStream.php index 265025a..b47c9cd 100644 --- a/lib/DataStream.php +++ b/lib/DataStream.php @@ -4,17 +4,18 @@ namespace dW\HTML5; class DataStream { - // The length of the data used for checking for end-of-file. - public $length = 0; // Used to get the file path for error reporting. public $filePath; - // Internal storage for the byte stream data array. - protected $byteStream; - // The internal position pointer. - private $pointer = 0; - // Line in the document the pointer is at. Used for error reporting. - private $position = []; + // Internal storage for the Intl data object. + protected $data; + // Used for error reporting to display line number. + protected $_line = 1; + // Used for error reporting to display column number. + protected $_column = 0; + // Used for error reporting when unconsuming to calculate column number from + // last newline. + protected $newlines = []; const ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'; const DIGIT = '0123456789'; @@ -31,9 +32,7 @@ class DataStream // DEVIATION: The spec has steps for parsing and determining the character // encoding. At this moment this implementation won't determine a character - // encoding and will convert all content fed to it to UTF-8. - $data = mb_convert_encoding($data, 'UTF-8', mb_detect_encoding($data)); - mb_internal_encoding('UTF-8'); + // encoding and will just assume UTF-8. # One leading U+FEFF BYTE ORDER MARK character must be ignored if any are present # in the input stream. @@ -54,53 +53,77 @@ class DataStream return ''; }, $data); + // Normalize line breaks. Convert CRLF and CR to LF. - // Break the document into a unicode friendly array of single characters for - // tokenization. - $this->byteStream = preg_split('/(?byteStream); $i++) { - if ($this->byteStream[$i] === "\n") { - $line++; - $column = 0; - } + // Break the string up into a traversable object. + $this->data = new \MensBeam\Intl\Encoding\UTF8(str_replace(["\r\n", "\r"], "\n", $data)); + } - $this->position[$i] = [$line, ++$column]; + public function consume(int $length = 1): string { + if ($length <= 0) { + throw new Exception(Exception::DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH, $length); } - // Set EOF to the string length of the document. - $this->length = count($this->byteStream); - } + for ($i = 0, $string = ''; $i < $length; $i++) { + $char = $this->data->nextChar(); - public function consume(int $length = 1): string { - return $this->look('', $length, true); + if ($char === "\n") { + $this->newlines[] = $this->data->posChar(); + $this->_column = 1; + $this->_line++; + } else { + $this->_column++; + } + + $string .= $char; + } + + return $string; } public function unconsume(int $length = 1) { - if ($this->pointer < $this->length) { - $this->pointer -= $length; + if ($length <= 0) { + throw new Exception(Exception::DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH, $length); + } + + $this->data->seek(0 - $length); + + $string = $this->data->peekChar($length); + $numOfNewlines = substr_count($string, "\n"); + + if ($numOfNewlines > 0) { + $this->_line -= $numOfNewlines; + + $count = $this->newlines; + $index = count($this->newlines) - ($numOfNewlines - 1); + $this->_column = 1 + (($count > 0 && isset($this->newlines[$index])) ? $this->data->posChar() - $this->newlines[$index] : $this->data->posChar()); + } else { + $this->_column -= $length; } } public function consumeWhile(string $match, int $limit = 0): string { - return $this->look($match, $limit, true, true); + return $this->span($match, true, true, $limit); } public function consumeUntil(string $match, int $limit = 0): string { - return $this->look($match, $limit, true, false); + return $this->span($match, false, true, $limit); } public function peek(int $length = 1): string { - return $this->look('', $length, false); + if ($length <= 0) { + throw new Exception(Exception::DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH, $length); + } + + return $this->data->peekChar($length); } public function peekWhile(string $match, int $limit = 0): string { - return $this->look($match, $limit, false, true); + return $this->span($match, true, false, $limit); } public function peekUntil(string $match, int $limit = 0): string { - return $this->look($match, $limit, false, false); + return $this->span($match, false, false, $limit); } @@ -373,40 +396,19 @@ class DataStream return '&'; } - private function look(string $match, int $length = 1, bool $movePointer = false, bool $while = true): string { - if ($length <= 0 && $match === '') { - throw new Exception(Exception::DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH, $length); - } - - if ($match !== '') { - $length = $this->stringSpan($match, $while, $this->pointer, $length); - } elseif ($this->pointer + 1 > $this->length) { - return ''; - } - - for ($output = '', $end = $this->pointer + $length, $i = $this->pointer; $i < $end; $i++) { - $output .= $this->byteStream[$i]; - } - - if ($movePointer) { - $this->pointer = $end; - } - - return $output; - } - + protected function span(string $match, bool $while = true, bool $advancePointer = true, int $limit = 0): string { + // Break the matching characters into an array of characters. Unicode friendly. + $match = preg_split('/(?data->nextChar(); - // Break the matching characters into an array of characters. Unicode friendly. - $match = preg_split('/(?byteStream[$start])) { + if ($char === '') { break; } - $char = $this->byteStream[$start]; $inArray = in_array($char, $match); // strspn @@ -418,20 +420,34 @@ class DataStream break; } - $output++; - $start++; + if ($advancePointer && $char === "\n") { + $this->newlines[] = $this->data->posChar(); + $this->_column = 1; + $this->_line++; + } else { + $this->_column++; + } + + $string .= $char; + $count++; + if ($count === $limit) { + break; + } + } - if($output === $length) break; + if ($count === 0) { + return ''; } - return $output; + $this->data->seek(($advancePointer) ? -1 : 0 - $count - 2); + return $string; } public function __get($property) { switch ($property) { - case 'line': return $this->position[($this->pointer < $this->length) ? $this->pointer : $this->length - 1][0]; + case 'column': return $this->_column; break; - case 'column': return $this->position[($this->pointer < $this->length) ? $this->pointer : $this->length - 1][1] + 1; + case 'line': return $this->_line; break; default: return null; }