Browse Source

Rewrote DataStream

• Changed DataStream from using an internal array storage to using /mensbeam/intl, much more efficient and in line with the specification.
• DataStream now calculates the line and column as it goes instead of relying upon a weird array of positions.
• Removed dependency on ext-intl.
ns
Dustin Wilson 6 years ago
parent
commit
1b172f4d61
  1. 4
      composer.json
  2. 52
      composer.lock
  3. 154
      lib/DataStream.php

4
composer.json

@ -4,10 +4,10 @@
"type": "library",
"require": {
"php": "^7.0",
"ext-intl": "*",
"ext-mbstring": "*",
"ext-ctype": "*",
"ext-hash": "*"
"ext-hash": "*",
"mensbeam/intl": "*"
},
"license": "MIT",
"authors": [

52
composer.lock

@ -1,11 +1,56 @@
{
"_readme": [
"This file locks the dependencies of your project to a known state",
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
"content-hash": "0dd2aa7522ed74e55953bac6d8f875dc",
"packages": [],
"content-hash": "081b624245e1c42efc06f310a4bc46bd",
"packages": [
{
"name": "mensbeam/intl",
"version": "0.2.0",
"source": {
"type": "git",
"url": "https://code.mensbeam.com/MensBeam/intl",
"reference": "58444b9545665a19a0cb6d997ab3596025e3be71"
},
"require": {
"php": "^7.0"
},
"require-dev": {
"bamarni/composer-bin-plugin": "*",
"ext-intl": "*"
},
"type": "library",
"autoload": {
"psr-4": {
"MensBeam\\Intl\\": "lib/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "J. King",
"email": "jking@jkingweb.ca",
"homepage": "https://jkingweb.ca/"
}
],
"description": "A set of dependency-free basic internationalization tools",
"keywords": [
"charset",
"encoding",
"internationalization",
"intl",
"unicode",
"utf-8",
"utf8"
],
"time": "2018-08-12T02:04:45+00:00"
}
],
"packages-dev": [],
"aliases": [],
"minimum-stability": "stable",
@ -14,7 +59,6 @@
"prefer-lowest": false,
"platform": {
"php": "^7.0",
"ext-intl": "*",
"ext-mbstring": "*",
"ext-ctype": "*",
"ext-hash": "*"

154
lib/DataStream.php

@ -4,17 +4,18 @@ namespace dW\HTML5;
class DataStream
{
// The length of the data used for checking for end-of-file.
public $length = 0;
// Used to get the file path for error reporting.
public $filePath;
// Internal storage for the byte stream data array.
protected $byteStream;
// The internal position pointer.
private $pointer = 0;
// Line in the document the pointer is at. Used for error reporting.
private $position = [];
// Internal storage for the Intl data object.
protected $data;
// Used for error reporting to display line number.
protected $_line = 1;
// Used for error reporting to display column number.
protected $_column = 0;
// Used for error reporting when unconsuming to calculate column number from
// last newline.
protected $newlines = [];
const ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
const DIGIT = '0123456789';
@ -31,9 +32,7 @@ class DataStream
// DEVIATION: The spec has steps for parsing and determining the character
// encoding. At this moment this implementation won't determine a character
// encoding and will convert all content fed to it to UTF-8.
$data = mb_convert_encoding($data, 'UTF-8', mb_detect_encoding($data));
mb_internal_encoding('UTF-8');
// encoding and will just assume UTF-8.
# One leading U+FEFF BYTE ORDER MARK character must be ignored if any are present
# in the input stream.
@ -54,53 +53,77 @@ class DataStream
return '';
}, $data);
// Normalize line breaks. Convert CRLF and CR to LF.
// Break the document into a unicode friendly array of single characters for
// tokenization.
$this->byteStream = preg_split('/(?<!^)(?!$)/u', str_replace(["\r\n","\r"], "\n", $data));
// Create a positioning array for getting the line and column for errors.
for ($line = 1, $column = 0, $i = 0; $i < count($this->byteStream); $i++) {
if ($this->byteStream[$i] === "\n") {
$line++;
$column = 0;
}
// Break the string up into a traversable object.
$this->data = new \MensBeam\Intl\Encoding\UTF8(str_replace(["\r\n", "\r"], "\n", $data));
}
$this->position[$i] = [$line, ++$column];
public function consume(int $length = 1): string {
if ($length <= 0) {
throw new Exception(Exception::DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH, $length);
}
// Set EOF to the string length of the document.
$this->length = count($this->byteStream);
}
for ($i = 0, $string = ''; $i < $length; $i++) {
$char = $this->data->nextChar();
public function consume(int $length = 1): string {
return $this->look('', $length, true);
if ($char === "\n") {
$this->newlines[] = $this->data->posChar();
$this->_column = 1;
$this->_line++;
} else {
$this->_column++;
}
$string .= $char;
}
return $string;
}
public function unconsume(int $length = 1) {
if ($this->pointer < $this->length) {
$this->pointer -= $length;
if ($length <= 0) {
throw new Exception(Exception::DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH, $length);
}
$this->data->seek(0 - $length);
$string = $this->data->peekChar($length);
$numOfNewlines = substr_count($string, "\n");
if ($numOfNewlines > 0) {
$this->_line -= $numOfNewlines;
$count = $this->newlines;
$index = count($this->newlines) - ($numOfNewlines - 1);
$this->_column = 1 + (($count > 0 && isset($this->newlines[$index])) ? $this->data->posChar() - $this->newlines[$index] : $this->data->posChar());
} else {
$this->_column -= $length;
}
}
public function consumeWhile(string $match, int $limit = 0): string {
return $this->look($match, $limit, true, true);
return $this->span($match, true, true, $limit);
}
public function consumeUntil(string $match, int $limit = 0): string {
return $this->look($match, $limit, true, false);
return $this->span($match, false, true, $limit);
}
public function peek(int $length = 1): string {
return $this->look('', $length, false);
if ($length <= 0) {
throw new Exception(Exception::DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH, $length);
}
return $this->data->peekChar($length);
}
public function peekWhile(string $match, int $limit = 0): string {
return $this->look($match, $limit, false, true);
return $this->span($match, true, false, $limit);
}
public function peekUntil(string $match, int $limit = 0): string {
return $this->look($match, $limit, false, false);
return $this->span($match, false, false, $limit);
}
@ -373,40 +396,19 @@ class DataStream
return '&';
}
private function look(string $match, int $length = 1, bool $movePointer = false, bool $while = true): string {
if ($length <= 0 && $match === '') {
throw new Exception(Exception::DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH, $length);
}
if ($match !== '') {
$length = $this->stringSpan($match, $while, $this->pointer, $length);
} elseif ($this->pointer + 1 > $this->length) {
return '';
}
for ($output = '', $end = $this->pointer + $length, $i = $this->pointer; $i < $end; $i++) {
$output .= $this->byteStream[$i];
}
if ($movePointer) {
$this->pointer = $end;
}
return $output;
}
protected function span(string $match, bool $while = true, bool $advancePointer = true, int $limit = 0): string {
// Break the matching characters into an array of characters. Unicode friendly.
$match = preg_split('/(?<!^)(?!$)/Su', $match);
private function stringSpan(string $match, bool $while = true, int $start = 0, int $length = 0): int {
$output = 0;
$count = 0;
$string = '';
while (true) {
$char = $this->data->nextChar();
// Break the matching characters into an array of characters. Unicode friendly.
$match = preg_split('/(?<!^)(?!$)/u', $match);
while(true) {
if (!isset($this->byteStream[$start])) {
if ($char === '') {
break;
}
$char = $this->byteStream[$start];
$inArray = in_array($char, $match);
// strspn
@ -418,20 +420,34 @@ class DataStream
break;
}
$output++;
$start++;
if ($advancePointer && $char === "\n") {
$this->newlines[] = $this->data->posChar();
$this->_column = 1;
$this->_line++;
} else {
$this->_column++;
}
$string .= $char;
$count++;
if ($count === $limit) {
break;
}
}
if($output === $length) break;
if ($count === 0) {
return '';
}
return $output;
$this->data->seek(($advancePointer) ? -1 : 0 - $count - 2);
return $string;
}
public function __get($property) {
switch ($property) {
case 'line': return $this->position[($this->pointer < $this->length) ? $this->pointer : $this->length - 1][0];
case 'column': return $this->_column;
break;
case 'column': return $this->position[($this->pointer < $this->length) ? $this->pointer : $this->length - 1][1] + 1;
case 'line': return $this->_line;
break;
default: return null;
}

Loading…
Cancel
Save