HTML-Parser/lib/Data.php
J. King 58a1177888 Address errors and omissions in error emission
One test still fails, though it is arguably immaterial. This does not
account for line and column number, which are known to be mostly
off by one.
2019-12-19 15:13:20 -05:00

249 lines
9 KiB
PHP

<?php
declare(strict_types=1);
namespace dW\HTML5;
class Data {
use ParseErrorEmitter;
// Used to get the file path for error reporting.
public $filePath;
// Internal storage for the Intl data object.
protected $data;
// Used for error reporting to display line number.
protected $_line = 1;
// Used for error reporting to display column number.
protected $_column = 1;
// array of normalized CR+LF pairs, denoted by the character offset of the LF
protected $normalized = [];
// Holds the character position and column number of each newline
protected $newlines = [];
// The forward-most input stream error emitted
protected $lastError = 0;
// Whether the EOF imaginary character has been consumed
protected $eof = false;
const ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
const DIGIT = '0123456789';
const HEX = '0123456789ABCDEFabcdef';
const WHITESPACE = "\t\n\x0c\x0d ";
public function __construct(string $data, string $filePath = 'STDIN', ParseError $errorHandler = null) {
$this->errorHandler = $errorHandler ?? new ParseError;
if ($filePath !== 'STDIN') {
$this->filePath = realpath($filePath);
$data = file_get_contents($this->filePath);
} else {
$this->filePath = $filePath;
}
// DEVIATION: The spec has steps for parsing and determining the character
// encoding. At this moment this implementation won't determine a character
// encoding and will just assume UTF-8.
$this->data = new \MensBeam\Intl\Encoding\UTF8($data, false, true);
}
public function consume(int $length = 1, $advancePointer = true): string {
assert($length > 0, new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length));
for ($i = 0, $string = ''; $i < $length; $i++) {
$char = $this->data->nextChar();
# Before the tokenization stage, the input stream must be
# preprocessed by normalizing newlines.
# Thus, newlines in HTML DOMs are represented by U+000A LF characters,
# and there are never any U+000D CR characters in the input to the tokenization stage.
if ($char === "\r") {
// if this is a CR+LF pair, skip the CR and note the normalization
if ($this->data->peekChar() === "\n") {
$char = $this->data->nextChar();
$this->normalized[$this->data->posChar()] = true;
}
// otherwise just silently change the character to LF;
// the bare CR will be trivial to process when seeking backwards
else {
$char = "\n";
}
}
// append the character to the output string
$string .= $char;
// unless we're peeking, track line and column position, and whether we've hit EOF
if ($advancePointer) {
if (!$this->checkChar($char)) {
break;
}
}
}
return $string;
}
protected function checkChar(string $char): bool {
// track line and column number, and EOF
if ($char === "\n") {
$this->newlines[$this->data->posChar()] = $this->_column;
$this->_column = 1;
$this->_line++;
} elseif ($char === '') {
$this->eof = true;
$this->_column++;
return false;
} else {
$this->_column++;
$here = $this->data->posChar();
if ($this->lastError < $here) {
// look for erroneous characters
$len = strlen($char);
if ($len === 1) {
$ord = ord($char);
if (($ord < 0x20 && !in_array($ord, [0x0, 0x9, 0xA, 0xC])) || $ord === 0x7F) {
$this->error(ParseError::CONTROL_CHARACTER_IN_INPUT_STREAM);
$this->lastError = $here;
}
} elseif ($len === 2) {
if (ord($char[0]) == 0xC2) {
$ord = ord($char[1]);
if ($ord >= 0x80 && $ord <= 0x9F) {
$this->error(ParseError::CONTROL_CHARACTER_IN_INPUT_STREAM);
$this->lastError = $here;
}
}
} elseif ($len === 3) {
$head = ord($char[0]);
if ($head === 0xED) {
$tail = (ord($char[1]) << 8) + ord($char[2]);
if ($tail >= 0xA080 && $tail <= 0xBFBF) {
$this->error(ParseError::SURROGATE_IN_INPUT_STREAM);
$this->lastError = $here;
}
} elseif ($head === 0xEF) {
$tail = (ord($char[1]) << 8) + ord($char[2]);
if (($tail >= 0xB790 && $tail <= 0xB7AF) || $tail >= 0xBFBE) {
$this->error(ParseError::NONCHARACTER_IN_INPUT_STREAM);
$this->lastError = $here;
} elseif ($tail === 0xBFBD && $this->data->posErr === $here) {
$this->error(ParseError::NONCHARACTER_IN_INPUT_STREAM, $this->data->posByte);
$this->lastError = $here;
}
}
} elseif ($len === 4) {
$tail = (ord($char[2]) << 8) + ord($char[3]);
if ($tail >= 0xBFBE) {
$this->error(ParseError::NONCHARACTER_IN_INPUT_STREAM);
$this->lastError = $here;
}
}
}
}
return true;
}
public function unconsume(int $length = 1, bool $retreatPointer = true): void {
assert($length > 0, new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length));
if ($this->eof) {
$length--;
$this->eof = false;
}
while ($length-- > 0) {
$here = $this->data->posChar();
// if the previous character was a normalized CR+LF pair, we need to go back two
if (isset($this->normalized[$here])) {
$this->data->seek(-1);
}
// recalculate line and column positions, if requested
if ($retreatPointer) {
$col = $this->newlines[$here] ?? 0;
if ($col) {
$this->_column = $col;
$this->_line--;
} else {
$this->_column--;
}
}
$this->data->seek(-1);
}
}
public function consumeWhile(string $match, int $limit = 0): string {
return $this->span($match, true, true, $limit);
}
public function consumeUntil(string $match, int $limit = 0): string {
return $this->span($match, false, true, $limit);
}
public function peek(int $length = 1): string {
assert($length > 0, new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length));
$string = $this->data->peekChar($length);
return $string;
}
public function peekWhile(string $match, int $limit = 0): string {
return $this->span($match, true, false, $limit);
}
public function peekUntil(string $match, int $limit = 0): string {
return $this->span($match, false, false, $limit);
}
protected function span(string $match, bool $while = true, bool $advancePointer = true, int $limit = -1): string {
// Break the matching characters into an array of characters. Unicode friendly.
$match = preg_split('/(?<!^)(?!$)/Su', $match);
$start = $this->data->posChar();
$count = 0;
$string = '';
while (true) {
$char = $this->consume(1, false);
if ($char === '') {
break;
}
$inArray = in_array($char, $match);
// strspn
if ($while && !$inArray) {
$this->unconsume(1, false);
break;
}
// strcspn
elseif (!$while && $inArray) {
$this->unconsume(1, false);
break;
}
if ($advancePointer) {
$this->checkChar($char);
}
$count++;
$string .= $char;
if ($count === $limit) {
break;
}
}
if (!$advancePointer && $count) {
$this->data->seek(-($this->data->posChar - $start));
}
return $string;
}
public function __get($property) {
switch ($property) {
case 'column': return $this->_column;
break;
case 'line': return $this->_line;
break;
case 'pointer': return $this->data->posChar();
break;
default: return null;
}
}
}