Browse Source

Fix up most error reporting positions

ns
J. King 4 years ago
parent
commit
00bf9974c5
  1. 50
      lib/Data.php
  2. 52
      lib/ParseError.php
  3. 3
      lib/ParseErrorEmitter.php
  4. 42
      lib/Tokenizer.php
  5. 4
      tests/cases/TestTokenizer.php

50
lib/Data.php

@ -18,7 +18,9 @@ class Data {
protected $normalized = [];
// Holds the character position and column number of each newline
protected $newlines = [];
// The forward-most input stream error emitted
// Holds the character position of each supplementary plane character, which count as two columns when reporting errors
protected $astrals = [];
// The character position of the forward-most input stream error emitted
protected $lastError = 0;
// Whether the EOF imaginary character has been consumed
protected $eof = false;
@ -87,14 +89,12 @@ class Data {
$this->_line++;
} elseif ($char === '') {
$this->eof = true;
$this->_column++;
return false;
} else {
$this->_column++;
$len = strlen($char);
$here = $this->data->posChar();
if ($this->lastError < $here) {
// look for erroneous characters
$len = strlen($char);
if ($len === 1) {
$ord = ord($char);
if (($ord < 0x20 && !in_array($ord, [0x0, 0x9, 0xA, 0xC])) || $ord === 0x7F) {
@ -135,6 +135,13 @@ class Data {
}
}
}
$this->_column++;
if ($len === 4) {
// If the character is on a supplementary Unicode plane,
// it counts as two columns for the purposes of error reporting
$this->astrals[$here] = true;
$this->_column++;
}
}
return true;
}
@ -160,6 +167,9 @@ class Data {
$this->_line--;
} else {
$this->_column--;
if ($this->astrals[$here] ?? false) {
$this->_column--;
}
}
}
$this->data->seek(-1);
@ -235,6 +245,38 @@ class Data {
return $string;
}
/** Returns an indexed array with the line and column positions of the requested offset from the current position */
public function whereIs(int $relativePos): array {
if ($relativePos === 0) {
return [$this->_line, $this->_column];
} elseif ($relativePos < 0) {
$pos = $this->data->posChar();
$line = $this->_line;
$col = $this->_column;
do {
// If the current position is the start of a line,
// get the column position of the end of the previous line
if (isset($this->newlines[$pos])) {
$line--;
$col = $this->newlines[$pos];
// If the newline was a normalized CR+LF pair,
// go back one extra character
if (isset($this->normalized[$pos])) {
$pos--;
}
} else {
$col--;
// supplementary plane characters count as two
if ($this->astrals[$pos] ?? false) {
$this->_column--;
}
}
$pos--;
} while (++$relativePos < 0);
return [$line, $col];
}
}
public function __get($property) {
switch ($property) {
case 'column': return $this->_column;

52
lib/ParseError.php

@ -107,6 +107,58 @@ class ParseError {
self::CONTROL_CHARACTER_IN_INPUT_STREAM => 'Control character in input stream',
];
const REPORT_OFFSETS = [
self::ENCODING_ERROR => 0,
self::UNEXPECTED_NULL_CHARACTER => -1,
self::UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME => 0,
self::EOF_BEFORE_TAG_NAME => 0,
self::INVALID_FIRST_CHARACTER_OF_TAG_NAME => 0,
self::MISSING_END_TAG_NAME => -1,
self::EOF_IN_TAG => 0,
self::EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT => 0,
self::UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME => -1,
self::DUPLICATE_ATTRIBUTE => -1,
self::UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME => -1,
self::MISSING_ATTRIBUTE_VALUE => -1,
self::UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE => -1,
self::MISSING_WHITESPACE_BETWEEN_ATTRIBUTES => 0,
self::UNEXPECTED_SOLIDUS_IN_TAG => 0,
self::CDATA_IN_HTML_CONTENT => -1,
self::INCORRECTLY_OPENED_COMMENT => 0,
self::ABRUPT_CLOSING_OF_EMPTY_COMMENT => -1,
self::EOF_IN_COMMENT => 0,
self::NESTED_COMMENT => 0,
self::INCORRECTLY_CLOSED_COMMENT => -1,
self::EOF_IN_DOCTYPE => 0,
self::MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME => 0,
self::MISSING_DOCTYPE_NAME => -1,
self::INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME => 0,
self::MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD => -1,
self::MISSING_DOCTYPE_PUBLIC_IDENTIFIER => -1,
self::MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER => 0,
self::ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER => -1,
self::MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS => -1,
self::MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD => -1,
self::MISSING_DOCTYPE_SYSTEM_IDENTIFIER => -1,
self::MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER => 0,
self::ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER => -1,
self::UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER => 0,
self::EOF_IN_CDATA => 0,
self::END_TAG_WITH_ATTRIBUTES => -1,
self::END_TAG_WITH_TRAILING_SOLIDUS => -1,
self::MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE => 0,
self::UNKNOWN_NAMED_CHARACTER_REFERENCE => 0,
self::ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE => 0,
self::NULL_CHARACTER_REFERENCE => 0,
self::CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE => 0,
self::SURROGATE_CHARACTER_REFERENCE => 0,
self::NONCHARACTER_CHARACTER_REFERENCE => 0,
self::CONTROL_CHARACTER_REFERENCE => 0,
self::SURROGATE_IN_INPUT_STREAM => 0,
self::NONCHARACTER_IN_INPUT_STREAM => 0,
self::CONTROL_CHARACTER_IN_INPUT_STREAM => 0,
];
public function setHandler() {
// Set the errror handler and honor already-set error reporting rules.
set_error_handler([$this, 'errorHandler'], error_reporting());

3
lib/ParseErrorEmitter.php

@ -10,6 +10,7 @@ trait ParseErrorEmitter {
$data = ($this instanceof Data) ? $this : ($this->data ?? null);
assert($data instanceof Data);
assert($this->errorHandler instanceof ParseError);
return $this->errorHandler->emit($data->filePath, $data->line, $data->column, $code, ...$arg);
list($line, $column) = $data->whereIs(ParseError::REPORT_OFFSETS[$code]);
return $this->errorHandler->emit($data->filePath, $line, $column, $code, ...$arg);
}
}

42
lib/Tokenizer.php

@ -470,9 +470,9 @@ class Tokenizer {
# This is an unexpected-question-mark-instead-of-tag-name parse error.
# Create a comment token whose data is the empty string.
# Reconsume in the bogus comment state.
$this->data->unconsume();
$this->error(ParseError::UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME);
// OPTIMIZATION: Not necessary to reconsume
$token = new CommentToken('?');
$token = new CommentToken('');
$this->state = self::BOGUS_COMMENT_STATE;
}
# EOF
@ -491,10 +491,10 @@ class Tokenizer {
# This is an invalid-first-character-of-tag-name parse error.
# Emit a U+003C LESS-THAN SIGN character token.
# Reconsume in the data state.
$this->data->unconsume();
$this->error(ParseError::INVALID_FIRST_CHARACTER_OF_TAG_NAME, $char);
// DEVIATION: unconsume and change state before emitting
$this->state = self::DATA_STATE;
$this->data->unconsume();
return new CharacterToken('<');
}
}
@ -539,9 +539,9 @@ class Tokenizer {
# This is an invalid-first-character-of-tag-name parse error.
# Create a comment token whose data is the empty string.
# Reconsume in the bogus comment state.
$this->data->unconsume();
$this->error(ParseError::INVALID_FIRST_CHARACTER_OF_TAG_NAME, $char);
$token = new CommentToken();
$this->data->unconsume();
$this->state = self::BOGUS_COMMENT_STATE;
}
}
@ -1923,9 +1923,9 @@ class Tokenizer {
else {
# This is a missing-whitespace-between-attributes parse error.
# Reconsume in the before attribute name state.
$this->data->unconsume();
$this->error(ParseError::MISSING_WHITESPACE_BETWEEN_ATTRIBUTES);
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
$this->data->unconsume();
}
}
@ -1955,9 +1955,9 @@ class Tokenizer {
else {
# This is an unexpected-solidus-in-tag parse error.
# Reconsume in the before attribute name state.
$this->data->unconsume();
$this->error(ParseError::UNEXPECTED_SOLIDUS_IN_TAG);
$this->state = self::BEFORE_ATTRIBUTE_NAME_STATE;
$this->data->unconsume();
}
}
@ -2250,9 +2250,9 @@ class Tokenizer {
else {
# This is a nested-comment parse error.
# Reconsume in the comment end state.
$this->data->unconsume();
$this->error(ParseError::NESTED_COMMENT);
$this->state = self::COMMENT_END_STATE;
$this->data->unconsume();
}
}
@ -2426,9 +2426,9 @@ class Tokenizer {
else {
# This is a missing-whitespace-before-doctype-name parse error.
# Reconsume in the before DOCTYPE name state.
$this->data->unconsume();
$this->error(ParseError::MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME);
$this->state = self::BEFORE_DOCTYPE_NAME_STATE;
$this->data->unconsume();
}
}
@ -2616,10 +2616,10 @@ class Tokenizer {
# Set the DOCTYPE token's force-quirks flag to on.
# Reconsume in the bogus DOCTYPE state.
else {
$this->data->unconsume();
$this->error(ParseError::INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
$this->data->unconsume();
}
}
}
@ -2686,10 +2686,10 @@ class Tokenizer {
# This is a missing-quote-before-doctype-public-identifier parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Reconsume in the bogus DOCTYPE state.
$this->data->unconsume();
$this->error(ParseError::MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
$this->data->unconsume();
}
}
@ -2750,10 +2750,10 @@ class Tokenizer {
# This is a missing-quote-before-doctype-public-identifier parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Reconsume in the bogus DOCTYPE state.
$this->data->unconsume();
$this->error(ParseError::MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
$this->data->unconsume();
}
}
@ -2927,10 +2927,10 @@ class Tokenizer {
# This is a missing-quote-before-doctype-system-identifier parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Reconsume in the bogus DOCTYPE state.
$this->data->unconsume();
$this->error(ParseError::MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
$this->data->unconsume();
}
}
@ -2989,10 +2989,10 @@ class Tokenizer {
# This is a missing-quote-before-doctype-system-identifier parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Reconsume in the bogus DOCTYPE state.
$this->data->unconsume();
$this->error(ParseError::MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
$this->data->unconsume();
}
}
@ -3058,10 +3058,10 @@ class Tokenizer {
# This is a missing-quote-before-doctype-system-identifier parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Reconsume in the bogus DOCTYPE state.
$this->data->unconsume();
$this->error(ParseError::MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
$this->data->unconsume();
}
}
@ -3124,10 +3124,10 @@ class Tokenizer {
# This is a missing-quote-before-doctype-system-identifier parse error.
# Set the DOCTYPE token's force-quirks flag to on.
# Reconsume in the bogus DOCTYPE state.
$this->data->unconsume();
$this->error(ParseError::MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
$token->forceQuirks = true;
$this->state = self::BOGUS_DOCTYPE_STATE;
$this->data->unconsume();
}
}
@ -3280,9 +3280,9 @@ class Tokenizer {
# This is an unexpected-character-after-doctype-system-identifier parse error.
# Reconsume in the bogus DOCTYPE state.
# (This does not set the DOCTYPE token's force-quirks flag to on.)
$this->data->unconsume();
$this->error(ParseError::UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER, $char);
$this->state = self::BOGUS_DOCTYPE_STATE;
$this->data->unconsume();
}
}
@ -3544,9 +3544,9 @@ class Tokenizer {
elseif ($char === ';') {
# This is an unknown-named-character-reference parse error.
# Reconsume in the return state.
$this->data->unconsume();
$this->error(ParseError::UNKNOWN_NAMED_CHARACTER_REFERENCE, $temporaryBuffer.';');
$this->state = $returnState;
$this->data->unconsume();
return $temporaryBuffer;
}
# Anything else
@ -3600,9 +3600,9 @@ class Tokenizer {
# This is an absence-of-digits-in-numeric-character-reference parse error.
# Flush code points consumed as a character reference.
# Reconsume in the return state.
$this->data->unconsume();
$this->error(ParseError::ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE);
$this->state = $returnState;
$this->data->unconsume();
return $temporaryBuffer;
}
}
@ -3626,9 +3626,9 @@ class Tokenizer {
# This is an absence-of-digits-in-numeric-character-reference parse error.
# Flush code points consumed as a character reference.
# Reconsume in the return state.
$this->data->unconsume();
$this->error(ParseError::ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE);
$this->state = $returnState;
$this->data->unconsume();
return $temporaryBuffer;
}
}
@ -3659,9 +3659,9 @@ class Tokenizer {
else {
# This is a missing-semicolon-after-character-reference parse error.
# Reconsume in the numeric character reference end state.
$this->data->unconsume();
$this->error(ParseError::MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE);
$this->state = self::NUMERIC_CHARACTER_REFERENCE_END_STATE;
$this->data->unconsume();
}
}
@ -3689,9 +3689,9 @@ class Tokenizer {
else {
# This is a missing-semicolon-after-character-reference parse error.
# Reconsume in the numeric character reference end state.
$this->data->unconsume();
$this->error(ParseError::MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE);
$this->state = self::NUMERIC_CHARACTER_REFERENCE_END_STATE;
$this->data->unconsume();
}
}

4
tests/cases/TestTokenizer.php

@ -61,8 +61,8 @@ class TestTokenizer extends \dW\HTML5\Test\StandardTest {
}
} while (!($t instanceof EOFToken));
} finally {
$expErrors = $expErrors ? array_column($expErrors, "code") : [];
$errors = $errors ? array_column($errors, "code") : [];
//$expErrors = $expErrors ? array_column($expErrors, "code") : [];
//$errors = $errors ? array_column($errors, "code") : [];
$actual = $this->combineCharacterTokens($actual);
$this->assertEquals($expected, $actual, $tokenizer->debugLog);
$this->assertEquals($expErrors, $errors, $tokenizer->debugLog);

Loading…
Cancel
Save