diff --git a/lib/Data.php b/lib/Data.php index 0330f7e..0ca5287 100644 --- a/lib/Data.php +++ b/lib/Data.php @@ -18,7 +18,9 @@ class Data { protected $normalized = []; // Holds the character position and column number of each newline protected $newlines = []; - // The forward-most input stream error emitted + // Holds the character position of each supplementary plane character, which count as two columns when reporting errors + protected $astrals = []; + // The character position of the forward-most input stream error emitted protected $lastError = 0; // Whether the EOF imaginary character has been consumed protected $eof = false; @@ -87,14 +89,12 @@ class Data { $this->_line++; } elseif ($char === '') { $this->eof = true; - $this->_column++; return false; } else { - $this->_column++; + $len = strlen($char); $here = $this->data->posChar(); if ($this->lastError < $here) { // look for erroneous characters - $len = strlen($char); if ($len === 1) { $ord = ord($char); if (($ord < 0x20 && !in_array($ord, [0x0, 0x9, 0xA, 0xC])) || $ord === 0x7F) { @@ -135,6 +135,13 @@ class Data { } } } + $this->_column++; + if ($len === 4) { + // If the character is on a supplementary Unicode plane, + // it counts as two columns for the purposes of error reporting + $this->astrals[$here] = true; + $this->_column++; + } } return true; } @@ -160,6 +167,9 @@ class Data { $this->_line--; } else { $this->_column--; + if ($this->astrals[$here] ?? false) { + $this->_column--; + } } } $this->data->seek(-1); @@ -235,6 +245,38 @@ class Data { return $string; } + /** Returns an indexed array with the line and column positions of the requested offset from the current position */ + public function whereIs(int $relativePos): array { + if ($relativePos === 0) { + return [$this->_line, $this->_column]; + } elseif ($relativePos < 0) { + $pos = $this->data->posChar(); + $line = $this->_line; + $col = $this->_column; + do { + // If the current position is the start of a line, + // get the column position of the end of the previous line + if (isset($this->newlines[$pos])) { + $line--; + $col = $this->newlines[$pos]; + // If the newline was a normalized CR+LF pair, + // go back one extra character + if (isset($this->normalized[$pos])) { + $pos--; + } + } else { + $col--; + // supplementary plane characters count as two + if ($this->astrals[$pos] ?? false) { + $this->_column--; + } + } + $pos--; + } while (++$relativePos < 0); + return [$line, $col]; + } + } + public function __get($property) { switch ($property) { case 'column': return $this->_column; diff --git a/lib/ParseError.php b/lib/ParseError.php index 6674187..da8b18b 100644 --- a/lib/ParseError.php +++ b/lib/ParseError.php @@ -107,6 +107,58 @@ class ParseError { self::CONTROL_CHARACTER_IN_INPUT_STREAM => 'Control character in input stream', ]; + const REPORT_OFFSETS = [ + self::ENCODING_ERROR => 0, + self::UNEXPECTED_NULL_CHARACTER => -1, + self::UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME => 0, + self::EOF_BEFORE_TAG_NAME => 0, + self::INVALID_FIRST_CHARACTER_OF_TAG_NAME => 0, + self::MISSING_END_TAG_NAME => -1, + self::EOF_IN_TAG => 0, + self::EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT => 0, + self::UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME => -1, + self::DUPLICATE_ATTRIBUTE => -1, + self::UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME => -1, + self::MISSING_ATTRIBUTE_VALUE => -1, + self::UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE => -1, + self::MISSING_WHITESPACE_BETWEEN_ATTRIBUTES => 0, + self::UNEXPECTED_SOLIDUS_IN_TAG => 0, + self::CDATA_IN_HTML_CONTENT => -1, + self::INCORRECTLY_OPENED_COMMENT => 0, + self::ABRUPT_CLOSING_OF_EMPTY_COMMENT => -1, + self::EOF_IN_COMMENT => 0, + self::NESTED_COMMENT => 0, + self::INCORRECTLY_CLOSED_COMMENT => -1, + self::EOF_IN_DOCTYPE => 0, + self::MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME => 0, + self::MISSING_DOCTYPE_NAME => -1, + self::INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME => 0, + self::MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD => -1, + self::MISSING_DOCTYPE_PUBLIC_IDENTIFIER => -1, + self::MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER => 0, + self::ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER => -1, + self::MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS => -1, + self::MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD => -1, + self::MISSING_DOCTYPE_SYSTEM_IDENTIFIER => -1, + self::MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER => 0, + self::ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER => -1, + self::UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER => 0, + self::EOF_IN_CDATA => 0, + self::END_TAG_WITH_ATTRIBUTES => -1, + self::END_TAG_WITH_TRAILING_SOLIDUS => -1, + self::MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE => 0, + self::UNKNOWN_NAMED_CHARACTER_REFERENCE => 0, + self::ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE => 0, + self::NULL_CHARACTER_REFERENCE => 0, + self::CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE => 0, + self::SURROGATE_CHARACTER_REFERENCE => 0, + self::NONCHARACTER_CHARACTER_REFERENCE => 0, + self::CONTROL_CHARACTER_REFERENCE => 0, + self::SURROGATE_IN_INPUT_STREAM => 0, + self::NONCHARACTER_IN_INPUT_STREAM => 0, + self::CONTROL_CHARACTER_IN_INPUT_STREAM => 0, + ]; + public function setHandler() { // Set the errror handler and honor already-set error reporting rules. set_error_handler([$this, 'errorHandler'], error_reporting()); diff --git a/lib/ParseErrorEmitter.php b/lib/ParseErrorEmitter.php index b61dd03..42ace03 100644 --- a/lib/ParseErrorEmitter.php +++ b/lib/ParseErrorEmitter.php @@ -10,6 +10,7 @@ trait ParseErrorEmitter { $data = ($this instanceof Data) ? $this : ($this->data ?? null); assert($data instanceof Data); assert($this->errorHandler instanceof ParseError); - return $this->errorHandler->emit($data->filePath, $data->line, $data->column, $code, ...$arg); + list($line, $column) = $data->whereIs(ParseError::REPORT_OFFSETS[$code]); + return $this->errorHandler->emit($data->filePath, $line, $column, $code, ...$arg); } } diff --git a/lib/Tokenizer.php b/lib/Tokenizer.php index e72461e..cd1c57f 100644 --- a/lib/Tokenizer.php +++ b/lib/Tokenizer.php @@ -470,9 +470,9 @@ class Tokenizer { # This is an unexpected-question-mark-instead-of-tag-name parse error. # Create a comment token whose data is the empty string. # Reconsume in the bogus comment state. + $this->data->unconsume(); $this->error(ParseError::UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME); - // OPTIMIZATION: Not necessary to reconsume - $token = new CommentToken('?'); + $token = new CommentToken(''); $this->state = self::BOGUS_COMMENT_STATE; } # EOF @@ -491,10 +491,10 @@ class Tokenizer { # This is an invalid-first-character-of-tag-name parse error. # Emit a U+003C LESS-THAN SIGN character token. # Reconsume in the data state. + $this->data->unconsume(); $this->error(ParseError::INVALID_FIRST_CHARACTER_OF_TAG_NAME, $char); // DEVIATION: unconsume and change state before emitting $this->state = self::DATA_STATE; - $this->data->unconsume(); return new CharacterToken('<'); } } @@ -539,9 +539,9 @@ class Tokenizer { # This is an invalid-first-character-of-tag-name parse error. # Create a comment token whose data is the empty string. # Reconsume in the bogus comment state. + $this->data->unconsume(); $this->error(ParseError::INVALID_FIRST_CHARACTER_OF_TAG_NAME, $char); $token = new CommentToken(); - $this->data->unconsume(); $this->state = self::BOGUS_COMMENT_STATE; } } @@ -1923,9 +1923,9 @@ class Tokenizer { else { # This is a missing-whitespace-between-attributes parse error. # Reconsume in the before attribute name state. + $this->data->unconsume(); $this->error(ParseError::MISSING_WHITESPACE_BETWEEN_ATTRIBUTES); $this->state = self::BEFORE_ATTRIBUTE_NAME_STATE; - $this->data->unconsume(); } } @@ -1955,9 +1955,9 @@ class Tokenizer { else { # This is an unexpected-solidus-in-tag parse error. # Reconsume in the before attribute name state. + $this->data->unconsume(); $this->error(ParseError::UNEXPECTED_SOLIDUS_IN_TAG); $this->state = self::BEFORE_ATTRIBUTE_NAME_STATE; - $this->data->unconsume(); } } @@ -2250,9 +2250,9 @@ class Tokenizer { else { # This is a nested-comment parse error. # Reconsume in the comment end state. + $this->data->unconsume(); $this->error(ParseError::NESTED_COMMENT); $this->state = self::COMMENT_END_STATE; - $this->data->unconsume(); } } @@ -2426,9 +2426,9 @@ class Tokenizer { else { # This is a missing-whitespace-before-doctype-name parse error. # Reconsume in the before DOCTYPE name state. + $this->data->unconsume(); $this->error(ParseError::MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME); $this->state = self::BEFORE_DOCTYPE_NAME_STATE; - $this->data->unconsume(); } } @@ -2616,10 +2616,10 @@ class Tokenizer { # Set the DOCTYPE token's force-quirks flag to on. # Reconsume in the bogus DOCTYPE state. else { + $this->data->unconsume(); $this->error(ParseError::INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME); $token->forceQuirks = true; $this->state = self::BOGUS_DOCTYPE_STATE; - $this->data->unconsume(); } } } @@ -2686,10 +2686,10 @@ class Tokenizer { # This is a missing-quote-before-doctype-public-identifier parse error. # Set the DOCTYPE token's force-quirks flag to on. # Reconsume in the bogus DOCTYPE state. + $this->data->unconsume(); $this->error(ParseError::MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER); $token->forceQuirks = true; $this->state = self::BOGUS_DOCTYPE_STATE; - $this->data->unconsume(); } } @@ -2750,10 +2750,10 @@ class Tokenizer { # This is a missing-quote-before-doctype-public-identifier parse error. # Set the DOCTYPE token's force-quirks flag to on. # Reconsume in the bogus DOCTYPE state. + $this->data->unconsume(); $this->error(ParseError::MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER); $token->forceQuirks = true; $this->state = self::BOGUS_DOCTYPE_STATE; - $this->data->unconsume(); } } @@ -2927,10 +2927,10 @@ class Tokenizer { # This is a missing-quote-before-doctype-system-identifier parse error. # Set the DOCTYPE token's force-quirks flag to on. # Reconsume in the bogus DOCTYPE state. + $this->data->unconsume(); $this->error(ParseError::MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER); $token->forceQuirks = true; $this->state = self::BOGUS_DOCTYPE_STATE; - $this->data->unconsume(); } } @@ -2989,10 +2989,10 @@ class Tokenizer { # This is a missing-quote-before-doctype-system-identifier parse error. # Set the DOCTYPE token's force-quirks flag to on. # Reconsume in the bogus DOCTYPE state. + $this->data->unconsume(); $this->error(ParseError::MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER); $token->forceQuirks = true; $this->state = self::BOGUS_DOCTYPE_STATE; - $this->data->unconsume(); } } @@ -3058,10 +3058,10 @@ class Tokenizer { # This is a missing-quote-before-doctype-system-identifier parse error. # Set the DOCTYPE token's force-quirks flag to on. # Reconsume in the bogus DOCTYPE state. + $this->data->unconsume(); $this->error(ParseError::MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER); $token->forceQuirks = true; $this->state = self::BOGUS_DOCTYPE_STATE; - $this->data->unconsume(); } } @@ -3124,10 +3124,10 @@ class Tokenizer { # This is a missing-quote-before-doctype-system-identifier parse error. # Set the DOCTYPE token's force-quirks flag to on. # Reconsume in the bogus DOCTYPE state. + $this->data->unconsume(); $this->error(ParseError::MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER); $token->forceQuirks = true; $this->state = self::BOGUS_DOCTYPE_STATE; - $this->data->unconsume(); } } @@ -3280,9 +3280,9 @@ class Tokenizer { # This is an unexpected-character-after-doctype-system-identifier parse error. # Reconsume in the bogus DOCTYPE state. # (This does not set the DOCTYPE token's force-quirks flag to on.) + $this->data->unconsume(); $this->error(ParseError::UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER, $char); $this->state = self::BOGUS_DOCTYPE_STATE; - $this->data->unconsume(); } } @@ -3544,9 +3544,9 @@ class Tokenizer { elseif ($char === ';') { # This is an unknown-named-character-reference parse error. # Reconsume in the return state. + $this->data->unconsume(); $this->error(ParseError::UNKNOWN_NAMED_CHARACTER_REFERENCE, $temporaryBuffer.';'); $this->state = $returnState; - $this->data->unconsume(); return $temporaryBuffer; } # Anything else @@ -3600,9 +3600,9 @@ class Tokenizer { # This is an absence-of-digits-in-numeric-character-reference parse error. # Flush code points consumed as a character reference. # Reconsume in the return state. + $this->data->unconsume(); $this->error(ParseError::ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE); $this->state = $returnState; - $this->data->unconsume(); return $temporaryBuffer; } } @@ -3626,9 +3626,9 @@ class Tokenizer { # This is an absence-of-digits-in-numeric-character-reference parse error. # Flush code points consumed as a character reference. # Reconsume in the return state. + $this->data->unconsume(); $this->error(ParseError::ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE); $this->state = $returnState; - $this->data->unconsume(); return $temporaryBuffer; } } @@ -3659,9 +3659,9 @@ class Tokenizer { else { # This is a missing-semicolon-after-character-reference parse error. # Reconsume in the numeric character reference end state. + $this->data->unconsume(); $this->error(ParseError::MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE); $this->state = self::NUMERIC_CHARACTER_REFERENCE_END_STATE; - $this->data->unconsume(); } } @@ -3689,9 +3689,9 @@ class Tokenizer { else { # This is a missing-semicolon-after-character-reference parse error. # Reconsume in the numeric character reference end state. + $this->data->unconsume(); $this->error(ParseError::MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE); $this->state = self::NUMERIC_CHARACTER_REFERENCE_END_STATE; - $this->data->unconsume(); } } diff --git a/tests/cases/TestTokenizer.php b/tests/cases/TestTokenizer.php index cc79ca4..9a055b7 100644 --- a/tests/cases/TestTokenizer.php +++ b/tests/cases/TestTokenizer.php @@ -61,8 +61,8 @@ class TestTokenizer extends \dW\HTML5\Test\StandardTest { } } while (!($t instanceof EOFToken)); } finally { - $expErrors = $expErrors ? array_column($expErrors, "code") : []; - $errors = $errors ? array_column($errors, "code") : []; + //$expErrors = $expErrors ? array_column($expErrors, "code") : []; + //$errors = $errors ? array_column($errors, "code") : []; $actual = $this->combineCharacterTokens($actual); $this->assertEquals($expected, $actual, $tokenizer->debugLog); $this->assertEquals($expErrors, $errors, $tokenizer->debugLog);