diff --git a/lib/Parser/ParseErrorEmitter.php b/lib/Parser/ParseErrorEmitter.php index a5a6573..a82763b 100644 --- a/lib/Parser/ParseErrorEmitter.php +++ b/lib/Parser/ParseErrorEmitter.php @@ -15,9 +15,34 @@ trait ParseErrorEmitter { $data = ($this instanceof Data) ? $this : ($this->data ?? null); assert($data instanceof Data); assert($this->errorHandler instanceof ParseError); - list($line, $column) = $data->whereIs(ParseError::REPORT_OFFSETS[$code] ?? 0); - $message = $this->errorMessage($code, ...$arg); - $this->errorHandler->errors[] = [$line, $column, $code, $arg, $message]; + if (in_array($code, [ParseError::UNEXPECTED_CHAR, ParseError::FOSTERED_CHAR])) { + // character-related errors must have an error generated for each character + assert( + (sizeof($arg) === 1 && is_string($arg[0])) + || (sizeof($arg) === 2 && is_array($arg[0]) && is_int($arg[1])) + ); + if (sizeof($arg) === 2) { + // pended characters come as a sequence of character tokens with an offset back into the data stream + $offset = $data->pointer - $arg[1]; + $chars = ""; + foreach ($arg[0] as $t) { + $chars .= $t->data; + } + $chars = sizeof(preg_split("//u", $chars)) - 3; + } else { + $offset = 0; + $chars = sizeof(preg_split("//u", $arg[0])) - 3; + } + while ($chars >= 0) { + list($line, $column) = $data->whereIs(-(($chars--) + $offset)); + $message = $this->errorMessage($code); + $this->errorHandler->errors[] = [$line, $column, $code, [], $message]; + } + } else { + list($line, $column) = $data->whereIs(ParseError::REPORT_OFFSETS[$code] ?? 0); + $message = $this->errorMessage($code, ...$arg); + $this->errorHandler->errors[] = [$line, $column, $code, $arg, $message]; + } } } diff --git a/lib/Parser/TreeBuilder.php b/lib/Parser/TreeBuilder.php index f1d5aee..05487c3 100644 --- a/lib/Parser/TreeBuilder.php +++ b/lib/Parser/TreeBuilder.php @@ -41,6 +41,8 @@ class TreeBuilder { protected $templateInsertionModes; /** @var array An array holding character tokens which may need to be foster-parented during table parsing */ protected $pendingTableCharacterTokens = []; + /** @var int The character position of the last pended table character token */ + protected $pendingTableCharacterTokenPosition = 0; /** @var bool Flag used to track whether name mangling has been performed for elements; this is a minor optimization */ protected $mangledElements = false; /** @var bool Flag used to track whether name mangling has been performed for attributes; this is a minor optimization */ @@ -1848,7 +1850,7 @@ class TreeBuilder { if ($token instanceof EndTagToken) { $this->error(ParseError::UNEXPECTED_END_TAG, $token->name); } elseif ($token instanceof CharacterToken) { - $this->error(ParseError::UNEXPECTED_CHAR); + $this->error(ParseError::UNEXPECTED_CHAR, $token->data); } elseif ($token instanceof EOFToken) { $this->error(ParseError::UNEXPECTED_EOF); } @@ -2239,7 +2241,7 @@ class TreeBuilder { # using the rules for the "in body" insertion mode, and # then disable foster parenting. if ($token instanceof CharacterToken) { - $this->error(ParseError::FOSTERED_CHAR); + $this->error(ParseError::FOSTERED_CHAR, $token->data); } elseif ($token instanceof StartTagToken) { $this->error(ParseError::FOSTERED_START_TAG, $token->name); } elseif ($token instanceof EndTagToken) { @@ -2263,6 +2265,7 @@ class TreeBuilder { # Append the character token to the pending table character # tokens list. $this->pendingTableCharacterTokens[] = $token; + $this->pendingTableCharacterTokenPosition = $this->data->pointer; } # Anything else else { @@ -2282,7 +2285,7 @@ class TreeBuilder { // NOTE: This is efectively the same as reprocessing in the // "in body" mode if (!$ws) { - $this->error(ParseError::UNEXPECTED_CHAR); + $this->error(ParseError::FOSTERED_CHAR, $this->pendingTableCharacterTokens, $this->pendingTableCharacterTokenPosition); $this->fosterParenting = true; foreach ($this->pendingTableCharacterTokens as $pending) { // The relevant parts of the "in body" mode are reproduced here @@ -2464,7 +2467,7 @@ class TreeBuilder { # is a parse error; ignore the token. if ($this->stack->currentNodeName !== "colgroup") { if ($token instanceof CharacterToken) { - $this->error(ParseError::UNEXPECTED_CHAR); + $this->error(ParseError::UNEXPECTED_CHAR, $token->data); } elseif ($token instanceof StartTagToken) { $this->error(ParseError::UNEXPECTED_START_TAG, $token->name); } elseif ($token instanceof EndTagToken) { @@ -3167,7 +3170,7 @@ class TreeBuilder { } elseif ($token instanceof EndTagToken) { $this->error(ParseError::UNEXPECTED_END_TAG, $token->name); } elseif ($token instanceof CharacterToken) { - $this->error(ParseError::UNEXPECTED_CHAR); + $this->error(ParseError::UNEXPECTED_CHAR, $token->data); } # Switch the insertion mode to "in body" # and reprocess the token. @@ -3269,7 +3272,7 @@ class TreeBuilder { } elseif ($token instanceof EndTagToken) { $this->error(ParseError::UNEXPECTED_END_TAG, $token->name); } elseif ($token instanceof CharacterToken) { - $this->error(ParseError::UNEXPECTED_CHAR); + $this->error(ParseError::UNEXPECTED_CHAR, $token->data); // Extract any whitespace characters from the token and insert them $ws = preg_replace('/[^\x09\x0a\x0c\x0d ]+/', "", $token->data); if (strlen($ws)) { @@ -3330,7 +3333,7 @@ class TreeBuilder { } elseif ($token instanceof EndTagToken) { $this->error(ParseError::UNEXPECTED_END_TAG, $token->name); } elseif ($token instanceof CharacterToken) { - $this->error(ParseError::UNEXPECTED_CHAR); + $this->error(ParseError::UNEXPECTED_CHAR, $token->data); // Extract any whitespace characters from the token and insert them $ws = preg_replace('/[^\x09\x0a\x0c\x0d ]+/', "", $token->data); if (strlen($ws)) { @@ -3371,7 +3374,7 @@ class TreeBuilder { } elseif ($token instanceof EndTagToken) { $this->error(ParseError::UNEXPECTED_END_TAG, $token->name); } elseif ($token instanceof CharacterToken) { - $this->error(ParseError::UNEXPECTED_CHAR); + $this->error(ParseError::UNEXPECTED_CHAR, $token->data); } # Switch the insertion mode to "in body" and reprocess the token. $insertionMode = $this->insertionMode = self::IN_BODY_MODE; @@ -3417,7 +3420,7 @@ class TreeBuilder { } elseif ($token instanceof EndTagToken) { $this->error(ParseError::UNEXPECTED_END_TAG, $token->name); } elseif ($token instanceof CharacterToken) { - $this->error(ParseError::UNEXPECTED_CHAR); + $this->error(ParseError::UNEXPECTED_CHAR, $token->data); } } } @@ -3440,8 +3443,6 @@ class TreeBuilder { # When the user agent is to apply the rules for parsing tokens in foreign # content, the user agent must handle the token as follows: - - // NOTE: Foster parenting is turned off when evaluating this // mode as it may have been turned on in a previous evluation // of the "in table" mode @@ -3559,12 +3560,12 @@ class TreeBuilder { } } } - # An end tag whose tag name is "script", if the current node is a script element - # in the SVG namespace - // DEVIATION: This implementation does not support scripting, so script elements - // aren't processed differently. # An end tag... elseif ($token instanceof EndTagToken) { + # An end tag whose tag name is "script", if the current node is a script element + # in the SVG namespace + // DEVIATION: This implementation does not support scripting, so script elements + // aren't processed differently. # An end tag whose tag name is "br", "p" if ($token->name === "br" || $token->name === "p") { # Parse error. @@ -3581,7 +3582,7 @@ class TreeBuilder { # in HTML content. goto ProcessToken; } - # Any other end tag + # Any other end tag elseif ($token instanceof EndTagToken) { # Run these steps: #