Browse Source

Fixed Data bug

• Fixed bug where Data::consumeWhile and Data::consumeUntil wouldn't move the pointer back one position if there were no matches.
• Changed DataStream to Data.
• Made each class have its own debug static property so each can print debug information separately.
split-manual
Dustin Wilson 6 years ago
parent
commit
33363ab2d3
  1. 58
      lib/Data.php
  2. 4
      lib/Exception.php
  3. 2
      lib/ParseError.php
  4. 7
      lib/Parser.php
  5. 7
      lib/Tokenizer.php
  6. 42
      lib/TreeBuilder.php

58
lib/DataStream.php → lib/Data.php

@ -2,7 +2,7 @@
declare(strict_types=1);
namespace dW\HTML5;
class DataStream
class Data
{
// Used to get the file path for error reporting.
public $filePath;
@ -17,9 +17,15 @@ class DataStream
// last newline.
protected $newlines = [];
// Used for debugging to print out information as data is consumed.
public static $debug = false;
const ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
const DIGIT = '0123456789';
const HEX = '0123456789ABCDEFabcdef';
const WHITESPACE = "\t\n\x0c\x0d ";
public function __construct(string $data, string $filePath = 'STDIN') {
@ -61,7 +67,7 @@ class DataStream
public function consume(int $length = 1): string {
if ($length <= 0) {
throw new Exception(Exception::DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH, $length);
throw new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length);
}
for ($i = 0, $string = ''; $i < $length; $i++) {
@ -78,12 +84,22 @@ class DataStream
$string .= $char;
}
if (self::$debug) {
echo "\nConsume\n==========\n";
echo "Length: $length\n";
echo "Data: ";
var_export($string);
echo "\n";
echo "Pointer: {$this->data->posChar()}\n";
echo "==========\n\n";
}
return $string;
}
public function unconsume(int $length = 1) {
if ($length <= 0) {
throw new Exception(Exception::DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH, $length);
throw new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length);
}
$this->data->seek(0 - $length);
@ -100,6 +116,12 @@ class DataStream
} else {
$this->_column -= $length;
}
if (self::$debug) {
echo "\nUnconsume\n==========\n";
echo "Pointer: {$this->data->posChar()}\n";
echo "==========\n\n";
}
}
public function consumeWhile(string $match, int $limit = 0): string {
@ -112,10 +134,21 @@ class DataStream
public function peek(int $length = 1): string {
if ($length <= 0) {
throw new Exception(Exception::DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH, $length);
throw new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length);
}
return $this->data->peekChar($length);
$string = $this->data->peekChar($length);
if (self::$debug) {
echo "\nPeek\n==========\n";
echo "Data: ";
var_export($string);
echo "\n";
echo "Pointer: {$this->data->posChar()}\n";
echo "==========\n\n";
}
return $string;
}
public function peekWhile(string $match, int $limit = 0): string {
@ -437,11 +470,18 @@ class DataStream
}
}
if ($count === 0) {
return '';
$this->data->seek(($advancePointer) ? -1 : 0 - $count - 2);
if (self::$debug) {
echo ($advancePointer) ? "\nconsume" : "\npeek";
echo ($while) ? 'While' : 'Until';
echo "\n==========\nPattern: ";
var_export(str_replace(["\t", "\n", "\x0c", "\x0d"], ['\t', '\n', '\x0c', '\x0d'], implode('', $match)));
echo "\nData: ";
var_export($string);
echo "\nPointer: {$this->data->posChar()}\n==========\n\n";
}
$this->data->seek(($advancePointer) ? -1 : 0 - $count - 2);
return $string;
}
@ -451,6 +491,8 @@ class DataStream
break;
case 'line': return $this->_line;
break;
case 'pointer': return $this->data->posChar();
break;
default: return null;
}
}

4
lib/Exception.php

@ -15,8 +15,8 @@ class Exception extends \Exception {
const STACK_DOMNODE_ONLY = 10202;
const STACK_FRAGMENT_CONTEXT_DOMELEMENT_DOMDOCUMENT_DOMDOCUMENTFRAG_EXPECTED = 10203;
const DATASTREAM_NODATA = 10301;
const DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH = 10302;
const DATA_NODATA = 10301;
const DATA_INVALID_DATA_CONSUMPTION_LENGTH = 10302;
const DOM_DOMDOCUMENT_EXPECTED = 10401;
const DOM_DOMELEMENT_STRING_OR_CLOSURE_EXPECTED = 10402;

2
lib/ParseError.php

@ -37,7 +37,7 @@ class ParseError {
'"%s" is an invalid name for an entity',
'"%s" is an invalid character codepoint'];
public function __construct(DataStream $data) {
public function __construct(Data $data) {
$this->data = $data;
// Set the error handler and honor already-set error reporting rules.

7
lib/Parser.php

@ -5,7 +5,7 @@ namespace dW\HTML5;
class Parser {
/* Non-static properties */
// Input data that's being parsed, uses DataStream
// Input data that's being parsed, uses Data
protected $data;
// The DOMDocument that is assembled by the tree builder
protected $DOM;
@ -34,9 +34,6 @@ class Parser {
/* Static properties */
// For debugging
public static $debug = false;
// Property used as an instance for the non-static properties
protected static $instance;
@ -72,7 +69,7 @@ class Parser {
}
// Process the input stream.
static::$instance->data = new DataStream(($file === true) ? '' : $data, ($file === true) ? $data : 'STDIN');
static::$instance->data = new Data(($file === true) ? '' : $data, ($file === true) ? $data : 'STDIN');
// Set the locale for CTYPE to en_US.UTF8 so ctype functions and strtolower only
// work on basic latin characters. Used extensively when tokenizing.

7
lib/Tokenizer.php

@ -8,6 +8,8 @@ class Tokenizer {
protected $data;
protected $stack;
public static $debug = false;
const DATA_STATE = 0;
const RCDATA_STATE = 1;
const RAWTEXT_STATE = 2;
@ -78,7 +80,7 @@ class Tokenizer {
const CTYPE_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
const CTYPE_UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
public function __construct(DataStream $data, OpenElementsStack $stack) {
public function __construct(Data $data, OpenElementsStack $stack) {
$this->state = self::DATA_STATE;
$this->data = $data;
$this->stack = $stack;
@ -86,7 +88,7 @@ class Tokenizer {
public function createToken(): Token {
while (true) {
if (Parser::$debug) {
if (self::$debug) {
switch ($this->state) {
case self::DATA_STATE: $state = "Data";
break;
@ -260,6 +262,7 @@ class Tokenizer {
// OPTIMIZATION: Consume all characters that don't match what is above and emit
// that as a character token instead to prevent having to loop back through here
// every single time.
return new CharacterToken($char.$this->data->consumeUntil('&<'));
}
}

42
lib/TreeBuilder.php

@ -42,6 +42,8 @@ class TreeBuilder {
// Instance used with the static token insertion methods.
protected static $instance;
// Used for debugging to print out information as the tree is built.
protected static $debug = false;
// Constants used for insertion modes
@ -119,8 +121,11 @@ class TreeBuilder {
$adjustedCurrentNodeName = $this->stack->adjustedCurrentNodeName;
$adjustedCurrentNodeNamespace = $this->stack->adjustedCurrentNodeNamespace;
if (Parser::$debug) {
if (self::$debug) {
echo "Node: $adjustedCurrentNodeName\n";
echo "\nToken: \n";
var_export($token);
echo "\n\n";
}
# 8.2.5 Tree construction
@ -175,11 +180,6 @@ class TreeBuilder {
}
}
# TEMPORARY
echo "\n";
var_export($token);
echo "\n\n";
break;
}
}
@ -189,7 +189,7 @@ class TreeBuilder {
// Loop used when processing the token under different rules; always breaks.
while (true) {
if (Parser::$debug) {
if (self::$debug) {
switch ($insertionMode) {
case self::INITIAL_MODE: $mode = "Initial";
break;
@ -251,7 +251,7 @@ class TreeBuilder {
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
// OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
if ($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") !== strlen($token->data))) {
if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
# Ignore the token.
}
# A comment token
@ -425,7 +425,7 @@ class TreeBuilder {
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
// OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
elseif ($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") === strlen($token->data))) {
elseif ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
# Ignore the token.
}
# A start tag whose tag name is "html"
@ -468,7 +468,9 @@ class TreeBuilder {
case self::BEFORE_HEAD_MODE:
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
if ($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") === strlen($token->data))) {
// OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
# Ignore the token.
}
# A comment token
@ -524,7 +526,9 @@ class TreeBuilder {
case self::IN_HEAD_MODE:
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
if ($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") !== strlen($token->data))) {
// OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
# Insert the character.
$this->insertCharacterToken($token);
}
@ -813,7 +817,9 @@ class TreeBuilder {
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
# A comment token
elseif (($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") === strlen($token->data))) ||
// OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
elseif (($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) ||
$token instanceof CommentToken) {
# Process the token using the rules for the "in head" insertion mode.
$insertionMode = self::IN_HEAD_MODE;
@ -838,7 +844,9 @@ class TreeBuilder {
case self::AFTER_HEAD_MODE:
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
if ($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") === strlen($token->data))) {
// OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
# Insert the character.
$this->insertCharacterToken($token);
}
@ -962,7 +970,9 @@ class TreeBuilder {
# Insert the token’s character.
$this->insertCharacterToken($token);
if (strspn($token->data, "\t\n\x0c\x0d ") !== strlen($token->data)) {
// OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
if (strspn($token->data, Data::WHITESPACE) !== strlen($token->data)) {
# Set the frameset-ok flag to "not ok".
$this->framesetOk = false;
}
@ -1074,7 +1084,7 @@ class TreeBuilder {
}
protected function parseTokenInForeignContent(Token $token): bool {
if (Parser::$debug) {
if (self::$debug) {
echo "Foreign Content\n";
}
@ -1092,7 +1102,7 @@ class TreeBuilder {
# Any other character token
// OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
if (strspn($token->data, "\t\n\x0c\x0d ") !== strlen($token->data)) {
if (strspn($token->data, Data::WHITESPACE) !== strlen($token->data)) {
# Set the frameset-ok flag to "not ok".
$this->$framesetOk = false;
}

Loading…
Cancel
Save