Browse Source

Fixed Data bug

• Fixed bug where Data::consumeWhile and Data::consumeUntil wouldn't move the pointer back one position if there were no matches.
• Changed DataStream to Data.
• Made each class have its own debug static property so each can print debug information separately.
split-manual
Dustin Wilson 6 years ago
parent
commit
33363ab2d3
  1. 58
      lib/Data.php
  2. 4
      lib/Exception.php
  3. 2
      lib/ParseError.php
  4. 7
      lib/Parser.php
  5. 7
      lib/Tokenizer.php
  6. 42
      lib/TreeBuilder.php

58
lib/DataStream.php → lib/Data.php

@ -2,7 +2,7 @@
declare(strict_types=1); declare(strict_types=1);
namespace dW\HTML5; namespace dW\HTML5;
class DataStream class Data
{ {
// Used to get the file path for error reporting. // Used to get the file path for error reporting.
public $filePath; public $filePath;
@ -17,9 +17,15 @@ class DataStream
// last newline. // last newline.
protected $newlines = []; protected $newlines = [];
// Used for debugging to print out information as data is consumed.
public static $debug = false;
const ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'; const ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
const DIGIT = '0123456789'; const DIGIT = '0123456789';
const HEX = '0123456789ABCDEFabcdef'; const HEX = '0123456789ABCDEFabcdef';
const WHITESPACE = "\t\n\x0c\x0d ";
public function __construct(string $data, string $filePath = 'STDIN') { public function __construct(string $data, string $filePath = 'STDIN') {
@ -61,7 +67,7 @@ class DataStream
public function consume(int $length = 1): string { public function consume(int $length = 1): string {
if ($length <= 0) { if ($length <= 0) {
throw new Exception(Exception::DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH, $length); throw new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length);
} }
for ($i = 0, $string = ''; $i < $length; $i++) { for ($i = 0, $string = ''; $i < $length; $i++) {
@ -78,12 +84,22 @@ class DataStream
$string .= $char; $string .= $char;
} }
if (self::$debug) {
echo "\nConsume\n==========\n";
echo "Length: $length\n";
echo "Data: ";
var_export($string);
echo "\n";
echo "Pointer: {$this->data->posChar()}\n";
echo "==========\n\n";
}
return $string; return $string;
} }
public function unconsume(int $length = 1) { public function unconsume(int $length = 1) {
if ($length <= 0) { if ($length <= 0) {
throw new Exception(Exception::DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH, $length); throw new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length);
} }
$this->data->seek(0 - $length); $this->data->seek(0 - $length);
@ -100,6 +116,12 @@ class DataStream
} else { } else {
$this->_column -= $length; $this->_column -= $length;
} }
if (self::$debug) {
echo "\nUnconsume\n==========\n";
echo "Pointer: {$this->data->posChar()}\n";
echo "==========\n\n";
}
} }
public function consumeWhile(string $match, int $limit = 0): string { public function consumeWhile(string $match, int $limit = 0): string {
@ -112,10 +134,21 @@ class DataStream
public function peek(int $length = 1): string { public function peek(int $length = 1): string {
if ($length <= 0) { if ($length <= 0) {
throw new Exception(Exception::DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH, $length); throw new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length);
} }
return $this->data->peekChar($length); $string = $this->data->peekChar($length);
if (self::$debug) {
echo "\nPeek\n==========\n";
echo "Data: ";
var_export($string);
echo "\n";
echo "Pointer: {$this->data->posChar()}\n";
echo "==========\n\n";
}
return $string;
} }
public function peekWhile(string $match, int $limit = 0): string { public function peekWhile(string $match, int $limit = 0): string {
@ -437,11 +470,18 @@ class DataStream
} }
} }
if ($count === 0) { $this->data->seek(($advancePointer) ? -1 : 0 - $count - 2);
return '';
if (self::$debug) {
echo ($advancePointer) ? "\nconsume" : "\npeek";
echo ($while) ? 'While' : 'Until';
echo "\n==========\nPattern: ";
var_export(str_replace(["\t", "\n", "\x0c", "\x0d"], ['\t', '\n', '\x0c', '\x0d'], implode('', $match)));
echo "\nData: ";
var_export($string);
echo "\nPointer: {$this->data->posChar()}\n==========\n\n";
} }
$this->data->seek(($advancePointer) ? -1 : 0 - $count - 2);
return $string; return $string;
} }
@ -451,6 +491,8 @@ class DataStream
break; break;
case 'line': return $this->_line; case 'line': return $this->_line;
break; break;
case 'pointer': return $this->data->posChar();
break;
default: return null; default: return null;
} }
} }

4
lib/Exception.php

@ -15,8 +15,8 @@ class Exception extends \Exception {
const STACK_DOMNODE_ONLY = 10202; const STACK_DOMNODE_ONLY = 10202;
const STACK_FRAGMENT_CONTEXT_DOMELEMENT_DOMDOCUMENT_DOMDOCUMENTFRAG_EXPECTED = 10203; const STACK_FRAGMENT_CONTEXT_DOMELEMENT_DOMDOCUMENT_DOMDOCUMENTFRAG_EXPECTED = 10203;
const DATASTREAM_NODATA = 10301; const DATA_NODATA = 10301;
const DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH = 10302; const DATA_INVALID_DATA_CONSUMPTION_LENGTH = 10302;
const DOM_DOMDOCUMENT_EXPECTED = 10401; const DOM_DOMDOCUMENT_EXPECTED = 10401;
const DOM_DOMELEMENT_STRING_OR_CLOSURE_EXPECTED = 10402; const DOM_DOMELEMENT_STRING_OR_CLOSURE_EXPECTED = 10402;

2
lib/ParseError.php

@ -37,7 +37,7 @@ class ParseError {
'"%s" is an invalid name for an entity', '"%s" is an invalid name for an entity',
'"%s" is an invalid character codepoint']; '"%s" is an invalid character codepoint'];
public function __construct(DataStream $data) { public function __construct(Data $data) {
$this->data = $data; $this->data = $data;
// Set the error handler and honor already-set error reporting rules. // Set the error handler and honor already-set error reporting rules.

7
lib/Parser.php

@ -5,7 +5,7 @@ namespace dW\HTML5;
class Parser { class Parser {
/* Non-static properties */ /* Non-static properties */
// Input data that's being parsed, uses DataStream // Input data that's being parsed, uses Data
protected $data; protected $data;
// The DOMDocument that is assembled by the tree builder // The DOMDocument that is assembled by the tree builder
protected $DOM; protected $DOM;
@ -34,9 +34,6 @@ class Parser {
/* Static properties */ /* Static properties */
// For debugging
public static $debug = false;
// Property used as an instance for the non-static properties // Property used as an instance for the non-static properties
protected static $instance; protected static $instance;
@ -72,7 +69,7 @@ class Parser {
} }
// Process the input stream. // Process the input stream.
static::$instance->data = new DataStream(($file === true) ? '' : $data, ($file === true) ? $data : 'STDIN'); static::$instance->data = new Data(($file === true) ? '' : $data, ($file === true) ? $data : 'STDIN');
// Set the locale for CTYPE to en_US.UTF8 so ctype functions and strtolower only // Set the locale for CTYPE to en_US.UTF8 so ctype functions and strtolower only
// work on basic latin characters. Used extensively when tokenizing. // work on basic latin characters. Used extensively when tokenizing.

7
lib/Tokenizer.php

@ -8,6 +8,8 @@ class Tokenizer {
protected $data; protected $data;
protected $stack; protected $stack;
public static $debug = false;
const DATA_STATE = 0; const DATA_STATE = 0;
const RCDATA_STATE = 1; const RCDATA_STATE = 1;
const RAWTEXT_STATE = 2; const RAWTEXT_STATE = 2;
@ -78,7 +80,7 @@ class Tokenizer {
const CTYPE_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'; const CTYPE_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
const CTYPE_UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; const CTYPE_UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
public function __construct(DataStream $data, OpenElementsStack $stack) { public function __construct(Data $data, OpenElementsStack $stack) {
$this->state = self::DATA_STATE; $this->state = self::DATA_STATE;
$this->data = $data; $this->data = $data;
$this->stack = $stack; $this->stack = $stack;
@ -86,7 +88,7 @@ class Tokenizer {
public function createToken(): Token { public function createToken(): Token {
while (true) { while (true) {
if (Parser::$debug) { if (self::$debug) {
switch ($this->state) { switch ($this->state) {
case self::DATA_STATE: $state = "Data"; case self::DATA_STATE: $state = "Data";
break; break;
@ -260,6 +262,7 @@ class Tokenizer {
// OPTIMIZATION: Consume all characters that don't match what is above and emit // OPTIMIZATION: Consume all characters that don't match what is above and emit
// that as a character token instead to prevent having to loop back through here // that as a character token instead to prevent having to loop back through here
// every single time. // every single time.
return new CharacterToken($char.$this->data->consumeUntil('&<')); return new CharacterToken($char.$this->data->consumeUntil('&<'));
} }
} }

42
lib/TreeBuilder.php

@ -42,6 +42,8 @@ class TreeBuilder {
// Instance used with the static token insertion methods. // Instance used with the static token insertion methods.
protected static $instance; protected static $instance;
// Used for debugging to print out information as the tree is built.
protected static $debug = false;
// Constants used for insertion modes // Constants used for insertion modes
@ -119,8 +121,11 @@ class TreeBuilder {
$adjustedCurrentNodeName = $this->stack->adjustedCurrentNodeName; $adjustedCurrentNodeName = $this->stack->adjustedCurrentNodeName;
$adjustedCurrentNodeNamespace = $this->stack->adjustedCurrentNodeNamespace; $adjustedCurrentNodeNamespace = $this->stack->adjustedCurrentNodeNamespace;
if (Parser::$debug) { if (self::$debug) {
echo "Node: $adjustedCurrentNodeName\n"; echo "Node: $adjustedCurrentNodeName\n";
echo "\nToken: \n";
var_export($token);
echo "\n\n";
} }
# 8.2.5 Tree construction # 8.2.5 Tree construction
@ -175,11 +180,6 @@ class TreeBuilder {
} }
} }
# TEMPORARY
echo "\n";
var_export($token);
echo "\n\n";
break; break;
} }
} }
@ -189,7 +189,7 @@ class TreeBuilder {
// Loop used when processing the token under different rules; always breaks. // Loop used when processing the token under different rules; always breaks.
while (true) { while (true) {
if (Parser::$debug) { if (self::$debug) {
switch ($insertionMode) { switch ($insertionMode) {
case self::INITIAL_MODE: $mode = "Initial"; case self::INITIAL_MODE: $mode = "Initial";
break; break;
@ -251,7 +251,7 @@ class TreeBuilder {
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
// OPTIMIZATION: Will check for multiple space characters at once as character // OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character. // tokens can contain more than one character.
if ($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") !== strlen($token->data))) { if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
# Ignore the token. # Ignore the token.
} }
# A comment token # A comment token
@ -425,7 +425,7 @@ class TreeBuilder {
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
// OPTIMIZATION: Will check for multiple space characters at once as character // OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character. // tokens can contain more than one character.
elseif ($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") === strlen($token->data))) { elseif ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
# Ignore the token. # Ignore the token.
} }
# A start tag whose tag name is "html" # A start tag whose tag name is "html"
@ -468,7 +468,9 @@ class TreeBuilder {
case self::BEFORE_HEAD_MODE: case self::BEFORE_HEAD_MODE:
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED # A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
if ($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") === strlen($token->data))) { // OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
# Ignore the token. # Ignore the token.
} }
# A comment token # A comment token
@ -524,7 +526,9 @@ class TreeBuilder {
case self::IN_HEAD_MODE: case self::IN_HEAD_MODE:
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED # A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
if ($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") !== strlen($token->data))) { // OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
# Insert the character. # Insert the character.
$this->insertCharacterToken($token); $this->insertCharacterToken($token);
} }
@ -813,7 +817,9 @@ class TreeBuilder {
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED # A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
# A comment token # A comment token
elseif (($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") === strlen($token->data))) || // OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
elseif (($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) ||
$token instanceof CommentToken) { $token instanceof CommentToken) {
# Process the token using the rules for the "in head" insertion mode. # Process the token using the rules for the "in head" insertion mode.
$insertionMode = self::IN_HEAD_MODE; $insertionMode = self::IN_HEAD_MODE;
@ -838,7 +844,9 @@ class TreeBuilder {
case self::AFTER_HEAD_MODE: case self::AFTER_HEAD_MODE:
# A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED # A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED
# (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE # (LF), U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
if ($token instanceof CharacterToken && (strspn($token->data, "\t\n\x0c\x0d ") === strlen($token->data))) { // OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
if ($token instanceof CharacterToken && (strspn($token->data, Data::WHITESPACE) === strlen($token->data))) {
# Insert the character. # Insert the character.
$this->insertCharacterToken($token); $this->insertCharacterToken($token);
} }
@ -962,7 +970,9 @@ class TreeBuilder {
# Insert the token’s character. # Insert the token’s character.
$this->insertCharacterToken($token); $this->insertCharacterToken($token);
if (strspn($token->data, "\t\n\x0c\x0d ") !== strlen($token->data)) { // OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character.
if (strspn($token->data, Data::WHITESPACE) !== strlen($token->data)) {
# Set the frameset-ok flag to "not ok". # Set the frameset-ok flag to "not ok".
$this->framesetOk = false; $this->framesetOk = false;
} }
@ -1074,7 +1084,7 @@ class TreeBuilder {
} }
protected function parseTokenInForeignContent(Token $token): bool { protected function parseTokenInForeignContent(Token $token): bool {
if (Parser::$debug) { if (self::$debug) {
echo "Foreign Content\n"; echo "Foreign Content\n";
} }
@ -1092,7 +1102,7 @@ class TreeBuilder {
# Any other character token # Any other character token
// OPTIMIZATION: Will check for multiple space characters at once as character // OPTIMIZATION: Will check for multiple space characters at once as character
// tokens can contain more than one character. // tokens can contain more than one character.
if (strspn($token->data, "\t\n\x0c\x0d ") !== strlen($token->data)) { if (strspn($token->data, Data::WHITESPACE) !== strlen($token->data)) {
# Set the frameset-ok flag to "not ok". # Set the frameset-ok flag to "not ok".
$this->$framesetOk = false; $this->$framesetOk = false;
} }

Loading…
Cancel
Save