Modern DOM library written in PHP for HTML documents
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

455 lines
37 KiB

<?php
declare(strict_types=1);
namespace dW\HTML5;
class DataStream
{
// Used to get the file path for error reporting.
public $filePath;
// Internal storage for the Intl data object.
protected $data;
// Used for error reporting to display line number.
protected $_line = 1;
// Used for error reporting to display column number.
protected $_column = 0;
// Used for error reporting when unconsuming to calculate column number from
// last newline.
protected $newlines = [];
const ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
const DIGIT = '0123456789';
const HEX = '0123456789ABCDEFabcdef';
public function __construct(string $data, string $filePath = 'STDIN') {
if ($filePath !== 'STDIN') {
$this->filePath = realpath($filePath);
$data = file_get_contents($this->filePath);
} else {
$this->filePath = $filePath;
}
// DEVIATION: The spec has steps for parsing and determining the character
// encoding. At this moment this implementation won't determine a character
// encoding and will just assume UTF-8.
# One leading U+FEFF BYTE ORDER MARK character must be ignored if any are present
# in the input stream.
# Note: The handling of U+0000 NULL characters varies based on where the
# characters are found. In general, they are ignored except where doing so could
# plausibly introduce an attack vector. This handling is, by necessity, spread
# across both the tokenization stage and the tree construction stage.
// DEVIATION: Just going to remove NULL characters. There is no scripting involved
// in this implementation and therefore no attack vector possible due to it.
$data = preg_replace(['/^\xEF\xBB\xBF/','/\x00/'], '', $data);
// Won't provide line or column counts for this as it's done before that
// information is available. It will be rare that this is triggered.
$data = preg_replace_callback('/(?:[\x01-\x08\x0B\x0E-\x1F\x7F]|\xC2[\x80-\x9F]|\xED(?:\xA0[\x80-\xFF]|[\xA1-\xBE][\x00-\xFF]|\xBF[\x00-\xBF])|\xEF\xB7[\x90-\xAF]|\xEF\xBF[\xBE\xBF]|[\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF])/u', function($matches) {
ParseError::trigger(ParseError::INVALID_CONTROL_OR_NONCHARACTERS, $this);
return '';
}, $data);
// Normalize line breaks. Convert CRLF and CR to LF.
// Break the string up into a traversable object.
$this->data = new \MensBeam\Intl\Encoding\UTF8(str_replace(["\r\n", "\r"], "\n", $data));
}
public function consume(int $length = 1): string {
if ($length <= 0) {
throw new Exception(Exception::DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH, $length);
}
for ($i = 0, $string = ''; $i < $length; $i++) {
$char = $this->data->nextChar();
if ($char === "\n") {
$this->newlines[] = $this->data->posChar();
$this->_column = 1;
$this->_line++;
} else {
$this->_column++;
}
$string .= $char;
}
return $string;
}
public function unconsume(int $length = 1) {
if ($length <= 0) {
throw new Exception(Exception::DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH, $length);
}
$this->data->seek(0 - $length);
$string = $this->data->peekChar($length);
$numOfNewlines = substr_count($string, "\n");
if ($numOfNewlines > 0) {
$this->_line -= $numOfNewlines;
$count = $this->newlines;
$index = count($this->newlines) - ($numOfNewlines - 1);
$this->_column = 1 + (($count > 0 && isset($this->newlines[$index])) ? $this->data->posChar() - $this->newlines[$index] : $this->data->posChar());
} else {
$this->_column -= $length;
}
}
public function consumeWhile(string $match, int $limit = 0): string {
return $this->span($match, true, true, $limit);
}
public function consumeUntil(string $match, int $limit = 0): string {
return $this->span($match, false, true, $limit);
}
public function peek(int $length = 1): string {
if ($length <= 0) {
throw new Exception(Exception::DATASTREAM_INVALID_DATA_CONSUMPTION_LENGTH, $length);
}
return $this->data->peekChar($length);
}
public function peekWhile(string $match, int $limit = 0): string {
return $this->span($match, true, false, $limit);
}
public function peekUntil(string $match, int $limit = 0): string {
return $this->span($match, false, false, $limit);
}
public function consumeCharacterReference(string $allowedCharacter = null, bool $inAttribute = false): string {
$char = $this->peek();
// OPTIMIZATION: When this spec states to return a character token of any kind this
// method will just return the character. The token will be emitted from
// Parser::parse() instead. Likewise, if the spec states to return nothing this
// method will instead return '&' because every single use of "tokenizing a
// character reference" in the spec this emits a '&' character token upon failure.
# The behavior depends on the identity of the next character (the one immediately
# after the U+0026 AMPERSAND character), as follows: U+0009 CHARACTER TABULATION
# (tab), U+000A LINE FEED (LF), U+000C FORM FEED (FF), U+0020 SPACE, U+003C
# LESS-THAN SIGN, U+0026 AMPERSAND, EOF, The additional allowed character, if
# there is one. Not a character reference. No characters are consumed, and nothing
# is returned. (This is not an error, either.)
if ($char === "\x09" || $char === "x0A" || $char === "\x0C" || $char === ' ' || $char === '<' || $char === '&' || $char === false || (!is_null($allowedCharacter) && $char === $allowedCharacter)) {
return '&';
}
# U+0023 NUMBER SIGN (#)
if ($char === '//') {
# Consume the U+0023 NUMBER SIGN.
$this->consume();
$char = $this->peek();
# The behavior further depends on the character after the U+0023 NUMBER SIGN:
# U+0078 LATIN SMALL LETTER X, U+0058 LATIN CAPITAL LETTER X
if ($char === 'x' || $char === 'X') {
# Consume the X.
$this->consume();
# Consume as many characters as match the range of ASCII hex digits.
$number = $this->consumeWhile(self::HEX);
# If no characters match the range, then don't consume any characters (and
# unconsume the U+0023 NUMBER SIGN character and, if appropriate, the X
# character). This is a parse error; nothing is returned.
if (!$number) {
ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $this->peek(), 'hexadecimal digit');
$this->unconsume(2);
return '&';
}
} else {
# Consume as many characters as match the range of ASCII digits.
$number = $this->consumeWhile(self::DIGIT);
# If no characters match the range, then don't consume any characters (and
# unconsume the U+0023 NUMBER SIGN character and, if appropriate, the X
# character). This is a parse error; nothing is returned.
if (!$number) {
ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $this->peek(), 'decimal digit');
$this->unconsume();
return '&';
}
}
# Otherwise, if the next character is a U+003B SEMICOLON, consume that too. If it
# isn't, there is a parse error.
$char = $this->peek();
if ($char === ';') {
$this->consume();
} else {
ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $char, 'semicolon terminator');
}
# If one or more characters match the range, then take them all and interpret the
# string of characters as a number (either hexadecimal or decimal as appropriate).
# If that number is one of the numbers in the first column of the following table,
# then this is a parse error. Find the row with that number in the first column,
# and return a character token for the Unicode character given in the second
# column of that row.
// DEVIATION: Because NULL characters are stripped from the document there's no
// sense in checking for them here.
switch ($number) {
# 0x80 U+20AC EURO SIGN (€)
case 0x80: $returnValue = '€';
break;
# 0x80 U+20AC EURO SIGN (€)
case 0x82: $returnValue = '‚';
break;
# 0x83 U+0192 LATIN SMALL LETTER F WITH HOOK (ƒ)
case 0x83: $returnValue = 'ƒ';
break;
# 0x84 U+201E DOUBLE LOW-9 QUOTATION MARK (&ldquor;)
case 0x84: $returnValue = '„';
break;
# 0x85 U+2026 HORIZONTAL ELLIPSIS (&mldr;)
case 0x85: $returnValue = '…';
break;
# 0x86 U+2020 DAGGER (†)
case 0x86: $returnValue = '†';
break;
# 0x87 U+2021 DOUBLE DAGGER (‡)
case 0x87: $returnValue = '‡';
break;
# 0x88 U+02C6 MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ)
case 0x88: $returnValue = 'ˆ';
break;
# 0x89 U+2030 PER MILLE SIGN (‰)
case 0x89: $returnValue = '‰';
break;
# 0x8A U+0160 LATIN CAPITAL LETTER S WITH CARON (Š)
case 0x8A: $returnValue = 'Š';
break;
# 0x8B U+2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹)
case 0x8B: $returnValue = '‹';
break;
# 0x8C U+0152 LATIN CAPITAL LIGATURE OE (Œ)
case 0x8C: $returnValue = 'Œ';
break;
# 0x8E U+017D LATIN CAPITAL LETTER Z WITH CARON (&Zcaron;)
case 0x8E: $returnValue = 'Ž';
break;
# 0x91 U+2018 LEFT SINGLE QUOTATION MARK (‘)
case 0x91: $returnValue = '‘';
break;
# 0x92 U+2019 RIGHT SINGLE QUOTATION MARK (&rsquor;)
case 0x92: $returnValue = '’';
break;
# 0x93 U+201C LEFT DOUBLE QUOTATION MARK (“)
case 0x93: $returnValue = '“';
break;
# 0x94 U+201D RIGHT DOUBLE QUOTATION MARK (”)
case 0x94: $returnValue = '”';
break;
# 0x95 U+2022 BULLET (&bullet;)
case 0x95: $returnValue = '•';
break;
# 0x96 U+2013 EN DASH (–)
case 0x96: $returnValue = '–';
break;
# 0x97 U+2014 EM DASH (—)
case 0x97: $returnValue = '—';
break;
# 0x98 U+02DC SMALL TILDE (˜)
case 0x98: $returnValue = '˜';
break;
# 0x99 U+2122 TRADE MARK SIGN (™)
case 0x99: $returnValue = '™';
break;
# 0x9A U+0161 LATIN SMALL LETTER S WITH CARON (š)
case 0x9A: $returnValue = 'š';
break;
# 0x9B U+203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›)
case 0x9B: $returnValue = '›';
break;
# 0x9C U+0153 LATIN SMALL LIGATURE OE (œ)
case 0x9C: $returnValue = 'œ';
break;
# 0x9E U+017E LATIN SMALL LETTER Z WITH CARON (&zcaron;)
case 0x9E: $returnValue = 'ž';
break;
# 0x9F U+0178 LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ)
case 0x9F: $returnValue = 'Ÿ';
break;
default : $returnValue = null;
}
if ($returnValue) {
ParseError::trigger(Error::INVALID_NUMERIC_ENTITY, $number);
return $returnValue;
}
# Otherwise, if the number is in the range 0xD800 to 0xDFFF or is greater than
# 0x10FFFF, then this is a parse error. Return a U+FFFD REPLACEMENT CHARACTER
# character token.
if (($number >= 0xD800 && $number <= 0xDFFF) || $number > 0x10FFFF) {
ParseError::trigger(Error::INVALID_CODEPOINT, $number);
return '�';
}
# Additionally, if the number is in the range 0x0001 to 0x0008, 0x000D to 0x001F,
# 0x007F to 0x009F, 0xFDD0 to 0xFDEF, or is one of 0x000B, 0xFFFE, 0xFFFF,
# 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
# 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF,
# 0xAFFFE, 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
# 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, or 0x10FFFF, then this is a parse error.
if (($number >= 0x0001 && $number <= 0x0008) || ($number >= 0x000D && $number <= 0x001F) ||
($number >= 0x007F && $number <= 0x009F) || ($number >= 0xFDD0 && $number <= 0xFDEF) ||
$number === 0x000B || $number === 0xFFFE || $number === 0xFFFF || $number === 0x1FFFE ||
$number === 0x1FFFF || $number === 0x2FFFE || $number === 0x2FFFF || $number === 0x3FFFE ||
$number === 0x3FFFF || $number === 0x4FFFE || $number === 0x4FFFF || $number === 0x5FFFE ||
$number === 0x5FFFF || $number === 0x6FFFE || $number === 0x6FFFF || $number === 0x7FFFE ||
$number === 0x7FFFF || $number === 0x8FFFE || $number === 0x8FFFF || $number === 0x9FFFE ||
$number === 0x9FFFF || $number === 0xAFFFE || $number === 0xAFFFF || $number === 0xBFFFE ||
$number === 0xBFFFF || $number === 0xCFFFE || $number === 0xCFFFF || $number === 0xDFFFE ||
$number === 0xDFFFF || $number === 0xEFFFE || $number === 0xEFFFF || $number === 0xFFFFE ||
$number === 0xFFFFF || $number === 0x10FFFE || $number === 0x10FFFF) {
ParseError::trigger(Error::INVALID_CODEPOINT, $number);
return '&';
}
# Otherwise, return a character token for the Unicode character whose code point
# is that number.
return mb_convert_encoding(pack('N', $number), 'UTF-8', 'UCS-4BE');
}
# Consume the maximum number of characters possible, with the consumed characters
# matching one of the identifiers in the first column of the named character
# references table (in a case-sensitive manner).
// Implementing this by peeking ahead 33 characters that match 0-9, A-Z, a-z, and
// ';'. 33 is the string length of the longest named character reference
// (calculated using `max(array_map('mb_strlen', array_keys($referenceTable)));`).
// It then checks the sequence of characters by checking them against a regular
// expression which is generated by a script that grabs the JSON of the character
// reference table from the spec and creates a somewhat optimized regular
// expression.
$sequence = static::peekWhile(self::DIGIT.self::ALPHA.';', 33);
if (preg_match('/^(?:[Aa]acute(?:;)?|[Aa]breve;|acd;|acE;|[Aa]circ(?:;)?|acute(?:;)?|[Aa]cy;|[Aa][Ee]lig(?:;)?|[Aa]fr;|af;|[Aa]grave(?:;)?|alefsym;|aleph;|[Aa]lpha;|[Aa]macr;|amalg;|[aA][mM][pP](?:;)?|andand;|andd;|andslope;|andv;|[Aa]nd;|ange;|angmsdaa;|angmsdab;|angmsdac;|angmsdad;|angmsdae;|angmsdaf;|angmsdag;|angmsdah;|angmsd;|angrtvbd;|angrtvb;|angrt;|angsph;|angst;|angzarr;|[Aa]ogon;|[Aa]opf;|apacir;|ap[Ee];|apid;|apos;|ApplyFunction;|approxeq;|[Aa]ring(?:;)?|[Aa]scr;|Assign;|ast;|asympeq;|asymp;|[Aa]tilde(?:;)?|[Aa]uml(?:;)?|awconint;|awint;|backcong;|backepsilon;|backprime;|backsimeq;|backsim;|Backslash;|barvee;|Barv;|barwedge;|[bB]arwed;|bbrktbrk;|bbrk;|bcong;|[Bb]cy;|bdquo;|[bB]ecause;|becaus;|bemptyv;|bepsi;|Bernoullis;|bernou;|[Bb]eta;|beth;|between;|[Bb]fr;|bigcap;|bigcirc;|bigcup;|bigodot;|bigoplus;|bigotimes;|bigsqcup;|bigstar;|bigtriangledown;|bigtriangleup;|biguplus;|bigvee;|bigwedge;|bkarow;|blacklozenge;|blacksquare;|blacktriangledown;|blacktriangleleft;|blacktriangleright;|blacktriangle;|ac;|angle;|ang;|blank;|blk12;|blk14;|blk34;|block;|bnequiv;|bne;|b[Nn]ot;|[Bb]opf;|bottom;|bot;|bowtie;|boxbox;|box[dD][lL];|box[dD][rR];|box[hH][dD];|box[hH][uU];|box[hH];|boxminus;|boxplus;|boxtimes;|box[uU][lL];|box[uU][rR];|box[vV][hH];|box[vV][lL];|box[vV][rR];|box[vV];|bprime;|[bB]reve;|brvbar(?:;)?|[bB]scr;|bsemi;|bsime;|bsim;|bsolb;|bsolhsub;|bsol;|bullet;|bull;|[Bb]umpeq;|bump[Ee];|bump;|[Cc]acute;|capand;|capbrcup;|capcap;|capcup;|capdot;|CapitalDifferentialD;|caps;|[cC]ap;|caret;|[Cc]caron;|caron;|Cayleys;|ccaps;|[Cc]cedil(?:;)?|[Cc]circ;|Cconint;|ccupssm;|ccups;|[Cc]dot;|Cedilla;|cedil(?:;)?|cemptyv;|[cC]enter[dD]ot;|cent(?:;)?|[cC]fr;|[Cc][Hh]cy;|checkmark;|check;|[Cc]hi;|circeq;|circlearrowleft;|circlearrowright;|circledast;|circledcirc;|circleddash;|CircleDot;|circledR;|circledS;|CircleMinus;|CirclePlus;|CircleTimes;|circ;|cir[Ee];|cirfnint;|cirmid;|cirscir;|cir;|ClockwiseContourIntegral;|CloseCurlyDoubleQuote;|CloseCurlyQuote;|clubsuit;|clubs;|coloneq;|[Cc]olone;|[cC]olon;|commat;|comma;|compfn;|complement;|complexes;|comp;|congdot;|Congruent;|cong;|cwconint;|[cC]onint;|ContourIntegral;|[cC]opf;|Coproduct;|coprod;|copysr;|[cC][oO][pP][yY](?:;)?|CounterClockwiseContourIntegral;|crarr;|[cC]ross;|[Cc]scr;|csube;|csub;|csupe;|csup;|ctdot;|cudarrl;|cudarrr;|cuepr;|cuesc;|cularrp;|cularr;|cupbrcap;|[cC]up[cC]ap;|cupcup;|cupdot;|cupor;|cups;|[cC]up;|curarrm;|curarr;|curlyeqprec;|curlyeqsucc;|curlyvee;|curlywedge;|curren(?:;)?|curvearrowleft;|curvearrowright;|cuvee;|cuwed;|cwint;|cylcty;|[dD]agger;|daleth;|[dD][aA]rr;|[Dd]ashv;|dash;|dbkarow;|dblac;|[Dd]caron;|[Dd]cy;|ddagger;|ddarr;|DDotrahd;|ddotseq;|[Dd][Dd];|deg(?:;)?|[Dd]elta;|Del;|demptyv;|dfisht;|[Dd]fr;|dharl;|dharr;|dHar;|DiacriticalAcute;|DiacriticalDot;|DiacriticalDoubleAcute;|DiacriticalGrave;|DiacriticalTilde;|diamondsuit;|[dD]iamond;|diams;|diam;|die;|DifferentialD;|digamma;|disin;|divideontimes;|divide(?:;)?|divonx;|div;|[Dd][Jj]cy;|dlcorn;|dlcrop;|dollar;|[Dd]opf;|DotDot;|doteqdot;|DotEqual;|doteq;|dotminus;|dotplus;|dotsquare;|doublebarwedge;|DoubleContourIntegral;|DoubleDot;|[Dd]ot;|DoubleDownArrow;|DoubleLeftArrow;|DoubleLeftRightArrow;|DoubleLeftTee;|DoubleLongLeftArrow;|DoubleLongLeftRightArrow;|DoubleLongRightArrow;|DoubleRightArrow;|DoubleRightTee;|DoubleUpArrow;|DoubleUpDownArrow;|DoubleVerticalBar;|DownArrowBar;|DownArrowUpArrow;|downdownarrows;|[dD]own[aA]rrow;|DownBreve;|downharpoonleft;|downharpoonright;|DownLeftRightVector;|DownLeftTeeVector;|DownLeftVectorBar;|DownLeftVector;|DownRightTeeVector;|DownRightVectorBar;|DownRightVector;|DownTeeArrow;|DownTee;|drbkarow;|drcorn;|drcrop;|[Dd]scr;|[Dd][Ss]cy;|dsol;|[Dd]strok;|dtdot;|dtrif;|dtri;|duarr;|duhar;|dwangle;|[Dd][Zz]cy;|dzigrarr;|[Ee]acute(?:;)?|easter;|[Ee]caron;|[Ee]circ(?:;)?|ecir;|ecolon;|[Ee]cy;|eDDot;|[Ee][dD]ot;|ee;|efDot;|[Ee]fr;|[Ee]grave(?:;)?|egsdot;|egs;|eg;|Element;|elinters;|ell;|elsdot;|els;|el;|[Ee]macr;|emptyset;|EmptySmallSquare;|EmptyVerySmallSquare;|emptyv;|empty;|emsp13;|emsp14;|emsp;|[Ee][Nn][Gg];|ensp;|[Ee]ogon;|[Ee]opf;|eparsl;|epar;|eplus;|[Ee]psilon;|epsiv;|epsi;|eqcirc;|eqcolon;|eqsim;|eqslantgtr;|eqslantless;|equals;|EqualTilde;|Equal;|equest;|Equilibrium;|equivDD;|equiv;|eqvparsl;|erarr;|erDot;|[eE]scr;|esdot;|[Ee]sim;|[Ee]ta;|[Ee][Tt][Hh](?:;)?|[Ee]uml(?:;)?|euro;|excl;|Exists;|exist;|expectation;|[eE]xponential[eE];|fallingdotseq;|[Ff]cy;|female;|ffilig;|fflig;|ffllig;|[Ff]fr;|filig;|FilledSmallSquare;|FilledVerySmallSquare;|fjlig;|flat;|fllig;|fltns;|fnof;|[Ff]opf;|[fF]or[aA]ll;|forkv;|fork;|Fouriertrf;|fpartint;|frac12(?:;)?|frac13;|frac14(?:;)?|frac15;|frac16;|frac18;|frac23;|frac25;|frac34(?:;)?|frac35;|frac38;|frac45;|frac56;|frac58;|frac78;|frasl;|frown;|[fF]scr;|gacute;|[Gg]ammad;|[Gg]amma;|gap;|[Gg]breve;|Gcedil;|[Gg]circ;|[Gg]cy;|[Gg]dot;|g[Ee]l;|geqq;|geqslant;|geq;|gescc;|gesdotol;|gesdoto;|gesdot;|gesles;|gesl;|ges;|g[eE];|[Gg]fr;|ggg;|[gG]g;|gimel;|[Gg][Jj]cy;|gla;|glE;|glj;|gl;|gnapprox;|gnap;|gneqq;|gneq;|gn[eE];|gnsim;|[Gg]opf;|grave;|GreaterEqualLess;|GreaterEqual;|GreaterFullEqual;|GreaterGreater;|GreaterLess;|GreaterSlantEqual;|GreaterTilde;|[Gg]scr;|gsime;|gsiml;|gsim;|gtcc;|gtcir;|gtdot;|gtlPar;|gtquest;|gtrapprox;|gtrarr;|gtrdot;|gtreqless;|gtreqqless;|gtrless;|gtrsim;|[gG][tT](?:;)?|gvertneqq;|gvnE;|Hacek;|hairsp;|half;|hamilt;|[Hh][Aa][Rr][Dd]cy;|harrcir;|harrw;|h[aA]rr;|Hat;|hbar;|[Hh]circ;|heartsuit;|hearts;|hellip;|hercon;|[hH]fr;|HilbertSpace;|hksearow;|hkswarow;|hoarr;|homtht;|hookleftarrow;|hookrightarrow;|[hH]opf;|horbar;|HorizontalLine;|[hH]scr;|hslash;|[Hh]strok;|HumpDownHump;|HumpEqual;|hybull;|hyphen;|[Ii]acute(?:;)?|[Ii]circ(?:;)?|[Ii]cy;|ic;|Idot;|[Ii][Ee]cy;|iexcl(?:;)?|iff;|[iI]fr;|[Ii]grave(?:;)?|iiiint;|iiint;|iinfin;|iiota;|ii;|[Ii][Jj]lig;|[Ii]macr;|image;|ImaginaryI;|imagline;|imagpart;|imath;|imof;|imped;|Implies;|Im;|incare;|infintie;|infin;|inodot;|intcal;|integers;|Integral;|intercal;|Intersection;|intlarhk;|intprod;|[iI]nt;|InvisibleComma;|InvisibleTimes;|in;|[Ii][Oo]cy;|[Ii]ogon;|[Ii]opf;|[Ii]ota;|iprod;|iquest(?:;)?|[iI]scr;|isindot;|isinE;|isinsv;|isins;|isinv;|isin;|[Ii]tilde;|it;|[Ii]ukcy;|[Ii]uml(?:;)?|[Jj]circ;|[Jj]cy;|[Jj]fr;|jmath;|[Jj]opf;|[Jj]scr;|[Jj]sercy;|[Jj]ukcy;|kappav;|[Kk]appa;|[Kk]cedil;|[Kk]cy;|[Kk]fr;|kgreen;|[Kk][Hh]cy;|[Kk][Jj]cy;|[Kk]opf;|[Kk]scr;|lAarr;|[Ll]acute;|laemptyv;|lagran;|[Ll]ambda;|langd;|langle;|[lL]ang;|Laplacetrf;|lap;|laquo(?:;)?|larrbfs;|larrb;|larrfs;|larrhk;|larrlp;|larrpl;|larrsim;|larrtl;|[lL][aA]rr;|l[aA]tail;|lates;|late;|lat;|l[bB]arr;|lbbrk;|lbrace;|lbrack;|lbrke;|lbrksld;|lbrkslu;|[Ll]caron;|[Ll]cedil;|lceil;|lcub;|[Ll]cy;|ldca;|ldquor;|ldquo;|ldrdhar;|ldrushar;|ldsh;|LeftAngleBracket;|LeftArrowBar;|LeftArrowRightArrow;|leftarrowtail;|[lL]eft[aA]rrow;|LeftCeiling;|LeftDoubleBracket;|LeftDownTeeVector;|LeftDownVectorBar;|LeftDownVector;|LeftFloor;|leftharpoondown;|leftharpoonup;|leftleftarrows;|leftrightarrows;|[lL]eft[rR]ight[aA]rrow;|leftrightharpoons;|leftrightsquigarrow;|LeftRightVector;|LeftTeeArrow;|LeftTeeVector;|LeftTee;|leftthreetimes;|LeftTriangleBar;|LeftTriangleEqual;|LeftTriangle;|LeftUpDownVector;|LeftUpTeeVector;|LeftUpVectorBar;|LeftUpVector;|LeftVectorBar;|LeftVector;|l[Ee]g;|leqq;|leqslant;|leq;|lescc;|lesdotor;|lesdoto;|lesdot;|lesges;|lesg;|lessapprox;|lessdot;|lesseqgtr;|lesseqqgtr;|LessEqualGreater;|LessFullEqual;|LessGreater;|lessgtr;|LessLess;|lesssim;|LessSlantEqual;|LessTilde;|les;|l[eE];|lfisht;|lfloor;|[Ll]fr;|lgE;|lg;|lhard;|lharul;|lharu;|lHar;|lhblk;|[Ll][Jj]cy;|llarr;|llcorner;|Lleftarrow;|llhard;|lltri;|[lL]l;|[Ll]midot;|lmoustache;|lmoust;|lnapprox;|lnap;|lneqq;|lneq;|ln[eE];|lnsim;|loang;|loarr;|lobrk;|[lL]ong[lL]eft[aA]rrow;|[lL]ong[lL]eft[rR]ight[aA]rrow;|longmapsto;|[lL]ong[rR]ight[aA]rrow;|looparrowleft;|looparrowright;|lopar;|[Ll]opf;|loplus;|lotimes;|lowast;|lowbar;|LowerLeftArrow;|LowerRightArrow;|lozenge;|lozf;|loz;|lparlt;|lpar;|lrarr;|lrcorner;|lrhard;|lrhar;|lrm;|lrtri;|lsaquo;|[lL]scr;|[lL]sh;|lsime;|lsimg;|lsim;|lsqb;|lsquor;|lsquo;|[Ll]strok;|ltcc;|ltcir;|ltdot;|lthree;|ltimes;|ltlarr;|ltquest;|ltrie;|ltrif;|ltri;|ltrPar;|[lL][tT](?:;)?|lurdshar;|luruhar;|lvertneqq;|lvnE;|macr(?:;)?|male;|maltese;|malt;|mapstodown;|mapstoleft;|mapstoup;|mapsto;|[Mm]ap;|marker;|mcomma;|[Mm]cy;|mdash;|mDDot;|measuredangle;|MediumSpace;|Mellintrf;|[Mm]fr;|mho;|micro(?:;)?|midast;|midcir;|middot(?:;)?|mid;|minusb;|minusdu;|minusd;|MinusPlus;|minus;|mlcp;|mldr;|mnplus;|models;|[Mm]opf;|mp;|[Mm]scr;|mstpos;|multimap;|mumap;|[mM]u;|nabla;|[nN]acute;|nang;|napE;|napid;|napos;|precnapprox;|succnapprox;|napprox;|approx;|nap;|naturals;|natural;|natur;|nbsp(?:;)?|nbumpe;|nbump;|ncap;|ap;|[nN]caron;|[nN]cedil;|ncongdot;|ncong;|ncup;|[nN]cy;|ndash;|nearhk;|nearrow;|ne[Aa]rr;|nedot;|NegativeMediumSpace;|NegativeThickSpace;|NegativeThinSpace;|NegativeVeryThinSpace;|nequiv;|nesear;|nesim;|NestedGreaterGreater;|NestedLessLess;|NewLine;|nexists;|nexist;|ne;|[nN]fr;|ngeqq;|ngeqslant;|ngeq;|nges;|ng[eE];|nGg;|ngsim;|ngtr;|nGtv;|n[gG]t;|nh[Aa]rr;|nhpar;|nisd;|nis;|[nN][jJ]cy;|nl[Aa]rr;|nldr;|n[Ll]eftarrow;|n[Ll]eftrightarrow;|nleqq;|nleqslant;|nleq;|nless;|nles;|nl[eE];|nLl;|nlsim;|nltrie;|nltri;|nLtv;|n[lL]t;|nmid;|NoBreak;|NonBreakingSpace;|[Nn]opf;|NotCongruent;|NotCupCap;|NotDoubleVerticalBar;|NotElement;|NotEqualTilde;|NotEqual;|NotExists;|NotGreaterEqual;|NotGreaterFullEqual;|NotGreaterGreater;|NotGreaterLess;|NotGreaterSlantEqual;|NotGreaterTilde;|NotGreater;|NotHumpDownHump;|NotHumpEqual;|notindot;|notinE;|notinva;|notinvb;|notinvc;|notin;|NotLeftTriangleBar;|NotLeftTriangleEqual;|NotLeftTriangle;|NotLessEqual;|NotLessGreater;|NotLessLess;|NotLessSlantEqual;|NotLessTilde;|NotLess;|NotNestedGreaterGreater;|NotNestedLessLess;|notniva;|notnivb;|notnivc;|niv;|notni;|ni;|NotPrecedesEqual;|NotPrecedesSlantEqual;|NotPrecedes;|NotReverseElement;|NotRightTriangleBar;|NotRightTriangleEqual;|NotRightTriangle;|NotSquareSubsetEqual;|NotSquareSubset;|NotSquareSupersetEqual;|NotSquareSuperset;|NotSubsetEqual;|NotSubset;|NotSucceedsEqual;|NotSucceedsSlantEqual;|NotSucceedsTilde;|NotSucceeds;|NotSupersetEqual;|NotSuperset;|NotTildeEqual;|NotTildeFullEqual;|NotTildeTilde;|NotTilde;|NotVerticalBar;|[nN]ot(?:;)?|nparallel;|nparsl;|npart;|npar;|npolint;|nprcue;|npreceq;|nprec;|npre;|npr;|nrarrc;|nrarrw;|nr[Aa]rr;|n[Rr]ightarrow;|nrtrie;|nrtri;|nsccue;|nsce;|[nN]scr;|nsc;|nshortmid;|nshortparallel;|nsimeq;|nsime;|nsim;|nsmid;|nspar;|nsqsube;|nsqsupe;|nsub[eE];|nsubseteqq;|nsubseteq;|nsubset;|nsub;|nsucceq;|nsucc;|nsup[eE];|nsupseteqq;|nsupseteq;|nsupset;|nsup;|ntgl;|[nN]tilde(?:;)?|ntlg;|ntrianglelefteq;|ntriangleleft;|ntrianglerighteq;|ntriangleright;|numero;|numsp;|num;|[nN]u;|nvap;|n[Vv][Dd]ash;|nvge;|nvgt;|nvHarr;|nvinfin;|nvlArr;|nvle;|nvltrie;|nvlt;|nvrArr;|nvrtrie;|nvsim;|nwarhk;|nwarrow;|nw[Aa]rr;|nwnear;|[oO]acute(?:;)?|oast;|[oO]circ(?:;)?|ocir;|[oO]cy;|odash;|[oO]dblac;|odiv;|odot;|odsold;|[oO][eE]lig;|ofcir;|[oO]fr;|ogon;|[oO]grave(?:;)?|ogt;|ohbar;|ohm;|oint;|olarr;|olcir;|olcross;|oline;|olt;|[oO]macr;|[oO]mega;|[oO]micron;|omid;|ominus;|[oO]opf;|opar;|OpenCurlyDoubleQuote;|OpenCurlyQuote;|operp;|oplus;|orarr;|orderof;|order;|ordf(?:;)?|ordm(?:;)?|ord;|origof;|oror;|orslope;|orv;|[oO]scr;|[oO]slash(?:;)?|osol;|oS;|[oO]tilde(?:;)?|otimesas;|[oO]times;|[oO]uml(?:;)?|ovbar;|OverBar;|OverBrace;|OverBracket;|OverParenthesis;|parallel;|para(?:;)?|parsim;|parsl;|PartialD;|part;|par;|[pP]cy;|percnt;|period;|permil;|perp;|pertenk;|[pP]fr;|phiv;|[pP]hi;|phmmat;|phone;|pitchfork;|piv;|[pP]i;|planckh;|planck;|plankv;|plusacir;|plusb;|pluscir;|plusdo;|plusdu;|pluse;|PlusMinus;|plusmn(?:;)?|plussim;|plustwo;|plus;|pm;|Poincareplane;|pointint;|[Pp]opf;|pound(?:;)?|prap;|prcue;|precapprox;|preccurlyeq;|PrecedesEqual;|PrecedesSlantEqual;|PrecedesTilde;|Precedes;|preceq;|precneqq;|precnsim;|precsim;|prec;|pr[Ee];|primes;|[Pp]rime;|prnap;|prnE;|prnsim;|Product;|prod;|profalar;|profline;|profsurf;|Proportional;|Proportion;|propto;|prop;|prsim;|prurel;|[pP]r;|[pP]scr;|[pP]si;|puncsp;|[qQ]fr;|qint;|[Qq]opf;|qprime;|[qQ]scr;|quaternions;|quatint;|questeq;|quest;|[Qq][Uu][Oo][Tt](?:;)?|rAarr;|race;|[rR]acute;|radic;|raemptyv;|rangd;|range;|rangle;|[Rr]ang;|raquo(?:;)?|rarrap;|rarrbfs;|rarrb;|rarrc;|rarrfs;|rarrhk;|rarrlp;|rarrpl;|rarrsim;|[rR]arrtl;|rarrw;|[rR][Aa]rr;|r[Aa]tail;|rationals;|ratio;|[Rr][Bb]arr;|rbbrk;|rbrace;|rbrack;|rbrke;|rbrksld;|rbrkslu;|[rR]caron;|[rR]cedil;|rceil;|rcub;|[rR]cy;|rdca;|rdldhar;|rdquor;|rdquo;|rdsh;|realine;|realpart;|reals;|real;|rect;|[Rr][Ee][Gg](?:;)?|ReverseElement;|ReverseEquilibrium;|ReverseUpEquilibrium;|Re;|rfisht;|rfloor;|[Rr]fr;|rhard;|rharul;|rharu;|rHar;|rhov;|[rR]ho;|RightAngleBracket;|RightArrowBar;|RightArrowLeftArrow;|rightarrowtail;|[Rr]ight[aA]rrow;|RightCeiling;|RightDoubleBracket;|RightDownTeeVector;|RightDownVectorBar;|RightDownVector;|RightFloor;|rightharpoondown;|rightharpoonup;|rightleftarrows;|rightleftharpoons;|rightrightarrows;|rightsquigarrow;|RightTeeArrow;|RightTeeVector;|RightTee;|rightthreetimes;|RightTriangleBar;|RightTriangleEqual;|RightTriangle;|RightUpDownVector;|RightUpTeeVector;|RightUpVectorBar;|RightUpVector;|RightVectorBar;|RightVector;|[oO]r;|ring;|risingdotseq;|rlarr;|rlhar;|rlm;|rmoustache;|rmoust;|rnmid;|roang;|roarr;|robrk;|ropar;|[Rr]opf;|roplus;|rotimes;|RoundImplies;|rpargt;|rpar;|rppolint;|rrarr;|Rrightarrow;|rsaquo;|[Rr]scr;|[Rr]sh;|rsqb;|rsquor;|rsquo;|rthree;|rtimes;|rtrie;|rtrif;|rtriltri;|rtri;|RuleDelayed;|ruluhar;|rx;|[sS]acute;|sbquo;|scap;|[sS]caron;|sccue;|[sS]cedil;|sc[Ee];|[sS]circ;|scnap;|scnE;|scnsim;|scpolint;|scsim;|[sS]cy;|[sS]c;|sdotb;|sdote;|sdot;|searhk;|searrow;|se[Aa]rr;|sect(?:;)?|semi;|seswar;|setminus;|setmn;|sext;|sfrown;|[sS]fr;|sharp;|[sS][hH][cC][hH]cy;|[sS][hH]cy;|ShortDownArrow;|ShortLeftArrow;|shortmid;|shortparallel;|ShortRightArrow;|ShortUpArrow;|shy(?:;)?|sigmaf;|sigmav;|[sS]igma;|simdot;|simeq;|sime;|simgE;|simg;|simlE;|siml;|simne;|simplus;|simrarr;|sim;|slarr;|SmallCircle;|smallsetminus;|smashp;|smeparsl;|smid;|smile;|smtes;|smte;|smt;|[sS][oO][fF][tT]cy;|solbar;|solb;|sol;|[sS]opf;|spadesuit;|spades;|spar;|sqcaps;|sqcap;|sqcups;|sqcup;|Sqrt;|sqsube;|sqsubseteq;|sqsubset;|sqsub;|sqsupe;|sqsupseteq;|sqsupset;|sqsup;|SquareIntersection;|SquareSubsetEqual;|SquareSubset;|SquareSupersetEqual;|SquareSuperset;|SquareUnion;|[Ss]quare;|squarf;|squf;|squ;|srarr;|[sS]scr;|ssetmn;|ssmile;|sstarf;|starf;|[sS]tar;|straightepsilon;|straightphi;|strns;|subdot;|subedot;|sub[eE];|submult;|subn[eE];|subplus;|subrarr;|subseteqq;|SubsetEqual;|subseteq;|subsetneqq;|subsetneq;|[Ss]ubset;|subsim;|subsub;|subsup;|[Ss]ub;|succapprox;|succcurlyeq;|SucceedsEqual;|SucceedsSlantEqual;|SucceedsTilde;|Succeeds;|succeq;|succneqq;|succnsim;|succsim;|succ;|SuchThat;|[Ss]um;|sung;|sup1(?:;)?|sup2(?:;)?|sup3(?:;)?|supdot;|supdsub;|supedot;|SupersetEqual;|Superset;|sup[eE];|suphsol;|suphsub;|suplarr;|supmult;|supn[eE];|supplus;|supseteqq;|supseteq;|supsetneqq;|supsetneq;|[Ss]upset;|supsim;|supsub;|supsup;|[Ss]up;|swarhk;|swarrow;|sw[Aa]rr;|swnwar;|szlig(?:;)?|Tab;|target;|[tT]au;|tbrk;|[tT]caron;|[tT]cedil;|[tT]cy;|tdot;|telrec;|[tT]fr;|there4;|[Tt]herefore;|thetasym;|thetav;|[tT]heta;|thickapprox;|thicksim;|ThickSpace;|ThinSpace;|thinsp;|thkap;|thksim;|[tT][hH][oO][rR][nN](?:;)?|TildeEqual;|TildeFullEqual;|TildeTilde;|timesbar;|timesb;|timesd;|times(?:;)?|tint;|toea;|topbot;|topcir;|topfork;|[tT]opf;|top;|tosa;|tprime;|[Tt][Rr][Aa][Dd][Ee];|triangledown;|trianglelefteq;|triangleleft;|triangleq;|trianglerighteq;|triangleright;|triangle;|tridot;|trie;|triminus;|TripleDot;|triplus;|trisb;|tritime;|trpezium;|[tT]scr;|[tT][sS]cy;|[tT][sS][hH]cy;|[tT]strok;|twixt;|twoheadleftarrow;|twoheadrightarrow;|[uU]acute(?:;)?|Uarrocir;|[uU][Aa]rr;|[uU]brcy;|[uU]breve;|[uU]circ(?:;)?|[uU]cy;|udarr;|[uU]dblac;|udhar;|ufisht;|[uU]fr;|[uU]grave(?:;)?|uharl;|uharr;|uHar;|uhblk;|ulcorner;|ulcorn;|ulcrop;|ultri;|[uU]macr;|uml(?:;)?|UnderBar;|UnderBrace;|UnderBracket;|UnderParenthesis;|UnionPlus;|Union;|[uU]ogon;|[uU]opf;|UpArrowBar;|UpArrowDownArrow;|[Uu]p[aA]rrow;|[Uu]p[dD]own[aA]rrow;|UpEquilibrium;|upharpoonleft;|upharpoonright;|uplus;|UpperLeftArrow;|UpperRightArrow;|upsih;|[uU]psilon;|[Uu]psi;|UpTeeArrow;|UpTee;|upuparrows;|urcorner;|urcorn;|urcrop;|[uU]ring;|urtri;|[uU]scr;|utdot;|[uU]tilde;|[Tt]ilde;|utrif;|utri;|uuarr;|[uU]uml(?:;)?|uwangle;|vangrt;|varepsilon;|varkappa;|varnothing;|varphi;|varpi;|varpropto;|varrho;|v[Aa]rr;|varsigma;|varsubsetneqq;|varsubsetneq;|varsupsetneqq;|varsupsetneq;|vartheta;|vartriangleleft;|vartriangleright;|vBarv;|[Vv][bB]ar;|[vV]cy;|Vdashl;|[Vv][Dd]ash;|veebar;|veeeq;|[Vv]ee;|vellip;|[Vv]erbar;|VerticalBar;|VerticalLine;|VerticalSeparator;|VerticalTilde;|[Vv]ert;|VeryThinSpace;|[vV]fr;|vltri;|vnsub;|vnsup;|[vV]opf;|vprop;|vrtri;|[vV]scr;|vsubn[eE];|vsupn[eE];|Vvdash;|vzigzag;|[wW]circ;|wedbar;|wedgeq;|xwedge;|[Ww]edge;|weierp;|[wW]fr;|[wW]opf;|wp;|wreath;|wr;|[wW]scr;|xcap;|xcirc;|xcup;|xdtri;|[xX]fr;|xh[Aa]rr;|[xX]i;|xl[Aa]rr;|xmap;|xnis;|xodot;|[xX]opf;|xoplus;|xotime;|xr[Aa]rr;|[xX]scr;|xsqcup;|xuplus;|xutri;|xvee;|[yY]acute(?:;)?|[yY][aA]cy;|[yY]circ;|[yY]cy;|yen(?:;)?|[yY]fr;|[yY][iI]cy;|[yY]opf;|[yY]scr;|[yY][uU]cy;|[Yy]uml(?:;)?|[zZ]acute;|[zZ]caron;|[zZ]cy;|[zZ]dot;|zeetrf;|ZeroWidthSpace;|[zZ]eta;|[Zz]fr;|[zZ][hH]cy;|zigrarr;|[Zz]opf;|[zZ]scr;|zwj;|zwnj;)/', $sequence, $matches)) {
$sequence = $matches[0];
$lastChar = substr($sequence, -1);
# If the character reference is being consumed as part of an attribute, and the
# last character matched is not a U+003B SEMICOLON character (;), and the next
# character is either a U+003D EQUALS SIGN character (=) or an alphanumeric ASCII
# character, then, for historical reasons, all the characters that were matched
# after the U+0026 AMPERSAND character (&) must be unconsumed, and nothing is
# returned. However, if this next character is in fact a U+003D EQUALS SIGN
# character (=), then this is a parse error, because some legacy user agents will
# misinterpret the markup in those cases.
// OPTIMIZATION: Not consuming here until this stuff is checked because there's no
// sense in consuming characters and then turning right back around and unconsuming
// them. Will consume after this step instead.
$next = $this->peek();
if ($inAttribute && $lastChar !== ';' && ($next === '=' || ctype_alnum($next))) {
if ($next === '=') {
ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $next, 'semicolon terminator');
}
return '&';
}
$this->consume(strlen($sequence));
if ($lastChar !== ';') {
// Used for PHP's entity decoder. Described below.
$sequence.=';';
ParseError::trigger(ParseError::ENTITY_UNEXPECTED_CHARACTER, $lastChar, 'semicolon terminator');
}
# Return one or two character tokens for the character(s) corresponding to the
# character reference name (as given by the second column of the named character
# references table).
// DEVIATION: Since the regular expression above checks the validity of the
// regular expression there isn't a need for the table. Can use PHP's built in
// decoder at least until there's entities in the table that aren't in the spec's.
return html_entity_decode('&'.$sequence, ENT_HTML5);
}
# If no match can be made, then no characters are consumed, and nothing is
# returned. In this case, if the characters after the U+0026 AMPERSAND character
# (&) consist of a sequence of one or more alphanumeric ASCII characters followed
# by a U+003B SEMICOLON character (;), then this is a parse error.
if (preg_match('/^[A-Za-z0-9]+;/', $char)) {
ParseError::trigger(ParseError::INVALID_NAMED_ENTITY, $char);
}
return '&';
}
protected function span(string $match, bool $while = true, bool $advancePointer = true, int $limit = 0): string {
// Break the matching characters into an array of characters. Unicode friendly.
$match = preg_split('/(?<!^)(?!$)/Su', $match);
$count = 0;
$string = '';
while (true) {
$char = $this->data->nextChar();
if ($char === '') {
break;
}
$inArray = in_array($char, $match);
// strspn
if ($while && !$inArray) {
break;
}
// strcspn
elseif (!$while && $inArray) {
break;
}
if ($advancePointer && $char === "\n") {
$this->newlines[] = $this->data->posChar();
$this->_column = 1;
$this->_line++;
} else {
$this->_column++;
}
$string .= $char;
$count++;
if ($count === $limit) {
break;
}
}
if ($count === 0) {
return '';
}
$this->data->seek(($advancePointer) ? -1 : 0 - $count - 2);
return $string;
}
public function __get($property) {
switch ($property) {
case 'column': return $this->_column;
break;
case 'line': return $this->_line;
break;
default: return null;
}
}
}