errorHandler = $errorHandler ?? new ParseError; if ($filePath !== 'STDIN') { $this->filePath = realpath($filePath); $data = file_get_contents($this->filePath); } else { $this->filePath = $filePath; } // DEVIATION: The spec has steps for parsing and determining the character // encoding. At this moment this implementation won't determine a character // encoding and will just assume UTF-8. # One leading U+FEFF BYTE ORDER MARK character must be ignored if any are present # in the input stream. # Note: The handling of U+0000 NULL characters varies based on where the # characters are found. In general, they are ignored except where doing so could # plausibly introduce an attack vector. This handling is, by necessity, spread # across both the tokenization stage and the tree construction stage. // DEVIATION: Just going to remove NULL characters. There is no scripting involved // in this implementation and therefore no attack vector possible due to it. $data = preg_replace(['/^\xEF\xBB\xBF/','/\x00/'], '', $data); // Won't provide line or column counts for this as it's done before that // information is available. It will be rare that this is triggered. $data = preg_replace_callback('/(?:[\x01-\x08\x0B\x0E-\x1F\x7F]|\xC2[\x80-\x9F]|\xED(?:\xA0[\x80-\xFF]|[\xA1-\xBE][\x00-\xFF]|\xBF[\x00-\xBF])|\xEF\xB7[\x90-\xAF]|\xEF\xBF[\xBE\xBF]|[\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF])/u', function($matches) { $this->error(ParseError::INVALID_CONTROL_OR_NONCHARACTERS); return ''; }, $data); // Normalize line breaks. Convert CRLF and CR to LF. // Break the string up into a traversable object. $this->data = new \MensBeam\Intl\Encoding\UTF8(str_replace(["\r\n", "\r"], "\n", $data)); } public function consume(int $length = 1): string { assert($length > 0, new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length)); for ($i = 0, $string = ''; $i < $length; $i++) { $char = $this->data->nextChar(); if ($char === "\n") { $this->newlines[] = $this->data->posChar(); $this->_column = 1; $this->_line++; } else { $this->_column++; } $string .= $char; } if (self::$debug) { echo "\nConsume\n==========\n"; echo "Length: $length\n"; echo "Data: "; var_export($string); echo "\n"; echo "Pointer: {$this->data->posChar()}\n"; echo "==========\n\n"; } return $string; } public function unconsume(int $length = 1) { assert($length > 0, new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length)); $this->data->seek(0 - $length); $string = $this->data->peekChar($length); $numOfNewlines = substr_count($string, "\n"); if ($numOfNewlines > 0) { $this->_line -= $numOfNewlines; $count = $this->newlines; $index = count($this->newlines) - ($numOfNewlines - 1); $this->_column = 1 + (($count > 0 && isset($this->newlines[$index])) ? $this->data->posChar() - $this->newlines[$index] : $this->data->posChar()); } else { $this->_column -= $length; } if (self::$debug) { echo "\nUnconsume\n==========\n"; echo "Pointer: {$this->data->posChar()}\n"; echo "==========\n\n"; } } public function consumeWhile(string $match, int $limit = 0): string { return $this->span($match, true, true, $limit); } public function consumeUntil(string $match, int $limit = 0): string { return $this->span($match, false, true, $limit); } public function peek(int $length = 1): string { assert($length > 0, new Exception(Exception::DATA_INVALID_DATA_CONSUMPTION_LENGTH, $length)); $string = $this->data->peekChar($length); if (self::$debug) { echo "\nPeek\n==========\n"; echo "Data: "; var_export($string); echo "\n"; echo "Pointer: {$this->data->posChar()}\n"; echo "==========\n\n"; } return $string; } public function peekWhile(string $match, int $limit = 0): string { return $this->span($match, true, false, $limit); } public function peekUntil(string $match, int $limit = 0): string { return $this->span($match, false, false, $limit); } public function consumeCharacterReference(string $allowedCharacter = null, bool $inAttribute = false): string { $char = $this->peek(); // OPTIMIZATION: When this spec states to return a character token of any kind this // method will just return the character. The token will be emitted from // Parser::parse() instead. Likewise, if the spec states to return nothing this // method will instead return '&' because every single use of "tokenizing a // character reference" in the spec this emits a '&' character token upon failure. # The behavior depends on the identity of the next character (the one immediately # after the U+0026 AMPERSAND character), as follows: U+0009 CHARACTER TABULATION # (tab), U+000A LINE FEED (LF), U+000C FORM FEED (FF), U+0020 SPACE, U+003C # LESS-THAN SIGN, U+0026 AMPERSAND, EOF, The additional allowed character, if # there is one. Not a character reference. No characters are consumed, and nothing # is returned. (This is not an error, either.) if ($char === "\x09" || $char === "x0A" || $char === "\x0C" || $char === ' ' || $char === '<' || $char === '&' || $char === '' || (!is_null($allowedCharacter) && $char === $allowedCharacter)) { return '&'; } # U+0023 NUMBER SIGN (#) if ($char === '#') { # Consume the U+0023 NUMBER SIGN. $this->consume(); $char = $this->peek(); # The behavior further depends on the character after the U+0023 NUMBER SIGN: # U+0078 LATIN SMALL LETTER X, U+0058 LATIN CAPITAL LETTER X if ($char === 'x' || $char === 'X') { # Consume the X. $this->consume(); # Consume as many characters as match the range of ASCII hex digits. $number = $this->consumeWhile(self::HEX); # If no characters match the range, then don't consume any characters (and # unconsume the U+0023 NUMBER SIGN character and, if appropriate, the X # character). This is a parse error; nothing is returned. if (!$number) { $this->error(ParseError::ENTITY_UNEXPECTED_CHARACTER, $this->peek(), 'hexadecimal digit'); $this->unconsume(2); return '&'; } } else { # Consume as many characters as match the range of ASCII digits. $number = $this->consumeWhile(self::DIGIT); # If no characters match the range, then don't consume any characters (and # unconsume the U+0023 NUMBER SIGN character and, if appropriate, the X # character). This is a parse error; nothing is returned. if (!$number) { $peek = $this->peek(); if ($peek !== '') { $this->error(ParseError::ENTITY_UNEXPECTED_CHARACTER, $this->peek(), 'decimal digit'); } else { $this->error(ParseError::UNEXPECTED_EOF); } $this->unconsume(); return '&'; } } # Otherwise, if the next character is a U+003B SEMICOLON, consume that too. If it # isn't, there is a parse error. $char = $this->peek(); if ($char === ';') { $this->consume(); } elseif ($char === '') { $this->error(ParseError::UNEXPECTED_EOF); } else { $this->error(ParseError::ENTITY_UNEXPECTED_CHARACTER, $char, 'semicolon terminator'); } # If one or more characters match the range, then take them all and interpret the # string of characters as a number (either hexadecimal or decimal as appropriate). # If that number is one of the numbers in the first column of the following table, # then this is a parse error. Find the row with that number in the first column, # and return a character token for the Unicode character given in the second # column of that row. // DEVIATION: Because NULL characters are stripped from the document there's no // sense in checking for them here. switch ($number) { # 0x80 U+20AC EURO SIGN (€) case 0x80: $returnValue = '€'; break; # 0x80 U+20AC EURO SIGN (€) case 0x82: $returnValue = '‚'; break; # 0x83 U+0192 LATIN SMALL LETTER F WITH HOOK (ƒ) case 0x83: $returnValue = 'ƒ'; break; # 0x84 U+201E DOUBLE LOW-9 QUOTATION MARK („) case 0x84: $returnValue = '„'; break; # 0x85 U+2026 HORIZONTAL ELLIPSIS (…) case 0x85: $returnValue = '…'; break; # 0x86 U+2020 DAGGER (†) case 0x86: $returnValue = '†'; break; # 0x87 U+2021 DOUBLE DAGGER (‡) case 0x87: $returnValue = '‡'; break; # 0x88 U+02C6 MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ) case 0x88: $returnValue = 'ˆ'; break; # 0x89 U+2030 PER MILLE SIGN (‰) case 0x89: $returnValue = '‰'; break; # 0x8A U+0160 LATIN CAPITAL LETTER S WITH CARON (Š) case 0x8A: $returnValue = 'Š'; break; # 0x8B U+2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹) case 0x8B: $returnValue = '‹'; break; # 0x8C U+0152 LATIN CAPITAL LIGATURE OE (Œ) case 0x8C: $returnValue = 'Œ'; break; # 0x8E U+017D LATIN CAPITAL LETTER Z WITH CARON (Ž) case 0x8E: $returnValue = 'Ž'; break; # 0x91 U+2018 LEFT SINGLE QUOTATION MARK (‘) case 0x91: $returnValue = '‘'; break; # 0x92 U+2019 RIGHT SINGLE QUOTATION MARK (’) case 0x92: $returnValue = '’'; break; # 0x93 U+201C LEFT DOUBLE QUOTATION MARK (“) case 0x93: $returnValue = '“'; break; # 0x94 U+201D RIGHT DOUBLE QUOTATION MARK (”) case 0x94: $returnValue = '”'; break; # 0x95 U+2022 BULLET (•) case 0x95: $returnValue = '•'; break; # 0x96 U+2013 EN DASH (–) case 0x96: $returnValue = '–'; break; # 0x97 U+2014 EM DASH (—) case 0x97: $returnValue = '—'; break; # 0x98 U+02DC SMALL TILDE (˜) case 0x98: $returnValue = '˜'; break; # 0x99 U+2122 TRADE MARK SIGN (™) case 0x99: $returnValue = '™'; break; # 0x9A U+0161 LATIN SMALL LETTER S WITH CARON (š) case 0x9A: $returnValue = 'š'; break; # 0x9B U+203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›) case 0x9B: $returnValue = '›'; break; # 0x9C U+0153 LATIN SMALL LIGATURE OE (œ) case 0x9C: $returnValue = 'œ'; break; # 0x9E U+017E LATIN SMALL LETTER Z WITH CARON (ž) case 0x9E: $returnValue = 'ž'; break; # 0x9F U+0178 LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ) case 0x9F: $returnValue = 'Ÿ'; break; default : $returnValue = null; } if ($returnValue) { $this->error(ParseError::INVALID_NUMERIC_ENTITY, $number); // Consume the ampersand but return the value instead. $this->consume(); return $returnValue; } # Otherwise, if the number is in the range 0xD800 to 0xDFFF or is greater than # 0x10FFFF, then this is a parse error. Return a U+FFFD REPLACEMENT CHARACTER # character token. if (($number >= 0xD800 && $number <= 0xDFFF) || $number > 0x10FFFF) { $this->error(ParseError::INVALID_CODEPOINT, $number); return '�'; } # Additionally, if the number is in the range 0x0001 to 0x0008, 0x000D to 0x001F, # 0x007F to 0x009F, 0xFDD0 to 0xFDEF, or is one of 0x000B, 0xFFFE, 0xFFFF, # 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, # 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, # 0xAFFFE, 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE, # 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, or 0x10FFFF, then this is a parse error. if (($number >= 0x0001 && $number <= 0x0008) || ($number >= 0x000D && $number <= 0x001F) || ($number >= 0x007F && $number <= 0x009F) || ($number >= 0xFDD0 && $number <= 0xFDEF) || $number === 0x000B || $number === 0xFFFE || $number === 0xFFFF || $number === 0x1FFFE || $number === 0x1FFFF || $number === 0x2FFFE || $number === 0x2FFFF || $number === 0x3FFFE || $number === 0x3FFFF || $number === 0x4FFFE || $number === 0x4FFFF || $number === 0x5FFFE || $number === 0x5FFFF || $number === 0x6FFFE || $number === 0x6FFFF || $number === 0x7FFFE || $number === 0x7FFFF || $number === 0x8FFFE || $number === 0x8FFFF || $number === 0x9FFFE || $number === 0x9FFFF || $number === 0xAFFFE || $number === 0xAFFFF || $number === 0xBFFFE || $number === 0xBFFFF || $number === 0xCFFFE || $number === 0xCFFFF || $number === 0xDFFFE || $number === 0xDFFFF || $number === 0xEFFFE || $number === 0xEFFFF || $number === 0xFFFFE || $number === 0xFFFFF || $number === 0x10FFFE || $number === 0x10FFFF) { $this->error(ParseError::INVALID_CODEPOINT, $number); // Consume the ampersand. $this->consume(); return '&'; } # Otherwise, return a character token for the Unicode character whose code point # is that number. return \MensBeam\Intl\Encoding\UTF8::encode((int) $number); } # Consume the maximum number of characters possible, with the consumed characters # matching one of the identifiers in the first column of the named character # references table (in a case-sensitive manner). // Implementing this by peeking ahead 33 characters that match 0-9, A-Z, a-z, and // ';'. 33 is the string length of the longest named character reference // (calculated using `max(array_map('mb_strlen', array_keys($referenceTable)));`). // It then checks the sequence of characters by checking them against a regular // expression which is generated by a script that grabs the JSON of the character // reference table from the spec and creates a somewhat optimized regular // expression. $sequence = static::peekWhile(self::DIGIT.self::ALPHA.';', 33); if ($sequence !== '' && preg_match('/^(?:[Aa]acute(?:;)?|[Aa]breve;|acd;|acE;|[Aa]circ(?:;)?|acute(?:;)?|[Aa]cy;|[Aa][Ee]lig(?:;)?|[Aa]fr;|af;|[Aa]grave(?:;)?|alefsym;|aleph;|[Aa]lpha;|[Aa]macr;|amalg;|[aA][mM][pP](?:;)?|andand;|andd;|andslope;|andv;|[Aa]nd;|ange;|angmsdaa;|angmsdab;|angmsdac;|angmsdad;|angmsdae;|angmsdaf;|angmsdag;|angmsdah;|angmsd;|angrtvbd;|angrtvb;|angrt;|angsph;|angst;|angzarr;|[Aa]ogon;|[Aa]opf;|apacir;|ap[Ee];|apid;|apos;|ApplyFunction;|approxeq;|[Aa]ring(?:;)?|[Aa]scr;|Assign;|ast;|asympeq;|asymp;|[Aa]tilde(?:;)?|[Aa]uml(?:;)?|awconint;|awint;|backcong;|backepsilon;|backprime;|backsimeq;|backsim;|Backslash;|barvee;|Barv;|barwedge;|[bB]arwed;|bbrktbrk;|bbrk;|bcong;|[Bb]cy;|bdquo;|[bB]ecause;|becaus;|bemptyv;|bepsi;|Bernoullis;|bernou;|[Bb]eta;|beth;|between;|[Bb]fr;|bigcap;|bigcirc;|bigcup;|bigodot;|bigoplus;|bigotimes;|bigsqcup;|bigstar;|bigtriangledown;|bigtriangleup;|biguplus;|bigvee;|bigwedge;|bkarow;|blacklozenge;|blacksquare;|blacktriangledown;|blacktriangleleft;|blacktriangleright;|blacktriangle;|ac;|angle;|ang;|blank;|blk12;|blk14;|blk34;|block;|bnequiv;|bne;|b[Nn]ot;|[Bb]opf;|bottom;|bot;|bowtie;|boxbox;|box[dD][lL];|box[dD][rR];|box[hH][dD];|box[hH][uU];|box[hH];|boxminus;|boxplus;|boxtimes;|box[uU][lL];|box[uU][rR];|box[vV][hH];|box[vV][lL];|box[vV][rR];|box[vV];|bprime;|[bB]reve;|brvbar(?:;)?|[bB]scr;|bsemi;|bsime;|bsim;|bsolb;|bsolhsub;|bsol;|bullet;|bull;|[Bb]umpeq;|bump[Ee];|bump;|[Cc]acute;|capand;|capbrcup;|capcap;|capcup;|capdot;|CapitalDifferentialD;|caps;|[cC]ap;|caret;|[Cc]caron;|caron;|Cayleys;|ccaps;|[Cc]cedil(?:;)?|[Cc]circ;|Cconint;|ccupssm;|ccups;|[Cc]dot;|Cedilla;|cedil(?:;)?|cemptyv;|[cC]enter[dD]ot;|cent(?:;)?|[cC]fr;|[Cc][Hh]cy;|checkmark;|check;|[Cc]hi;|circeq;|circlearrowleft;|circlearrowright;|circledast;|circledcirc;|circleddash;|CircleDot;|circledR;|circledS;|CircleMinus;|CirclePlus;|CircleTimes;|circ;|cir[Ee];|cirfnint;|cirmid;|cirscir;|cir;|ClockwiseContourIntegral;|CloseCurlyDoubleQuote;|CloseCurlyQuote;|clubsuit;|clubs;|coloneq;|[Cc]olone;|[cC]olon;|commat;|comma;|compfn;|complement;|complexes;|comp;|congdot;|Congruent;|cong;|cwconint;|[cC]onint;|ContourIntegral;|[cC]opf;|Coproduct;|coprod;|copysr;|[cC][oO][pP][yY](?:;)?|CounterClockwiseContourIntegral;|crarr;|[cC]ross;|[Cc]scr;|csube;|csub;|csupe;|csup;|ctdot;|cudarrl;|cudarrr;|cuepr;|cuesc;|cularrp;|cularr;|cupbrcap;|[cC]up[cC]ap;|cupcup;|cupdot;|cupor;|cups;|[cC]up;|curarrm;|curarr;|curlyeqprec;|curlyeqsucc;|curlyvee;|curlywedge;|curren(?:;)?|curvearrowleft;|curvearrowright;|cuvee;|cuwed;|cwint;|cylcty;|[dD]agger;|daleth;|[dD][aA]rr;|[Dd]ashv;|dash;|dbkarow;|dblac;|[Dd]caron;|[Dd]cy;|ddagger;|ddarr;|DDotrahd;|ddotseq;|[Dd][Dd];|deg(?:;)?|[Dd]elta;|Del;|demptyv;|dfisht;|[Dd]fr;|dharl;|dharr;|dHar;|DiacriticalAcute;|DiacriticalDot;|DiacriticalDoubleAcute;|DiacriticalGrave;|DiacriticalTilde;|diamondsuit;|[dD]iamond;|diams;|diam;|die;|DifferentialD;|digamma;|disin;|divideontimes;|divide(?:;)?|divonx;|div;|[Dd][Jj]cy;|dlcorn;|dlcrop;|dollar;|[Dd]opf;|DotDot;|doteqdot;|DotEqual;|doteq;|dotminus;|dotplus;|dotsquare;|doublebarwedge;|DoubleContourIntegral;|DoubleDot;|[Dd]ot;|DoubleDownArrow;|DoubleLeftArrow;|DoubleLeftRightArrow;|DoubleLeftTee;|DoubleLongLeftArrow;|DoubleLongLeftRightArrow;|DoubleLongRightArrow;|DoubleRightArrow;|DoubleRightTee;|DoubleUpArrow;|DoubleUpDownArrow;|DoubleVerticalBar;|DownArrowBar;|DownArrowUpArrow;|downdownarrows;|[dD]own[aA]rrow;|DownBreve;|downharpoonleft;|downharpoonright;|DownLeftRightVector;|DownLeftTeeVector;|DownLeftVectorBar;|DownLeftVector;|DownRightTeeVector;|DownRightVectorBar;|DownRightVector;|DownTeeArrow;|DownTee;|drbkarow;|drcorn;|drcrop;|[Dd]scr;|[Dd][Ss]cy;|dsol;|[Dd]strok;|dtdot;|dtrif;|dtri;|duarr;|duhar;|dwangle;|[Dd][Zz]cy;|dzigrarr;|[Ee]acute(?:;)?|easter;|[Ee]caron;|[Ee]circ(?:;)?|ecir;|ecolon;|[Ee]cy;|eDDot;|[Ee][dD]ot;|ee;|efDot;|[Ee]fr;|[Ee]grave(?:;)?|egsdot;|egs;|eg;|Element;|elinters;|ell;|elsdot;|els;|el;|[Ee]macr;|emptyset;|EmptySmallSquare;|EmptyVerySmallSquare;|emptyv;|empty;|emsp13;|emsp14;|emsp;|[Ee][Nn][Gg];|ensp;|[Ee]ogon;|[Ee]opf;|eparsl;|epar;|eplus;|[Ee]psilon;|epsiv;|epsi;|eqcirc;|eqcolon;|eqsim;|eqslantgtr;|eqslantless;|equals;|EqualTilde;|Equal;|equest;|Equilibrium;|equivDD;|equiv;|eqvparsl;|erarr;|erDot;|[eE]scr;|esdot;|[Ee]sim;|[Ee]ta;|[Ee][Tt][Hh](?:;)?|[Ee]uml(?:;)?|euro;|excl;|Exists;|exist;|expectation;|[eE]xponential[eE];|fallingdotseq;|[Ff]cy;|female;|ffilig;|fflig;|ffllig;|[Ff]fr;|filig;|FilledSmallSquare;|FilledVerySmallSquare;|fjlig;|flat;|fllig;|fltns;|fnof;|[Ff]opf;|[fF]or[aA]ll;|forkv;|fork;|Fouriertrf;|fpartint;|frac12(?:;)?|frac13;|frac14(?:;)?|frac15;|frac16;|frac18;|frac23;|frac25;|frac34(?:;)?|frac35;|frac38;|frac45;|frac56;|frac58;|frac78;|frasl;|frown;|[fF]scr;|gacute;|[Gg]ammad;|[Gg]amma;|gap;|[Gg]breve;|Gcedil;|[Gg]circ;|[Gg]cy;|[Gg]dot;|g[Ee]l;|geqq;|geqslant;|geq;|gescc;|gesdotol;|gesdoto;|gesdot;|gesles;|gesl;|ges;|g[eE];|[Gg]fr;|ggg;|[gG]g;|gimel;|[Gg][Jj]cy;|gla;|glE;|glj;|gl;|gnapprox;|gnap;|gneqq;|gneq;|gn[eE];|gnsim;|[Gg]opf;|grave;|GreaterEqualLess;|GreaterEqual;|GreaterFullEqual;|GreaterGreater;|GreaterLess;|GreaterSlantEqual;|GreaterTilde;|[Gg]scr;|gsime;|gsiml;|gsim;|gtcc;|gtcir;|gtdot;|gtlPar;|gtquest;|gtrapprox;|gtrarr;|gtrdot;|gtreqless;|gtreqqless;|gtrless;|gtrsim;|[gG][tT](?:;)?|gvertneqq;|gvnE;|Hacek;|hairsp;|half;|hamilt;|[Hh][Aa][Rr][Dd]cy;|harrcir;|harrw;|h[aA]rr;|Hat;|hbar;|[Hh]circ;|heartsuit;|hearts;|hellip;|hercon;|[hH]fr;|HilbertSpace;|hksearow;|hkswarow;|hoarr;|homtht;|hookleftarrow;|hookrightarrow;|[hH]opf;|horbar;|HorizontalLine;|[hH]scr;|hslash;|[Hh]strok;|HumpDownHump;|HumpEqual;|hybull;|hyphen;|[Ii]acute(?:;)?|[Ii]circ(?:;)?|[Ii]cy;|ic;|Idot;|[Ii][Ee]cy;|iexcl(?:;)?|iff;|[iI]fr;|[Ii]grave(?:;)?|iiiint;|iiint;|iinfin;|iiota;|ii;|[Ii][Jj]lig;|[Ii]macr;|image;|ImaginaryI;|imagline;|imagpart;|imath;|imof;|imped;|Implies;|Im;|incare;|infintie;|infin;|inodot;|intcal;|integers;|Integral;|intercal;|Intersection;|intlarhk;|intprod;|[iI]nt;|InvisibleComma;|InvisibleTimes;|in;|[Ii][Oo]cy;|[Ii]ogon;|[Ii]opf;|[Ii]ota;|iprod;|iquest(?:;)?|[iI]scr;|isindot;|isinE;|isinsv;|isins;|isinv;|isin;|[Ii]tilde;|it;|[Ii]ukcy;|[Ii]uml(?:;)?|[Jj]circ;|[Jj]cy;|[Jj]fr;|jmath;|[Jj]opf;|[Jj]scr;|[Jj]sercy;|[Jj]ukcy;|kappav;|[Kk]appa;|[Kk]cedil;|[Kk]cy;|[Kk]fr;|kgreen;|[Kk][Hh]cy;|[Kk][Jj]cy;|[Kk]opf;|[Kk]scr;|lAarr;|[Ll]acute;|laemptyv;|lagran;|[Ll]ambda;|langd;|langle;|[lL]ang;|Laplacetrf;|lap;|laquo(?:;)?|larrbfs;|larrb;|larrfs;|larrhk;|larrlp;|larrpl;|larrsim;|larrtl;|[lL][aA]rr;|l[aA]tail;|lates;|late;|lat;|l[bB]arr;|lbbrk;|lbrace;|lbrack;|lbrke;|lbrksld;|lbrkslu;|[Ll]caron;|[Ll]cedil;|lceil;|lcub;|[Ll]cy;|ldca;|ldquor;|ldquo;|ldrdhar;|ldrushar;|ldsh;|LeftAngleBracket;|LeftArrowBar;|LeftArrowRightArrow;|leftarrowtail;|[lL]eft[aA]rrow;|LeftCeiling;|LeftDoubleBracket;|LeftDownTeeVector;|LeftDownVectorBar;|LeftDownVector;|LeftFloor;|leftharpoondown;|leftharpoonup;|leftleftarrows;|leftrightarrows;|[lL]eft[rR]ight[aA]rrow;|leftrightharpoons;|leftrightsquigarrow;|LeftRightVector;|LeftTeeArrow;|LeftTeeVector;|LeftTee;|leftthreetimes;|LeftTriangleBar;|LeftTriangleEqual;|LeftTriangle;|LeftUpDownVector;|LeftUpTeeVector;|LeftUpVectorBar;|LeftUpVector;|LeftVectorBar;|LeftVector;|l[Ee]g;|leqq;|leqslant;|leq;|lescc;|lesdotor;|lesdoto;|lesdot;|lesges;|lesg;|lessapprox;|lessdot;|lesseqgtr;|lesseqqgtr;|LessEqualGreater;|LessFullEqual;|LessGreater;|lessgtr;|LessLess;|lesssim;|LessSlantEqual;|LessTilde;|les;|l[eE];|lfisht;|lfloor;|[Ll]fr;|lgE;|lg;|lhard;|lharul;|lharu;|lHar;|lhblk;|[Ll][Jj]cy;|llarr;|llcorner;|Lleftarrow;|llhard;|lltri;|[lL]l;|[Ll]midot;|lmoustache;|lmoust;|lnapprox;|lnap;|lneqq;|lneq;|ln[eE];|lnsim;|loang;|loarr;|lobrk;|[lL]ong[lL]eft[aA]rrow;|[lL]ong[lL]eft[rR]ight[aA]rrow;|longmapsto;|[lL]ong[rR]ight[aA]rrow;|looparrowleft;|looparrowright;|lopar;|[Ll]opf;|loplus;|lotimes;|lowast;|lowbar;|LowerLeftArrow;|LowerRightArrow;|lozenge;|lozf;|loz;|lparlt;|lpar;|lrarr;|lrcorner;|lrhard;|lrhar;|lrm;|lrtri;|lsaquo;|[lL]scr;|[lL]sh;|lsime;|lsimg;|lsim;|lsqb;|lsquor;|lsquo;|[Ll]strok;|ltcc;|ltcir;|ltdot;|lthree;|ltimes;|ltlarr;|ltquest;|ltrie;|ltrif;|ltri;|ltrPar;|[lL][tT](?:;)?|lurdshar;|luruhar;|lvertneqq;|lvnE;|macr(?:;)?|male;|maltese;|malt;|mapstodown;|mapstoleft;|mapstoup;|mapsto;|[Mm]ap;|marker;|mcomma;|[Mm]cy;|mdash;|mDDot;|measuredangle;|MediumSpace;|Mellintrf;|[Mm]fr;|mho;|micro(?:;)?|midast;|midcir;|middot(?:;)?|mid;|minusb;|minusdu;|minusd;|MinusPlus;|minus;|mlcp;|mldr;|mnplus;|models;|[Mm]opf;|mp;|[Mm]scr;|mstpos;|multimap;|mumap;|[mM]u;|nabla;|[nN]acute;|nang;|napE;|napid;|napos;|precnapprox;|succnapprox;|napprox;|approx;|nap;|naturals;|natural;|natur;|nbsp(?:;)?|nbumpe;|nbump;|ncap;|ap;|[nN]caron;|[nN]cedil;|ncongdot;|ncong;|ncup;|[nN]cy;|ndash;|nearhk;|nearrow;|ne[Aa]rr;|nedot;|NegativeMediumSpace;|NegativeThickSpace;|NegativeThinSpace;|NegativeVeryThinSpace;|nequiv;|nesear;|nesim;|NestedGreaterGreater;|NestedLessLess;|NewLine;|nexists;|nexist;|ne;|[nN]fr;|ngeqq;|ngeqslant;|ngeq;|nges;|ng[eE];|nGg;|ngsim;|ngtr;|nGtv;|n[gG]t;|nh[Aa]rr;|nhpar;|nisd;|nis;|[nN][jJ]cy;|nl[Aa]rr;|nldr;|n[Ll]eftarrow;|n[Ll]eftrightarrow;|nleqq;|nleqslant;|nleq;|nless;|nles;|nl[eE];|nLl;|nlsim;|nltrie;|nltri;|nLtv;|n[lL]t;|nmid;|NoBreak;|NonBreakingSpace;|[Nn]opf;|NotCongruent;|NotCupCap;|NotDoubleVerticalBar;|NotElement;|NotEqualTilde;|NotEqual;|NotExists;|NotGreaterEqual;|NotGreaterFullEqual;|NotGreaterGreater;|NotGreaterLess;|NotGreaterSlantEqual;|NotGreaterTilde;|NotGreater;|NotHumpDownHump;|NotHumpEqual;|notindot;|notinE;|notinva;|notinvb;|notinvc;|notin;|NotLeftTriangleBar;|NotLeftTriangleEqual;|NotLeftTriangle;|NotLessEqual;|NotLessGreater;|NotLessLess;|NotLessSlantEqual;|NotLessTilde;|NotLess;|NotNestedGreaterGreater;|NotNestedLessLess;|notniva;|notnivb;|notnivc;|niv;|notni;|ni;|NotPrecedesEqual;|NotPrecedesSlantEqual;|NotPrecedes;|NotReverseElement;|NotRightTriangleBar;|NotRightTriangleEqual;|NotRightTriangle;|NotSquareSubsetEqual;|NotSquareSubset;|NotSquareSupersetEqual;|NotSquareSuperset;|NotSubsetEqual;|NotSubset;|NotSucceedsEqual;|NotSucceedsSlantEqual;|NotSucceedsTilde;|NotSucceeds;|NotSupersetEqual;|NotSuperset;|NotTildeEqual;|NotTildeFullEqual;|NotTildeTilde;|NotTilde;|NotVerticalBar;|[nN]ot(?:;)?|nparallel;|nparsl;|npart;|npar;|npolint;|nprcue;|npreceq;|nprec;|npre;|npr;|nrarrc;|nrarrw;|nr[Aa]rr;|n[Rr]ightarrow;|nrtrie;|nrtri;|nsccue;|nsce;|[nN]scr;|nsc;|nshortmid;|nshortparallel;|nsimeq;|nsime;|nsim;|nsmid;|nspar;|nsqsube;|nsqsupe;|nsub[eE];|nsubseteqq;|nsubseteq;|nsubset;|nsub;|nsucceq;|nsucc;|nsup[eE];|nsupseteqq;|nsupseteq;|nsupset;|nsup;|ntgl;|[nN]tilde(?:;)?|ntlg;|ntrianglelefteq;|ntriangleleft;|ntrianglerighteq;|ntriangleright;|numero;|numsp;|num;|[nN]u;|nvap;|n[Vv][Dd]ash;|nvge;|nvgt;|nvHarr;|nvinfin;|nvlArr;|nvle;|nvltrie;|nvlt;|nvrArr;|nvrtrie;|nvsim;|nwarhk;|nwarrow;|nw[Aa]rr;|nwnear;|[oO]acute(?:;)?|oast;|[oO]circ(?:;)?|ocir;|[oO]cy;|odash;|[oO]dblac;|odiv;|odot;|odsold;|[oO][eE]lig;|ofcir;|[oO]fr;|ogon;|[oO]grave(?:;)?|ogt;|ohbar;|ohm;|oint;|olarr;|olcir;|olcross;|oline;|olt;|[oO]macr;|[oO]mega;|[oO]micron;|omid;|ominus;|[oO]opf;|opar;|OpenCurlyDoubleQuote;|OpenCurlyQuote;|operp;|oplus;|orarr;|orderof;|order;|ordf(?:;)?|ordm(?:;)?|ord;|origof;|oror;|orslope;|orv;|[oO]scr;|[oO]slash(?:;)?|osol;|oS;|[oO]tilde(?:;)?|otimesas;|[oO]times;|[oO]uml(?:;)?|ovbar;|OverBar;|OverBrace;|OverBracket;|OverParenthesis;|parallel;|para(?:;)?|parsim;|parsl;|PartialD;|part;|par;|[pP]cy;|percnt;|period;|permil;|perp;|pertenk;|[pP]fr;|phiv;|[pP]hi;|phmmat;|phone;|pitchfork;|piv;|[pP]i;|planckh;|planck;|plankv;|plusacir;|plusb;|pluscir;|plusdo;|plusdu;|pluse;|PlusMinus;|plusmn(?:;)?|plussim;|plustwo;|plus;|pm;|Poincareplane;|pointint;|[Pp]opf;|pound(?:;)?|prap;|prcue;|precapprox;|preccurlyeq;|PrecedesEqual;|PrecedesSlantEqual;|PrecedesTilde;|Precedes;|preceq;|precneqq;|precnsim;|precsim;|prec;|pr[Ee];|primes;|[Pp]rime;|prnap;|prnE;|prnsim;|Product;|prod;|profalar;|profline;|profsurf;|Proportional;|Proportion;|propto;|prop;|prsim;|prurel;|[pP]r;|[pP]scr;|[pP]si;|puncsp;|[qQ]fr;|qint;|[Qq]opf;|qprime;|[qQ]scr;|quaternions;|quatint;|questeq;|quest;|[Qq][Uu][Oo][Tt](?:;)?|rAarr;|race;|[rR]acute;|radic;|raemptyv;|rangd;|range;|rangle;|[Rr]ang;|raquo(?:;)?|rarrap;|rarrbfs;|rarrb;|rarrc;|rarrfs;|rarrhk;|rarrlp;|rarrpl;|rarrsim;|[rR]arrtl;|rarrw;|[rR][Aa]rr;|r[Aa]tail;|rationals;|ratio;|[Rr][Bb]arr;|rbbrk;|rbrace;|rbrack;|rbrke;|rbrksld;|rbrkslu;|[rR]caron;|[rR]cedil;|rceil;|rcub;|[rR]cy;|rdca;|rdldhar;|rdquor;|rdquo;|rdsh;|realine;|realpart;|reals;|real;|rect;|[Rr][Ee][Gg](?:;)?|ReverseElement;|ReverseEquilibrium;|ReverseUpEquilibrium;|Re;|rfisht;|rfloor;|[Rr]fr;|rhard;|rharul;|rharu;|rHar;|rhov;|[rR]ho;|RightAngleBracket;|RightArrowBar;|RightArrowLeftArrow;|rightarrowtail;|[Rr]ight[aA]rrow;|RightCeiling;|RightDoubleBracket;|RightDownTeeVector;|RightDownVectorBar;|RightDownVector;|RightFloor;|rightharpoondown;|rightharpoonup;|rightleftarrows;|rightleftharpoons;|rightrightarrows;|rightsquigarrow;|RightTeeArrow;|RightTeeVector;|RightTee;|rightthreetimes;|RightTriangleBar;|RightTriangleEqual;|RightTriangle;|RightUpDownVector;|RightUpTeeVector;|RightUpVectorBar;|RightUpVector;|RightVectorBar;|RightVector;|[oO]r;|ring;|risingdotseq;|rlarr;|rlhar;|rlm;|rmoustache;|rmoust;|rnmid;|roang;|roarr;|robrk;|ropar;|[Rr]opf;|roplus;|rotimes;|RoundImplies;|rpargt;|rpar;|rppolint;|rrarr;|Rrightarrow;|rsaquo;|[Rr]scr;|[Rr]sh;|rsqb;|rsquor;|rsquo;|rthree;|rtimes;|rtrie;|rtrif;|rtriltri;|rtri;|RuleDelayed;|ruluhar;|rx;|[sS]acute;|sbquo;|scap;|[sS]caron;|sccue;|[sS]cedil;|sc[Ee];|[sS]circ;|scnap;|scnE;|scnsim;|scpolint;|scsim;|[sS]cy;|[sS]c;|sdotb;|sdote;|sdot;|searhk;|searrow;|se[Aa]rr;|sect(?:;)?|semi;|seswar;|setminus;|setmn;|sext;|sfrown;|[sS]fr;|sharp;|[sS][hH][cC][hH]cy;|[sS][hH]cy;|ShortDownArrow;|ShortLeftArrow;|shortmid;|shortparallel;|ShortRightArrow;|ShortUpArrow;|shy(?:;)?|sigmaf;|sigmav;|[sS]igma;|simdot;|simeq;|sime;|simgE;|simg;|simlE;|siml;|simne;|simplus;|simrarr;|sim;|slarr;|SmallCircle;|smallsetminus;|smashp;|smeparsl;|smid;|smile;|smtes;|smte;|smt;|[sS][oO][fF][tT]cy;|solbar;|solb;|sol;|[sS]opf;|spadesuit;|spades;|spar;|sqcaps;|sqcap;|sqcups;|sqcup;|Sqrt;|sqsube;|sqsubseteq;|sqsubset;|sqsub;|sqsupe;|sqsupseteq;|sqsupset;|sqsup;|SquareIntersection;|SquareSubsetEqual;|SquareSubset;|SquareSupersetEqual;|SquareSuperset;|SquareUnion;|[Ss]quare;|squarf;|squf;|squ;|srarr;|[sS]scr;|ssetmn;|ssmile;|sstarf;|starf;|[sS]tar;|straightepsilon;|straightphi;|strns;|subdot;|subedot;|sub[eE];|submult;|subn[eE];|subplus;|subrarr;|subseteqq;|SubsetEqual;|subseteq;|subsetneqq;|subsetneq;|[Ss]ubset;|subsim;|subsub;|subsup;|[Ss]ub;|succapprox;|succcurlyeq;|SucceedsEqual;|SucceedsSlantEqual;|SucceedsTilde;|Succeeds;|succeq;|succneqq;|succnsim;|succsim;|succ;|SuchThat;|[Ss]um;|sung;|sup1(?:;)?|sup2(?:;)?|sup3(?:;)?|supdot;|supdsub;|supedot;|SupersetEqual;|Superset;|sup[eE];|suphsol;|suphsub;|suplarr;|supmult;|supn[eE];|supplus;|supseteqq;|supseteq;|supsetneqq;|supsetneq;|[Ss]upset;|supsim;|supsub;|supsup;|[Ss]up;|swarhk;|swarrow;|sw[Aa]rr;|swnwar;|szlig(?:;)?|Tab;|target;|[tT]au;|tbrk;|[tT]caron;|[tT]cedil;|[tT]cy;|tdot;|telrec;|[tT]fr;|there4;|[Tt]herefore;|thetasym;|thetav;|[tT]heta;|thickapprox;|thicksim;|ThickSpace;|ThinSpace;|thinsp;|thkap;|thksim;|[tT][hH][oO][rR][nN](?:;)?|TildeEqual;|TildeFullEqual;|TildeTilde;|timesbar;|timesb;|timesd;|times(?:;)?|tint;|toea;|topbot;|topcir;|topfork;|[tT]opf;|top;|tosa;|tprime;|[Tt][Rr][Aa][Dd][Ee];|triangledown;|trianglelefteq;|triangleleft;|triangleq;|trianglerighteq;|triangleright;|triangle;|tridot;|trie;|triminus;|TripleDot;|triplus;|trisb;|tritime;|trpezium;|[tT]scr;|[tT][sS]cy;|[tT][sS][hH]cy;|[tT]strok;|twixt;|twoheadleftarrow;|twoheadrightarrow;|[uU]acute(?:;)?|Uarrocir;|[uU][Aa]rr;|[uU]brcy;|[uU]breve;|[uU]circ(?:;)?|[uU]cy;|udarr;|[uU]dblac;|udhar;|ufisht;|[uU]fr;|[uU]grave(?:;)?|uharl;|uharr;|uHar;|uhblk;|ulcorner;|ulcorn;|ulcrop;|ultri;|[uU]macr;|uml(?:;)?|UnderBar;|UnderBrace;|UnderBracket;|UnderParenthesis;|UnionPlus;|Union;|[uU]ogon;|[uU]opf;|UpArrowBar;|UpArrowDownArrow;|[Uu]p[aA]rrow;|[Uu]p[dD]own[aA]rrow;|UpEquilibrium;|upharpoonleft;|upharpoonright;|uplus;|UpperLeftArrow;|UpperRightArrow;|upsih;|[uU]psilon;|[Uu]psi;|UpTeeArrow;|UpTee;|upuparrows;|urcorner;|urcorn;|urcrop;|[uU]ring;|urtri;|[uU]scr;|utdot;|[uU]tilde;|[Tt]ilde;|utrif;|utri;|uuarr;|[uU]uml(?:;)?|uwangle;|vangrt;|varepsilon;|varkappa;|varnothing;|varphi;|varpi;|varpropto;|varrho;|v[Aa]rr;|varsigma;|varsubsetneqq;|varsubsetneq;|varsupsetneqq;|varsupsetneq;|vartheta;|vartriangleleft;|vartriangleright;|vBarv;|[Vv][bB]ar;|[vV]cy;|Vdashl;|[Vv][Dd]ash;|veebar;|veeeq;|[Vv]ee;|vellip;|[Vv]erbar;|VerticalBar;|VerticalLine;|VerticalSeparator;|VerticalTilde;|[Vv]ert;|VeryThinSpace;|[vV]fr;|vltri;|vnsub;|vnsup;|[vV]opf;|vprop;|vrtri;|[vV]scr;|vsubn[eE];|vsupn[eE];|Vvdash;|vzigzag;|[wW]circ;|wedbar;|wedgeq;|xwedge;|[Ww]edge;|weierp;|[wW]fr;|[wW]opf;|wp;|wreath;|wr;|[wW]scr;|xcap;|xcirc;|xcup;|xdtri;|[xX]fr;|xh[Aa]rr;|[xX]i;|xl[Aa]rr;|xmap;|xnis;|xodot;|[xX]opf;|xoplus;|xotime;|xr[Aa]rr;|[xX]scr;|xsqcup;|xuplus;|xutri;|xvee;|[yY]acute(?:;)?|[yY][aA]cy;|[yY]circ;|[yY]cy;|yen(?:;)?|[yY]fr;|[yY][iI]cy;|[yY]opf;|[yY]scr;|[yY][uU]cy;|[Yy]uml(?:;)?|[zZ]acute;|[zZ]caron;|[zZ]cy;|[zZ]dot;|zeetrf;|ZeroWidthSpace;|[zZ]eta;|[Zz]fr;|[zZ][hH]cy;|zigrarr;|[Zz]opf;|[zZ]scr;|zwj;|zwnj;)/', $sequence, $matches)) { $sequence = $matches[0]; $lastChar = substr($sequence, -1); # If the character reference is being consumed as part of an attribute, and the # last character matched is not a U+003B SEMICOLON character (;), and the next # character is either a U+003D EQUALS SIGN character (=) or an alphanumeric ASCII # character, then, for historical reasons, all the characters that were matched # after the U+0026 AMPERSAND character (&) must be unconsumed, and nothing is # returned. However, if this next character is in fact a U+003D EQUALS SIGN # character (=), then this is a parse error, because some legacy user agents will # misinterpret the markup in those cases. // OPTIMIZATION: Not consuming here until this stuff is checked because there's no // sense in consuming characters and then turning right back around and unconsuming // them. Will consume after this step instead. $next = $this->peek(); if ($inAttribute && $lastChar !== ';' && ($next === '=' || ctype_alnum($next))) { if ($next === '=') { $this->error(ParseError::ENTITY_UNEXPECTED_CHARACTER, $next, 'semicolon terminator'); } // Consume the ampersand. $this->consume(); return '&'; } // Add 1 to the string length because the & isn't included in the matched // sequence. $this->consume(strlen($sequence) + 1); if ($lastChar !== ';') { // Used for PHP's entity decoder. Described below. $sequence.=';'; $this->error(ParseError::ENTITY_UNEXPECTED_CHARACTER, $lastChar, 'semicolon terminator'); } # Return one or two character tokens for the character(s) corresponding to the # character reference name (as given by the second column of the named character # references table). // DEVIATION: Since the regular expression above checks the validity of the // regular expression there isn't a need for the table. Can use PHP's built in // decoder at least until there's entities in the table that aren't in the spec's. return html_entity_decode('&'.$sequence, ENT_HTML5); } # If no match can be made, then no characters are consumed, and nothing is # returned. In this case, if the characters after the U+0026 AMPERSAND character # (&) consist of a sequence of one or more alphanumeric ASCII characters followed # by a U+003B SEMICOLON character (;), then this is a parse error. if (preg_match('/^[A-Za-z0-9]+;/', $char)) { $this->error(ParseError::INVALID_NAMED_ENTITY, $char); } // Consume the ampersand. $this->consume(); return '&'; } protected function span(string $match, bool $while = true, bool $advancePointer = true, int $limit = 0): string { // Break the matching characters into an array of characters. Unicode friendly. $match = preg_split('/(?data->nextChar(); if ($char === '') { break; } $inArray = in_array($char, $match); // strspn if ($while && !$inArray) { break; } // strcspn elseif (!$while && $inArray) { break; } if ($advancePointer) { if ($char === "\n") { $this->newlines[] = $this->data->posChar(); $this->_column = 1; $this->_line++; } else { $this->_column++; } } $string .= $char; $count++; if ($count === $limit) { break; } } // If the end is reached the pointer isn't moved when the last character // is checked, so it only needs to be moved backwards if not wanting the // pointer to move. if ($char === '') { if (!$advancePointer) { $this->data->seek(0 - $count - 1); } } else { $this->data->seek(($advancePointer) ? -1 : 0 - $count - 2); } if (self::$debug) { echo ($advancePointer) ? "\nconsume" : "\npeek"; echo ($while) ? 'While' : 'Until'; echo "\n==========\nPattern: "; var_export(str_replace(["\t", "\n", "\x0c", "\x0d"], ['\t', '\n', '\x0c', '\x0d'], implode('', $match))); echo "\nData: "; var_export($string); echo "\nPointer: {$this->data->posChar()}\n==========\n\n"; } return $string; } public function __get($property) { switch ($property) { case 'column': return $this->_column; break; case 'line': return $this->_line; break; case 'pointer': return $this->data->posChar(); break; default: return null; } } }