string = $data; $this->errorHandler = $errorHandler; $config = $config ?? new Config; $encodingOrContentType = (string) $encodingOrContentType; $prescanBytes = (int) ($config->encodingPrescanBytes ?? 1024); $fallbackEncoding = (string) $config->encodingFallback; // don't track the current line/column position if error reporting has been suppressed $this->track = (bool) $this->errorHandler; # 13.2.3.2 Determining the character encoding # User agents must use the following algorithm, called the encoding # sniffing algorithm, to determine the character encoding to use # when decoding a document in the first pass. This algorithm takes # as input any out-of-band metadata available to the user agent # (e.g. the Content-Type metadata of the document) and all the bytes # available so far, and returns a character encoding and a confidence # that is either tentative or certain. // NOTE: We implement steps 1, 2, 4, 5, and 9 if ($encoding = Charset::fromBOM($data)) { # If the result of BOM sniffing is an encoding, return that # encoding with confidence certain. $this->encodingCertain = true; } elseif ($encoding = Charset::fromCharset($encodingOrContentType)) { # If the user has explicitly instructed the user agent to override # the document's character encoding with a specific encoding, # optionally return that encoding with the confidence certain. $this->encodingCertain = true; } elseif ($encoding = Charset::fromTransport($encodingOrContentType)) { # If the transport layer specifies a character encoding, and it is # supported, return that encoding with the confidence certain. $this->encodingCertain = true; } elseif ($encoding = Charset::fromPrescan($data, $prescanBytes)) { # Optionally prescan the byte stream to determine its encoding. # The aforementioned algorithm either aborts unsuccessfully or # returns a character encoding. If it returns a character # encoding, then return the same encoding, with confidence # tentative. $this->encodingCertain = false; } else { # Otherwise, return an implementation-defined or user-specified # default character encoding, with the confidence tentative. $encoding = Charset::fromCharset($fallbackEncoding) ?? "windows-1252"; $this->encodingCertain = false; } $this->encoding = $encoding; $this->data = Encoding::createDecoder($encoding, $data, false, true); } public function consume(): string { $char = $this->data->nextChar(); # Before the tokenization stage, the input stream must be # preprocessed by normalizing newlines. # Thus, newlines in HTML DOMs are represented by U+000A LF characters, # and there are never any U+000D CR characters in the input to the tokenization stage. if ($char === "\r") { // if this is a CR+LF pair, skip the CR and note the normalization if ($this->data->peekChar() === "\n") { $char = $this->data->nextChar(); $this->normalized[$this->data->posChar()] = true; } // otherwise just silently change the character to LF; // the bare CR will be trivial to process when seeking backwards else { $char = "\n"; } } elseif ($char === '') { $this->eof = true; } // unless we're peeking, track line and column position, and whether we've hit EOF if ($this->track) { if ($char === '') { // do nothing } elseif ($char === "\n") { $this->newlines[$this->data->posChar()] = $this->_column; $this->_column = 0; $this->_line++; } else { $this->_column++; $len = strlen($char); $here = $this->data->posChar(); if ($this->lastError < $here) { // look for erroneous characters if ($len === 1) { $ord = ord($char); if (($ord < 0x20 && !in_array($ord, [0x0, 0x9, 0xA, 0xC])) || $ord === 0x7F) { $this->error(ParseError::CONTROL_CHARACTER_IN_INPUT_STREAM); $this->lastError = $here; } } elseif ($len === 2) { if (ord($char[0]) == 0xC2) { $ord = ord($char[1]); if ($ord >= 0x80 && $ord <= 0x9F) { $this->error(ParseError::CONTROL_CHARACTER_IN_INPUT_STREAM); $this->lastError = $here; } } } elseif ($len === 3) { $head = ord($char[0]); if ($head === 0xED) { $tail = (ord($char[1]) << 8) + ord($char[2]); if ($tail >= 0xA080 && $tail <= 0xBFBF) { $this->error(ParseError::SURROGATE_IN_INPUT_STREAM); $this->lastError = $here; } } elseif ($head === 0xEF) { $tail = (ord($char[1]) << 8) + ord($char[2]); if (($tail >= 0xB790 && $tail <= 0xB7AF) || $tail >= 0xBFBE) { $this->error(ParseError::NONCHARACTER_IN_INPUT_STREAM); $this->lastError = $here; } elseif ($tail === 0xBFBD && $this->data->posErr === $here) { $this->error(ParseError::NONCHARACTER_IN_INPUT_STREAM); $this->lastError = $here; } } } elseif ($len === 4) { $tail = (ord($char[2]) << 8) + ord($char[3]); if ($tail >= 0xBFBE) { $this->error(ParseError::NONCHARACTER_IN_INPUT_STREAM); $this->lastError = $here; } $this->astrals[$here] = true; } } } } return $char; } public function unconsume(int $length = 1, bool $retreatPointer = true): void { assert($length > 0, new \Exception("Value must be non-negative")); if ($this->eof) { $length--; $this->eof = false; } while ($length-- > 0) { $here = $this->data->posChar(); // if the previous character was a normalized CR+LF pair, we need to go back two if (isset($this->normalized[$here])) { // NOTE: This case is never encountered by the parser $this->data->seek(-1); // @codeCoverageIgnore } // recalculate line and column positions, if requested if ($retreatPointer && $this->track) { // NOTE: These cases are never encountered by the parser // @codeCoverageIgnoreStart if ($col = $this->newlines[$here] ?? 0) { $this->_column = $col + 1; $this->_line--; } elseif ($this->astrals[$here] ?? false) { $this->_column--; } // @codeCoverageIgnoreEnd $this->_column--; } $this->data->seek(-1); } } public function consumeWhile(string $match, int $limit = null): string { $start = $this->data->posChar(); $out = $this->data->asciiSpan($match, $limit); if ($this->track) { $this->_column += ($this->data->posChar() - $start); } return $out; } public function consumeUntil(string $match, int $limit = null): string { $start = $this->data->posChar(); if ($this->track) { // control characters produce parse errors $match .= "\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\x7F"; $out = $this->data->asciiSpanNot($match."\r\n", $limit); $this->_column += ($this->data->posChar() - $start); return $out; } else { return $this->data->asciiSpanNot($match."\r\n", $limit); } } public function peek(int $length = 1): string { assert($length > 0, new \Exception("Value must be non-negative")); return $this->data->peekChar($length); } /** Returns an indexed array with the line and column positions of the requested offset from the current position */ public function whereIs(int $relativePos): array { if ($this->track) { if ($this->eof) { $relativePos++; if ($this->astrals[$this->data->posChar()] ?? false) { $relativePos++; } } if ($relativePos === 0) { if (!$this->_column && $this->_line > 1) { return [$this->_line - 1, $this->newlines[$this->data->posChar()] + 1]; } else { return [$this->_line, $this->_column]; } } elseif ($relativePos < 0) { $pos = $this->data->posChar(); $line = $this->_line; $col = $this->_column; do { // If the current position is the start of a line, // get the column position of the end of the previous line // NOTE: These cases are never encountered by the parser // @codeCoverageIgnoreStart if (isset($this->newlines[$pos])) { $line--; $col = $this->newlines[$pos] + 1; // If the newline was a normalized CR+LF pair, // go back one extra character if (isset($this->normalized[$pos])) { $pos--; } } elseif ($this->astrals[$pos] ?? false) { // supplementary plane characters count as two $col--; } // @codeCoverageIgnoreEnd $col--; $pos--; } while (++$relativePos < 0); return [$line, $col]; } else { return [$this->_line, $this->_column + $relativePos]; } } else { return [0, 0]; // @codeCoverageIgnore } } public function __get($property) { switch ($property) { case 'column': return $this->_column; // @codeCoverageIgnore break; case 'line': return $this->_line; // @codeCoverageIgnore break; case 'pointer': return $this->data->posChar(); break; default: return null; // @codeCoverageIgnore } } public function changeEncoding(string $encoding): void { $newEncoding = $encoding; $oldEncoding = $this->encoding; assert(!$this->encodingCertain, new \Exception("Encoding is already certain")); assert($oldEncoding === Charset::fromCharset($oldEncoding), new \Exception("Current encoding '{$this->encoding}' is invalid")); assert($newEncoding === Charset::fromCharset($newEncoding), new \Exception("Invalid encoding name '$encoding'")); # When the parser requires the user agent to change the encoding, # it must run the following steps. This might happen if the encoding # sniffing algorithm described above failed to find a character encoding, # or if it found a character encoding that was not the actual # encoding of the file. # If the encoding that is already being used to interpret the input # stream is UTF-16BE/LE, then set the confidence to certain and # return. The new encoding is ignored; if it was anything but the # same encoding, then it would be clearly incorrect. if (in_array($oldEncoding, ["UTF-16LE", "UTF-16BE"])) { $this->encodingCertain = true; return; } # If the new encoding is UTF-16BE/LE, then change it to UTF-8. if (in_array($newEncoding, ["UTF-16LE", "UTF-16BE"])) { $newEncoding = "UTF-8"; } # If the new encoding is x-user-defined, then change it to windows-1252. if ($newEncoding === "x-user-defined") { $newEncoding = "windows-1252"; } # If the new encoding is identical or equivalent to the encoding # that is already being used to interpret the input stream, then # set the confidence to certain and return. This happens when the # encoding information found in the file matches what the encoding # sniffing algorithm determined to be the encoding, and in the second # pass through the parser if the first pass found that the encoding # sniffing algorithm described in the earlier section failed to find # the right encoding. if ($newEncoding === $oldEncoding) { $this->encodingCertain = true; return; } # If all the bytes up to the last byte converted by the current decoder # have the same Unicode interpretations in both the current encoding # and the new encoding, and if the user agent supports changing the # converter on the fly, then the user agent may change to the new # converter for the encoding on the fly. # Set the document's character encoding and the encoding used to # convert the input stream to the new encoding, set the confidence # to certain, and return. // This implementation considers the bytes to have the same // interpretation if they are all ASCII. This does require special // handling for those encodings which are not quite ASCII-compatible // (only ISO 2022-JP), but is relatively simple to confirm $this->encoding = $newEncoding; $this->encodingCertain = true; $bytes = $this->data->posByte(); $chars = $this->data->posChar(); if ($bytes === $chars) { if ($newEncoding === "ISO-2022-JP") { // exclude 0x0E, 0x0F, and 0x1B from the ASCII range as these have different interpretation in ISO-2022-JP encoding $range = '[^\x{0E}\x{0F}\x{1B}\x{80}-\x{FF}]'; } else { $range = '[\x{00}-\x{7F}]'; } if (preg_match('/^'.$range.'{'.$bytes.'}/s', $this->string)) { // The bytes are the same; change the encoding, seek to the same location, and continue parsing $this->data = Encoding::createDecoder($newEncoding, $this->string, false, true); $this->data->seek($chars); return; } } // If the bytes are not the same we have to throw everything out and start over // The simplest way, ugly though it is, is to throw an exceptionto unwind all // the way back to the invocation of the parser $this->data = Encoding::createDecoder($newEncoding, $this->string, false, true); throw new EncodingChangeException; } }