Browse Source

Implement XML infoset coercion

ns
J. King 3 years ago
parent
commit
f50c46952e
  1. 39
      lib/DOM/Document.php
  2. 26
      lib/DOM/Element.php
  3. 26
      lib/DOM/traits/EscapeString.php
  4. 29
      lib/TreeBuilder.php
  5. 28
      tests/cases/TestTreeConstructor.php

39
lib/DOM/Document.php

@ -96,12 +96,13 @@ class Document extends \DOMDocument {
$e->content = $this->createDocumentFragment();
}
return $e;
} catch (\DOMException) {
} catch (\DOMException $e) {
// The element name is invalid for XML
// Replace any offending characters with "UHHHHHH" where H is the
// uppercase hexadecimal digits of the character's code point
$this->mangledElements = true;
$name = $this->CoerceName($name);
$name = $this->coerceName($name);
return parent::createElement($name, $value);
}
}
@ -113,13 +114,39 @@ class Document extends \DOMDocument {
$e->content = $this->createDocumentFragment();
}
return $e;
} catch (\DOMException) {
throw $e;
} catch (\DOMException $e) {
// The element name is invalid for XML
// Replace any offending characters with "UHHHHHH" where H is the
// Replace any offending characters with "UHHHHHH" where H are the
// uppercase hexadecimal digits of the character's code point
$this->mangledElements = true;
$qualifiedName = $this->CoerceName($qualifiedName);
$qualifiedName = $this->coerceName($qualifiedName);
return parent::createElementNS($namespaceURI, $qualifiedName, $value);
}
}
public function createAttribute($name) {
try {
return parent::createAttribute($name);
} catch (\DOMException $e) {
// The element name is invalid for XML
// Replace any offending characters with "UHHHHHH" where H are the
// uppercase hexadecimal digits of the character's code point
$this->mangledAttributes = true;
$name = $this->coerceName($name);
return parent::createAttribute($name);
}
}
public function createAttributeNS($namespaceURI, $qualifiedName) {
try {
return parent::createAttributeNS($namespaceURI, $qualifiedName);
} catch (\DOMException $e) {
// The element name is invalid for XML
// Replace any offending characters with "UHHHHHH" where H are the
// uppercase hexadecimal digits of the character's code point
$this->mangledAttributes = true;
$qualifiedName = $this->coerceName($qualifiedName);
return parent::createAttributeNS($namespaceURI, $qualifiedName);
}
}

26
lib/DOM/Element.php

@ -12,6 +12,32 @@ class Element extends \DOMElement {
protected const SELF_CLOSING_ELEMENTS = ['area', 'base', 'basefont', 'bgsound', 'br', 'col', 'embed', 'frame', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr'];
public function setAttribute($name, $value) {
try {
parent::setAttribute($name, $value);
} catch (\DOMException $e) {
// The attribute name is invalid for XML
// Replace any offending characters with "UHHHHHH" where H are the
// uppercase hexadecimal digits of the character's code point
$this->ownerDocument->mangledAttributes = true;
$name = $this->coerceName($name);
parent::setAttribute($name, $value);
}
}
public function setAttributeNS($namespaceURI, $qualifiedName, $value) {
try {
parent::setAttributeNS($namespaceURI, $qualifiedName, $value);
} catch (\DOMException $e) {
// The attribute name is invalid for XML
// Replace any offending characters with "UHHHHHH" where H are the
// uppercase hexadecimal digits of the character's code point
$this->ownerDocument->mangledAttributes = true;
$qualifiedName = $this->coerceName($qualifiedName);
parent::setAttributeNS($namespaceURI, $qualifiedName, $value);
}
}
public function isMathMLTextIntegrationPoint(): bool {
return (
$this->namespaceURI === Parser::MATHML_NAMESPACE && (

26
lib/DOM/traits/EscapeString.php

@ -23,12 +23,30 @@ trait EscapeString {
return $string;
}
protected function CoerceName(string $name): string {
protected function coerceName(string $name): string {
// This matches the inverse of the production of NameChar in XML 1.0,
// with the added exclusion of ":" from allowed characters
// See https://www.w3.org/TR/REC-xml/#NT-NameStartChar
preg_match_all('/[^\-\.0-9\x{B7}\x{300}-\x{36F}\x{203F}-\x{2040}A-Za-z_\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}]/u', $name, $m, \PREG_OFFSET_CAPTURE);
var_export($m);
exit;
preg_match_all('/[^\-\.0-9\x{B7}\x{300}-\x{36F}\x{203F}-\x{2040}A-Za-z_\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}]/u', $name, $m);
foreach (array_unique($m[0], \SORT_STRING) as $c) {
$esc = "U".str_pad(strtoupper(dechex(\IntlChar::ord($c))), 6, "0", \STR_PAD_LEFT);
$name = str_replace($c, $esc, $name);
}
// Apply stricter rules to the first character
if (preg_match('/^[^A-Za-z_\x{C0}-\x{D6}\x{D8}-\x{F6}\x{F8}-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}]/u', $name, $m)) {
$c = $m[0];
$esc = "U".str_pad(strtoupper(dechex(\IntlChar::ord($c))), 6, "0", \STR_PAD_LEFT);
$name = $esc.substr($name, strlen($c));
}
return $name;
}
protected function uncoerceName(string $name): string {
preg_match_all('/U[0-9A-F]{6}/', $name, $m);
foreach (array_unique($m[0], \SORT_STRING) as $o) {
$c = \IntlChar::chr(hexdec(substr($o, 1)));
$name = str_replace($o, $c, $name);
}
return $name;
}
}

29
lib/TreeBuilder.php

@ -3,7 +3,7 @@ declare(strict_types=1);
namespace dW\HTML5;
class TreeBuilder {
use ParseErrorEmitter;
use ParseErrorEmitter, EscapeString;
public $debugLog = "";
@ -292,6 +292,12 @@ class TreeBuilder {
$this->debugLog .= "EMITTED: ".constant(get_class($token)."::NAME")."\n";
return true;
})());
// If element name coercison has occurred at some earlier point,
// we must coerce all end tag names to match mangled start tags
if ($token instanceof EndTagToken && $this->DOM->mangledElements) {
$token->name = $this->coerceName($token->name);
}
// Loop used for reprocessing.
$iterations = 0;
while (true) {
@ -373,6 +379,14 @@ class TreeBuilder {
return true;
})());
// If attribute name coercison has occurred at some earlier point,
// we must coerce all attributes on html and body start tags in
// case they are relocated to existing elements
if ($token instanceof StartTagToken && $this->DOM->mangledAttributes && in_array($token->name, ["html", "body"])) {
foreach ($token->attributes as $attr) {
$attr->name = $this->coerceName($attr->name);
}
}
# 13.2.6.4. The rules for parsing tokens in HTML content
# 13.2.6.4.1. The "initial" insertion mode
if ($insertionMode === self::INITIAL_MODE) {
@ -4198,7 +4212,18 @@ class TreeBuilder {
$element = $document->createElementNS($namespace, $localName);
# Append each attribute in the given token to element.
foreach ($token->attributes as $attr) {
$element->setAttributeNS(null, $attr->name, $attr->value);
$ns = null;
if ($namespace) {
// Determine the namespace URI for the prefix, if any
if (strpos($attr->name, "xml:") === 0) {
$ns = Parser::XML_NAMESPACE;
} elseif (strpos($attr->name, "xmlns:") === 0) {
$ns = Parser::XMLNS_NAMESPACE;
} elseif (strpos($attr->name, "xlink:") === 0) {
$ns = Parser::XLINK_NAMESPACE;
}
}
$element->setAttributeNS($ns, $attr->name, $attr->value);
}
# If element has an xmlns attribute in the XMLNS namespace whose value
# is not exactly the same as the element's namespace, that is a

28
tests/cases/TestTreeConstructor.php

@ -22,6 +22,8 @@ use dW\HTML5\TreeBuilder;
* @covers \dW\HTML5\Stack
*/
class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
use \dW\HTML5\EscapeString;
protected $out;
protected $depth;
@ -102,19 +104,6 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
}
}
}
if (in_array($data, [
//'<!DOCTYPE html><html xml:lang=bar><html xml:lang=foo>',
//'<!DOCTYPE html><body xlink:href=foo><svg xlink:href=foo></svg>',
//'<!DOCTYPE html><body xlink:href=foo xml:lang=en><svg><g xml:lang=en xlink:href=foo></g></svg>',
//'<!DOCTYPE html><body xlink:href=foo xml:lang=en><svg><g xml:lang=en xlink:href=foo /></svg>',
//'<!DOCTYPE html><body xlink:href=foo xml:lang=en><svg><g xml:lang=en xlink:href=foo />bar</svg>',
//'<!DOCTYPE html><body xlink:href=foo><math xlink:href=foo></math>',
//'<!DOCTYPE html><body xlink:href=foo xml:lang=en><math><mi xml:lang=en xlink:href=foo></mi></math>',
//'<!DOCTYPE html><body xlink:href=foo xml:lang=en><math><mi xml:lang=en xlink:href=foo /></math>',
//'<!DOCTYPE html><body xlink:href=foo xml:lang=en><math><mi xml:lang=en xlink:href=foo />bar</math>',
])) {
$skip = 'Requires implementation of the "Coercing an HTML DOM into an infoset" specification section';
}
return [$exp, $patched, $skip];
}
@ -167,13 +156,20 @@ class TestTreeConstructor extends \PHPUnit\Framework\TestCase {
} else {
$prefix = "";
}
$this->push("<".$prefix.$e->localName.">");
$localName = $this->uncoerceName($e->localName);
$this->push("<".$prefix.$localName.">");
$this->depth++;
$attr = [];
foreach ($e->attributes as $a) {
$attr[$a->name] = $a->value;
$prefix = "";
if ($a->namespaceURI) {
$prefix = Parser::NAMESPACE_MAP[$a->namespaceURI];
assert((bool) $prefix, new \Exception("Prefix for namespace {$a->namespaceURI} is not defined"));
$prefix .= " ";
}
$attr[$prefix.$this->uncoerceName($a->name)] = $a->value;
}
ksort($attr);
ksort($attr, \SORT_STRING);
foreach ($attr as $k => $v) {
$this->push($k.'="'.$v.'"');
}

Loading…
Cancel
Save