Skip to content

Commit 6fd1629

Browse files
committed
Use new parsers for DOMParser in PHP 8.4 when available
Because the new HTML parser uses UTF-8 as a fallback encoding, we have adjusted the configured fallback encoding of our parser to match
1 parent 024011f commit 6fd1629

File tree

7 files changed

+175
-52
lines changed

7 files changed

+175
-52
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ Like the standard interface, it will parse either HTML or XML documents. This im
4040

4141
- Any XML MIME content-type (e.g. `application/rss+xml`) is acceptable, not just the restricted list mandated by the interface
4242
- MIME content-types may include a `charset` parameter to specify an authoritative encoding of the document
43-
- If no `charset` is provided encoding will be detected from document hints; the default encoding for HTML is `windows-1252` and for XML `UTF-8`
43+
- If no `charset` is provided encoding will be detected from document hints; the default encoding is `UTF-8`
4444
- `InvalidArgumentException` is thrown in place of JavaScript's `TypeError`
4545

4646
### Parsing into existing documents

lib/DOMParser.php

+47-17
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
declare(strict_types=1);
77
namespace MensBeam\HTML;
88

9+
use MensBeam\HTML\Parser\Config;
910
use MensBeam\Mime\MimeType;
1011
use MensBeam\Intl\Encoding;
1112

@@ -46,7 +47,7 @@ class DOMParser {
4647
"csbig5", "x-x-big5", "x-euc-jp", "ms932", "windows-31j", "x-sjis",
4748
"cseuckr", "euc-kr", "replacement",
4849
];
49-
/** @var array A List of canonical encoding names DOMDocument does not understand, with liases to labels it does understand */
50+
/** @var array A List of canonical encoding names DOMDocument does not understand, with aliases to labels it does understand */
5051
const ENCODING_ALIAS_MAP = [
5152
'windows-1258' => "x-cp1258",
5253
'GBK' => "x-gbk",
@@ -65,34 +66,62 @@ class DOMParser {
6566
* detection
6667
*
6768
* For the XML parser, if `$string` cannot be parsed, then the returned
68-
* `DOMDocument` will contain elements describing the resulting error
69+
* document will contain elements describing the resulting error
6970
*
7071
* If no encoding is specified and none can be detected from the document,
71-
* the default encoding is Windows-1252 for HTML and UTF-8 for XML
72+
* the default encoding is UTF-8 for both HTML and XML
73+
*
74+
* @return \DOMDocument|\Dom\HTMLDocument|\Dom\XMLDocument
7275
*/
73-
public function parseFromString(string $string, string $type): \DOMDocument {
74-
// start by parsing the type
76+
public function parseFromString(string $string, string $type) {
77+
// parse the Content-Type
7578
$t = MimeType::parseBytes($type);
79+
// determine authoritative encoding from BOM or Content-Type
80+
$encoding = Encoding::sniffBOM($string) ?? $t->params['charset'] ?? "";
81+
$label = Encoding::matchLabel($encoding);
82+
if ($label) {
83+
$encoding = $label['name'];
84+
} else {
85+
$encoding = null;
86+
}
7687
// parse the string as either HTML or XML
7788
if ($t->isHtml) {
78-
// for HTML we invoke our parser which has its own handling for everything
79-
return $this->createDocumentHtml($string, $type);
89+
// if we're using PHP 8.4, we can use the modern built-in parser
90+
if ($this->useNewParsers()) {
91+
return \Dom\HTMLDocument::createFromString($string, \LIBXML_NOERROR | \LIBXML_COMPACT, $encoding);
92+
}
93+
// otherwise we invoke our parser which has its own handling for everything
94+
$c = new Config;
95+
$c->encodingFallback = "UTF-8";
96+
return Parser::parse($string, $encoding, $c)->document;
8097
} elseif ($t->isXml) {
81-
// for XML we have to jump through a few hoops to deal with
82-
// encoding
83-
return $this->createDocumentXml($this->fixXmlEncoding($string, $t->params['charset'] ?? ""));
98+
// for XML we have to jump through a few hoops to deal with errors,
99+
// as well as with encoding, so we put this in
100+
// another function.
101+
return $this->createDocumentXml($string, $encoding);
84102
} else {
85103
throw new \InvalidArgumentException("\$type must be \"text/html\" or an XML type");
86104
}
87105
}
88106

89-
protected function createDocumentHtml(string $string, string $type): \DOMDocument {
90-
return Parser::parse($string, $type)->document;
107+
protected function useNewParsers(): bool {
108+
return class_exists(\Dom\Document::class);
91109
}
92110

93-
protected function createDocumentXml(string $string): \DOMDocument {
94-
$document = new \DOMDocument;
95-
if (!$document->loadXML($string, \LIBXML_NONET | \LIBXML_BIGLINES | \LIBXML_COMPACT |\LIBXML_NOWARNING | \LIBXML_NOERROR)) {
111+
protected function createDocumentXml(string $string, ?string $encoding) {
112+
$string = $this->fixXmlEncoding($string, $encoding ?? "");
113+
try {
114+
if ($this->useNewParsers()) {
115+
return \Dom\XMLDocument::createFromString($string, \LIBXML_NOERROR | \LIBXML_COMPACT);
116+
} else {
117+
$document = new \DOMDocument;
118+
if ($document->loadXML($string, \LIBXML_NONET | \LIBXML_BIGLINES | \LIBXML_COMPACT |\LIBXML_NOWARNING | \LIBXML_NOERROR)) {
119+
return $document;
120+
} else {
121+
throw new \Exception;
122+
}
123+
}
124+
} catch (\Exception $e) {
96125
$err = libxml_get_last_error();
97126
$message = trim(htmlspecialchars($err->message, \ENT_NOQUOTES | \ENT_SUBSTITUTE | \ENT_XML1, "UTF-8"));
98127
$string = <<<XMLDOC
@@ -104,9 +133,8 @@ protected function createDocumentXml(string $string): \DOMDocument {
104133
column="{$err->column}"
105134
>{$err->code}: "$message" on line {$err->line}, column {$err->column}</parsererror>
106135
XMLDOC;
107-
return $this->createDocumentXml($string);
136+
return $this->createDocumentXml($string, "UTF-8");
108137
}
109-
return $document;
110138
}
111139

112140
protected function fixXmlEncoding(string $string, string $encoding) {
@@ -162,6 +190,8 @@ protected function fixXmlEncoding(string $string, string $encoding) {
162190
} elseif ($charset === "UTF-16LE") {
163191
// if the string is UTF-16LE, adding a BOM is sufficient
164192
return self::BOM_UTF16LE.$string;
193+
} elseif ($charset === "replacement") {
194+
return "\u{FFFD}";
165195
} elseif ($charset) {
166196
// otherwise substitute the encoding declaration if any
167197
return "<?xml".$xmlVersion." encoding=\"$charset\"".$xmlStandalone."?>".substr($string, strlen($xmlDeclaration));

tests/cases/TestDOMParser.php

+17-19
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,18 @@
1212
* @covers \MensBeam\HTML\DOMParser
1313
*/
1414
class TestDOMParser extends \PHPUnit\Framework\TestCase {
15+
protected $p;
16+
17+
public function setUp(): void {
18+
$this->p = \Phake::partialMock(DOMParser::class);
19+
\Phake::when($this->p)->useNewParsers->thenReturn(false);
20+
}
21+
1522
/** @dataProvider provideDocuments */
1623
public function testParseADocument(string $input, string $type, string $exp): void {
17-
$p = new DOMParser;
18-
$document = $p->parseFromString($input, $type);
24+
$document = $this->p->parseFromString($input, $type);
1925
$this->assertSame($exp, $document->documentElement->textContent);
20-
$this->assertSame("html", $document->documentElement->tagName);
26+
$this->assertSame("html", $document->documentElement->localName);
2127
}
2228

2329
public function provideDocuments(): iterable {
@@ -27,7 +33,7 @@ public function provideDocuments(): iterable {
2733
};
2834
return [
2935
["Test", "text/html", "Test"],
30-
["Ol\xE9", "text/html", "Ol\u{E9}"],
36+
["Ol\u{E9}", "text/html", "Ol\u{E9}"],
3137
["Ol\u{E9}", "text/html;charset=utf8", "Ol\u{E9}"],
3238
["<meta charset=utf8>Ol\u{E9}", "text/html", "Ol\u{E9}"],
3339
["<html>Test</html>", "text/xml", "Test"],
@@ -37,10 +43,6 @@ public function provideDocuments(): iterable {
3743
["<?xml version='1.0' encoding='windows-1252'?><html>Ol\xE9</html>", "text/xml", "Ol\u{E9}"],
3844
["<html>Ol\xE9</html>", "text/xml;charset=windows-1252", "Ol\u{E9}"],
3945
["<html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", "Ol\u{E9}"],
40-
["<?xml version='1.1' encoding='windows-1252'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", "Ol\u{E9}"],
41-
["<?xml version='1.1' encoding='utf8'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", "Ol\u{E9}"],
42-
["<?xml version='1.1'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", "Ol\u{E9}"],
43-
["<?xml version='1.1' ?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", "Ol\u{E9}"],
4446
["<?xml version='1.0' standalone='yes'?><html>Ol\u{E9}</html>", "text/xml;charset=UTF-8", "Ol\u{E9}"],
4547
["<?xml version='1.0' standalone='yes'?><html>Ol\xE9</html>", "text/xml;charset=windows-1252", "Ol\u{E9}"],
4648
["<?xml version='1.0'?><html>Ol\u{E9}</html>", "text/xml;charset=bogus", "Ol\u{E9}"],
@@ -59,33 +61,29 @@ public function provideDocuments(): iterable {
5961

6062
public function testFailToParseADocument(): void {
6163
$in = "<html>Test</html><!--Test-->Test";
62-
$p = new DOMParser;
63-
$d = $p->parseFromString($in, "text/xml");
64-
$this->assertSame("parsererror", $d->documentElement->tagName);
64+
$d = $this->p->parseFromString($in, "text/xml");
65+
$this->assertSame("parsererror", $d->documentElement->localName);
6566
$this->assertSame("http://www.mozilla.org/newlayout/xml/parsererror.xml", $d->documentElement->namespaceURI);
6667
$this->assertNotSame("", trim($d->documentElement->textContent));
6768
}
6869

6970
public function testParseWithIncorrectType(): void {
7071
$in = "<html>Ol\u{E9}</html>";
71-
$p = new DOMParser;
7272
$this->expectException(\InvalidArgumentException::class);
73-
$p->parseFromString($in, "text/plain");
73+
$this->p->parseFromString($in, "text/plain");
7474
}
7575

7676
public function testParseWithInvalidEncodingInHeader(): void {
7777
$in = "<html>Test</html>";
78-
$p = new DOMParser;
79-
$d = $p->parseFromString($in, "text/xml;charset=csiso2022kr");
80-
$this->assertSame("parsererror", $d->documentElement->tagName);
78+
$d = $this->p->parseFromString($in, "text/xml;charset=csiso2022kr");
79+
$this->assertSame("parsererror", $d->documentElement->localName);
8180
$this->assertSame("http://www.mozilla.org/newlayout/xml/parsererror.xml", $d->documentElement->namespaceURI);
8281
$this->assertNotSame("", trim($d->documentElement->textContent));
8382
}
8483
public function testParseWithInvalidEncodingInDocument(): void {
8584
$in = "<?xml version='1.0' encoding='bogus'?><html>Test</html>";
86-
$p = new DOMParser;
87-
$d = $p->parseFromString($in, "text/xml");
88-
$this->assertSame("parsererror", $d->documentElement->tagName);
85+
$d = $this->p->parseFromString($in, "text/xml");
86+
$this->assertSame("parsererror", $d->documentElement->localName);
8987
$this->assertSame("http://www.mozilla.org/newlayout/xml/parsererror.xml", $d->documentElement->namespaceURI);
9088
$this->assertNotSame("", trim($d->documentElement->textContent));
9189
}

tests/cases/TestDOMParserNew.php

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
<?php
2+
/** @license MIT
3+
* Copyright 2017 , Dustin Wilson, J. King et al.
4+
* See LICENSE and AUTHORS files for details */
5+
6+
declare(strict_types=1);
7+
namespace MensBeam\HTML\TestCase;
8+
9+
use MensBeam\HTML\DOMParser;
10+
11+
/**
12+
* @covers \MensBeam\HTML\DOMParser
13+
* @requires PHP >= 8.4
14+
*/
15+
class TestDOMParserNew extends TestDOMParser {
16+
protected $p;
17+
18+
public function setUp(): void {
19+
$this->p = \Phake::partialMock(DOMParser::class);
20+
\Phake::when($this->p)->useNewParsers->thenReturn(true);
21+
}
22+
}

tests/phpunit.dist.xml

+1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
</testsuite>
3131
<testsuite name="DOMParser">
3232
<file>cases/TestDOMParser.php</file>
33+
<file>cases/TestDOMParserNew.php</file>
3334
</testsuite>
3435
<testsuite name="Serializer">
3536
<file>cases/TestSerializer.php</file>

vendor-bin/phpunit/composer.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
{
22
"require": {
3-
"phpunit/phpunit": "^8.5 | ^9.0"
3+
"phpunit/phpunit": "^8.5 | ^9.0",
4+
"phake/phake": "^4.4"
45
}
56
}

vendor-bin/phpunit/composer.lock

+85-14
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)