Skip to content

Commit 7fc6ab0

Browse files
authored
Merge pull request #550 from osapon/master
Improved charset tag recognition accuracy.
2 parents a1da3d4 + 751ea8b commit 7fc6ab0

File tree

1 file changed

+20
-3
lines changed

1 file changed

+20
-3
lines changed

src/Document.php

+20-3
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,31 @@ public function __construct(Extractor $extractor)
2828

2929
$encoding = null;
3030
$contentType = $extractor->getResponse()->getHeaderLine('content-type');
31-
preg_match('/charset="?(.*?)(?=$|\s|;|")/i', $contentType, $match);
31+
preg_match('/charset=(?:"|\')?(.*?)(?=$|\s|;|"|\'|>)/i', $contentType, $match);
3232
if (!empty($match[1])) {
3333
$encoding = trim($match[1], ',');
34-
} elseif (!empty($html)) {
35-
preg_match('/charset="?(.*?)(?=$|\s|;|")/i', $html, $match);
34+
try {
35+
$ret = mb_encoding_aliases($encoding);
36+
if ($ret === false) {
37+
$encoding = null;
38+
}
39+
} catch (\ValueError $exception) {
40+
$encoding = null;
41+
}
42+
}
43+
if (is_null($encoding) && !empty($html)) {
44+
preg_match('/charset=(?:"|\')?(.*?)(?=$|\s|;|"|\'|>)/i', $html, $match);
3645
if (!empty($match[1])) {
3746
$encoding = trim($match[1], ',');
3847
}
48+
try {
49+
$ret = mb_encoding_aliases($encoding);
50+
if ($ret === false) {
51+
$encoding = null;
52+
}
53+
} catch (\ValueError $exception) {
54+
$encoding = null;
55+
}
3956
}
4057
$this->document = !empty($html) ? Parser::parse($html, $encoding) : new DOMDocument();
4158
$this->initXPath();

0 commit comments

Comments
 (0)