Skip to content

Commit 1d6aa33

Browse files
authored
Merge pull request #544 from osapon/master
(fix): Improved recognition of text encoding
2 parents 85dccc6 + d6ac21f commit 1d6aa33

7 files changed

+2607
-1
lines changed

.gitattributes

+2
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,5 @@
77
.php_cs.dist export-ignore
88
.travis.yml export-ignore
99
phpunit.xml.dist export-ignore
10+
/tests/cache/4pda.to.2022-12-04-406834-sostoyalsya_reliz_clown_of_duty_parodii_na_call_of_duty.php working-tree-encoding=windows-1251 diff=windows-1251
11+
/tests/cache/www.itmedia.co.jp.news-articles-2410-28-news159.html.php working-tree-encoding=sjis diff=sjis

src/Document.php

+12-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,18 @@ public function __construct(Extractor $extractor)
2626
$html = str_replace('<br>', "\n<br>", $html);
2727
$html = str_replace('<br ', "\n<br ", $html);
2828

29-
$this->document = !empty($html) ? Parser::parse($html) : new DOMDocument();
29+
$encoding = null;
30+
$contentType = $extractor->getResponse()->getHeaderLine('content-type');
31+
preg_match('/charset="?(.*?)(?=$|\s|;|")/i', $contentType, $match);
32+
if (!empty($match[1])) {
33+
$encoding = $match[1];
34+
} elseif (!empty($html)) {
35+
preg_match('/charset="?(.*?)(?=$|\s|;|")/i', $html, $match);
36+
if (!empty($match[1])) {
37+
$encoding = $match[1];
38+
}
39+
}
40+
$this->document = !empty($html) ? Parser::parse($html, $encoding) : new DOMDocument();
3041
$this->initXPath();
3142
}
3243

tests/PagesTest.php

+2
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ public function urlDataProvider(): array
5252
['http://www.ustream.tv/channel/red-shoes-billiards-60803-camera-1'],
5353
['http://www.viddler.com/v/bdce8c7'],
5454
['http://www.wired.com/?p=2064839'],
55+
['https://www.itmedia.co.jp/news/articles/2410/28/news159.html'],
56+
['https://4pda.to/2022/12/04/406834/sostoyalsya_reliz_clown_of_duty_parodii_na_call_of_duty/'],
5557
];
5658
}
5759

tests/cache/4pda.to.2022-12-04-406834-sostoyalsya_reliz_clown_of_duty_parodii_na_call_of_duty.php

+470
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)