11# -*- coding: utf-8 -*-
2-
2+ import json
33from urllib .parse import urlparse
44from magic_html .extractors .article_extractor import ArticleExtractor
55from magic_html .extractors .weixin_extractor import WeixinExtractor
66from magic_html .extractors .forum_extractor import ForumExtractor
7+ from magic_html .extractors .custom_extractor import CustomExtractor
78
89
910class GeneralExtractor :
11+ def __init__ (self , config_path = "" ):
12+ if config_path :
13+ """
14+ demo rule config file json:
15+ {
16+ "www.***.com": {
17+ "clean": ["//script", "//style"],
18+ "title": {
19+ "mode": "xpath",
20+ "value": "//div[@class='media-body']/h4/text()"
21+ },
22+ "content": {
23+ "mode": "xpath",
24+ "value": "//div[@class='message break-all']"
25+ }
26+ }
27+ }
28+ """
29+ try :
30+ with open (config_path , 'r' , encoding = 'utf-8' ) as f :
31+ self .rule = json .loads (f .read ())
32+ except :
33+ pass
34+ else :
35+ self .rule = {}
36+
1037 def extract (self , html = "" , ** kwargs ) -> dict :
1138 base_url = kwargs .get ("base_url" , "" )
1239 html_type = kwargs .pop ("html_type" , None )
@@ -16,6 +43,16 @@ def extract(self, html="", **kwargs) -> dict:
1643 elif html_type == "weixin" :
1744 return WeixinExtractor ().extract (html = html , ** kwargs )
1845 if base_url :
19- if urlparse (base_url ).netloc == "mp.weixin.qq.com" :
46+ netloc = urlparse (base_url ).netloc
47+ if netloc in self .rule :
48+ try :
49+ new_kwargs = dict ()
50+ new_kwargs ["rule" ] = self .rule [netloc ]
51+ new_kwargs .update (kwargs )
52+ return CustomExtractor ().extract (html = html , ** new_kwargs )
53+ except :
54+ # 当自定义规则不能覆盖站点所有板块时,使用
55+ return ArticleExtractor ().extract (html = html , ** kwargs )
56+ if netloc == "mp.weixin.qq.com" :
2057 return WeixinExtractor ().extract (html = html , ** kwargs )
2158 return ArticleExtractor ().extract (html = html , ** kwargs )
0 commit comments