Skip to content

Commit c210c32

Browse files
authored
Merge pull request #1 from sixgad/dev
feat: add customextractor
2 parents 9315d8e + 29e3e33 commit c210c32

2 files changed

Lines changed: 96 additions & 2 deletions

File tree

magic_html/__init__.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,39 @@
11
# -*- coding: utf-8 -*-
2-
2+
import json
33
from urllib.parse import urlparse
44
from magic_html.extractors.article_extractor import ArticleExtractor
55
from magic_html.extractors.weixin_extractor import WeixinExtractor
66
from magic_html.extractors.forum_extractor import ForumExtractor
7+
from magic_html.extractors.custom_extractor import CustomExtractor
78

89

910
class GeneralExtractor:
11+
def __init__(self, config_path=""):
12+
if config_path:
13+
"""
14+
demo rule config file json:
15+
{
16+
"www.***.com": {
17+
"clean": ["//script", "//style"],
18+
"title": {
19+
"mode": "xpath",
20+
"value": "//div[@class='media-body']/h4/text()"
21+
},
22+
"content": {
23+
"mode": "xpath",
24+
"value": "//div[@class='message break-all']"
25+
}
26+
}
27+
}
28+
"""
29+
try:
30+
with open(config_path, 'r', encoding='utf-8') as f:
31+
self.rule = json.loads(f.read())
32+
except:
33+
pass
34+
else:
35+
self.rule = {}
36+
1037
def extract(self, html="", **kwargs) -> dict:
1138
base_url = kwargs.get("base_url", "")
1239
html_type = kwargs.pop("html_type", None)
@@ -16,6 +43,16 @@ def extract(self, html="", **kwargs) -> dict:
1643
elif html_type == "weixin":
1744
return WeixinExtractor().extract(html=html, **kwargs)
1845
if base_url:
19-
if urlparse(base_url).netloc == "mp.weixin.qq.com":
46+
netloc = urlparse(base_url).netloc
47+
if netloc in self.rule:
48+
try:
49+
new_kwargs = dict()
50+
new_kwargs["rule"] = self.rule[netloc]
51+
new_kwargs.update(kwargs)
52+
return CustomExtractor().extract(html=html, **new_kwargs)
53+
except:
54+
# 当自定义规则不能覆盖站点所有板块时,使用
55+
return ArticleExtractor().extract(html=html, **kwargs)
56+
if netloc == "mp.weixin.qq.com":
2057
return WeixinExtractor().extract(html=html, **kwargs)
2158
return ArticleExtractor().extract(html=html, **kwargs)
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# -*- coding:utf-8 -*-
2+
import re
3+
4+
from magic_html.utils import *
5+
from magic_html.extractors.base_extractor import BaseExtractor
6+
from magic_html.extractors.title_extractor import TitleExtractor
7+
8+
9+
class CustomExtractor(BaseExtractor):
10+
def __init__(self) -> None:
11+
super().__init__()
12+
13+
def use_clean_rule(self, tree, clean_rules):
14+
for clean_rule in clean_rules:
15+
for x in tree.xpath(clean_rule):
16+
self.remove_node(x)
17+
return tree
18+
19+
def use_extract_rule(self, tree, extract_rule):
20+
if "/text()" in extract_rule["value"]:
21+
return "".join(tree.xpath(extract_rule["value"])).strip()
22+
return tree.xpath(extract_rule["value"])[0]
23+
24+
def extract(self, html="", base_url="", rule={}) -> dict:
25+
tree = load_html(html)
26+
if tree is None:
27+
raise ValueError
28+
29+
# base_url
30+
base_href = tree.xpath("//base/@href")
31+
32+
if base_href and "http" in base_href[0]:
33+
base_url = base_href[0]
34+
35+
if "clean" in rule:
36+
tree = self.use_clean_rule(tree, rule["clean"])
37+
38+
# 获取title
39+
if "title" not in rule:
40+
title = TitleExtractor().process(tree)
41+
else:
42+
title = self.use_extract_rule(tree, rule["title"])
43+
44+
# 文章区域
45+
try:
46+
body_tree = self.use_extract_rule(tree, rule["content"])
47+
except:
48+
raise ValueError
49+
body_html = tostring(body_tree, encoding=str)
50+
51+
return {
52+
"xp_num": "custom",
53+
"drop_list": False,
54+
"html": body_html,
55+
"title": title,
56+
"base_url": base_url
57+
}

0 commit comments

Comments
 (0)