Merge pull request #1 from sixgad/dev

sixgad · web-flow · commit c210c32cfe0b · 2024-05-22T14:24:22.000+08:00
feat: add customextractor
diff --git a/magic_html/__init__.py b/magic_html/__init__.py
@@ -1,12 +1,39 @@
 # -*- coding: utf-8 -*-
-
+import json
 from urllib.parse import urlparse
 from magic_html.extractors.article_extractor import ArticleExtractor
 from magic_html.extractors.weixin_extractor import WeixinExtractor
 from magic_html.extractors.forum_extractor import ForumExtractor
+from magic_html.extractors.custom_extractor import CustomExtractor
 
 
 class GeneralExtractor:
+    def __init__(self, config_path=""):
+        if config_path:
+            """
+            demo rule config file json:
+            {
+                "www.***.com": {
+                    "clean": ["//script", "//style"],
+                    "title": {
+                        "mode": "xpath",
+                        "value": "//div[@class='media-body']/h4/text()"
+                    },
+                    "content": {
+                        "mode": "xpath",
+                        "value": "//div[@class='message break-all']"
+                    }
+                }
+            }     
+            """
+            try:
+                with open(config_path, 'r', encoding='utf-8') as f:
+                    self.rule = json.loads(f.read())
+            except:
+                pass
+        else:
+            self.rule = {}
+
     def extract(self, html="", **kwargs) -> dict:
         base_url = kwargs.get("base_url", "")
         html_type = kwargs.pop("html_type", None)
@@ -16,6 +43,16 @@ def extract(self, html="", **kwargs) -> dict:
             elif html_type == "weixin":
                 return WeixinExtractor().extract(html=html, **kwargs)
         if base_url:
-            if urlparse(base_url).netloc == "mp.weixin.qq.com":
+            netloc = urlparse(base_url).netloc
+            if netloc in self.rule:
+                try:
+                    new_kwargs = dict()
+                    new_kwargs["rule"] = self.rule[netloc]
+                    new_kwargs.update(kwargs)
+                    return CustomExtractor().extract(html=html, **new_kwargs)
+                except:
+                    # 当自定义规则不能覆盖站点所有板块时，使用
+                    return ArticleExtractor().extract(html=html, **kwargs)
+            if netloc == "mp.weixin.qq.com":
                 return WeixinExtractor().extract(html=html, **kwargs)
         return ArticleExtractor().extract(html=html, **kwargs)
diff --git a/magic_html/extractors/custom_extractor.py b/magic_html/extractors/custom_extractor.py
@@ -0,0 +1,57 @@
+# -*- coding:utf-8 -*-
+import re
+
+from magic_html.utils import *
+from magic_html.extractors.base_extractor import BaseExtractor
+from magic_html.extractors.title_extractor import TitleExtractor
+
+
+class CustomExtractor(BaseExtractor):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def use_clean_rule(self, tree, clean_rules):
+        for clean_rule in clean_rules:
+            for x in tree.xpath(clean_rule):
+                self.remove_node(x)
+        return tree
+
+    def use_extract_rule(self, tree, extract_rule):
+        if "/text()" in extract_rule["value"]:
+            return "".join(tree.xpath(extract_rule["value"])).strip()
+        return tree.xpath(extract_rule["value"])[0]
+
+    def extract(self, html="", base_url="", rule={}) -> dict:
+        tree = load_html(html)
+        if tree is None:
+            raise ValueError
+
+        # base_url
+        base_href = tree.xpath("//base/@href")
+
+        if base_href and "http" in base_href[0]:
+            base_url = base_href[0]
+
+        if "clean" in rule:
+            tree = self.use_clean_rule(tree, rule["clean"])
+
+        # 获取title
+        if "title" not in rule:
+            title = TitleExtractor().process(tree)
+        else:
+            title = self.use_extract_rule(tree, rule["title"])
+
+        # 文章区域
+        try:
+            body_tree = self.use_extract_rule(tree, rule["content"])
+        except:
+            raise ValueError
+        body_html = tostring(body_tree, encoding=str)
+
+        return {
+            "xp_num": "custom",
+            "drop_list": False,
+            "html": body_html,
+            "title": title,
+            "base_url": base_url
+        }