Splitting named scrapers into their own files

medialab · Feb 21, 2024 · 110b64c · 110b64c
1 parent cf5be7d
commit 110b64c
Show file tree

Hide file tree

Showing 10 changed files with 373 additions and 325 deletions.
diff --git a/minet/scrape/classes/named.py b/minet/scrape/classes/named.py
diff --git a/minet/scrape/classes/named/__init__.py b/minet/scrape/classes/named/__init__.py
@@ -0,0 +1,23 @@
+from typing import Dict, Type
+from .types import NamedScraper
+
+from .canonical import CanonicalScraper
+from .europresse import EuropresseScraper
+from .images import ImagesScraper
+from .metas import MetasScraper
+from .rss import RssScraper
+from .title import TitleScraper
+from .urls import UrlsScraper
+
+NAMED_SCRAPERS: Dict[str, Type[NamedScraper]] = {
+    s.name: s
+    for s in [
+        TitleScraper,
+        CanonicalScraper,
+        UrlsScraper,
+        ImagesScraper,
+        MetasScraper,
+        RssScraper,
+        EuropresseScraper,
+    ]
+}
diff --git a/minet/scrape/classes/named/canonical.py b/minet/scrape/classes/named/canonical.py
@@ -0,0 +1,31 @@
+from typing import Any, cast
+
+from bs4 import SoupStrainer, BeautifulSoup
+
+from .types import NamedScraper
+
+
+class CanonicalScraper(NamedScraper):
+    name = "canonical"
+    fieldnames = ["canonical_url"]
+    plural = False
+    output_type = "scalar"
+    strainer = SoupStrainer(name="link", attrs={"rel": "canonical"})
+
+    def scrape(self, soup: BeautifulSoup, context=None) -> Any:
+        link_elem = soup.select_one("link[rel=canonical][href]")
+
+        if link_elem is None:
+            return None
+
+        url = link_elem.get("href")
+
+        if url is None:
+            return None
+
+        url = cast(str, url).strip()
+
+        if not url:
+            return None
+
+        return url