Adding builtin scraper for Europresse

medialab · Feb 22, 2024 · 7a6d336 · 7a6d336
1 parent da98a7a
commit 7a6d336
Show file tree

Hide file tree

Showing 12 changed files with 324 additions and 206 deletions.
diff --git a/docs/cli.md b/docs/cli.md
@@ -1262,6 +1262,7 @@ Builtin scrapers:
     with the correct base url if --url-column is valid.
 . "images": scrape all the relevant <img> tag src urls. Will join them
     with the correct base url if --url-column is valid.
+. "europresse": scrape the articles from europresse HTML files.
 
 Examples:
 

diff --git a/minet/cli/scrape/__init__.py b/minet/cli/scrape/__init__.py
@@ -52,6 +52,7 @@ def resolve_arguments(cli_args):
             with the correct base url if --url-column is valid.
         . "images": scrape all the relevant <img> tag src urls. Will join them
             with the correct base url if --url-column is valid.
+        . "europresse": scrape the articles from europresse HTML files.
 
         Examples:
 

diff --git a/minet/scrape/classes/named.py b/minet/scrape/classes/named.py
diff --git a/minet/scrape/classes/named/__init__.py b/minet/scrape/classes/named/__init__.py
@@ -0,0 +1,23 @@
+from typing import Dict, Type
+from .types import NamedScraper
+
+from .canonical import CanonicalScraper
+from .europresse import EuropresseScraper
+from .images import ImagesScraper
+from .metas import MetasScraper
+from .rss import RssScraper
+from .title import TitleScraper
+from .urls import UrlsScraper
+
+NAMED_SCRAPERS: Dict[str, Type[NamedScraper]] = {
+    s.name: s
+    for s in [
+        TitleScraper,
+        CanonicalScraper,
+        UrlsScraper,
+        ImagesScraper,
+        MetasScraper,
+        RssScraper,
+        EuropresseScraper,
+    ]
+}
diff --git a/minet/scrape/classes/named/canonical.py b/minet/scrape/classes/named/canonical.py
@@ -0,0 +1,31 @@
+from typing import Any, cast
+
+from bs4 import SoupStrainer, BeautifulSoup
+
+from .types import NamedScraper
+
+
+class CanonicalScraper(NamedScraper):
+    name = "canonical"
+    fieldnames = ["canonical_url"]
+    plural = False
+    output_type = "scalar"
+    strainer = SoupStrainer(name="link", attrs={"rel": "canonical"})
+
+    def scrape(self, soup: BeautifulSoup, context=None) -> Any:
+        link_elem = soup.select_one("link[rel=canonical][href]")
+
+        if link_elem is None:
+            return None
+
+        url = link_elem.get("href")
+
+        if url is None:
+            return None
+
+        url = cast(str, url).strip()
+
+        if not url:
+            return None
+
+        return url
diff --git a/minet/scrape/classes/named/europresse.py b/minet/scrape/classes/named/europresse.py
@@ -0,0 +1,74 @@
+from typing import Any
+
+import warnings
+from datetime import datetime
+from html import unescape
+from bs4 import SoupStrainer, BeautifulSoup, MarkupResemblesLocatorWarning
+
+from .types import NamedScraper
+
+
+def extract_content(content):
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=MarkupResemblesLocatorWarning)
+        return BeautifulSoup(unescape(content), "html.parser").get_text().strip()
+
+
+def extract_date(doc_id):
+    return datetime.strptime(doc_id.split("·")[1], "%Y%m%d").date().isoformat()
+
+
+def extract_media(media):
+    return media.split(",", 1)[0].split("\n", 1)[0].split(" " * 16, 1)[0].strip()
+
+
+def select_and_strip(elem, selector):
+    selected_elem = elem.select_one(selector)
+
+    if selected_elem is None:
+        return ""
+
+    return selected_elem.get_text().strip()
+
+
+class EuropresseScraper(NamedScraper):
+    name = "europresse"
+    fieldnames = ["id", "title", "content", "url", "date", "media", "media_id"]
+    plural = True
+    output_type = "collection"
+    strainer = SoupStrainer(name="article")
+
+    def scrape(self, soup: BeautifulSoup, context=None) -> Any:
+        articles = []
+        selectors = {
+            "title": ".titreArticle",
+            "id": ".publiC-lblNodoc",
+            "media": ".DocPublicationName",
+        }
+
+        for elem in soup.select("article"):
+            row = {}
+
+            content = elem.select_one(".docOcurrContainer")
+            if content is None:
+                content = ""
+            else:
+                urls = content.select("a")
+                for u in urls:
+                    if "Cet article est paru dans" in u.get_text():
+                        row["url"] = u.get("href")
+                        break
+                content = content.get_text()
+
+            row["content"] = extract_content(content)
+
+            for field, selector in selectors.items():
+                row[field] = select_and_strip(elem, selector)
+
+            row["date"] = extract_date(row["id"])
+            row["media"] = extract_media(row["media"])
+            row["media_id"] = row["id"].split("·")[2]
+
+            articles.append(row)
+
+        return articles
diff --git a/minet/scrape/classes/named/images.py b/minet/scrape/classes/named/images.py
@@ -0,0 +1,42 @@
+from typing import Any, cast
+
+from bs4 import SoupStrainer, BeautifulSoup
+from ural import should_follow_href
+from urllib.parse import urljoin
+
+from .types import NamedScraper
+
+
+class ImagesScraper(NamedScraper):
+    name = "images"
+    fieldnames = ["src"]
+    plural = True
+    output_type = "list"
+    strainer = SoupStrainer(name="img")
+
+    def scrape(self, soup: BeautifulSoup, context=None) -> Any:
+        img_elems = soup.select("img[src]")
+        base_url = context.get("url") if context is not None else None
+
+        urls = []
+
+        for img in img_elems:
+            url = img.get("src")
+
+            if url is None:
+                continue
+
+            url = cast(str, url).strip()
+
+            if not url:
+                continue
+
+            if not should_follow_href(url):
+                continue
+
+            if base_url:
+                url = urljoin(base_url, url)
+
+            urls.append(url)
+
+        return urls