Adding builtin scraper for Europresse

medialab · Feb 20, 2024 · cf5be7d · cf5be7d
1 parent da98a7a
commit cf5be7d
Show file tree

Hide file tree

Showing 3 changed files with 122 additions and 1 deletion.
diff --git a/docs/cli.md b/docs/cli.md
@@ -1262,6 +1262,7 @@ Builtin scrapers:
     with the correct base url if --url-column is valid.
 . "images": scrape all the relevant <img> tag src urls. Will join them
     with the correct base url if --url-column is valid.
+. "europresse": scrape the articles from europresse HTML files.
 
 Examples:
 

diff --git a/minet/cli/scrape/__init__.py b/minet/cli/scrape/__init__.py
@@ -52,6 +52,7 @@ def resolve_arguments(cli_args):
             with the correct base url if --url-column is valid.
         . "images": scrape all the relevant <img> tag src urls. Will join them
             with the correct base url if --url-column is valid.
+        . "europresse": scrape the articles from europresse HTML files.
 
         Examples:
 

diff --git a/minet/scrape/classes/named.py b/minet/scrape/classes/named.py
@@ -1,14 +1,39 @@
 from typing import Optional, List, Any, Dict, Type, cast
 
-from bs4 import SoupStrainer, BeautifulSoup
+from bs4 import SoupStrainer, BeautifulSoup, MarkupResemblesLocatorWarning
+from datetime import datetime
+from html import unescape
+import locale
 from urllib.parse import urljoin
 from ural import should_follow_href, could_be_rss
+import warnings
 
 from minet.scrape.analysis import ScraperAnalysisOutputType
 from minet.scrape.utils import ensure_soup
 from minet.scrape.types import AnyScrapableTarget
 from minet.scrape.classes.base import ScraperBase
 
+warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
+
+DAYS_OF_WEEK_FR = [
+    "lundi",
+    "mardi",
+    "mercredi",
+    "jeudi",
+    "vendredi",
+    "samedi",
+    "dimanche"
+]
+DAYS_OF_WEEK_EN = [
+    "monday",
+    "tuesday",
+    "wednesday",
+    "thursday",
+    "friday",
+    "saturday",
+    "sunday"
+]
+
 
 class NamedScraper(ScraperBase):
     name: str
@@ -192,6 +217,99 @@ def scrape(self, soup: BeautifulSoup, context=None):
 
         return rss_urls
 
+def extract_date(doc_header):
+ date = ""
+ date_index = 0
+ found_date = False
+ doc_header_list = doc_header.split(" ")
+
+ for enum, word in enumerate(doc_header_list):
+
+    if word.lower() in DAYS_OF_WEEK_FR:
+        found_date = True
+        date_index = enum
+        loc = locale.setlocale(locale.LC_ALL, 'fr_FR.utf8')
+    elif word.strip(",").lower() in DAYS_OF_WEEK_EN:
+        found_date = True
+        date_index = enum
+        loc = locale.setlocale(locale.LC_ALL, 'en_US.utf8')
+
+    if found_date:
+
+        if enum in range(date_index, date_index + 3):
+            date += word + " "
+
+        elif enum == date_index + 3:
+            date += word
+
+            try:
+                if loc[:2] == "fr":
+                    formatted_date = datetime.strptime(date, "%A %d %B %Y")
+                else:
+                    formatted_date = datetime.strptime(date, "%A, %B %d, %Y")
+
+                return formatted_date.date().isoformat()
+
+            except ValueError:
+                return extract_date(" ".join(doc_header_list[enum:]))
+
+def select_and_strip(elem, selector):
+    selected_elem = elem.select_one(selector)
+
+    if selected_elem is None:
+        return ""
+
+    return selected_elem.get_text().strip()
+
+class EuropresseScraper(NamedScraper):
+    name = "europresse"
+    fieldnames = ["id", "title", "content", "url", "date", "media"]
+    plural = True
+    output_type = "collection"
+    strainer = SoupStrainer(name = "article")
+
+    def scrape(self, soup: BeautifulSoup, context=None) -> Any:
+
+        articles = []
+        selectors = {
+            "title": ".titreArticle",
+            "id": ".publiC-lblNodoc",
+            "date": ".DocHeader",
+            "media": ".DocPublicationName"
+        }
+
+
+        for elem in soup.select("article"):
+
+            row = {}
+
+            content = elem.select_one(".docOcurrContainer")
+            if content is None:
+                content = ""
+            else:
+                urls = content.select("a")
+                for u in urls:
+                    if "Cet article est paru dans" in u.get_text():
+                        row["url"] = u.get("href")
+                        break
+                content = content.get_text()
+
+
+            for field, selector in selectors.items():
+                row[field] = select_and_strip(elem, selector)
+
+            row["content"] = BeautifulSoup(unescape(content), "html.parser").get_text().strip()
+            row["date"] = extract_date(row["date"])
+            row["media"] = row["media"]\
+            .split(",")[0]\
+            .split("\n")[0]\
+            .split("                ")[0]\
+            .strip()
+
+            articles.append(row)
+
+        return articles
+
 
 NAMED_SCRAPERS: Dict[str, Type[NamedScraper]] = {
     s.name: s
@@ -202,5 +320,6 @@ def scrape(self, soup: BeautifulSoup, context=None):
         ImagesScraper,
         MetasScraper,
         RssScraper,
+        EuropresseScraper
     ]
 }