Skip to content

Commit

Permalink
Adding builtin scraper for Europresse
Browse files Browse the repository at this point in the history
  • Loading branch information
bmaz committed Feb 20, 2024
1 parent da98a7a commit cf5be7d
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 1 deletion.
1 change: 1 addition & 0 deletions docs/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -1262,6 +1262,7 @@ Builtin scrapers:
with the correct base url if --url-column is valid.
. "images": scrape all the relevant <img> tag src urls. Will join them
with the correct base url if --url-column is valid.
. "europresse": scrape the articles from europresse HTML files.
Examples:
Expand Down
1 change: 1 addition & 0 deletions minet/cli/scrape/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def resolve_arguments(cli_args):
with the correct base url if --url-column is valid.
. "images": scrape all the relevant <img> tag src urls. Will join them
with the correct base url if --url-column is valid.
. "europresse": scrape the articles from europresse HTML files.
Examples:
Expand Down
121 changes: 120 additions & 1 deletion minet/scrape/classes/named.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,39 @@
from typing import Optional, List, Any, Dict, Type, cast

from bs4 import SoupStrainer, BeautifulSoup
from bs4 import SoupStrainer, BeautifulSoup, MarkupResemblesLocatorWarning
from datetime import datetime
from html import unescape
import locale
from urllib.parse import urljoin
from ural import should_follow_href, could_be_rss
import warnings

from minet.scrape.analysis import ScraperAnalysisOutputType
from minet.scrape.utils import ensure_soup
from minet.scrape.types import AnyScrapableTarget
from minet.scrape.classes.base import ScraperBase

warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

DAYS_OF_WEEK_FR = [
"lundi",
"mardi",
"mercredi",
"jeudi",
"vendredi",
"samedi",
"dimanche"
]
DAYS_OF_WEEK_EN = [
"monday",
"tuesday",
"wednesday",
"thursday",
"friday",
"saturday",
"sunday"
]


class NamedScraper(ScraperBase):
name: str
Expand Down Expand Up @@ -192,6 +217,99 @@ def scrape(self, soup: BeautifulSoup, context=None):

return rss_urls

def extract_date(doc_header):
date = ""
date_index = 0
found_date = False
doc_header_list = doc_header.split(" ")

for enum, word in enumerate(doc_header_list):

if word.lower() in DAYS_OF_WEEK_FR:
found_date = True
date_index = enum
loc = locale.setlocale(locale.LC_ALL, 'fr_FR.utf8')
elif word.strip(",").lower() in DAYS_OF_WEEK_EN:
found_date = True
date_index = enum
loc = locale.setlocale(locale.LC_ALL, 'en_US.utf8')

if found_date:

if enum in range(date_index, date_index + 3):
date += word + " "

elif enum == date_index + 3:
date += word

try:
if loc[:2] == "fr":
formatted_date = datetime.strptime(date, "%A %d %B %Y")
else:
formatted_date = datetime.strptime(date, "%A, %B %d, %Y")

return formatted_date.date().isoformat()

except ValueError:
return extract_date(" ".join(doc_header_list[enum:]))

def select_and_strip(elem, selector):
selected_elem = elem.select_one(selector)

if selected_elem is None:
return ""

return selected_elem.get_text().strip()

class EuropresseScraper(NamedScraper):
name = "europresse"
fieldnames = ["id", "title", "content", "url", "date", "media"]
plural = True
output_type = "collection"
strainer = SoupStrainer(name = "article")

def scrape(self, soup: BeautifulSoup, context=None) -> Any:

articles = []
selectors = {
"title": ".titreArticle",
"id": ".publiC-lblNodoc",
"date": ".DocHeader",
"media": ".DocPublicationName"
}


for elem in soup.select("article"):

row = {}

content = elem.select_one(".docOcurrContainer")
if content is None:
content = ""
else:
urls = content.select("a")
for u in urls:
if "Cet article est paru dans" in u.get_text():
row["url"] = u.get("href")
break
content = content.get_text()


for field, selector in selectors.items():
row[field] = select_and_strip(elem, selector)

row["content"] = BeautifulSoup(unescape(content), "html.parser").get_text().strip()
row["date"] = extract_date(row["date"])
row["media"] = row["media"]\
.split(",")[0]\
.split("\n")[0]\
.split(" ")[0]\
.strip()

articles.append(row)

return articles


NAMED_SCRAPERS: Dict[str, Type[NamedScraper]] = {
s.name: s
Expand All @@ -202,5 +320,6 @@ def scrape(self, soup: BeautifulSoup, context=None):
ImagesScraper,
MetasScraper,
RssScraper,
EuropresseScraper
]
}

0 comments on commit cf5be7d

Please sign in to comment.