Skip to content

Commit

Permalink
Adding builtin scraper for Europresse
Browse files Browse the repository at this point in the history
  • Loading branch information
bmaz committed Feb 22, 2024
1 parent da98a7a commit 7a6d336
Show file tree
Hide file tree
Showing 12 changed files with 324 additions and 206 deletions.
1 change: 1 addition & 0 deletions docs/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -1262,6 +1262,7 @@ Builtin scrapers:
with the correct base url if --url-column is valid.
. "images": scrape all the relevant <img> tag src urls. Will join them
with the correct base url if --url-column is valid.
. "europresse": scrape the articles from europresse HTML files.
Examples:
Expand Down
1 change: 1 addition & 0 deletions minet/cli/scrape/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def resolve_arguments(cli_args):
with the correct base url if --url-column is valid.
. "images": scrape all the relevant <img> tag src urls. Will join them
with the correct base url if --url-column is valid.
. "europresse": scrape the articles from europresse HTML files.
Examples:
Expand Down
206 changes: 0 additions & 206 deletions minet/scrape/classes/named.py

This file was deleted.

23 changes: 23 additions & 0 deletions minet/scrape/classes/named/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from typing import Dict, Type
from .types import NamedScraper

from .canonical import CanonicalScraper
from .europresse import EuropresseScraper
from .images import ImagesScraper
from .metas import MetasScraper
from .rss import RssScraper
from .title import TitleScraper
from .urls import UrlsScraper

NAMED_SCRAPERS: Dict[str, Type[NamedScraper]] = {
s.name: s
for s in [
TitleScraper,
CanonicalScraper,
UrlsScraper,
ImagesScraper,
MetasScraper,
RssScraper,
EuropresseScraper,
]
}
31 changes: 31 additions & 0 deletions minet/scrape/classes/named/canonical.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from typing import Any, cast

from bs4 import SoupStrainer, BeautifulSoup

from .types import NamedScraper


class CanonicalScraper(NamedScraper):
name = "canonical"
fieldnames = ["canonical_url"]
plural = False
output_type = "scalar"
strainer = SoupStrainer(name="link", attrs={"rel": "canonical"})

def scrape(self, soup: BeautifulSoup, context=None) -> Any:
link_elem = soup.select_one("link[rel=canonical][href]")

if link_elem is None:
return None

url = link_elem.get("href")

if url is None:
return None

url = cast(str, url).strip()

if not url:
return None

return url
74 changes: 74 additions & 0 deletions minet/scrape/classes/named/europresse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from typing import Any

import warnings
from datetime import datetime
from html import unescape
from bs4 import SoupStrainer, BeautifulSoup, MarkupResemblesLocatorWarning

from .types import NamedScraper


def extract_content(content):
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=MarkupResemblesLocatorWarning)
return BeautifulSoup(unescape(content), "html.parser").get_text().strip()


def extract_date(doc_id):
return datetime.strptime(doc_id.split("·")[1], "%Y%m%d").date().isoformat()


def extract_media(media):
return media.split(",", 1)[0].split("\n", 1)[0].split(" " * 16, 1)[0].strip()


def select_and_strip(elem, selector):
selected_elem = elem.select_one(selector)

if selected_elem is None:
return ""

return selected_elem.get_text().strip()


class EuropresseScraper(NamedScraper):
name = "europresse"
fieldnames = ["id", "title", "content", "url", "date", "media", "media_id"]
plural = True
output_type = "collection"
strainer = SoupStrainer(name="article")

def scrape(self, soup: BeautifulSoup, context=None) -> Any:
articles = []
selectors = {
"title": ".titreArticle",
"id": ".publiC-lblNodoc",
"media": ".DocPublicationName",
}

for elem in soup.select("article"):
row = {}

content = elem.select_one(".docOcurrContainer")
if content is None:
content = ""
else:
urls = content.select("a")
for u in urls:
if "Cet article est paru dans" in u.get_text():
row["url"] = u.get("href")
break
content = content.get_text()

row["content"] = extract_content(content)

for field, selector in selectors.items():
row[field] = select_and_strip(elem, selector)

row["date"] = extract_date(row["id"])
row["media"] = extract_media(row["media"])
row["media_id"] = row["id"].split("·")[2]

articles.append(row)

return articles
42 changes: 42 additions & 0 deletions minet/scrape/classes/named/images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from typing import Any, cast

from bs4 import SoupStrainer, BeautifulSoup
from ural import should_follow_href
from urllib.parse import urljoin

from .types import NamedScraper


class ImagesScraper(NamedScraper):
name = "images"
fieldnames = ["src"]
plural = True
output_type = "list"
strainer = SoupStrainer(name="img")

def scrape(self, soup: BeautifulSoup, context=None) -> Any:
img_elems = soup.select("img[src]")
base_url = context.get("url") if context is not None else None

urls = []

for img in img_elems:
url = img.get("src")

if url is None:
continue

url = cast(str, url).strip()

if not url:
continue

if not should_follow_href(url):
continue

if base_url:
url = urljoin(base_url, url)

urls.append(url)

return urls
Loading

0 comments on commit 7a6d336

Please sign in to comment.