From 7a6d336eaf8b3c221566922a0867befd4be65ae7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9atrice=20Mazoyer?= Date: Thu, 22 Feb 2024 11:06:58 +0100 Subject: [PATCH] Adding builtin scraper for Europresse --- docs/cli.md | 1 + minet/cli/scrape/__init__.py | 1 + minet/scrape/classes/named.py | 206 ----------------------- minet/scrape/classes/named/__init__.py | 23 +++ minet/scrape/classes/named/canonical.py | 31 ++++ minet/scrape/classes/named/europresse.py | 74 ++++++++ minet/scrape/classes/named/images.py | 42 +++++ minet/scrape/classes/named/metas.py | 30 ++++ minet/scrape/classes/named/rss.py | 35 ++++ minet/scrape/classes/named/title.py | 21 +++ minet/scrape/classes/named/types.py | 24 +++ minet/scrape/classes/named/urls.py | 42 +++++ 12 files changed, 324 insertions(+), 206 deletions(-) delete mode 100644 minet/scrape/classes/named.py create mode 100644 minet/scrape/classes/named/__init__.py create mode 100644 minet/scrape/classes/named/canonical.py create mode 100644 minet/scrape/classes/named/europresse.py create mode 100644 minet/scrape/classes/named/images.py create mode 100644 minet/scrape/classes/named/metas.py create mode 100644 minet/scrape/classes/named/rss.py create mode 100644 minet/scrape/classes/named/title.py create mode 100644 minet/scrape/classes/named/types.py create mode 100644 minet/scrape/classes/named/urls.py diff --git a/docs/cli.md b/docs/cli.md index 79f8a2796d..637ab09e34 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -1262,6 +1262,7 @@ Builtin scrapers: with the correct base url if --url-column is valid. . "images": scrape all the relevant tag src urls. Will join them with the correct base url if --url-column is valid. +. "europresse": scrape the articles from europresse HTML files. Examples: diff --git a/minet/cli/scrape/__init__.py b/minet/cli/scrape/__init__.py index 0751d91eb9..bf835c32b8 100644 --- a/minet/cli/scrape/__init__.py +++ b/minet/cli/scrape/__init__.py @@ -52,6 +52,7 @@ def resolve_arguments(cli_args): with the correct base url if --url-column is valid. . "images": scrape all the relevant tag src urls. Will join them with the correct base url if --url-column is valid. + . "europresse": scrape the articles from europresse HTML files. Examples: diff --git a/minet/scrape/classes/named.py b/minet/scrape/classes/named.py deleted file mode 100644 index a6f6d6fe88..0000000000 --- a/minet/scrape/classes/named.py +++ /dev/null @@ -1,206 +0,0 @@ -from typing import Optional, List, Any, Dict, Type, cast - -from bs4 import SoupStrainer, BeautifulSoup -from urllib.parse import urljoin -from ural import should_follow_href, could_be_rss - -from minet.scrape.analysis import ScraperAnalysisOutputType -from minet.scrape.utils import ensure_soup -from minet.scrape.types import AnyScrapableTarget -from minet.scrape.classes.base import ScraperBase - - -class NamedScraper(ScraperBase): - name: str - fieldnames: List[str] - plural: bool - tabular = True - output_type: ScraperAnalysisOutputType - strainer: Optional[SoupStrainer] - - def scrape(self, soup: BeautifulSoup, context=None) -> Any: - raise NotImplementedError - - def __call__(self, html: AnyScrapableTarget, context=None) -> Any: - soup = ensure_soup(html, strainer=self.strainer) - return self.scrape(soup, context=context) - - -class TitleScraper(NamedScraper): - name = "title" - fieldnames = ["title"] - plural = False - output_type = "scalar" - strainer = SoupStrainer(name="title") - - def scrape(self, soup: BeautifulSoup, context=None) -> Any: - title_elem = soup.find(name="title") - - if title_elem is None: - return None - - return title_elem.get_text().strip() - - -class CanonicalScraper(NamedScraper): - name = "canonical" - fieldnames = ["canonical_url"] - plural = False - output_type = "scalar" - strainer = SoupStrainer(name="link", attrs={"rel": "canonical"}) - - def scrape(self, soup: BeautifulSoup, context=None) -> Any: - link_elem = soup.select_one("link[rel=canonical][href]") - - if link_elem is None: - return None - - url = link_elem.get("href") - - if url is None: - return None - - url = cast(str, url).strip() - - if not url: - return None - - return url - - -class UrlsScraper(NamedScraper): - name = "urls" - fieldnames = ["url"] - plural = True - output_type = "list" - strainer = SoupStrainer(name="a") - - def scrape(self, soup: BeautifulSoup, context=None) -> Any: - a_elems = soup.select("a[href]") - base_url = context.get("url") if context is not None else None - - urls = [] - - for a in a_elems: - url = a.get("href") - - if url is None: - continue - - url = cast(str, url).strip() - - if not url: - continue - - if not should_follow_href(url): - continue - - if base_url: - url = urljoin(base_url, url) - - urls.append(url) - - return urls - - -class ImagesScraper(NamedScraper): - name = "images" - fieldnames = ["src"] - plural = True - output_type = "list" - strainer = SoupStrainer(name="img") - - def scrape(self, soup: BeautifulSoup, context=None) -> Any: - img_elems = soup.select("img[src]") - base_url = context.get("url") if context is not None else None - - urls = [] - - for img in img_elems: - url = img.get("src") - - if url is None: - continue - - url = cast(str, url).strip() - - if not url: - continue - - if not should_follow_href(url): - continue - - if base_url: - url = urljoin(base_url, url) - - urls.append(url) - - return urls - - -class MetasScraper(NamedScraper): - name = "metas" - fieldnames = [ - "name", - "property", - "http-equiv", - "itemprop", - "content", - "charset", - ] - plural = True - output_type = "collection" - strainer = SoupStrainer(name="meta") - - def scrape(self, soup: BeautifulSoup, context=None) -> Any: - meta_elems = soup.find_all(name="meta") - - metas = [] - - for meta_elem in meta_elems: - metas.append({name: meta_elem.get(name) for name in self.fieldnames}) - - return metas - - -class RssScraper(NamedScraper): - name = "rss" - fieldnames = ["rss_url"] - plural = True - output_type = "list" - strainer = SoupStrainer(name=["a", "link"]) - - def scrape(self, soup: BeautifulSoup, context=None): - rss_urls = [] - base_url = context.get("url") if context is not None else "" - - for link in soup.find_all(): - if link.name == "link": - type_attr = link.attrs.get("type", None) - if ( - type_attr == "application/rss+xml" - or type_attr == "application/atom+xml" - ): - href = link.attrs.get("href", None) - if href: - rss_urls.append(urljoin(base_url, href)) - else: - href = link.attrs.get("href", None) - url = urljoin(base_url, href) - if could_be_rss(url): - rss_urls.append(url) - - return rss_urls - - -NAMED_SCRAPERS: Dict[str, Type[NamedScraper]] = { - s.name: s - for s in [ - TitleScraper, - CanonicalScraper, - UrlsScraper, - ImagesScraper, - MetasScraper, - RssScraper, - ] -} diff --git a/minet/scrape/classes/named/__init__.py b/minet/scrape/classes/named/__init__.py new file mode 100644 index 0000000000..f1515aa8c2 --- /dev/null +++ b/minet/scrape/classes/named/__init__.py @@ -0,0 +1,23 @@ +from typing import Dict, Type +from .types import NamedScraper + +from .canonical import CanonicalScraper +from .europresse import EuropresseScraper +from .images import ImagesScraper +from .metas import MetasScraper +from .rss import RssScraper +from .title import TitleScraper +from .urls import UrlsScraper + +NAMED_SCRAPERS: Dict[str, Type[NamedScraper]] = { + s.name: s + for s in [ + TitleScraper, + CanonicalScraper, + UrlsScraper, + ImagesScraper, + MetasScraper, + RssScraper, + EuropresseScraper, + ] +} diff --git a/minet/scrape/classes/named/canonical.py b/minet/scrape/classes/named/canonical.py new file mode 100644 index 0000000000..a580179786 --- /dev/null +++ b/minet/scrape/classes/named/canonical.py @@ -0,0 +1,31 @@ +from typing import Any, cast + +from bs4 import SoupStrainer, BeautifulSoup + +from .types import NamedScraper + + +class CanonicalScraper(NamedScraper): + name = "canonical" + fieldnames = ["canonical_url"] + plural = False + output_type = "scalar" + strainer = SoupStrainer(name="link", attrs={"rel": "canonical"}) + + def scrape(self, soup: BeautifulSoup, context=None) -> Any: + link_elem = soup.select_one("link[rel=canonical][href]") + + if link_elem is None: + return None + + url = link_elem.get("href") + + if url is None: + return None + + url = cast(str, url).strip() + + if not url: + return None + + return url diff --git a/minet/scrape/classes/named/europresse.py b/minet/scrape/classes/named/europresse.py new file mode 100644 index 0000000000..500ece9b3f --- /dev/null +++ b/minet/scrape/classes/named/europresse.py @@ -0,0 +1,74 @@ +from typing import Any + +import warnings +from datetime import datetime +from html import unescape +from bs4 import SoupStrainer, BeautifulSoup, MarkupResemblesLocatorWarning + +from .types import NamedScraper + + +def extract_content(content): + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=MarkupResemblesLocatorWarning) + return BeautifulSoup(unescape(content), "html.parser").get_text().strip() + + +def extract_date(doc_id): + return datetime.strptime(doc_id.split("·")[1], "%Y%m%d").date().isoformat() + + +def extract_media(media): + return media.split(",", 1)[0].split("\n", 1)[0].split(" " * 16, 1)[0].strip() + + +def select_and_strip(elem, selector): + selected_elem = elem.select_one(selector) + + if selected_elem is None: + return "" + + return selected_elem.get_text().strip() + + +class EuropresseScraper(NamedScraper): + name = "europresse" + fieldnames = ["id", "title", "content", "url", "date", "media", "media_id"] + plural = True + output_type = "collection" + strainer = SoupStrainer(name="article") + + def scrape(self, soup: BeautifulSoup, context=None) -> Any: + articles = [] + selectors = { + "title": ".titreArticle", + "id": ".publiC-lblNodoc", + "media": ".DocPublicationName", + } + + for elem in soup.select("article"): + row = {} + + content = elem.select_one(".docOcurrContainer") + if content is None: + content = "" + else: + urls = content.select("a") + for u in urls: + if "Cet article est paru dans" in u.get_text(): + row["url"] = u.get("href") + break + content = content.get_text() + + row["content"] = extract_content(content) + + for field, selector in selectors.items(): + row[field] = select_and_strip(elem, selector) + + row["date"] = extract_date(row["id"]) + row["media"] = extract_media(row["media"]) + row["media_id"] = row["id"].split("·")[2] + + articles.append(row) + + return articles diff --git a/minet/scrape/classes/named/images.py b/minet/scrape/classes/named/images.py new file mode 100644 index 0000000000..821f6247a4 --- /dev/null +++ b/minet/scrape/classes/named/images.py @@ -0,0 +1,42 @@ +from typing import Any, cast + +from bs4 import SoupStrainer, BeautifulSoup +from ural import should_follow_href +from urllib.parse import urljoin + +from .types import NamedScraper + + +class ImagesScraper(NamedScraper): + name = "images" + fieldnames = ["src"] + plural = True + output_type = "list" + strainer = SoupStrainer(name="img") + + def scrape(self, soup: BeautifulSoup, context=None) -> Any: + img_elems = soup.select("img[src]") + base_url = context.get("url") if context is not None else None + + urls = [] + + for img in img_elems: + url = img.get("src") + + if url is None: + continue + + url = cast(str, url).strip() + + if not url: + continue + + if not should_follow_href(url): + continue + + if base_url: + url = urljoin(base_url, url) + + urls.append(url) + + return urls diff --git a/minet/scrape/classes/named/metas.py b/minet/scrape/classes/named/metas.py new file mode 100644 index 0000000000..8083fe6d3e --- /dev/null +++ b/minet/scrape/classes/named/metas.py @@ -0,0 +1,30 @@ +from typing import Any + +from bs4 import SoupStrainer, BeautifulSoup + +from .types import NamedScraper + + +class MetasScraper(NamedScraper): + name = "metas" + fieldnames = [ + "name", + "property", + "http-equiv", + "itemprop", + "content", + "charset", + ] + plural = True + output_type = "collection" + strainer = SoupStrainer(name="meta") + + def scrape(self, soup: BeautifulSoup, context=None) -> Any: + meta_elems = soup.find_all(name="meta") + + metas = [] + + for meta_elem in meta_elems: + metas.append({name: meta_elem.get(name) for name in self.fieldnames}) + + return metas diff --git a/minet/scrape/classes/named/rss.py b/minet/scrape/classes/named/rss.py new file mode 100644 index 0000000000..d8bedf5562 --- /dev/null +++ b/minet/scrape/classes/named/rss.py @@ -0,0 +1,35 @@ +from urllib.parse import urljoin +from ural import could_be_rss +from bs4 import SoupStrainer, BeautifulSoup + +from .types import NamedScraper + + +class RssScraper(NamedScraper): + name = "rss" + fieldnames = ["rss_url"] + plural = True + output_type = "list" + strainer = SoupStrainer(name=["a", "link"]) + + def scrape(self, soup: BeautifulSoup, context=None): + rss_urls = [] + base_url = context.get("url") if context is not None else "" + + for link in soup.find_all(): + if link.name == "link": + type_attr = link.attrs.get("type", None) + if ( + type_attr == "application/rss+xml" + or type_attr == "application/atom+xml" + ): + href = link.attrs.get("href", None) + if href: + rss_urls.append(urljoin(base_url, href)) + else: + href = link.attrs.get("href", None) + url = urljoin(base_url, href) + if could_be_rss(url): + rss_urls.append(url) + + return rss_urls diff --git a/minet/scrape/classes/named/title.py b/minet/scrape/classes/named/title.py new file mode 100644 index 0000000000..6de22ee9b2 --- /dev/null +++ b/minet/scrape/classes/named/title.py @@ -0,0 +1,21 @@ +from typing import Any + +from bs4 import SoupStrainer, BeautifulSoup + +from .types import NamedScraper + + +class TitleScraper(NamedScraper): + name = "title" + fieldnames = ["title"] + plural = False + output_type = "scalar" + strainer = SoupStrainer(name="title") + + def scrape(self, soup: BeautifulSoup, context=None) -> Any: + title_elem = soup.find(name="title") + + if title_elem is None: + return None + + return title_elem.get_text().strip() diff --git a/minet/scrape/classes/named/types.py b/minet/scrape/classes/named/types.py new file mode 100644 index 0000000000..59aa9cede1 --- /dev/null +++ b/minet/scrape/classes/named/types.py @@ -0,0 +1,24 @@ +from typing import Optional, List, Any + +from bs4 import SoupStrainer, BeautifulSoup + +from minet.scrape.analysis import ScraperAnalysisOutputType +from minet.scrape.utils import ensure_soup +from minet.scrape.types import AnyScrapableTarget +from minet.scrape.classes.base import ScraperBase + + +class NamedScraper(ScraperBase): + name: str + fieldnames: List[str] + plural: bool + tabular = True + output_type: ScraperAnalysisOutputType + strainer: Optional[SoupStrainer] + + def scrape(self, soup: BeautifulSoup, context=None) -> Any: + raise NotImplementedError + + def __call__(self, html: AnyScrapableTarget, context=None) -> Any: + soup = ensure_soup(html, strainer=self.strainer) + return self.scrape(soup, context=context) diff --git a/minet/scrape/classes/named/urls.py b/minet/scrape/classes/named/urls.py new file mode 100644 index 0000000000..aa0b036adc --- /dev/null +++ b/minet/scrape/classes/named/urls.py @@ -0,0 +1,42 @@ +from typing import Any, cast + +from urllib.parse import urljoin +from bs4 import SoupStrainer, BeautifulSoup +from ural import should_follow_href + +from .types import NamedScraper + + +class UrlsScraper(NamedScraper): + name = "urls" + fieldnames = ["url"] + plural = True + output_type = "list" + strainer = SoupStrainer(name="a") + + def scrape(self, soup: BeautifulSoup, context=None) -> Any: + a_elems = soup.select("a[href]") + base_url = context.get("url") if context is not None else None + + urls = [] + + for a in a_elems: + url = a.get("href") + + if url is None: + continue + + url = cast(str, url).strip() + + if not url: + continue + + if not should_follow_href(url): + continue + + if base_url: + url = urljoin(base_url, url) + + urls.append(url) + + return urls