Skip to content

Commit

Permalink
Splitting named scrapers into their own files
Browse files Browse the repository at this point in the history
  • Loading branch information
Yomguithereal committed Feb 21, 2024
1 parent cf5be7d commit 110b64c
Show file tree
Hide file tree
Showing 10 changed files with 373 additions and 325 deletions.
325 changes: 0 additions & 325 deletions minet/scrape/classes/named.py

This file was deleted.

23 changes: 23 additions & 0 deletions minet/scrape/classes/named/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from typing import Dict, Type
from .types import NamedScraper

from .canonical import CanonicalScraper
from .europresse import EuropresseScraper
from .images import ImagesScraper
from .metas import MetasScraper
from .rss import RssScraper
from .title import TitleScraper
from .urls import UrlsScraper

NAMED_SCRAPERS: Dict[str, Type[NamedScraper]] = {
s.name: s
for s in [
TitleScraper,
CanonicalScraper,
UrlsScraper,
ImagesScraper,
MetasScraper,
RssScraper,
EuropresseScraper,
]
}
31 changes: 31 additions & 0 deletions minet/scrape/classes/named/canonical.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from typing import Any, cast

from bs4 import SoupStrainer, BeautifulSoup

from .types import NamedScraper


class CanonicalScraper(NamedScraper):
name = "canonical"
fieldnames = ["canonical_url"]
plural = False
output_type = "scalar"
strainer = SoupStrainer(name="link", attrs={"rel": "canonical"})

def scrape(self, soup: BeautifulSoup, context=None) -> Any:
link_elem = soup.select_one("link[rel=canonical][href]")

if link_elem is None:
return None

url = link_elem.get("href")

if url is None:
return None

url = cast(str, url).strip()

if not url:
return None

return url
Loading

0 comments on commit 110b64c

Please sign in to comment.