-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Move WACZ crawl logic to a separate middleware
- Loading branch information
Wesley van Lee
committed
Oct 7, 2024
1 parent
b1fcbdc
commit 3ecbcb9
Showing
3 changed files
with
80 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
import re | ||
from typing import Iterable, Self, Union | ||
from urllib.parse import urlparse | ||
|
||
from scrapy import Request, Spider, signals | ||
from scrapy.crawler import Crawler | ||
from scrapy.exceptions import NotConfigured | ||
from scrapy.settings import Settings | ||
from scrapy.statscollectors import StatsCollector | ||
from smart_open import open | ||
|
||
from scrapy_webarchive.wacz import MultiWaczFile, WaczFile | ||
from scrapy_webarchive.warc import record_transformer | ||
|
||
|
||
class WaczCrawlMiddleware: | ||
def __init__(self, settings: Settings, stats: StatsCollector) -> None: | ||
self.stats = stats | ||
wacz_url = settings.get("WACZ_SOURCE_URL", None) | ||
|
||
if not wacz_url: | ||
raise NotConfigured | ||
|
||
self.wacz_urls = re.split(r"\s*,\s*", wacz_url) | ||
self.crawl = settings.get("WACZ_CRAWL", False) | ||
self.timeout = settings.getfloat("WACZ_TIMEOUT", 60) | ||
|
||
@classmethod | ||
def from_crawler(cls, crawler: Crawler) -> Self: | ||
assert crawler.stats | ||
o = cls(crawler.settings, crawler.stats) | ||
crawler.signals.connect(o.spider_opened, signals.spider_opened) | ||
return o | ||
|
||
def spider_opened(self, spider: Spider) -> None: | ||
if not self.crawl: | ||
return | ||
|
||
tp = {"timeout": self.timeout} | ||
self.wacz: Union[WaczFile, MultiWaczFile] | ||
|
||
if len(self.wacz_urls) == 1: | ||
spider.logger.info(f"[WACZDownloader] Opening WACZ {self.wacz_urls[0]}") | ||
self.wacz = WaczFile(open(self.wacz_urls[0], "rb", transport_params=tp)) | ||
else: | ||
spider.logger.info(f"[WACZDownloader] Opening WACZs {self.wacz_urls}") | ||
self.wacz = MultiWaczFile( | ||
[open(u, "rb", transport_params=tp) for u in self.wacz_urls] | ||
) | ||
|
||
def process_start_requests(self, start_requests: Iterable[Request], spider: Spider): | ||
if not self.crawl: | ||
for request in start_requests: | ||
yield request | ||
|
||
# ignore original start requests, just yield all responses found | ||
for entry in self.wacz.iter_index(): | ||
url = entry["url"] | ||
|
||
# filter out off-site responses | ||
if hasattr(spider, 'allowed_domains') and urlparse(url).hostname not in spider.allowed_domains: | ||
continue | ||
|
||
# only accept whitelisted responses if requested by spider | ||
if hasattr(spider, "archive_regexp") and not re.search(spider.archive_regexp, url): | ||
continue | ||
|
||
self.stats.inc_value("wacz/start_request_count", spider=spider) | ||
|
||
# do not filter to allow all occurences to be handled | ||
# since we don't yet get all information for the request, this can be necessary | ||
yield record_transformer.request_for_record( | ||
entry, | ||
flags=["wacz_start_request"], | ||
meta={"wacz_index_entry": entry}, | ||
dont_filter=True, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters