diff --git a/scrapy_webarchive/downloadermiddlewares.py b/scrapy_webarchive/downloadermiddlewares.py index c199b25..960d1bd 100644 --- a/scrapy_webarchive/downloadermiddlewares.py +++ b/scrapy_webarchive/downloadermiddlewares.py @@ -1,6 +1,5 @@ import re -from typing import Iterable, Union -from urllib.parse import urlparse +from typing import Union from scrapy import signals from scrapy.crawler import Crawler @@ -56,35 +55,6 @@ def spider_opened(self, spider: Spider) -> None: [open(u, "rb", transport_params=tp) for u in self.wacz_urls] ) - def process_start_requests(self, start_requests: Iterable[Request], spider: Spider): - if not self.crawl: - for request in start_requests: - yield request - - if self.crawl: - # ignore original start requests, just yield all responses found - for entry in self.wacz.iter_index(): - url = entry["url"] - - # filter out off-site responses - if urlparse(url).hostname not in spider.allowed_domains: - continue - - # only accept whitelisted responses if requested by spider - if hasattr(spider, "archive_regexp") and not re.search(spider.archive_regexp, url): - continue - - self.stats.inc_value("wacz/start_request_count", spider=spider) - - # do not filter to allow all occurences to be handled - # since we don't yet get all information for the request, this can be necessary - yield record_transformer.request_for_record( - entry, - flags=["wacz_start_request"], - meta={"wacz_index_entry": entry}, - dont_filter=True, - ) - def process_request(self, request: Request, spider: Spider): # ignore blacklisted pages (to avoid crawling e.g. redirects from whitelisted pages to unwanted ones) if hasattr(spider, "archive_blacklist_regexp") and re.search( diff --git a/scrapy_webarchive/middleware.py b/scrapy_webarchive/middleware.py new file mode 100644 index 0000000..67e8b86 --- /dev/null +++ b/scrapy_webarchive/middleware.py @@ -0,0 +1,77 @@ +import re +from typing import Iterable, Self, Union +from urllib.parse import urlparse + +from scrapy import Request, Spider, signals +from scrapy.crawler import Crawler +from scrapy.exceptions import NotConfigured +from scrapy.settings import Settings +from scrapy.statscollectors import StatsCollector +from smart_open import open + +from scrapy_webarchive.wacz import MultiWaczFile, WaczFile +from scrapy_webarchive.warc import record_transformer + + +class WaczCrawlMiddleware: + def __init__(self, settings: Settings, stats: StatsCollector) -> None: + self.stats = stats + wacz_url = settings.get("WACZ_SOURCE_URL", None) + + if not wacz_url: + raise NotConfigured + + self.wacz_urls = re.split(r"\s*,\s*", wacz_url) + self.crawl = settings.get("WACZ_CRAWL", False) + self.timeout = settings.getfloat("WACZ_TIMEOUT", 60) + + @classmethod + def from_crawler(cls, crawler: Crawler) -> Self: + assert crawler.stats + o = cls(crawler.settings, crawler.stats) + crawler.signals.connect(o.spider_opened, signals.spider_opened) + return o + + def spider_opened(self, spider: Spider) -> None: + if not self.crawl: + return + + tp = {"timeout": self.timeout} + self.wacz: Union[WaczFile, MultiWaczFile] + + if len(self.wacz_urls) == 1: + spider.logger.info(f"[WACZDownloader] Opening WACZ {self.wacz_urls[0]}") + self.wacz = WaczFile(open(self.wacz_urls[0], "rb", transport_params=tp)) + else: + spider.logger.info(f"[WACZDownloader] Opening WACZs {self.wacz_urls}") + self.wacz = MultiWaczFile( + [open(u, "rb", transport_params=tp) for u in self.wacz_urls] + ) + + def process_start_requests(self, start_requests: Iterable[Request], spider: Spider): + if not self.crawl: + for request in start_requests: + yield request + + # ignore original start requests, just yield all responses found + for entry in self.wacz.iter_index(): + url = entry["url"] + + # filter out off-site responses + if hasattr(spider, 'allowed_domains') and urlparse(url).hostname not in spider.allowed_domains: + continue + + # only accept whitelisted responses if requested by spider + if hasattr(spider, "archive_regexp") and not re.search(spider.archive_regexp, url): + continue + + self.stats.inc_value("wacz/start_request_count", spider=spider) + + # do not filter to allow all occurences to be handled + # since we don't yet get all information for the request, this can be necessary + yield record_transformer.request_for_record( + entry, + flags=["wacz_start_request"], + meta={"wacz_index_entry": entry}, + dont_filter=True, + ) diff --git a/scrapy_webarchive/warc.py b/scrapy_webarchive/warc.py index 74e2305..ca2e825 100644 --- a/scrapy_webarchive/warc.py +++ b/scrapy_webarchive/warc.py @@ -1,8 +1,8 @@ import socket -from typing import List, Tuple import uuid from datetime import datetime, timezone from io import BytesIO +from typing import List, Tuple from urllib.parse import urlparse from scrapy import __version__ as scrapy_version @@ -162,7 +162,7 @@ class WarcRecordTransformer: response_types = ResponseTypes() - def request_for_record(self, record: WARCRecord, **kwargs): + def request_for_record(self, record: dict, **kwargs): # TODO: locate request in WACZ and include all relevant things (like headers) return Request(url=record["url"], method=record.get("method", "GET"), **kwargs)