Skip to content

Commit

Permalink
Move WACZ crawl logic to a separate middleware
Browse files Browse the repository at this point in the history
  • Loading branch information
Wesley van Lee committed Oct 7, 2024
1 parent b1fcbdc commit 3ecbcb9
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 33 deletions.
32 changes: 1 addition & 31 deletions scrapy_webarchive/downloadermiddlewares.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import re
from typing import Iterable, Union
from urllib.parse import urlparse
from typing import Union

from scrapy import signals
from scrapy.crawler import Crawler
Expand Down Expand Up @@ -56,35 +55,6 @@ def spider_opened(self, spider: Spider) -> None:
[open(u, "rb", transport_params=tp) for u in self.wacz_urls]
)

def process_start_requests(self, start_requests: Iterable[Request], spider: Spider):
if not self.crawl:
for request in start_requests:
yield request

if self.crawl:
# ignore original start requests, just yield all responses found
for entry in self.wacz.iter_index():
url = entry["url"]

# filter out off-site responses
if urlparse(url).hostname not in spider.allowed_domains:
continue

# only accept whitelisted responses if requested by spider
if hasattr(spider, "archive_regexp") and not re.search(spider.archive_regexp, url):
continue

self.stats.inc_value("wacz/start_request_count", spider=spider)

# do not filter to allow all occurences to be handled
# since we don't yet get all information for the request, this can be necessary
yield record_transformer.request_for_record(
entry,
flags=["wacz_start_request"],
meta={"wacz_index_entry": entry},
dont_filter=True,
)

def process_request(self, request: Request, spider: Spider):
# ignore blacklisted pages (to avoid crawling e.g. redirects from whitelisted pages to unwanted ones)
if hasattr(spider, "archive_blacklist_regexp") and re.search(
Expand Down
77 changes: 77 additions & 0 deletions scrapy_webarchive/middleware.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import re
from typing import Iterable, Self, Union
from urllib.parse import urlparse

from scrapy import Request, Spider, signals
from scrapy.crawler import Crawler
from scrapy.exceptions import NotConfigured
from scrapy.settings import Settings
from scrapy.statscollectors import StatsCollector
from smart_open import open

from scrapy_webarchive.wacz import MultiWaczFile, WaczFile
from scrapy_webarchive.warc import record_transformer


class WaczCrawlMiddleware:
def __init__(self, settings: Settings, stats: StatsCollector) -> None:
self.stats = stats
wacz_url = settings.get("WACZ_SOURCE_URL", None)

if not wacz_url:
raise NotConfigured

self.wacz_urls = re.split(r"\s*,\s*", wacz_url)
self.crawl = settings.get("WACZ_CRAWL", False)
self.timeout = settings.getfloat("WACZ_TIMEOUT", 60)

@classmethod
def from_crawler(cls, crawler: Crawler) -> Self:
assert crawler.stats
o = cls(crawler.settings, crawler.stats)
crawler.signals.connect(o.spider_opened, signals.spider_opened)
return o

def spider_opened(self, spider: Spider) -> None:
if not self.crawl:
return

tp = {"timeout": self.timeout}
self.wacz: Union[WaczFile, MultiWaczFile]

if len(self.wacz_urls) == 1:
spider.logger.info(f"[WACZDownloader] Opening WACZ {self.wacz_urls[0]}")
self.wacz = WaczFile(open(self.wacz_urls[0], "rb", transport_params=tp))
else:
spider.logger.info(f"[WACZDownloader] Opening WACZs {self.wacz_urls}")
self.wacz = MultiWaczFile(
[open(u, "rb", transport_params=tp) for u in self.wacz_urls]
)

def process_start_requests(self, start_requests: Iterable[Request], spider: Spider):
if not self.crawl:
for request in start_requests:
yield request

# ignore original start requests, just yield all responses found
for entry in self.wacz.iter_index():
url = entry["url"]

# filter out off-site responses
if hasattr(spider, 'allowed_domains') and urlparse(url).hostname not in spider.allowed_domains:
continue

# only accept whitelisted responses if requested by spider
if hasattr(spider, "archive_regexp") and not re.search(spider.archive_regexp, url):
continue

self.stats.inc_value("wacz/start_request_count", spider=spider)

# do not filter to allow all occurences to be handled
# since we don't yet get all information for the request, this can be necessary
yield record_transformer.request_for_record(
entry,
flags=["wacz_start_request"],
meta={"wacz_index_entry": entry},
dont_filter=True,
)
4 changes: 2 additions & 2 deletions scrapy_webarchive/warc.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import socket
from typing import List, Tuple
import uuid
from datetime import datetime, timezone
from io import BytesIO
from typing import List, Tuple
from urllib.parse import urlparse

from scrapy import __version__ as scrapy_version
Expand Down Expand Up @@ -162,7 +162,7 @@ class WarcRecordTransformer:

response_types = ResponseTypes()

def request_for_record(self, record: WARCRecord, **kwargs):
def request_for_record(self, record: dict, **kwargs):
# TODO: locate request in WACZ and include all relevant things (like headers)
return Request(url=record["url"], method=record.get("method", "GET"), **kwargs)

Expand Down

0 comments on commit 3ecbcb9

Please sign in to comment.