From ebfd1e9804917f9b7277f58ad688128390957bdb Mon Sep 17 00:00:00 2001 From: Yomguithereal Date: Mon, 15 Apr 2024 17:37:14 +0200 Subject: [PATCH] Adding browser_emulation to Crawlers Fix #957 --- ftest/crawlers/echojs.py | 17 +++++++++++-- minet/crawl/crawler.py | 53 +++++++++++++++++++++++++++++++++------- 2 files changed, 59 insertions(+), 11 deletions(-) diff --git a/ftest/crawlers/echojs.py b/ftest/crawlers/echojs.py index d4fb42a61f..3575f23fac 100644 --- a/ftest/crawlers/echojs.py +++ b/ftest/crawlers/echojs.py @@ -7,9 +7,9 @@ def scrape(soup: WonderfulSoup) -> SpiderResult: next_links = soup.scrape("#newslist article > h2 > a[href]", "href") - title = soup.force_select_one("title").get_text() + title = soup.select_one("title") - return title, next_links + return title.get_text() if title is not None else None, next_links def spider(job: CrawlJob, response: Response) -> SpiderResult: @@ -60,3 +60,16 @@ def __init__(self, **kwargs): def factory(**crawler_kwargs): return EchoJSCrawler(**crawler_kwargs) + + +def emulation_factory(**crawler_kwargs): + async def init(context): + print(context) + + return Crawler( + EchoJSSpider(), + browser_emulation=True, + browser_kwargs={"adblock": True}, + browser_context_init=init, + **crawler_kwargs, + ) diff --git a/minet/crawl/crawler.py b/minet/crawl/crawler.py index 408399c17d..ced39699b4 100644 --- a/minet/crawl/crawler.py +++ b/minet/crawl/crawler.py @@ -18,9 +18,15 @@ Iterable, Iterator, Union, + Awaitable, + TYPE_CHECKING, ) from minet.types import ParamSpec +if TYPE_CHECKING: + from playwright.async_api import BrowserContext + from minet.browser import ThreadsafeBrowser + from os import makedirs from os.path import join from threading import Lock @@ -125,13 +131,16 @@ def __init__( self.max_depth = crawler.max_depth self.callback = callback - self.default_kwargs = { - "pool_manager": crawler.executor.pool_manager, - "max_redirects": max_redirects, - "stateful": stateful_redirects, - "spoof_ua": spoof_ua, - "cancel_event": crawler.executor.cancel_event, - } + self.default_kwargs = {} + + if self.crawler.browser is None: + self.default_kwargs = { + "pool_manager": crawler.executor.pool_manager, + "max_redirects": max_redirects, + "stateful": stateful_redirects, + "spoof_ua": spoof_ua, + "cancel_event": crawler.executor.cancel_event, + } if use_pycurl: del self.default_kwargs["pool_manager"] @@ -179,11 +188,16 @@ def __call__( try: retryer = getattr(self.local_context, "retryer", None) + request_fn = ( + request + if self.crawler.browser is None + else self.crawler.browser.request + ) if retryer is not None: - response = retryer(request, job.url, **kwargs) + response = retryer(request_fn, job.url, **kwargs) else: - response = request(job.url, **kwargs) + response = request_fn(job.url, **kwargs) # If end url is different from job we add the url to visited cache # NOTE: this is somewhat subject to race conditions but it should @@ -260,6 +274,8 @@ class Crawler(Generic[CrawlJobDataTypes, CrawlResultDataTypes]): enqueue_lock: Lock + browser: Optional["ThreadsafeBrowser"] + queue: CrawlerQueue persistent: bool @@ -302,11 +318,27 @@ def __init__( max_redirects: int = DEFAULT_FETCH_MAX_REDIRECTS, stateful_redirects: bool = False, spoof_ua: bool = False, + browser_emulation: bool = False, + browser_kwargs: Dict[str, Any] = {}, + browser_context_init: Optional[ + Callable[["BrowserContext"], Awaitable[None]] + ] = None, ): # Validation if resume and persistent_storage_path is None: raise TypeError("cannot resume a non-persistent crawler") + # Browser emulation? + self.browser = None + + if browser_emulation: + from minet.browser import ThreadsafeBrowser + + self.browser = ThreadsafeBrowser(**browser_kwargs) + + if browser_context_init is not None: + self.browser.run_in_default_context(browser_context_init) + # Utilities self.file_writer = ThreadSafeFileWriter(writer_root_directory, sqlar=sqlar) self.process_pool = None @@ -490,6 +522,9 @@ def stop(self): if self.url_cache: self.url_cache.close() + if self.browser is not None: + self.browser.stop() + def __enter__(self): self.start() return self