Skip to content

Commit

Permalink
Adding browser_emulation to Crawlers
Browse files Browse the repository at this point in the history
Fix #957
  • Loading branch information
Yomguithereal committed Apr 15, 2024
1 parent 8e4ab2a commit ebfd1e9
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 11 deletions.
17 changes: 15 additions & 2 deletions ftest/crawlers/echojs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@

def scrape(soup: WonderfulSoup) -> SpiderResult:
next_links = soup.scrape("#newslist article > h2 > a[href]", "href")
title = soup.force_select_one("title").get_text()
title = soup.select_one("title")

return title, next_links
return title.get_text() if title is not None else None, next_links


def spider(job: CrawlJob, response: Response) -> SpiderResult:
Expand Down Expand Up @@ -60,3 +60,16 @@ def __init__(self, **kwargs):

def factory(**crawler_kwargs):
return EchoJSCrawler(**crawler_kwargs)


def emulation_factory(**crawler_kwargs):
async def init(context):
print(context)

return Crawler(
EchoJSSpider(),
browser_emulation=True,
browser_kwargs={"adblock": True},
browser_context_init=init,
**crawler_kwargs,
)
53 changes: 44 additions & 9 deletions minet/crawl/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,15 @@
Iterable,
Iterator,
Union,
Awaitable,
TYPE_CHECKING,
)
from minet.types import ParamSpec

if TYPE_CHECKING:
from playwright.async_api import BrowserContext
from minet.browser import ThreadsafeBrowser

from os import makedirs
from os.path import join
from threading import Lock
Expand Down Expand Up @@ -125,13 +131,16 @@ def __init__(
self.max_depth = crawler.max_depth
self.callback = callback

self.default_kwargs = {
"pool_manager": crawler.executor.pool_manager,
"max_redirects": max_redirects,
"stateful": stateful_redirects,
"spoof_ua": spoof_ua,
"cancel_event": crawler.executor.cancel_event,
}
self.default_kwargs = {}

if self.crawler.browser is None:
self.default_kwargs = {
"pool_manager": crawler.executor.pool_manager,
"max_redirects": max_redirects,
"stateful": stateful_redirects,
"spoof_ua": spoof_ua,
"cancel_event": crawler.executor.cancel_event,
}

if use_pycurl:
del self.default_kwargs["pool_manager"]
Expand Down Expand Up @@ -179,11 +188,16 @@ def __call__(

try:
retryer = getattr(self.local_context, "retryer", None)
request_fn = (
request
if self.crawler.browser is None
else self.crawler.browser.request
)

if retryer is not None:
response = retryer(request, job.url, **kwargs)
response = retryer(request_fn, job.url, **kwargs)
else:
response = request(job.url, **kwargs)
response = request_fn(job.url, **kwargs)

# If end url is different from job we add the url to visited cache
# NOTE: this is somewhat subject to race conditions but it should
Expand Down Expand Up @@ -260,6 +274,8 @@ class Crawler(Generic[CrawlJobDataTypes, CrawlResultDataTypes]):

enqueue_lock: Lock

browser: Optional["ThreadsafeBrowser"]

queue: CrawlerQueue
persistent: bool

Expand Down Expand Up @@ -302,11 +318,27 @@ def __init__(
max_redirects: int = DEFAULT_FETCH_MAX_REDIRECTS,
stateful_redirects: bool = False,
spoof_ua: bool = False,
browser_emulation: bool = False,
browser_kwargs: Dict[str, Any] = {},
browser_context_init: Optional[
Callable[["BrowserContext"], Awaitable[None]]
] = None,
):
# Validation
if resume and persistent_storage_path is None:
raise TypeError("cannot resume a non-persistent crawler")

# Browser emulation?
self.browser = None

if browser_emulation:
from minet.browser import ThreadsafeBrowser

self.browser = ThreadsafeBrowser(**browser_kwargs)

if browser_context_init is not None:
self.browser.run_in_default_context(browser_context_init)

# Utilities
self.file_writer = ThreadSafeFileWriter(writer_root_directory, sqlar=sqlar)
self.process_pool = None
Expand Down Expand Up @@ -490,6 +522,9 @@ def stop(self):
if self.url_cache:
self.url_cache.close()

if self.browser is not None:
self.browser.stop()

def __enter__(self):
self.start()
return self
Expand Down

0 comments on commit ebfd1e9

Please sign in to comment.