Skip to content

Commit

Permalink
Fix: retrying context closing (#28)
Browse files Browse the repository at this point in the history
* Base for retrying

* Request-Response in puppeteer terms.

* Closing contexts after retry

* Typos and readme

* fix: missing cntx_id

* Proper merging from other branch

* Readme and extra import

* Version update

* Defaults

* Errback and CloseContextRequest fix

* Typo

* log warning if close context response status is not 200

---------

Co-authored-by: Max Varlamov <[email protected]>
  • Loading branch information
MatthewZMSU and mxsnq authored Jul 2, 2024
1 parent 9bd72a8 commit ca2ddd9
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 25 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ By default, only cookies are sent.
You would also like to send meta with your request. By default, you are not allowed to do this
in order to sustain backward compatibility. You can change this behaviour by setting `PUPPETEER_INCLUDE_META` to True.

One your spider has done the crawling, the service middleware would close all contexts with
`scrapypuppeteer.CloseContextRequest`. It accepts a list of all browser contexts to be closed.

## Automatic recaptcha solving

Enable PuppeteerRecaptchaDownloaderMiddleware to automatically solve recaptcha during scraping. We do not recommend
Expand Down
10 changes: 8 additions & 2 deletions scrapypuppeteer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
from .request import PuppeteerRequest
from .response import *
from .request import PuppeteerRequest, CloseContextRequest
from .response import (
PuppeteerResponse,
PuppeteerHtmlResponse,
PuppeteerScreenshotResponse,
PuppeteerRecaptchaSolverResponse,
PuppeteerJsonResponse,
)
62 changes: 43 additions & 19 deletions scrapypuppeteer/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
from typing import List, Union
from urllib.parse import urlencode, urljoin

from scrapy import Request, signals
from scrapy import signals
from scrapy.crawler import Crawler
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import Headers, TextResponse
from scrapy.exceptions import IgnoreRequest, NotConfigured, DontCloseSpider
from scrapy.http import Headers, TextResponse, Response
from scrapy.utils.log import failure_to_exc_info
from twisted.python.failure import Failure

from scrapypuppeteer.actions import (
Click,
Expand All @@ -26,16 +28,16 @@
PuppeteerRecaptchaSolverResponse,
PuppeteerJsonResponse,
)
from scrapypuppeteer.request import ActionRequest, PuppeteerRequest
from scrapypuppeteer.request import ActionRequest, PuppeteerRequest, CloseContextRequest


class PuppeteerServiceDownloaderMiddleware:
"""
This downloader middleware converts PuppeteerRequest instances to
Puppeteer service API requests and then converts its responses to
PuppeteerResponse instances. Additionally, it tracks all browser contexts
that spider uses and performs cleanup request to service once spider
is closed.
that spider uses and performs cleanup request to service right before
spider is closed.
Additionally, the middleware uses these meta-keys, do not use them, because their changing
could possibly (almost probably) break determined behaviour:
Expand All @@ -49,7 +51,7 @@ class PuppeteerServiceDownloaderMiddleware:
PUPPETEER_INCLUDE_HEADERS (bool|list[str])
Determines which request headers will be sent to remote site by puppeteer service.
Either True (all headers), False (no headers) or list of header names.
May be overriden per request.
May be overridden per request.
By default, only cookies are sent.
PUPPETEER_INCLUDE_META (bool)
Expand Down Expand Up @@ -92,14 +94,24 @@ def from_crawler(cls, crawler):
include_meta = crawler.settings.getbool(cls.SERVICE_META_SETTING, False)
middleware = cls(crawler, service_url, include_headers, include_meta)
crawler.signals.connect(
middleware.close_used_contexts, signal=signals.spider_closed
middleware.close_used_contexts, signal=signals.spider_idle
)
return middleware

def process_request(self, request, spider):
if not isinstance(request, PuppeteerRequest):
return
if isinstance(request, CloseContextRequest):
return self.process_close_context_request(request)

if isinstance(request, PuppeteerRequest):
return self.process_puppeteer_request(request)

def process_close_context_request(self, request: CloseContextRequest):
if not request.is_valid_url:
return request.replace(
url=urljoin(self.service_base_url, "/close_context"),
)

def process_puppeteer_request(self, request: PuppeteerRequest):
action = request.action
service_url = urljoin(self.service_base_url, action.endpoint)
service_params = self._encode_service_params(request)
Expand Down Expand Up @@ -176,8 +188,6 @@ def process_response(self, request, response, spider):
return response.replace(request=request)

response_data = json.loads(response.text)
response_cls = self._get_response_class(puppeteer_request.action)

if response.status != 200:
reason = response_data.pop("error", f"undefined, status {response.status}")
self.service_logger.warning(
Expand All @@ -188,6 +198,8 @@ def process_response(self, request, response, spider):
self.used_contexts[id(spider)].add(context_id)
return response

response_cls = self._get_response_class(puppeteer_request.action)

return self._form_response(
response_cls,
response_data,
Expand Down Expand Up @@ -225,16 +237,28 @@ def _get_response_class(request_action):
return PuppeteerJsonResponse

def close_used_contexts(self, spider):
contexts = list(self.used_contexts[id(spider)])
contexts = list(self.used_contexts.pop(id(spider), set()))
if contexts:
request = Request(
urljoin(self.service_base_url, "/close_context"),
method="POST",
headers=Headers({"Content-Type": "application/json"}),
request = CloseContextRequest(
contexts,
meta={"proxy": None},
body=json.dumps(contexts),
)
return self.crawler.engine.downloader.fetch(request, None)

def handle_close_contexts_result(result):
if isinstance(result, Response):
if result.status == 200:
self.service_logger.debug(f"Successfully closed {len(request.contexts)} "
f"contexts with request {result.request}")
else:
self.service_logger.warning(f"Could not close contexts: {result.text}")
elif isinstance(result, Failure):
self.service_logger.warning(f"Could not close contexts: {result.value}",
exc_info=failure_to_exc_info(result))

dfd = self.crawler.engine.download(request)
dfd.addBoth(handle_close_contexts_result)

raise DontCloseSpider()


class PuppeteerRecaptchaDownloaderMiddleware:
Expand Down
38 changes: 37 additions & 1 deletion scrapypuppeteer/request.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
from typing import Tuple, List, Union

from scrapy.http import Request
from scrapy.http import Request, Headers

from scrapypuppeteer.actions import GoTo, PuppeteerServiceAction

Expand Down Expand Up @@ -94,3 +95,38 @@ def __init__(
self.page_id = page_id
self.close_page = close_page
self.include_headers = include_headers


class CloseContextRequest(Request):
"""
This request is used to close the browser contexts.
The response for this request is a regular Scrapy HTMLResponse.
"""

attributes: Tuple[str, ...] = Request.attributes + ("contexts",)

def __init__(self, contexts: List, **kwargs):
"""
:param contexts: list of puppeteer contexts to close.
:param kwargs: arguments of scrapy.Request.
"""
self.contexts = contexts
self.is_valid_url = False

if "url" in kwargs:
self.is_valid_url = True
url = kwargs.pop("url", "://") # Incorrect url. To be replaced in middleware

kwargs["method"] = "POST"
kwargs["headers"] = Headers({"Content-Type": "application/json"})
kwargs["body"] = json.dumps(self.contexts)

super().__init__(url, **kwargs)

def __repr__(self):
return f"<CLOSE CONTEXT {self.url if self.is_valid_url else 'undefined url'}>"

def __str__(self):
return self.__repr__()
3 changes: 1 addition & 2 deletions scrapypuppeteer/response.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import warnings
from typing import Tuple, Union

from scrapy.exceptions import ScrapyDeprecationWarning
Expand All @@ -6,8 +7,6 @@
from scrapypuppeteer import PuppeteerRequest
from scrapypuppeteer.actions import GoTo, PuppeteerServiceAction

import warnings


class PuppeteerResponse(TextResponse):
attributes: Tuple[str, ...] = TextResponse.attributes + (
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

setup(
name="scrapy-puppeteer-client",
version="0.2.0",
version="0.3.0",
description="A library to use Puppeteer-managed browser in Scrapy spiders",
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down

0 comments on commit ca2ddd9

Please sign in to comment.