Json to Html converse (#30)

* to_html method * Formatting * Attribute fix * Documentation * Version update
ispras · Jul 4, 2024 · 5b0ee56 · 5b0ee56
1 parent ca2ddd9
commit 5b0ee56
Show file tree

Hide file tree

Showing 4 changed files with 68 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -43,6 +43,16 @@ class MySpider(scrapy.Spider):
         ...
 ```
 
+## Puppeter responses
+
+There is a parent `PuppeteerResponse` class from which other response classes are inherited.
+
+Here is a list of them all:
+- `PuppeteerHtmlResponse` - has `html` and `cookies` properties
+- `PuppeteerScreenshotResponse` - has `screenshot` property
+- `PuppeteerJsonResponse` - has `data` property and `to_html()` method which tries to transform itself to `PuppeteerHtmlResponse`
+- `PuppeteerRecaptchaSolverResponse(PuppeteerJsonResponse, PuppeteerHtmlResponse)` - has `recaptcha_data` property
+
 ## Advanced usage
 
 `PuppeteerRequest`'s first argument is a browser action.
@@ -67,32 +77,42 @@ Then use `response.follow` to continue interacting with the same tab:
 
 ```python
 import scrapy
-from scrapypuppeteer import PuppeteerRequest
+from scrapypuppeteer import PuppeteerRequest, PuppeteerHtmlResponse
 from scrapypuppeteer.actions import Click
 
 class MySpider(scrapy.Spider):
     ...
     def start_requests(self):
-        yield PuppeteerRequest('https://exapmle.com', close_page=False, callback=self.parse)
-
-    def parse(self, response):
+        yield PuppeteerRequest(
+            'https://exapmle.com',  # will be transformed into GoTo action
+            close_page=False,
+            callback=self.parse,
+        )
+
+    def parse(self, response: PuppeteerHtmlResponse):
         ...
         # parse and yield some items
         ...
         next_page_selector = 'button.next-page-or-smth'
         if response.css(next_page_selector ):
-            yield response.follow(Click(next_page_selector ,
-                                        wait_options={'selectorOrTimeout': 3000}), # wait 3 seconds
-                                  close_page=False,
-                                  callback=self.parse)
+            yield response.follow(
+                Click(
+                    next_page_selector,
+                    wait_options={'selectorOrTimeout': 3000},  # wait 3 seconds
+                ),
+                close_page=False,
+                callback=self.parse,
+            )
 ```
 
 On your first request service will create new incognito browser context and new page in it.
 Their ids will be in returned in response object as `context_id` and `page_id` attributes.
 Following such response means passing context and page ids to next request.
 You also may specify requests context and page ids directly.
 
-Once your spider is closed, middleware will take care of closing all used browser contexts.
+Right before your spider has done the crawling, the service middleware will take care
+of closing all used browser contexts with `scrapypuppeteer.CloseContextRequest`.
+It accepts a list of all browser contexts to be closed.
 
 One may customize which `PuppeteerRequest`'s headers will be sent to remote website by the service 
 via `include_headers` attribute in request or globally with `PUPPETEER_INCLUDE_HEADERS` setting. 
@@ -102,9 +122,6 @@ By default, only cookies are sent.
 You would also like to send meta with your request. By default, you are not allowed to do this
 in order to sustain backward compatibility. You can change this behaviour by setting `PUPPETEER_INCLUDE_META` to True.
 
-One your spider has done the crawling, the service middleware would close all contexts with
-`scrapypuppeteer.CloseContextRequest`. It accepts a list of all browser contexts to be closed.
-
 ## Automatic recaptcha solving
 
 Enable PuppeteerRecaptchaDownloaderMiddleware to automatically solve recaptcha during scraping. We do not recommend
@@ -138,5 +155,5 @@ In this case RecaptchaMiddleware will just skip the request.
 - [x] skeleton that could handle goto, click, scroll, and actions
 - [ ] headers and cookies management
 - [ ] proxy support for puppeteer
-- [ ] error handling for requests
+- [x] error handling for requests
 - [ ] har support
diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py
@@ -247,13 +247,19 @@ def close_used_contexts(self, spider):
             def handle_close_contexts_result(result):
                 if isinstance(result, Response):
                     if result.status == 200:
-                        self.service_logger.debug(f"Successfully closed {len(request.contexts)} "
-                                                  f"contexts with request {result.request}")
+                        self.service_logger.debug(
+                            f"Successfully closed {len(request.contexts)} "
+                            f"contexts with request {result.request}"
+                        )
                     else:
-                        self.service_logger.warning(f"Could not close contexts: {result.text}")
+                        self.service_logger.warning(
+                            f"Could not close contexts: {result.text}"
+                        )
                 elif isinstance(result, Failure):
-                    self.service_logger.warning(f"Could not close contexts: {result.value}",
-                                                exc_info=failure_to_exc_info(result))
+                    self.service_logger.warning(
+                        f"Could not close contexts: {result.value}",
+                        exc_info=failure_to_exc_info(result),
+                    )
 
             dfd = self.crawler.engine.download(request)
             dfd.addBoth(handle_close_contexts_result)

diff --git a/scrapypuppeteer/response.py b/scrapypuppeteer/response.py
@@ -121,6 +121,32 @@ def __init__(self, url, puppeteer_request, context_id, page_id, data, **kwargs):
         self.data = data
         super().__init__(url, puppeteer_request, context_id, page_id, **kwargs)
 
+    def to_html(self) -> PuppeteerHtmlResponse:
+        """
+        Tries to converge a PuppeteerJsonResponse to a PuppeteerHtmlResponse.
+        For this self.data must be dict.
+        Then self.data must have "html" key with a string containing a page content
+        and "cookies" key with a list of cookies or None.
+
+        If the .data property does not have at least 1 argument the error is raised.
+        """
+        if not isinstance(self.data, dict):
+            raise TypeError(
+                "PuppeteerJsonResponse's .data property must be a dict"
+                "to converse it to a PuppeteerHtmlResponse."
+            )
+
+        kwargs = dict()
+        for attr in PuppeteerResponse.attributes:
+            kwargs[attr] = getattr(self, attr)
+        kwargs["html"] = self.data["html"]
+        kwargs["body"] = kwargs["html"]
+        kwargs["cookies"] = self.data["cookies"]
+        kwargs["headers"].update({"Content-Type": ["text/html"]})
+        kwargs["encoding"] = "utf-8"
+
+        return PuppeteerHtmlResponse(**kwargs)
+
 
 class PuppeteerRecaptchaSolverResponse(PuppeteerJsonResponse, PuppeteerHtmlResponse):
     """

diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@
 
 setup(
     name="scrapy-puppeteer-client",
-    version="0.3.0",
+    version="0.3.1",
     description="A library to use Puppeteer-managed browser in Scrapy spiders",
     long_description=long_description,
     long_description_content_type="text/markdown",