Skip to content

Commit 7548cfc

Browse files
committed
Pull upstream changes from #37.
This adjusts the code after the said PR refactored the project structure. Reference: #37
1 parent 38a53c9 commit 7548cfc

File tree

9 files changed

+239
-228
lines changed

9 files changed

+239
-228
lines changed

README.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ web-poet
1818
:target: https://codecov.io/gh/scrapinghub/web-poet
1919
:alt: Coverage report
2020

21-
.. image:: https://readthedocs.org/projects/web-poet/badge/?version=latest
22-
:target: https://web-poet.readthedocs.io/en/latest/?badge=latest
21+
.. image:: https://readthedocs.org/projects/web-poet/badge/?version=stable
22+
:target: https://web-poet.readthedocs.io/en/stable/?badge=stable
2323
:alt: Documentation Status
2424

2525
``web-poet`` implements Page Object pattern for web scraping.

tests/test_requests.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,13 @@
33
import pytest
44
from web_poet.exceptions import RequestBackendError, HttpResponseError
55
from web_poet.page_inputs import (
6+
HttpClient,
67
HttpRequest,
78
HttpResponse,
89
HttpRequestBody,
910
HttpRequestHeaders
1011
)
11-
from web_poet.requests import (
12-
HttpClient,
13-
request_backend_var,
14-
)
12+
from web_poet.requests import request_backend_var
1513

1614

1715
@pytest.fixture
@@ -47,7 +45,7 @@ async def test_perform_request_from_httpclient(async_mock):
4745
async def test_http_client_single_requests(async_mock):
4846
client = HttpClient(async_mock)
4947

50-
with mock.patch("web_poet.requests.HttpRequest") as mock_request:
48+
with mock.patch("web_poet.page_inputs.client.HttpRequest") as mock_request:
5149
response = await client.request("url")
5250
response.url == "url"
5351

web_poet/__init__.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
from .pages import WebPage, ItemPage, ItemWebPage, Injectable
2-
from .requests import (
3-
request_backend_var,
4-
HttpClient,
5-
)
2+
from .requests import request_backend_var
63
from .page_inputs import (
74
Meta,
5+
HttpClient,
86
HttpRequest,
97
HttpResponse,
108
HttpRequestHeaders,

web_poet/exceptions/http.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
operations.
77
"""
88

9-
from web_poet.page_inputs import HttpResponse, HttpRequest
9+
from web_poet.page_inputs.http import HttpResponse, HttpRequest
1010

1111

1212
class HttpError(IOError):

web_poet/page_inputs/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from .meta import Meta
2+
from .client import HttpClient
3+
from .http import (
4+
HttpRequest,
5+
HttpResponse,
6+
HttpRequestHeaders,
7+
HttpResponseHeaders,
8+
HttpRequestBody,
9+
HttpResponseBody,
10+
)

web_poet/page_inputs/client.py

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
"""This module has a full support for :mod:`asyncio` that enables developers to
2+
perform asynchronous additional requests inside of Page Objects.
3+
4+
Note that the implementation to fully execute any :class:`~.Request` is not
5+
handled in this module. With that, the framework using **web-poet** must supply
6+
the implementation.
7+
8+
You can read more about this in the :ref:`advanced-downloader-impl` documentation.
9+
"""
10+
11+
import asyncio
12+
import logging
13+
from typing import Optional, Dict, List, Union, Callable
14+
15+
from web_poet.requests import request_backend_var, _perform_request
16+
from web_poet.page_inputs.http import (
17+
HttpRequest,
18+
HttpRequestHeaders,
19+
HttpRequestBody,
20+
HttpResponse,
21+
)
22+
from web_poet.exceptions import RequestBackendError, HttpResponseError
23+
from web_poet.utils import as_list
24+
25+
logger = logging.getLogger(__name__)
26+
27+
_StrMapping = Dict[str, str]
28+
_Headers = Union[_StrMapping, HttpRequestHeaders]
29+
_Body = Union[bytes, HttpRequestBody]
30+
_Status = Union[str, int]
31+
32+
33+
class HttpClient:
34+
"""A convenient client to easily execute requests.
35+
36+
By default, it uses the request implementation assigned in the
37+
``web_poet.request_backend_var`` which is a :mod:`contextvars` instance to
38+
download the actual requests. However, it can easily be overridable by
39+
providing an optional ``request_downloader`` callable.
40+
41+
Providing the request implementation by dependency injection would be a good
42+
alternative solution when you want to avoid setting up :mod:`contextvars`
43+
like ``web_poet.request_backend_var``.
44+
45+
In any case, this doesn't contain any implementation about how to execute
46+
any requests fed into it. When setting that up, make sure that the downloader
47+
implementation returns a :class:`~.HttpResponse` instance.
48+
"""
49+
50+
def __init__(self, request_downloader: Callable = None):
51+
self._request_downloader = request_downloader or _perform_request
52+
53+
@staticmethod
54+
def _handle_status(
55+
response: HttpResponse,
56+
request: HttpRequest,
57+
*,
58+
allow_status: List[_Status] = None,
59+
) -> None:
60+
allow_status_normalized = list(map(str, as_list(allow_status)))
61+
allow_all_status = any(
62+
[True for s in allow_status_normalized if "*" == s.strip()]
63+
)
64+
65+
if (
66+
allow_all_status
67+
or response.status is None # allows serialized responses from tests
68+
or response.status < 400
69+
or str(response.status) in allow_status_normalized
70+
):
71+
return
72+
73+
if 400 <= response.status < 500:
74+
kind = "Client"
75+
elif 500 <= response.status < 600:
76+
kind = "Server"
77+
78+
msg = f"{response.status} {kind} Error for {response.url}"
79+
raise HttpResponseError(msg, request=request, response=response)
80+
81+
async def request(
82+
self,
83+
url: str,
84+
*,
85+
method: str = "GET",
86+
headers: Optional[_Headers] = None,
87+
body: Optional[_Body] = None,
88+
allow_status: List[_Status] = None,
89+
) -> HttpResponse:
90+
"""This is a shortcut for creating a :class:`~.HttpRequest` instance and
91+
executing that request.
92+
93+
A :class:`~.HttpResponse` instance should then be returned for successful
94+
responses in the 100-3xx status code range. Otherwise, an exception of
95+
type :class:`web_poet.exceptions.http.HttpResponseError` will be raised.
96+
97+
This behavior can be changed by suppressing the exceptions on select
98+
status codes using the ``allow_status`` param:
99+
* Passing status code values would not raise the exception when it
100+
occurs. This would return the response as-is.
101+
* Passing a "*" value would basically allow any status codes.
102+
103+
.. warning::
104+
By convention, the request implementation supplied optionally to
105+
:class:`~.HttpClient` should return a :class:`~.HttpResponse` instance.
106+
However, the underlying implementation supplied might change that,
107+
depending on how the framework using **web-poet** implements it.
108+
"""
109+
headers = headers or {}
110+
body = body or b""
111+
req = HttpRequest(url=url, method=method, headers=headers, body=body)
112+
response = await self.execute(req, allow_status=allow_status)
113+
return response
114+
115+
async def get(
116+
self,
117+
url: str,
118+
*,
119+
headers: Optional[_Headers] = None,
120+
allow_status: List[_Status] = None,
121+
) -> HttpResponse:
122+
"""Similar to :meth:`~.HttpClient.request` but peforming a ``GET``
123+
request.
124+
"""
125+
return await self.request(
126+
url=url,
127+
method="GET",
128+
headers=headers,
129+
allow_status=allow_status,
130+
)
131+
132+
async def post(
133+
self,
134+
url: str,
135+
*,
136+
headers: Optional[_Headers] = None,
137+
body: Optional[_Body] = None,
138+
allow_status: List[_Status] = None,
139+
) -> HttpResponse:
140+
"""Similar to :meth:`~.HttpClient.request` but performing a ``POST``
141+
request.
142+
"""
143+
return await self.request(
144+
url=url,
145+
method="POST",
146+
headers=headers,
147+
body=body,
148+
allow_status=allow_status,
149+
)
150+
151+
async def execute(
152+
self, request: HttpRequest, *, allow_status: List[_Status] = None
153+
) -> HttpResponse:
154+
"""Accepts a single instance of :class:`~.HttpRequest` and executes it
155+
using the request implementation configured in the :class:`~.HttpClient`
156+
instance.
157+
158+
A :class:`~.HttpResponse` instance should then be returned for successful
159+
responses in the 100-3xx status code range. Otherwise, an exception of
160+
type :class:`web_poet.exceptions.http.HttpResponseError` will be raised.
161+
162+
This behavior can be changed by suppressing the exceptions on select
163+
status codes using the ``allow_status`` param:
164+
165+
* Passing status code values would not raise the exception when it
166+
occurs. This would return the response as-is.
167+
* Passing a "*" value would basically allow any status codes.
168+
"""
169+
response = await self._request_downloader(request)
170+
self._handle_status(response, request, allow_status=allow_status)
171+
return response
172+
173+
async def batch_execute(
174+
self,
175+
*requests: HttpRequest,
176+
return_exceptions: bool = False,
177+
allow_status: List[_Status] = None,
178+
) -> List[Union[HttpResponse, Exception]]:
179+
"""Similar to :meth:`~.HttpClient.execute` but accepts a collection of
180+
:class:`~.HttpRequest` instances that would be batch executed.
181+
182+
The order of the :class:`~.HttpResponses` would correspond to the order
183+
of :class:`~.HttpRequest` passed.
184+
185+
If any of the :class:`~.HttpRequest` raises an exception upon execution,
186+
the exception is raised.
187+
188+
To prevent this, the actual exception can be returned alongside any
189+
successful :class:`~.HttpResponse`. This enables salvaging any usable
190+
responses despite any possible failures. This can be done by setting
191+
``True`` to the ``return_exceptions`` parameter.
192+
193+
Like :meth:`~.HttpClient.execute`, :class:`web_poet.exceptions.http.HttpResponseError`
194+
will be raised for responses with status codes in the ``400-5xx`` range.
195+
The ``allow_status`` parameter could be used the same way here to prevent
196+
these exceptions from being raised.
197+
198+
You can omit ``allow_status="*"`` if you're passing ``return_exceptions=True``.
199+
However, it would be returning :class:`web_poet.exceptions.http.HttpResponseError`
200+
instead of :class:`~.HttpResponse`.
201+
"""
202+
203+
coroutines = [self.execute(r, allow_status=allow_status) for r in requests]
204+
responses = await asyncio.gather(
205+
*coroutines, return_exceptions=return_exceptions
206+
)
207+
return responses

web_poet/page_inputs.py renamed to web_poet/page_inputs/http.py

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
from web_poet.utils import memoizemethod_noargs
1515

1616
T_headers = TypeVar("T_headers", bound="HttpResponseHeaders")
17-
AnyStrDict = Dict[AnyStr, Union[AnyStr, List[AnyStr], Tuple[AnyStr, ...]]]
17+
18+
_AnyStrDict = Dict[AnyStr, Union[AnyStr, List[AnyStr], Tuple[AnyStr, ...]]]
1819

1920

2021
class HttpRequestBody(bytes):
@@ -99,7 +100,7 @@ class HttpResponseHeaders(_HttpHeaders):
99100

100101
@classmethod
101102
def from_bytes_dict(
102-
cls: Type[T_headers], arg: AnyStrDict, encoding: str = "utf-8"
103+
cls: Type[T_headers], arg: _AnyStrDict, encoding: str = "utf-8"
103104
) -> T_headers:
104105
"""An alternative constructor for instantiation where the header-value
105106
pairs could be in raw bytes form.
@@ -270,13 +271,3 @@ def _auto_detect_fun(self, body: bytes) -> Optional[str]:
270271
except UnicodeError:
271272
continue
272273
return resolve_encoding(enc)
273-
274-
275-
class Meta(dict):
276-
"""Container class that could contain any arbitrary data to be passed into
277-
a Page Object.
278-
279-
Note that this is simply a subclass of Python's ``dict``.
280-
"""
281-
282-
pass

web_poet/page_inputs/meta.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
class Meta(dict):
2+
"""Container class that could contain any arbitrary data to be passed into
3+
a Page Object.
4+
5+
Note that this is simply a subclass of Python's ``dict``.
6+
"""
7+
8+
pass

0 commit comments

Comments
 (0)