From 34528b2d720f83ad745a69068e75f65f47ce9374 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 1 Nov 2022 20:53:04 +0100 Subject: [PATCH 01/17] Reference: cover only the preferred, shorter import paths --- docs/reference/adapter.rst | 3 +-- docs/reference/components.rst | 24 ++++++++---------------- docs/reference/items.rst | 17 ++++++----------- 3 files changed, 15 insertions(+), 29 deletions(-) diff --git a/docs/reference/adapter.rst b/docs/reference/adapter.rst index 9add1131..f23fc23b 100644 --- a/docs/reference/adapter.rst +++ b/docs/reference/adapter.rst @@ -2,5 +2,4 @@ Adapter ======= -.. class:: zyte_common_items.ZyteItemAdapter -.. autoclass:: zyte_common_items.adapter.ZyteItemAdapter \ No newline at end of file +.. autoclass:: zyte_common_items.ZyteItemAdapter \ No newline at end of file diff --git a/docs/reference/components.rst b/docs/reference/components.rst index 5e230a59..ef089bc4 100644 --- a/docs/reference/components.rst +++ b/docs/reference/components.rst @@ -7,34 +7,26 @@ Components These classes are used to map data within :ref:`items `, and are not tied to any specific item type. -.. class:: zyte_common_items.AdditionalProperty(**kwargs) -.. autoclass:: zyte_common_items.components.AdditionalProperty(**kwargs) +.. autoclass:: zyte_common_items.AdditionalProperty(**kwargs) :members: -.. class:: zyte_common_items.AggregateRating(**kwargs) -.. autoclass:: zyte_common_items.components.AggregateRating(**kwargs) +.. autoclass:: zyte_common_items.AggregateRating(**kwargs) :members: -.. class:: zyte_common_items.Brand(**kwargs) -.. autoclass:: zyte_common_items.components.Brand(**kwargs) +.. autoclass:: zyte_common_items.Brand(**kwargs) :members: -.. class:: zyte_common_items.Breadcrumb(**kwargs) -.. autoclass:: zyte_common_items.components.Breadcrumb(**kwargs) +.. autoclass:: zyte_common_items.Breadcrumb(**kwargs) :members: -.. class:: zyte_common_items.Gtin(**kwargs) -.. autoclass:: zyte_common_items.components.Gtin(**kwargs) +.. autoclass:: zyte_common_items.Gtin(**kwargs) :members: -.. class:: zyte_common_items.Image(**kwargs) -.. autoclass:: zyte_common_items.components.Image(**kwargs) +.. autoclass:: zyte_common_items.Image(**kwargs) :members: -.. class:: zyte_common_items.Link(**kwargs) -.. autoclass:: zyte_common_items.components.Link(**kwargs) +.. autoclass:: zyte_common_items.Link(**kwargs) :members: -.. class:: zyte_common_items.Metadata(**kwargs) -.. autoclass:: zyte_common_items.components.Metadata(**kwargs) +.. autoclass:: zyte_common_items.Metadata(**kwargs) :members: diff --git a/docs/reference/items.rst b/docs/reference/items.rst index c3f712dc..b4bdaeed 100644 --- a/docs/reference/items.rst +++ b/docs/reference/items.rst @@ -7,26 +7,22 @@ Items Product ======= -.. class:: zyte_common_items.Product(**kwargs) -.. autoclass:: zyte_common_items.items.Product(**kwargs) +.. autoclass:: zyte_common_items.Product(**kwargs) :members: :inherited-members: -.. class:: zyte_common_items.ProductVariant(**kwargs) -.. autoclass:: zyte_common_items.items.ProductVariant(**kwargs) +.. autoclass:: zyte_common_items.ProductVariant(**kwargs) :members: :inherited-members: Product List ============ -.. class:: zyte_common_items.ProductList(**kwargs) -.. autoclass:: zyte_common_items.items.ProductList(**kwargs) +.. autoclass:: zyte_common_items.ProductList(**kwargs) :members: :inherited-members: -.. class:: zyte_common_items.ProductFromList(**kwargs) -.. autoclass:: zyte_common_items.items.ProductFromList(**kwargs) +.. autoclass:: zyte_common_items.ProductFromList(**kwargs) :members: :inherited-members: @@ -34,10 +30,9 @@ Product List Custom items ============ -Subclass :class:`~zyte_common_items.base.Item` to create your own item classes. +Subclass :class:`~zyte_common_items.Item` to create your own item classes. -.. class:: zyte_common_items.Item(**kwargs) -.. autoclass:: zyte_common_items.base.Item(**kwargs) +.. autoclass:: zyte_common_items.Item(**kwargs) :members: .. attribute:: _unknown_fields_dict From 2d1cd89d1dc7ba16524df311a5d2391989f84bf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 1 Nov 2022 21:05:01 +0100 Subject: [PATCH 02/17] Include the table of contents into the index page of the documentation --- docs/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/index.rst b/docs/index.rst index 7f2fd69b..7d06d5cc 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -7,7 +7,6 @@ zyte-common-items |version| documentation :end-before: .. description ends .. toctree:: - :hidden: setup usage From 1463016478dcf40e54a55e2b9d81bd291cf5d1ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 1 Nov 2022 21:05:17 +0100 Subject: [PATCH 03/17] Reword the intro to cover page objects --- README.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index 6b6d955b..68bd3b34 100644 --- a/README.rst +++ b/README.rst @@ -20,12 +20,12 @@ zyte-common-items .. description starts -``zyte-common-items`` is a Python 3.7+ library of item classes used by Zyte_ to -normalize different types of data extracted from websites. - -It can be used in custom data extraction code for normalization purposes, -maximizing opportunities for code reuse. +``zyte-common-items`` is a Python 3.7+ library of `page object`_ and item_ +classes for web data extraction that we use at Zyte_ to maximize opportunities +for code reuse. +.. _item: https://docs.scrapy.org/en/latest/topics/items.html +.. _page object: https://web-poet.readthedocs.io/en/stable/ .. _Zyte: https://www.zyte.com/ .. description ends From e91109722ace3172fd6bd4ae8f60698d4fa5e5b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 2 Nov 2022 00:19:41 +0100 Subject: [PATCH 04/17] Add ProductPage and ProductListPage --- README.rst | 2 +- docs/conf.py | 2 + docs/index.rst | 20 ++++++++- docs/reference/index.rst | 1 + docs/reference/items.rst | 8 ++-- docs/reference/page-objects.rst | 43 +++++++++++++++++++ docs/{usage.rst => usage/items.rst} | 7 +++- docs/usage/page-objects.rst | 40 ++++++++++++++++++ tests/test_page_objects.py | 64 +++++++++++++++++++++++++++++ tox.ini | 4 +- zyte_common_items/__init__.py | 1 + zyte_common_items/base.py | 6 +-- zyte_common_items/page_objects.py | 31 ++++++++++++++ 13 files changed, 218 insertions(+), 11 deletions(-) create mode 100644 docs/reference/page-objects.rst rename docs/{usage.rst => usage/items.rst} (96%) create mode 100644 docs/usage/page-objects.rst create mode 100644 tests/test_page_objects.py create mode 100644 zyte_common_items/page_objects.py diff --git a/README.rst b/README.rst index 68bd3b34..a30c3c95 100644 --- a/README.rst +++ b/README.rst @@ -20,7 +20,7 @@ zyte-common-items .. description starts -``zyte-common-items`` is a Python 3.7+ library of `page object`_ and item_ +``zyte-common-items`` is a Python 3.7+ library of item_ and `page object`_ classes for web data extraction that we use at Zyte_ to maximize opportunities for code reuse. diff --git a/docs/conf.py b/docs/conf.py index 3c86a9da..91eaf1b3 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -35,6 +35,8 @@ def get_version_and_release(): autodoc_member_order = "groupwise" +intersphinx_disabled_reftypes = [] intersphinx_mapping = { "python": ("https://docs.python.org/3", None), + "web-poet": ("https://web-poet.readthedocs.io/en/stable", None), } diff --git a/docs/index.rst b/docs/index.rst index 7d06d5cc..a6766a8c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -7,9 +7,27 @@ zyte-common-items |version| documentation :end-before: .. description ends .. toctree:: + :caption: Getting started + :maxdepth: 1 setup - usage + +.. toctree:: + :caption: Usage + :maxdepth: 1 + + usage/items + usage/page-objects + +.. toctree:: + :caption: Reference + :maxdepth: 1 + reference/index changelog + +.. toctree:: + :caption: Contributing + :maxdepth: 1 + contributing diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 4a5fe362..a3535a05 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -5,5 +5,6 @@ Reference .. toctree:: items + page-objects components adapter diff --git a/docs/reference/items.rst b/docs/reference/items.rst index b4bdaeed..24555180 100644 --- a/docs/reference/items.rst +++ b/docs/reference/items.rst @@ -1,8 +1,8 @@ -.. _items: +.. _item-api: -===== -Items -===== +======== +Item API +======== Product ======= diff --git a/docs/reference/page-objects.rst b/docs/reference/page-objects.rst new file mode 100644 index 00000000..a5ac5c64 --- /dev/null +++ b/docs/reference/page-objects.rst @@ -0,0 +1,43 @@ +.. _page-object-api: + +=============== +Page object API +=============== + +Product +======= + +.. autoclass:: zyte_common_items.ProductPage(**kwargs) + :show-inheritance: + + +Product List +============ + +.. autoclass:: zyte_common_items.ProductListPage(**kwargs) + :show-inheritance: + + +Custom page objects +=================== + +Subclass :class:`~zyte_common_items.Page` to create your own page object +classes. + +.. autoclass:: zyte_common_items.Page(**kwargs) + :show-inheritance: + + .. data:: metadata + :type: zyte_common_items.Metadata + + Data extraction process metadata. + + :attr:`~zyte_common_items.Metadata.dateDownloaded` is set to the current + date. + + :attr:`~zyte_common_items.Metadata.probability` is not set. + + .. data:: url + :type: web_poet.page_inputs.http.ResponseUrl + + Main URL from which the data has been extracted. diff --git a/docs/usage.rst b/docs/usage/items.rst similarity index 96% rename from docs/usage.rst rename to docs/usage/items.rst index 516acae5..a28deadc 100644 --- a/docs/usage.rst +++ b/docs/usage/items.rst @@ -1,7 +1,12 @@ +.. _items: + ===== -Usage +Items ===== +The :ref:`provided item classes ` can be used to map data extracted +from web pages, e.g. using :ref:`page objects `. + Creating items from dictionaries ================================ diff --git a/docs/usage/page-objects.rst b/docs/usage/page-objects.rst new file mode 100644 index 00000000..dfd1583c --- /dev/null +++ b/docs/usage/page-objects.rst @@ -0,0 +1,40 @@ +.. _page-objects: + +============ +Page objects +============ + +The :ref:`provided page object classes ` are good base classes +for custom page object classes that implement website-specific :doc:`page +objects `. + +They provide the following base line: + +- They declare the :ref:`item class ` that they return, allowing for + their ``to_item`` method to automatically build an instance of it from + ``@field``-decorated methods. See :ref:`web-poet-fields`. + +- They provide a default implementation for their + :attr:`~zyte_common_items.Page.metadata` and + :attr:`~zyte_common_items.Page.url` fields. + +The following code shows a :class:`~zyte_common_items.ProductPage` subclass +whose ``to_item`` method returns an instance of +:class:`~zyte_common_items.Product` with +:attr:`~zyte_common_items.Product.metadata`, a +:attr:`~zyte_common_items.Product.name`, and a +:attr:`~zyte_common_items.Product.url`: + +.. code-block:: python + + import attrs + from web_poet import HttpResponse + from zyte_common_items import ProductPage + + @attrs.define + class CustomProductPage(ProductPage): + response: HttpResponse + + @field + def name(self): + return self.response.css("h1::text").get() diff --git a/tests/test_page_objects.py b/tests/test_page_objects.py new file mode 100644 index 00000000..dbe6bc01 --- /dev/null +++ b/tests/test_page_objects.py @@ -0,0 +1,64 @@ +from datetime import datetime + +import attrs +import pytest +from web_poet import field, HttpResponse, ResponseUrl + +from zyte_common_items import ProductPage, ProductListPage + + +@pytest.mark.parametrize( + "page_class", + ( + ProductPage, + ProductListPage, + ) +) +def test_default(page_class): + datetime_before = datetime.utcnow().replace(microsecond=0) + + page = page_class(url="https://example.com") + + assert page.metadata.probability is None + assert page.url == "https://example.com" + + page_datetime_string = page.metadata.dateDownloaded + assert page_datetime_string.endswith("Z") + page_datetime = datetime.fromisoformat(page_datetime_string[:-1]) + datetime_after = datetime.utcnow().replace(microsecond=0) + assert datetime_before <= page_datetime <= datetime_after + + +@pytest.mark.asyncio +async def test_example(): + datetime_before = datetime.utcnow().replace(microsecond=0) + + @attrs.define + class BookPage(ProductPage): + response: HttpResponse + + @field + def name(self): + return self.response.css("h1::text").get() + + url = ResponseUrl("https://example.com/books/1") + html = b""" + + + +

Foo

+ + + """ + response = HttpResponse(url=url, body=html) + + item = await BookPage(url=url, response=response).to_item() + + assert item.url == str(url) + assert item.name == "Foo" + + item_datetime_string = item.metadata.dateDownloaded + assert item_datetime_string.endswith("Z") + item_datetime = datetime.fromisoformat(item_datetime_string[:-1]) + datetime_after = datetime.utcnow().replace(microsecond=0) + assert datetime_before <= item_datetime <= datetime_after diff --git a/tox.ini b/tox.ini index 0b92512d..eb6b03cb 100644 --- a/tox.ini +++ b/tox.ini @@ -4,8 +4,10 @@ envlist = py37,py38,py39,py310,mypy [testenv] deps = pytest + pytest-asyncio pytest-cov - pytest-mypy-testing==0.0.11 + # https://github.com/davidfritzsche/pytest-mypy-testing/issues/35 + git+https://github.com/davidfritzsche/pytest-mypy-testing.git@031514ff6ecd5bdf4d11ff238c14d4801b5e47f3 setenv = PY_IGNORE_IMPORTMISMATCH=1 commands = diff --git a/zyte_common_items/__init__.py b/zyte_common_items/__init__.py index c958cc8c..41689e80 100644 --- a/zyte_common_items/__init__.py +++ b/zyte_common_items/__init__.py @@ -12,3 +12,4 @@ Metadata, ) from .items import Product, ProductFromList, ProductList, ProductVariant +from .page_objects import Page, ProductPage, ProductListPage diff --git a/zyte_common_items/base.py b/zyte_common_items/base.py index 313019a8..c2280103 100644 --- a/zyte_common_items/base.py +++ b/zyte_common_items/base.py @@ -6,19 +6,19 @@ from typing import get_args except ImportError: # Compliance with python 3.7 - from zyte_common_items.util import get_args + from .util import get_args try: from typing import get_origin except ImportError: # Compliance with python 3.7 - from zyte_common_items.util import get_origin + from .util import get_origin from typing import Dict, List, Optional, Union import attrs -from zyte_common_items.util import split_in_unknown_and_known_fields +from .util import split_in_unknown_and_known_fields def is_data_container(cls_or_obj): diff --git a/zyte_common_items/page_objects.py b/zyte_common_items/page_objects.py new file mode 100644 index 00000000..7ac476c3 --- /dev/null +++ b/zyte_common_items/page_objects.py @@ -0,0 +1,31 @@ +from datetime import datetime + +import attrs +from web_poet import field, ItemPage, ResponseUrl, Returns + +from .components import Metadata +from .items import Product, ProductList + + +@attrs.define +class Page(ItemPage): + _url: ResponseUrl + + @field + def metadata(self) -> Metadata: + """…""" + return Metadata( + dateDownloaded=f"{datetime.utcnow().isoformat(timespec='seconds')}Z", + ) + + @field + def url(self) -> ResponseUrl: + return self._url + + +class ProductPage(Page, Returns[Product]): + pass + + +class ProductListPage(Page, Returns[ProductList]): + pass From 1503a5f116f928f8605204efbfaccf150f6eb3b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 2 Nov 2022 00:24:21 +0100 Subject: [PATCH 05/17] Run pre-commit --- tests/test_page_objects.py | 6 +++--- zyte_common_items/__init__.py | 2 +- zyte_common_items/page_objects.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_page_objects.py b/tests/test_page_objects.py index dbe6bc01..c89414bf 100644 --- a/tests/test_page_objects.py +++ b/tests/test_page_objects.py @@ -2,9 +2,9 @@ import attrs import pytest -from web_poet import field, HttpResponse, ResponseUrl +from web_poet import HttpResponse, ResponseUrl, field -from zyte_common_items import ProductPage, ProductListPage +from zyte_common_items import ProductListPage, ProductPage @pytest.mark.parametrize( @@ -12,7 +12,7 @@ ( ProductPage, ProductListPage, - ) + ), ) def test_default(page_class): datetime_before = datetime.utcnow().replace(microsecond=0) diff --git a/zyte_common_items/__init__.py b/zyte_common_items/__init__.py index 41689e80..b0d92e39 100644 --- a/zyte_common_items/__init__.py +++ b/zyte_common_items/__init__.py @@ -12,4 +12,4 @@ Metadata, ) from .items import Product, ProductFromList, ProductList, ProductVariant -from .page_objects import Page, ProductPage, ProductListPage +from .page_objects import Page, ProductListPage, ProductPage diff --git a/zyte_common_items/page_objects.py b/zyte_common_items/page_objects.py index 7ac476c3..ea60bbdc 100644 --- a/zyte_common_items/page_objects.py +++ b/zyte_common_items/page_objects.py @@ -1,7 +1,7 @@ from datetime import datetime import attrs -from web_poet import field, ItemPage, ResponseUrl, Returns +from web_poet import ItemPage, ResponseUrl, Returns, field from .components import Metadata from .items import Product, ProductList From 89e7b238d1c20edaeb38916b03ac6180f934e28e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 2 Nov 2022 10:12:44 +0100 Subject: [PATCH 06/17] =?UTF-8?q?Fix=20typo:=20url=20=E2=86=92=20=5Furl?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Kevin Lloyd Bernal --- tests/test_page_objects.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_page_objects.py b/tests/test_page_objects.py index c89414bf..0a07f25a 100644 --- a/tests/test_page_objects.py +++ b/tests/test_page_objects.py @@ -52,7 +52,7 @@ def name(self): """ response = HttpResponse(url=url, body=html) - item = await BookPage(url=url, response=response).to_item() + item = await BookPage(_url=url, response=response).to_item() assert item.url == str(url) assert item.name == "Foo" From c63838f491aef929f0306842f06c913ee80e1ef6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 2 Nov 2022 10:15:36 +0100 Subject: [PATCH 07/17] Remove leftover docstring --- docs/reference/page-objects.rst | 2 +- zyte_common_items/page_objects.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/reference/page-objects.rst b/docs/reference/page-objects.rst index a5ac5c64..c19eb25d 100644 --- a/docs/reference/page-objects.rst +++ b/docs/reference/page-objects.rst @@ -33,7 +33,7 @@ classes. Data extraction process metadata. :attr:`~zyte_common_items.Metadata.dateDownloaded` is set to the current - date. + UTC date and time. :attr:`~zyte_common_items.Metadata.probability` is not set. diff --git a/zyte_common_items/page_objects.py b/zyte_common_items/page_objects.py index ea60bbdc..1ea32bf2 100644 --- a/zyte_common_items/page_objects.py +++ b/zyte_common_items/page_objects.py @@ -13,7 +13,6 @@ class Page(ItemPage): @field def metadata(self) -> Metadata: - """…""" return Metadata( dateDownloaded=f"{datetime.utcnow().isoformat(timespec='seconds')}Z", ) From 9c1b6c70dceb47db7fe7b54e0ad5c4a9d540e60b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 2 Nov 2022 10:20:09 +0100 Subject: [PATCH 08/17] =?UTF-8?q?Revert=20"Fix=20typo:=20url=20=E2=86=92?= =?UTF-8?q?=20=5Furl"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 89e7b238d1c20edaeb38916b03ac6180f934e28e. --- tests/test_page_objects.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_page_objects.py b/tests/test_page_objects.py index 0a07f25a..c89414bf 100644 --- a/tests/test_page_objects.py +++ b/tests/test_page_objects.py @@ -52,7 +52,7 @@ def name(self): """ response = HttpResponse(url=url, body=html) - item = await BookPage(_url=url, response=response).to_item() + item = await BookPage(url=url, response=response).to_item() assert item.url == str(url) assert item.name == "Foo" From 139b32d14ecd8ba4021086a02c9a2d0d0657f04b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 2 Nov 2022 20:47:59 +0100 Subject: [PATCH 09/17] Set metadata.probability to 1.0 --- docs/reference/page-objects.rst | 2 +- tests/test_page_objects.py | 3 ++- zyte_common_items/page_objects.py | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/reference/page-objects.rst b/docs/reference/page-objects.rst index c19eb25d..389204f9 100644 --- a/docs/reference/page-objects.rst +++ b/docs/reference/page-objects.rst @@ -35,7 +35,7 @@ classes. :attr:`~zyte_common_items.Metadata.dateDownloaded` is set to the current UTC date and time. - :attr:`~zyte_common_items.Metadata.probability` is not set. + :attr:`~zyte_common_items.Metadata.probability` is set to ``1.0``. .. data:: url :type: web_poet.page_inputs.http.ResponseUrl diff --git a/tests/test_page_objects.py b/tests/test_page_objects.py index c89414bf..5c37c515 100644 --- a/tests/test_page_objects.py +++ b/tests/test_page_objects.py @@ -19,7 +19,7 @@ def test_default(page_class): page = page_class(url="https://example.com") - assert page.metadata.probability is None + assert page.metadata.probability == 1.0 assert page.url == "https://example.com" page_datetime_string = page.metadata.dateDownloaded @@ -56,6 +56,7 @@ def name(self): assert item.url == str(url) assert item.name == "Foo" + assert item.metadata.probability == 1.0 item_datetime_string = item.metadata.dateDownloaded assert item_datetime_string.endswith("Z") diff --git a/zyte_common_items/page_objects.py b/zyte_common_items/page_objects.py index 1ea32bf2..ce7bf280 100644 --- a/zyte_common_items/page_objects.py +++ b/zyte_common_items/page_objects.py @@ -15,6 +15,7 @@ class Page(ItemPage): def metadata(self) -> Metadata: return Metadata( dateDownloaded=f"{datetime.utcnow().isoformat(timespec='seconds')}Z", + probability=1.0, ) @field From e42bb8f8cac8b01bf6f58c73bf0831dcc051fbec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 9 Nov 2022 09:02:44 +0100 Subject: [PATCH 10/17] =?UTF-8?q?page=5Fobjects=20=E2=86=92=20pages?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- zyte_common_items/__init__.py | 2 +- zyte_common_items/{page_objects.py => pages.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename zyte_common_items/{page_objects.py => pages.py} (100%) diff --git a/zyte_common_items/__init__.py b/zyte_common_items/__init__.py index b0d92e39..11cc0a65 100644 --- a/zyte_common_items/__init__.py +++ b/zyte_common_items/__init__.py @@ -12,4 +12,4 @@ Metadata, ) from .items import Product, ProductFromList, ProductList, ProductVariant -from .page_objects import Page, ProductListPage, ProductPage +from .pages import Page, ProductListPage, ProductPage diff --git a/zyte_common_items/page_objects.py b/zyte_common_items/pages.py similarity index 100% rename from zyte_common_items/page_objects.py rename to zyte_common_items/pages.py From b19b833014c4b578b5dda4e9a481aa4e860f87f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 9 Nov 2022 09:15:27 +0100 Subject: [PATCH 11/17] Freeze the version of mypy also in regular tox environments --- tox.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/tox.ini b/tox.ini index eb6b03cb..07340838 100644 --- a/tox.ini +++ b/tox.ini @@ -8,6 +8,7 @@ deps = pytest-cov # https://github.com/davidfritzsche/pytest-mypy-testing/issues/35 git+https://github.com/davidfritzsche/pytest-mypy-testing.git@031514ff6ecd5bdf4d11ff238c14d4801b5e47f3 + mypy==0.971 setenv = PY_IGNORE_IMPORTMISMATCH=1 commands = From a14a9bdc809c5263b6ff95e70696c3e20dbba3ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 9 Nov 2022 09:54:24 +0100 Subject: [PATCH 12/17] Add WebPage-based page object classes --- docs/index.rst | 2 +- docs/reference/index.rst | 2 +- docs/reference/page-objects.rst | 43 ----------- docs/reference/pages.rst | 76 +++++++++++++++++++ docs/usage/{page-objects.rst => pages.rst} | 2 - tests/{test_page_objects.py => test_pages.py} | 50 ++++++++++-- zyte_common_items/__init__.py | 9 ++- zyte_common_items/pages.py | 30 +++++++- 8 files changed, 156 insertions(+), 58 deletions(-) delete mode 100644 docs/reference/page-objects.rst create mode 100644 docs/reference/pages.rst rename docs/usage/{page-objects.rst => pages.rst} (94%) rename tests/{test_page_objects.py => test_pages.py} (56%) diff --git a/docs/index.rst b/docs/index.rst index a6766a8c..79cda06f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -17,7 +17,7 @@ zyte-common-items |version| documentation :maxdepth: 1 usage/items - usage/page-objects + usage/pages .. toctree:: :caption: Reference diff --git a/docs/reference/index.rst b/docs/reference/index.rst index a3535a05..482b7e91 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -5,6 +5,6 @@ Reference .. toctree:: items - page-objects + pages components adapter diff --git a/docs/reference/page-objects.rst b/docs/reference/page-objects.rst deleted file mode 100644 index 389204f9..00000000 --- a/docs/reference/page-objects.rst +++ /dev/null @@ -1,43 +0,0 @@ -.. _page-object-api: - -=============== -Page object API -=============== - -Product -======= - -.. autoclass:: zyte_common_items.ProductPage(**kwargs) - :show-inheritance: - - -Product List -============ - -.. autoclass:: zyte_common_items.ProductListPage(**kwargs) - :show-inheritance: - - -Custom page objects -=================== - -Subclass :class:`~zyte_common_items.Page` to create your own page object -classes. - -.. autoclass:: zyte_common_items.Page(**kwargs) - :show-inheritance: - - .. data:: metadata - :type: zyte_common_items.Metadata - - Data extraction process metadata. - - :attr:`~zyte_common_items.Metadata.dateDownloaded` is set to the current - UTC date and time. - - :attr:`~zyte_common_items.Metadata.probability` is set to ``1.0``. - - .. data:: url - :type: web_poet.page_inputs.http.ResponseUrl - - Main URL from which the data has been extracted. diff --git a/docs/reference/pages.rst b/docs/reference/pages.rst new file mode 100644 index 00000000..d2cdccf4 --- /dev/null +++ b/docs/reference/pages.rst @@ -0,0 +1,76 @@ +.. _page-object-api: + +=============== +Page object API +=============== + +Product +======= + +.. autoclass:: zyte_common_items.BaseProductPage(**kwargs) + :show-inheritance: + +.. autoclass:: zyte_common_items.ProductPage(**kwargs) + :show-inheritance: + + +Product List +============ + +.. autoclass:: zyte_common_items.BaseProductListPage(**kwargs) + :show-inheritance: + +.. autoclass:: zyte_common_items.ProductListPage(**kwargs) + :show-inheritance: + + +Custom page objects +=================== + +Subclass :class:`~zyte_common_items.Page` to create your own page object +classes that rely on :class:`~zyte_common_items.HttpResponse`. + +If you do not want :class:`~zyte_common_items.HttpResponse` as input, you can +inherit from :class:`~zyte_common_items.BasePage` instead. + +.. autoclass:: zyte_common_items.BasePage(**kwargs) + :show-inheritance: + + Base class for page object classes that has + :class:`~zyte_common_items.ResponseUrl` as a dependency. + + .. data:: metadata + :type: zyte_common_items.Metadata + + Data extraction process metadata. + + :attr:`~zyte_common_items.Metadata.dateDownloaded` is set to the current + UTC date and time. + + :attr:`~zyte_common_items.Metadata.probability` is set to ``1.0``. + + .. data:: url + :type: web_poet.page_inputs.http.ResponseUrl + + Main URL from which the data has been extracted. + +.. autoclass:: zyte_common_items.Page(**kwargs) + :show-inheritance: + + Base class for page object classes that has + :class:`~zyte_common_items.HttpResponse` as a dependency. + + .. data:: metadata + :type: zyte_common_items.Metadata + + Data extraction process metadata. + + :attr:`~zyte_common_items.Metadata.dateDownloaded` is set to the current + UTC date and time. + + :attr:`~zyte_common_items.Metadata.probability` is set to ``1.0``. + + .. data:: url + :type: web_poet.page_inputs.http.ResponseUrl + + Main URL from which the data has been extracted. diff --git a/docs/usage/page-objects.rst b/docs/usage/pages.rst similarity index 94% rename from docs/usage/page-objects.rst rename to docs/usage/pages.rst index dfd1583c..8b0885a7 100644 --- a/docs/usage/page-objects.rst +++ b/docs/usage/pages.rst @@ -28,12 +28,10 @@ whose ``to_item`` method returns an instance of .. code-block:: python import attrs - from web_poet import HttpResponse from zyte_common_items import ProductPage @attrs.define class CustomProductPage(ProductPage): - response: HttpResponse @field def name(self): diff --git a/tests/test_page_objects.py b/tests/test_pages.py similarity index 56% rename from tests/test_page_objects.py rename to tests/test_pages.py index 5c37c515..1fa16d17 100644 --- a/tests/test_page_objects.py +++ b/tests/test_pages.py @@ -4,7 +4,35 @@ import pytest from web_poet import HttpResponse, ResponseUrl, field -from zyte_common_items import ProductListPage, ProductPage +from zyte_common_items import ( + BaseProductListPage, + BaseProductPage, + ProductListPage, + ProductPage, +) + + +@pytest.mark.parametrize( + "page_class", + ( + BaseProductPage, + BaseProductListPage, + ), +) +def test_base_pages_default(page_class): + datetime_before = datetime.utcnow().replace(microsecond=0) + + page = page_class(url=ResponseUrl("https://example.com")) + + assert page.metadata.probability == 1.0 + assert page.url == "https://example.com" + assert isinstance(page.url, str) + + page_datetime_string = page.metadata.dateDownloaded + assert page_datetime_string.endswith("Z") + page_datetime = datetime.fromisoformat(page_datetime_string[:-1]) + datetime_after = datetime.utcnow().replace(microsecond=0) + assert datetime_before <= page_datetime <= datetime_after @pytest.mark.parametrize( @@ -14,13 +42,25 @@ ProductListPage, ), ) -def test_default(page_class): +def test_pages_default(page_class): datetime_before = datetime.utcnow().replace(microsecond=0) - page = page_class(url="https://example.com") + url = ResponseUrl("https://example.com") + html = b""" + + + +

Foo

+ + + """ + response = HttpResponse(url=url, body=html) + + page = page_class(response=response) assert page.metadata.probability == 1.0 assert page.url == "https://example.com" + assert isinstance(page.url, str) page_datetime_string = page.metadata.dateDownloaded assert page_datetime_string.endswith("Z") @@ -35,8 +75,6 @@ async def test_example(): @attrs.define class BookPage(ProductPage): - response: HttpResponse - @field def name(self): return self.response.css("h1::text").get() @@ -52,7 +90,7 @@ def name(self): """ response = HttpResponse(url=url, body=html) - item = await BookPage(url=url, response=response).to_item() + item = await BookPage(response=response).to_item() assert item.url == str(url) assert item.name == "Foo" diff --git a/zyte_common_items/__init__.py b/zyte_common_items/__init__.py index 11cc0a65..c389a534 100644 --- a/zyte_common_items/__init__.py +++ b/zyte_common_items/__init__.py @@ -12,4 +12,11 @@ Metadata, ) from .items import Product, ProductFromList, ProductList, ProductVariant -from .pages import Page, ProductListPage, ProductPage +from .pages import ( + BasePage, + BaseProductListPage, + BaseProductPage, + Page, + ProductListPage, + ProductPage, +) diff --git a/zyte_common_items/pages.py b/zyte_common_items/pages.py index ce7bf280..e37982e2 100644 --- a/zyte_common_items/pages.py +++ b/zyte_common_items/pages.py @@ -1,14 +1,14 @@ from datetime import datetime import attrs -from web_poet import ItemPage, ResponseUrl, Returns, field +from web_poet import ItemPage, ResponseUrl, Returns, WebPage, field from .components import Metadata from .items import Product, ProductList @attrs.define -class Page(ItemPage): +class BasePage(ItemPage): _url: ResponseUrl @field @@ -19,8 +19,30 @@ def metadata(self) -> Metadata: ) @field - def url(self) -> ResponseUrl: - return self._url + def url(self) -> str: + return str(self._url) + + +class BaseProductPage(BasePage, Returns[Product]): + pass + + +class BaseProductListPage(BasePage, Returns[ProductList]): + pass + + +@attrs.define +class Page(WebPage): + @field + def metadata(self) -> Metadata: + return Metadata( + dateDownloaded=f"{datetime.utcnow().isoformat(timespec='seconds')}Z", + probability=1.0, + ) + + @field + def url(self) -> str: + return str(self.response.url) class ProductPage(Page, Returns[Product]): From c357334ff4d2cbc4898c52f7d639416c853f8de4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 9 Nov 2022 11:58:55 +0100 Subject: [PATCH 13/17] =?UTF-8?q?.response.css=20=E2=86=92=20.css?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/usage/pages.rst | 2 +- tests/test_pages.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/usage/pages.rst b/docs/usage/pages.rst index 8b0885a7..c9a43b2b 100644 --- a/docs/usage/pages.rst +++ b/docs/usage/pages.rst @@ -35,4 +35,4 @@ whose ``to_item`` method returns an instance of @field def name(self): - return self.response.css("h1::text").get() + return self.css("h1::text").get() diff --git a/tests/test_pages.py b/tests/test_pages.py index 1fa16d17..d9163ae9 100644 --- a/tests/test_pages.py +++ b/tests/test_pages.py @@ -77,7 +77,7 @@ async def test_example(): class BookPage(ProductPage): @field def name(self): - return self.response.css("h1::text").get() + return self.css("h1::text").get() url = ResponseUrl("https://example.com/books/1") html = b""" From ee364f263bcea849e957881983abf9fdd06acf52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 9 Nov 2022 12:02:03 +0100 Subject: [PATCH 14/17] tox.ini: run docs by default --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 07340838..aa62e981 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py37,py38,py39,py310,mypy +envlist = py37,py38,py39,py310,mypy,docs [testenv] deps = From 50144c391c8d0f7fbd243eaac5b897e68dce42fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 22 Nov 2022 06:50:32 +0100 Subject: [PATCH 15/17] Remove unneeded @attrs.define from the docs. Co-authored-by: Kevin Lloyd Bernal --- docs/usage/pages.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/usage/pages.rst b/docs/usage/pages.rst index c9a43b2b..749f4638 100644 --- a/docs/usage/pages.rst +++ b/docs/usage/pages.rst @@ -30,7 +30,6 @@ whose ``to_item`` method returns an instance of import attrs from zyte_common_items import ProductPage - @attrs.define class CustomProductPage(ProductPage): @field From 9adfb7461b7d4689184a60712d205dca6cd15bb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Sun, 27 Nov 2022 11:58:50 +0100 Subject: [PATCH 16/17] Use a mixin to avoid code repetition --- setup.py | 3 ++- zyte_common_items/pages.py | 30 +++++++++++++++--------------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/setup.py b/setup.py index c35236d9..34e0404f 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,8 @@ install_requires=[ "attrs>=21.3.0", "itemadapter>=0.2.0", - "web-poet>=0.5.0", + # https://github.com/scrapinghub/web-poet/pull/109 + "web-poet @ git+https://github.com/Gallaecio/web-poet.git@mixin-support", ], classifiers=[ "Development Status :: 3 - Alpha", diff --git a/zyte_common_items/pages.py b/zyte_common_items/pages.py index e37982e2..6db8b9c2 100644 --- a/zyte_common_items/pages.py +++ b/zyte_common_items/pages.py @@ -7,9 +7,9 @@ from .items import Product, ProductList -@attrs.define -class BasePage(ItemPage): - _url: ResponseUrl +class _BaseMixin: + def _get_response_url(self): + raise NotImplementedError @field def metadata(self) -> Metadata: @@ -20,7 +20,15 @@ def metadata(self) -> Metadata: @field def url(self) -> str: - return str(self._url) + return str(self._get_response_url()) + + +@attrs.define +class BasePage(_BaseMixin, ItemPage): + _url: ResponseUrl + + def _get_response_url(self): + return self._url class BaseProductPage(BasePage, Returns[Product]): @@ -32,17 +40,9 @@ class BaseProductListPage(BasePage, Returns[ProductList]): @attrs.define -class Page(WebPage): - @field - def metadata(self) -> Metadata: - return Metadata( - dateDownloaded=f"{datetime.utcnow().isoformat(timespec='seconds')}Z", - probability=1.0, - ) - - @field - def url(self) -> str: - return str(self.response.url) +class Page(_BaseMixin, WebPage): + def _get_response_url(self): + return self.response.url class ProductPage(Page, Returns[Product]): From cbfda26a1e83d822352fd2a2be8f09907998de48 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 24 Jan 2023 19:23:57 +0800 Subject: [PATCH 17/17] use web_poet.Unset --- CHANGELOG.rst | 6 ++ docs/usage/items.rst | 1 + setup.py | 3 +- tests/test_adapter.py | 3 +- tests/test_components.py | 6 +- tests/test_items.py | 9 ++- tests/test_mypy.py | 16 ++-- zyte_common_items/base.py | 8 +- zyte_common_items/components.py | 25 +++--- zyte_common_items/items.py | 131 ++++++++++++++++---------------- zyte_common_items/util.py | 5 +- 11 files changed, 116 insertions(+), 97 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 1ad4fc80..9ee7a40f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,12 @@ Changelog ========= +TBD +=== + +* Use ``web_poet.Unset`` sentinel value which represents fields which hasn't been + assigned with any value. This is to differentiate values which are ``None``. + 0.2.0 (2022-09-22) ================== diff --git a/docs/usage/items.rst b/docs/usage/items.rst index a28deadc..1298df40 100644 --- a/docs/usage/items.rst +++ b/docs/usage/items.rst @@ -36,6 +36,7 @@ nested data, such as :class:`~zyte_common_items.components.Image` and >>> product.mainImage Image(url='https://example.com/image.png') >>> product.canonicalUrl +Unset >>> product.gtin [Gtin(type='gtin13', value='9504000059446')] diff --git a/setup.py b/setup.py index 34e0404f..5f7ffa9e 100644 --- a/setup.py +++ b/setup.py @@ -25,8 +25,7 @@ install_requires=[ "attrs>=21.3.0", "itemadapter>=0.2.0", - # https://github.com/scrapinghub/web-poet/pull/109 - "web-poet @ git+https://github.com/Gallaecio/web-poet.git@mixin-support", + "web-poet @ git+https://git@github.com/scrapinghub/web-poet@feat-unset#egg=web-poet", ], classifiers=[ "Development Status :: 3 - Alpha", diff --git a/tests/test_adapter.py b/tests/test_adapter.py index c251603a..37573850 100644 --- a/tests/test_adapter.py +++ b/tests/test_adapter.py @@ -9,6 +9,7 @@ import attrs import pytest from itemadapter import ItemAdapter +from web_poet import Unset from zyte_common_items import Item, Product, ZyteItemAdapter @@ -202,7 +203,7 @@ def test_known_field_get_missing(): product = Product(url=url) with configured_adapter(): adapter = ItemAdapter(product) - assert adapter["canonicalUrl"] is None + assert adapter["canonicalUrl"] is Unset def test_known_field_set(): diff --git a/tests/test_components.py b/tests/test_components.py index 62582f36..5d4fce24 100644 --- a/tests/test_components.py +++ b/tests/test_components.py @@ -1,3 +1,5 @@ +from web_poet import Unset + from zyte_common_items import AggregateRating, Breadcrumb, Link, Metadata @@ -19,5 +21,5 @@ def test_link_optional_fields(): def test_metadata_default_values(): metadata = Metadata() - assert metadata.dateDownloaded is None - assert metadata.probability is None + assert metadata.dateDownloaded is Unset + assert metadata.probability is Unset diff --git a/tests/test_items.py b/tests/test_items.py index fe00ecf3..7262e7b1 100644 --- a/tests/test_items.py +++ b/tests/test_items.py @@ -1,6 +1,7 @@ from copy import copy import pytest +from web_poet import Unset from zyte_common_items import ( AdditionalProperty, @@ -119,7 +120,7 @@ def test_product_min_fields(): for field in list(_PRODUCT_ALL_KWARGS): if field in _PRODUCT_MIN_KWARGS: continue - assert getattr(product, field) is None + assert getattr(product, field) is Unset def test_product_missing_fields(): @@ -141,7 +142,7 @@ def test_product_list_min_fields(): for field in list(_PRODUCT_LIST_MIN_KWARGS): if field in _PRODUCT_LIST_MIN_KWARGS: continue - assert getattr(product_list, field) is None + assert getattr(product_list, field) is Unset def test_product_list_missing_fields(): @@ -161,7 +162,7 @@ def test_product_from_list_all_fields(): def test_product_from_list_min_fields(): product_from_list = ProductFromList() for field in list(_PRODUCT_FROM_LIST_ALL_KWARGS): - assert getattr(product_from_list, field) is None + assert getattr(product_from_list, field) is Unset def test_product_variant_all_fields(): @@ -173,4 +174,4 @@ def test_product_variant_all_fields(): def test_product_variant_min_fields(): product_variant = ProductVariant() for field in list(_PRODUCT_VARIANT_ALL_KWARGS): - assert getattr(product_variant, field) is None + assert getattr(product_variant, field) is Unset diff --git a/tests/test_mypy.py b/tests/test_mypy.py index 8e572bdd..af03bc7e 100644 --- a/tests/test_mypy.py +++ b/tests/test_mypy.py @@ -187,28 +187,28 @@ def test_assignment_product_variant(): @pytest.mark.mypy_testing def test_instantiation_image(): with pytest.raises(ValueError): - Image(url=123) # E: Argument "url" to "Image" has incompatible type "int"; expected "Union[str, _Url]" + Image(url=123) # E: Argument "url" to "Image" has incompatible type "int"; expected "Union[str, _Url, UnsetType]" @pytest.mark.mypy_testing def test_instantiation_breadcrumb(): with pytest.raises(ValueError): - Breadcrumb(url=123) # E: Argument "url" to "Breadcrumb" has incompatible type "int"; expected "Union[str, _Url, None]" + Breadcrumb(url=123) # E: Argument "url" to "Breadcrumb" has incompatible type "int"; expected "Union[str, _Url, UnsetType, None]" @pytest.mark.mypy_testing def test_instantiation_link(): with pytest.raises(ValueError): - Link(url=123) # E: Argument "url" to "Link" has incompatible type "int"; expected "Union[str, _Url, None]" + Link(url=123) # E: Argument "url" to "Link" has incompatible type "int"; expected "Union[str, _Url, UnsetType, None]" @pytest.mark.mypy_testing def test_instantiation_product_list(): with pytest.raises(ValueError): - ProductList(url=123) # E: Argument "url" to "ProductList" has incompatible type "int"; expected "Union[str, _Url]" + ProductList(url=123) # E: Argument "url" to "ProductList" has incompatible type "int"; expected "Union[str, _Url, UnsetType]" with pytest.raises(ValueError): ProductList( - url="https://www.example.com", canonicalUrl=123 # E: Argument "canonicalUrl" to "ProductList" has incompatible type "int"; expected "Union[str, _Url, None]" + url="https://www.example.com", canonicalUrl=123 # E: Argument "canonicalUrl" to "ProductList" has incompatible type "int"; expected "Union[str, _Url, UnsetType, None]" ) @@ -216,7 +216,7 @@ def test_instantiation_product_list(): def test_instantiation_product_from_list(): with pytest.raises(ValueError): ProductFromList( - url=123 # E: Argument "url" to "ProductFromList" has incompatible type "int"; expected "Union[str, _Url, None]" + url=123 # E: Argument "url" to "ProductFromList" has incompatible type "int"; expected "Union[str, _Url, UnsetType, None]" ) @@ -224,9 +224,9 @@ def test_instantiation_product_from_list(): def test_instantiation_product_variant(): with pytest.raises(ValueError): ProductVariant( - url=123 # E: Argument "url" to "ProductVariant" has incompatible type "int"; expected "Union[str, _Url, None]" + url=123 # E: Argument "url" to "ProductVariant" has incompatible type "int"; expected "Union[str, _Url, UnsetType, None]" ) with pytest.raises(ValueError): ProductVariant( - url="https://www.example.com", canonicalUrl=123 # E: Argument "canonicalUrl" to "ProductVariant" has incompatible type "int"; expected "Union[str, _Url, None]" + url="https://www.example.com", canonicalUrl=123 # E: Argument "canonicalUrl" to "ProductVariant" has incompatible type "int"; expected "Union[str, _Url, UnsetType, None]" ) diff --git a/zyte_common_items/base.py b/zyte_common_items/base.py index c2280103..ef74b6d7 100644 --- a/zyte_common_items/base.py +++ b/zyte_common_items/base.py @@ -17,6 +17,7 @@ from typing import Dict, List, Optional, Union import attrs +from web_poet import UnsetType from .util import split_in_unknown_and_known_fields @@ -88,9 +89,12 @@ def _apply_field_types_to_sub_fields(cls, item: Dict): origin = get_origin(type_annotation) if origin == Union: field_classes = get_args(type_annotation) - if len(field_classes) != 2 or not isinstance(None, field_classes[1]): + if len(field_classes) != 2 and not {UnsetType, type(None)} <= set( + field_classes + ): raise ValueError( - "Field should only be annotated with one type (or optional)." + "Field should only be annotated with one type " + "(or either None or Unset)." ) type_annotation = field_classes[0] origin = get_origin(type_annotation) diff --git a/zyte_common_items/components.py b/zyte_common_items/components.py index 1ec481f2..b4c5af55 100644 --- a/zyte_common_items/components.py +++ b/zyte_common_items/components.py @@ -1,8 +1,9 @@ """Classes for data nested within items.""" -from typing import Optional +from typing import Union import attrs +from web_poet import Unset, UnsetType from zyte_common_items.base import Item from zyte_common_items.util import url_to_str @@ -34,13 +35,13 @@ class AggregateRating(Item): """ #: Maximum value of the rating system. - bestRating: Optional[float] = None + bestRating: Union[float, None, UnsetType] = Unset #: Average value of all ratings. - ratingValue: Optional[float] = None + ratingValue: Union[float, None, UnsetType] = Unset #: Review count. - reviewCount: Optional[int] = None + reviewCount: Union[int, None, UnsetType] = Unset @attrs.define @@ -65,11 +66,11 @@ class Breadcrumb(Item): """ #: Displayed name. - name: Optional[str] = None + name: Union[str, None, UnsetType] = Unset #: Target URL. - url: Optional[str] = attrs.field( - default=None, converter=attrs.converters.optional(url_to_str), kw_only=True + url: Union[str, None, UnsetType] = attrs.field( + default=Unset, converter=attrs.converters.optional(url_to_str), kw_only=True ) @@ -119,11 +120,11 @@ class Link(Item): """A link from a webpage to another webpage.""" #: Displayed text. - text: Optional[str] = None + text: Union[str, None, UnsetType] = Unset #: Target URL. - url: Optional[str] = attrs.field( - default=None, converter=attrs.converters.optional(url_to_str), kw_only=True + url: Union[str, None, UnsetType] = attrs.field( + default=Unset, converter=attrs.converters.optional(url_to_str), kw_only=True ) @@ -136,7 +137,7 @@ class Metadata(Item): #: Date and time when the product data was downloaded, in UTC timezone and #: the following format: ``YYYY-MM-DDThh:mm:ssZ``. - dateDownloaded: Optional[str] = None + dateDownloaded: Union[str, None, UnsetType] = Unset #: The probability (0 for 0%, 1 for 100%) that the webpage features the #: requested data type. @@ -147,4 +148,4 @@ class Metadata(Item): #: webpage features a job listing instead of a product, the value should be #: `0`. When there is no complete certainty, the value could be anything in #: between (e.g. `0.96`). - probability: Optional[float] = None + probability: Union[str, None, UnsetType] = Unset diff --git a/zyte_common_items/items.py b/zyte_common_items/items.py index 37a8ad9e..f7f45f9a 100644 --- a/zyte_common_items/items.py +++ b/zyte_common_items/items.py @@ -1,6 +1,7 @@ -from typing import List, Optional +from typing import List, Union import attrs +from web_poet import Unset, UnsetType from zyte_common_items.base import Item from zyte_common_items.components import ( @@ -33,18 +34,18 @@ class ProductVariant(Item): #: extracted. #: #: See also ``features``. - additionalProperties: Optional[List[AdditionalProperty]] = None + additionalProperties: Union[List[AdditionalProperty], None, UnsetType] = Unset #: Availability status. #: #: The value is expected to be one of: ``"InStock"``, ``"OutOfStock"``. - availability: Optional[str] = None + availability: Union[str, None, UnsetType] = Unset #: Canonical form of the URL, as indicated by the website. #: #: See also ``url``. - canonicalUrl: Optional[str] = attrs.field( - default=None, converter=attrs.converters.optional(url_to_str), kw_only=True + canonicalUrl: Union[str, None, UnsetType] = attrs.field( + default=Unset, converter=attrs.converters.optional(url_to_str), kw_only=True ) #: Color. @@ -52,20 +53,20 @@ class ProductVariant(Item): #: It is extracted as displayed (e.g. ``"white"``). #: #: See also ``size``, ``style``. - color: Optional[str] = None + color: Union[str, None, UnsetType] = Unset #: Price currency `ISO 4217`_ alphabetic code (e.g. ``"USD"``). #: #: See also ``currencyRaw``. #: #: .. _ISO 4217: https://en.wikipedia.org/wiki/ISO_4217 - currency: Optional[str] = None + currency: Union[str, None, UnsetType] = Unset #: Price currency as it appears on the webpage (no post-processing), e.g. #: ``"$"``. #: #: See also ``currency``. - currencyRaw: Optional[str] = None + currencyRaw: Union[str, None, UnsetType] = Unset #: List of standardized GTIN_ product identifiers associated with the #: product, which are unique for the product across different sellers. @@ -73,17 +74,17 @@ class ProductVariant(Item): #: See also: ``mpn``, ``productId``, ``sku``. #: #: .. _GTIN: https://en.wikipedia.org/wiki/Global_Trade_Item_Number - gtin: Optional[List[Gtin]] = None + gtin: Union[List[Gtin], None, UnsetType] = Unset #: All product images. #: #: The main image (see ``mainImage``) should be first in the list. #: #: Images only displayed as part of the product description are excluded. - images: Optional[List[Image]] = None + images: Union[List[Image], None, UnsetType] = Unset #: Main product image. - mainImage: Optional[Image] = None + mainImage: Union[Image, None, UnsetType] = Unset #: `Manufacturer part number (MPN)`_. #: @@ -92,10 +93,10 @@ class ProductVariant(Item): #: See also: ``gtin``, ``productId``, ``sku``. #: #: .. _Manufacturer part number (MPN): https://en.wikipedia.org/wiki/Part_number - mpn: Optional[str] = None + mpn: Union[str, None, UnsetType] = Unset #: Name as it appears on the webpage (no post-processing). - name: Optional[str] = None + name: Union[str, None, UnsetType] = Unset #: Price at which the product is being offered. #: @@ -105,7 +106,7 @@ class ProductVariant(Item): #: #: If ``regularPrice`` is not ``None``, ``price`` should always be lower #: than ``regularPrice``. - price: Optional[str] = None + price: Union[str, None, UnsetType] = Unset #: Product identifier, unique within an e-commerce website. #: @@ -113,7 +114,7 @@ class ProductVariant(Item): #: even a URL. #: #: See also: ``gtin``, ``mpn``, ``sku``. - productId: Optional[str] = None + productId: Union[str, None, UnsetType] = Unset #: Price at which the product was being offered in the past, and which is #: presented as a reference next to the current price. @@ -125,7 +126,7 @@ class ProductVariant(Item): #: #: If ``regularPrice`` is not ``None``, it should always be higher than #: ``price``. - regularPrice: Optional[str] = None + regularPrice: Union[str, None, UnsetType] = Unset #: Size or dimensions. #: @@ -134,7 +135,7 @@ class ProductVariant(Item): #: It is extracted as displayed (e.g. ``"XL"``). #: #: See also ``color``, ``style``. - size: Optional[str] = None + size: Union[str, None, UnsetType] = Unset #: `Stock keeping unit (SKU)`_ identifier, i.e. a merchant-specific product #: identifier. @@ -142,7 +143,7 @@ class ProductVariant(Item): #: See also: ``gtin``, ``mpn``, ``productId``. #: #: .. _Stock keeping unit (SKU): https://en.wikipedia.org/wiki/Stock_keeping_unit - sku: Optional[str] = None + sku: Union[str, None, UnsetType] = Unset #: Style. #: @@ -151,13 +152,13 @@ class ProductVariant(Item): #: It is extracted as displayed (e.g. ``"polka dots"``). #: #: See also ``color``, ``size``. - style: Optional[str] = None + style: Union[str, None, UnsetType] = Unset #: Main URL from which the product variant data could be extracted. #: #: See also ``canonicalUrl``. - url: Optional[str] = attrs.field( - default=None, converter=attrs.converters.optional(url_to_str), kw_only=True + url: Union[str, None, UnsetType] = attrs.field( + default=Unset, converter=attrs.converters.optional(url_to_str), kw_only=True ) @@ -179,29 +180,29 @@ class Product(Item): #: extracted. #: #: See also ``features``. - additionalProperties: Optional[List[AdditionalProperty]] = None + additionalProperties: Union[List[AdditionalProperty], None, UnsetType] = Unset #: Aggregate data about reviews and ratings. - aggregateRating: Optional[AggregateRating] = None + aggregateRating: Union[AggregateRating, None, UnsetType] = Unset #: Availability status. #: #: The value is expected to be one of: ``"InStock"``, ``"OutOfStock"``. - availability: Optional[str] = None + availability: Union[str, None, UnsetType] = Unset #: Brand. - brand: Optional[Brand] = None + brand: Union[Brand, None, UnsetType] = Unset #: Webpage `breadcrumb trail`_. #: #: .. _Breadcrumb trail: https://en.wikipedia.org/wiki/Breadcrumb_navigation - breadcrumbs: Optional[List[Breadcrumb]] = None + breadcrumbs: Union[List[Breadcrumb], None, UnsetType] = Unset #: Canonical form of the URL, as indicated by the website. #: #: See also ``url``. - canonicalUrl: Optional[str] = attrs.field( - default=None, converter=attrs.converters.optional(url_to_str), kw_only=True + canonicalUrl: Union[str, None, UnsetType] = attrs.field( + default=Unset, converter=attrs.converters.optional(url_to_str), kw_only=True ) #: Color. @@ -209,20 +210,20 @@ class Product(Item): #: It is extracted as displayed (e.g. ``"white"``). #: #: See also ``size``, ``style``. - color: Optional[str] = None + color: Union[str, None, UnsetType] = Unset #: Price currency `ISO 4217`_ alphabetic code (e.g. ``"USD"``). #: #: See also ``currencyRaw``. #: #: .. _ISO 4217: https://en.wikipedia.org/wiki/ISO_4217 - currency: Optional[str] = None + currency: Union[str, None, UnsetType] = Unset #: Price currency as it appears on the webpage (no post-processing), e.g. #: ``"$"``. #: #: See also ``currency``. - currencyRaw: Optional[str] = None + currencyRaw: Union[str, None, UnsetType] = Unset #: Plain-text description. #: @@ -243,7 +244,7 @@ class Product(Item): #: - There should be no whitespace at the beginning or end. #: #: See also ``descriptionHtml``. - description: Optional[str] = None + description: Union[str, None, UnsetType] = Unset #: HTML description. #: @@ -253,14 +254,14 @@ class Product(Item): #: normalization specification`_ for details. #: #: .. _HTML normalization specification: https://docs.zyte.com/automatic-extraction/article.html#format-of-articlebodyhtml-field - descriptionHtml: Optional[str] = None + descriptionHtml: Union[str, None, UnsetType] = Unset #: List of features. #: #: They are usually listed as bullet points in product webpages. #: #: See also ``additionalProperties``. - features: Optional[List[str]] = None + features: Union[List[str], None, UnsetType] = Unset #: List of standardized GTIN_ product identifiers associated with the #: product, which are unique for the product across different sellers. @@ -268,20 +269,20 @@ class Product(Item): #: See also: ``mpn``, ``productId``, ``sku``. #: #: .. _GTIN: https://en.wikipedia.org/wiki/Global_Trade_Item_Number - gtin: Optional[List[Gtin]] = None + gtin: Union[List[Gtin], None, UnsetType] = Unset #: All product images. #: #: The main image (see ``mainImage``) should be first in the list. #: #: Images only displayed as part of the product description are excluded. - images: Optional[List[Image]] = None + images: Union[List[Image], None, UnsetType] = Unset #: Main product image. - mainImage: Optional[Image] = None + mainImage: Union[Image, None, UnsetType] = Unset #: Data extraction process metadata. - metadata: Optional[Metadata] = None + metadata: Union[Metadata, None, UnsetType] = Unset #: `Manufacturer part number (MPN)`_. #: @@ -290,10 +291,10 @@ class Product(Item): #: See also: ``gtin``, ``productId``, ``sku``. #: #: .. _Manufacturer part number (MPN): https://en.wikipedia.org/wiki/Part_number - mpn: Optional[str] = None + mpn: Union[str, None, UnsetType] = Unset #: Name as it appears on the webpage (no post-processing). - name: Optional[str] = None + name: Union[str, None, UnsetType] = Unset #: Price at which the product is being offered. #: @@ -303,7 +304,7 @@ class Product(Item): #: #: If ``regularPrice`` is not ``None``, ``price`` should always be lower #: than ``regularPrice``. - price: Optional[str] = None + price: Union[str, None, UnsetType] = Unset # Redefined to extend the documentation. #: Product identifier, unique within an e-commerce website. @@ -312,7 +313,7 @@ class Product(Item): #: even a URL. #: #: See also: ``gtin``, ``mpn``, ``sku``. - productId: Optional[str] = None + productId: Union[str, None, UnsetType] = Unset #: Price at which the product was being offered in the past, and which is #: presented as a reference next to the current price. @@ -324,7 +325,7 @@ class Product(Item): #: #: If ``regularPrice`` is not ``None``, it should always be higher than #: ``price``. - regularPrice: Optional[str] = None + regularPrice: Union[str, None, UnsetType] = Unset #: Size or dimensions. #: @@ -333,7 +334,7 @@ class Product(Item): #: It is extracted as displayed (e.g. ``"XL"``). #: #: See also ``color``, ``style``. - size: Optional[str] = None + size: Union[str, None, UnsetType] = Unset #: `Stock keeping unit (SKU)`_ identifier, i.e. a merchant-specific product #: identifier. @@ -341,7 +342,7 @@ class Product(Item): #: See also: ``gtin``, ``mpn``, ``productId``. #: #: .. _Stock keeping unit (SKU): https://en.wikipedia.org/wiki/Stock_keeping_unit - sku: Optional[str] = None + sku: Union[str, None, UnsetType] = Unset #: Style. #: @@ -350,7 +351,7 @@ class Product(Item): #: It is extracted as displayed (e.g. ``"polka dots"``). #: #: See also ``color``, ``size``. - style: Optional[str] = None + style: Union[str, None, UnsetType] = Unset #: Main URL from which the data has been extracted. #: @@ -382,7 +383,7 @@ class Product(Item): #: #: Product variant details may not include those that require multiple #: additional requests (e.g. 1 or more requests per variant). - variants: Optional[List[ProductVariant]] = None + variants: Union[List[ProductVariant], None, UnsetType] = Unset @attrs.define(slots=True, kw_only=True) @@ -398,22 +399,22 @@ class ProductFromList(Item): #: See also ``currencyRaw``. #: #: .. _ISO 4217: https://en.wikipedia.org/wiki/ISO_4217 - currency: Optional[str] = None + currency: Union[str, None, UnsetType] = Unset #: Price currency as it appears on the webpage (no post-processing), e.g. #: ``"$"``. #: #: See also ``currency``. - currencyRaw: Optional[str] = None + currencyRaw: Union[str, None, UnsetType] = Unset #: Main product image. - mainImage: Optional[Image] = None + mainImage: Union[Image, None, UnsetType] = Unset #: Data extraction process metadata. - metadata: Optional[Metadata] = None + metadata: Union[Metadata, None, UnsetType] = Unset #: Name as it appears on the webpage (no post-processing). - name: Optional[str] = None + name: Union[str, None, UnsetType] = Unset #: Price at which the product is being offered. #: @@ -423,13 +424,13 @@ class ProductFromList(Item): #: #: If ``regularPrice`` is not ``None``, ``price`` should always be lower #: than ``regularPrice``. - price: Optional[str] = None + price: Union[str, None, UnsetType] = Unset #: Product identifier, unique within an e-commerce website. #: #: It may come in the form of an SKU or any other identifier, a hash, or #: even a URL. - productId: Optional[str] = None + productId: Union[str, None, UnsetType] = Unset #: Price at which the product was being offered in the past, and which is #: presented as a reference next to the current price. @@ -441,11 +442,11 @@ class ProductFromList(Item): #: #: If ``regularPrice`` is not ``None``, it should always be higher than #: ``price``. - regularPrice: Optional[str] = None + regularPrice: Union[str, None, UnsetType] = Unset #: Main URL from which the product data could be extracted. - url: Optional[str] = attrs.field( - default=None, converter=attrs.converters.optional(url_to_str), kw_only=True + url: Union[str, None, UnsetType] = attrs.field( + default=Unset, converter=attrs.converters.optional(url_to_str), kw_only=True ) @@ -462,13 +463,13 @@ class ProductList(Item): #: Webpage `breadcrumb trail`_. #: #: .. _Breadcrumb trail: https://en.wikipedia.org/wiki/Breadcrumb_navigation - breadcrumbs: Optional[List[Breadcrumb]] = None + breadcrumbs: Union[List[Breadcrumb], None, UnsetType] = Unset #: Canonical form of the URL, as indicated by the website. #: #: See also ``url``. - canonicalUrl: Optional[str] = attrs.field( - default=None, converter=attrs.converters.optional(url_to_str), kw_only=True + canonicalUrl: Union[str, None, UnsetType] = attrs.field( + default=Unset, converter=attrs.converters.optional(url_to_str), kw_only=True ) #: Name of the product listing as it appears on the webpage @@ -476,10 +477,10 @@ class ProductList(Item): #: #: For example, if the webpage is one of the pages of the Robots category, #: ``categoryName`` is ``'Robots'``. - categoryName: Optional[str] = None + categoryName: Union[str, None, UnsetType] = Unset #: Data extraction process metadata. - metadata: Optional[Metadata] = None + metadata: Union[Metadata, None, UnsetType] = Unset #: Number of the current page. #: @@ -487,10 +488,10 @@ class ProductList(Item): #: #: It must be 1-based. For example, if the first page of a listing is #: numbered as 0 on the website, it should be extracted as `1` nonetheless. - pageNumber: Optional[int] = None + pageNumber: Union[int, None, UnsetType] = Unset #: Link to the next page. - paginationNext: Optional[Link] = None + paginationNext: Union[Link, None, UnsetType] = Unset #: List of products. #: @@ -501,7 +502,7 @@ class ProductList(Item): #: The order of the products reflects their position on the rendered page. #: Product order is top-to-bottom, and left-to-right or right-to-left #: depending on the webpage locale. - products: Optional[List[ProductFromList]] = None + products: Union[List[ProductFromList], None, UnsetType] = Unset #: Main URL from which the data has been extracted. #: diff --git a/zyte_common_items/util.py b/zyte_common_items/util.py index 6b088a96..743fd8d1 100644 --- a/zyte_common_items/util.py +++ b/zyte_common_items/util.py @@ -2,6 +2,7 @@ from weakref import WeakKeyDictionary import attrs +from web_poet import UnsetType from web_poet.page_inputs.url import _Url # Caches the attribute names for attr.s classes @@ -59,7 +60,9 @@ def get_origin(tp) -> Tuple: return getattr(tp, "__origin__", ()) -def url_to_str(url: Union[str, _Url]) -> str: +def url_to_str(url: Union[str, _Url, UnsetType]) -> Union[str, UnsetType]: + if isinstance(url, UnsetType): + return url if not isinstance(url, (str, _Url)): raise ValueError( f"{url!r} is neither a string nor an instance of RequestURL or ResponseURL."