diff --git a/invenio_vocabularies/contrib/common/ror/datastreams.py b/invenio_vocabularies/contrib/common/ror/datastreams.py index 534f8a73..703ac78a 100644 --- a/invenio_vocabularies/contrib/common/ror/datastreams.py +++ b/invenio_vocabularies/contrib/common/ror/datastreams.py @@ -10,8 +10,8 @@ """ROR-related Datastreams Readers/Writers/Transformers module.""" import io -from datetime import datetime +import arrow import requests from idutils import normalize_ror @@ -33,6 +33,26 @@ def _iter(self, fp, *args, **kwargs): "RORHTTPReader downloads one file and therefore does not iterate through items" ) + def _get_last_dump_date(self, linksets): + """Get the last dump date.""" + for linkset in linksets: + metadata_formats = linkset.get("describedby", []) + for format_link in metadata_formats: + if format_link.get("type") == "application/ld+json": + json_ld_reponse = requests.get( + format_link["href"], + headers={"Accept": format_link["type"]}, + ) + json_ld_reponse.raise_for_status() + json_ld_data = json_ld_reponse.json() + + last_dump_date = arrow.get(json_ld_data["dateCreated"]) + return last_dump_date + else: + raise ReaderError( + "Couldn't find JSON-LD in publisher's linkset to determine last dump date." + ) + def read(self, item=None, *args, **kwargs): """Reads the latest ROR data dump ZIP file from Zenodo and yields an in-memory binary stream of it.""" if item: @@ -54,39 +74,21 @@ def read(self, item=None, *args, **kwargs): headers={"Accept": "application/linkset+json"}, ) linkset_response.raise_for_status() + linksets = linkset_response.json()["linkset"] if self._since: - for link in linkset_response.json()["linkset"]: - if "type" in link and link["type"] == "application/ld+json": - json_ld_reponse = requests.get( - link["anchor"], headers={"Accept": link["type"]} - ) - json_ld_reponse.raise_for_status() - - # TODO Update to use dateCreated once the field is added to InvenioRDM. (https://github.com/inveniosoftware/invenio-rdm-records/issues/1777) - last_dump_date = json_ld_reponse.json()["datePublished"] - if datetime.fromisoformat(last_dump_date) < datetime.fromisoformat( - self._since - ): - return - break - else: - raise ReaderError("Couldn't find json-ld in publisher's linkset.") - - # Extract the Landing page Link Set Object located as the first (index 0) item. - landing_page_linkset = linkset_response.json()["linkset"][0] - - # Extract the URL of the only ZIP file linked to the record. - landing_page_zip_items = [ - item - for item in landing_page_linkset["item"] - if item["type"] == "application/zip" - ] - if len(landing_page_zip_items) != 1: - raise ReaderError( - f"Expected 1 ZIP item but got {len(landing_page_zip_items)}" - ) - file_url = landing_page_zip_items[0]["href"] + last_dump_date = self._get_last_dump_date(linksets) + if last_dump_date < arrow.get(self._since): + return + + for linkset in linksets: + items = linkset.get("item", []) + zip_files = [item for item in items if item["type"] == "application/zip"] + if len(zip_files) == 1: + file_url = zip_files[0]["href"] + break + if len(zip_files) > 1: + raise ReaderError(f"Expected 1 ZIP item but got {len(zip_files)}") # Download the ZIP file and fully load the response bytes content in memory. # The bytes content are then wrapped by a BytesIO to be file-like object (as required by `zipfile.ZipFile`). diff --git a/invenio_vocabularies/contrib/subjects/schema.py b/invenio_vocabularies/contrib/subjects/schema.py index df711d3e..99691a16 100644 --- a/invenio_vocabularies/contrib/subjects/schema.py +++ b/invenio_vocabularies/contrib/subjects/schema.py @@ -10,7 +10,6 @@ """Subjects schema.""" - from invenio_i18n import get_locale from marshmallow import fields, pre_load from marshmallow_utils.fields import SanitizedUnicode diff --git a/tests/contrib/common/ror/test_ror_datastreams.py b/tests/contrib/common/ror/test_ror_datastreams.py index aa0634a0..d146598f 100644 --- a/tests/contrib/common/ror/test_ror_datastreams.py +++ b/tests/contrib/common/ror/test_ror_datastreams.py @@ -22,28 +22,32 @@ API_JSON_RESPONSE_CONTENT = { "linkset": [ { - "anchor": "https://example.com/records/11186879", + "anchor": "https://zenodo.org/records/11186879", + "describedby": [ + { + "href": "https://zenodo.org/api/records/11186879", + "type": "application/ld+json", + }, + ], "item": [ { "href": "https://example.com/records/11186879/files/v1.46.1-2024-05-13-ror-data.zip", "type": "application/zip", } ], - "type": "application/zip", + "type": [{"href": "https://schema.org/Dataset"}], }, { - "anchor": "https://example.com/api/records/11186879", - "describes": [ - {"href": "https://example.com/records/11186879", "type": "text/html"} + "anchor": "https://example.com/records/11186879/files/v1.46.1-2024-05-13-ror-data.zip", + "collection": [ + {"href": "https://zenodo.org/records/11186879", "type": "text/html"} ], - "type": "application/dcat+xml", }, { - "anchor": "https://example.com/records/12729557", + "anchor": "https://zenodo.org/api/records/11186879", "describes": [ - {"href": "https://example.com/12729557", "type": "text/html"} + {"href": "https://zenodo.org/records/11186879", "type": "text/html"} ], - "type": "application/ld+json", }, ] } @@ -98,15 +102,7 @@ API_JSON_RESPONSE_CONTENT_LD_JSON = { "name": "ROR Data", - "datePublished": "2024-07-11", - "dateModified": "2024-07-11T22:29:25.727626+00:00", - "distribution": [ - { - "@type": "DataDownload", - "contentUrl": "https://example.com/records/12729557/files/v1.49-2024-07-11-ror-data.zip/content", - "encodingFormat": "application/zip", - } - ], + "dateCreated": "2024-07-11T22:29:25.727626+00:00", } DOWNLOAD_FILE_BYTES_CONTENT = b"The content of the file" @@ -168,16 +164,8 @@ def side_effect(url, headers=None, allow_redirects=False): return MockResponse(API_JSON_RESPONSE_CONTENT) -@pytest.fixture(scope="function") -def download_file_bytes_content(): - return DOWNLOAD_FILE_BYTES_CONTENT - - -@patch( - "requests.get", - side_effect=side_effect, -) -def test_ror_http_reader(_, download_file_bytes_content): +@patch("requests.get", side_effect=side_effect) +def test_ror_http_reader(_): reader = RORHTTPReader() results = [] for entry in reader.read(): @@ -185,14 +173,11 @@ def test_ror_http_reader(_, download_file_bytes_content): assert len(results) == 1 assert isinstance(results[0], io.BytesIO) - assert results[0].read() == download_file_bytes_content + assert results[0].read() == DOWNLOAD_FILE_BYTES_CONTENT -@patch( - "requests.get", - side_effect=side_effect, -) -def test_ror_http_reader_since_before_publish(_, download_file_bytes_content): +@patch("requests.get", side_effect=side_effect) +def test_ror_http_reader_since_before_publish(_): reader = RORHTTPReader(since="2024-07-10") results = [] for entry in reader.read(): @@ -201,11 +186,8 @@ def test_ror_http_reader_since_before_publish(_, download_file_bytes_content): assert len(results) == 1 -@patch( - "requests.get", - side_effect=side_effect, -) -def test_ror_http_reader_since_after_publish(_, download_file_bytes_content): +@patch("requests.get", side_effect=side_effect) +def test_ror_http_reader_since_after_publish(_): reader = RORHTTPReader(since="2024-07-12") results = [] for entry in reader.read(): @@ -214,18 +196,6 @@ def test_ror_http_reader_since_after_publish(_, download_file_bytes_content): assert len(results) == 0 -@patch( - "requests.get", - side_effect=lambda url, headers=None, allow_redirects=False: MockResponse( - API_JSON_RESPONSE_CONTENT, remove_links=True - ), -) -def test_ror_http_reader_wrong_number_zip_items_error(_): - reader = RORHTTPReader() - with pytest.raises(ReaderError): - next(reader.read()) - - @patch( "requests.get", side_effect=lambda url, headers=None, allow_redirects=False: MockResponse( @@ -245,7 +215,7 @@ def test_ror_http_reader_wrong_number_zip_items_error(_): ), ) def test_ror_http_reader_no_json_ld(_): - reader = RORHTTPReader(since="12-07-2024") + reader = RORHTTPReader(since="2024-07-12") with pytest.raises(ReaderError): next(reader.read()) diff --git a/tests/contrib/subjects/test_subjects_datastreams.py b/tests/contrib/subjects/test_subjects_datastreams.py index f73736f8..14328ec4 100644 --- a/tests/contrib/subjects/test_subjects_datastreams.py +++ b/tests/contrib/subjects/test_subjects_datastreams.py @@ -7,6 +7,7 @@ # details. """Subject datastream tests.""" + from copy import deepcopy import pytest