ror: fix dump modified date comparison

inveniosoftware · Aug 21, 2024 · 1d85bcf · 1d85bcf
1 parent 109fdb4
commit 1d85bcf
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 85 deletions.
diff --git a/invenio_vocabularies/contrib/common/ror/datastreams.py b/invenio_vocabularies/contrib/common/ror/datastreams.py
@@ -10,8 +10,8 @@
 """ROR-related Datastreams Readers/Writers/Transformers module."""
 
 import io
-from datetime import datetime
 
+import arrow
 import requests
 from idutils import normalize_ror
 
@@ -33,6 +33,26 @@ def _iter(self, fp, *args, **kwargs):
             "RORHTTPReader downloads one file and therefore does not iterate through items"
         )
 
+    def _get_last_dump_date(self, linksets):
+        """Get the last dump date."""
+        for linkset in linksets:
+            metadata_formats = linkset.get("describedby", [])
+            for format_link in metadata_formats:
+                if format_link.get("type") == "application/ld+json":
+                    json_ld_reponse = requests.get(
+                        format_link["href"],
+                        headers={"Accept": format_link["type"]},
+                    )
+                    json_ld_reponse.raise_for_status()
+                    json_ld_data = json_ld_reponse.json()
+
+                    last_dump_date = arrow.get(json_ld_data["dateCreated"])
+                    return last_dump_date
+        else:
+            raise ReaderError(
+                "Couldn't find JSON-LD in publisher's linkset to determine last dump date."
+            )
+
     def read(self, item=None, *args, **kwargs):
         """Reads the latest ROR data dump ZIP file from Zenodo and yields an in-memory binary stream of it."""
         if item:
@@ -54,39 +74,21 @@ def read(self, item=None, *args, **kwargs):
             headers={"Accept": "application/linkset+json"},
         )
         linkset_response.raise_for_status()
+        linksets = linkset_response.json()["linkset"]
 
         if self._since:
-            for link in linkset_response.json()["linkset"]:
-                if "type" in link and link["type"] == "application/ld+json":
-                    json_ld_reponse = requests.get(
-                        link["anchor"], headers={"Accept": link["type"]}
-                    )
-                    json_ld_reponse.raise_for_status()
-
-                    # TODO Update to use dateCreated once the field is added to InvenioRDM. (https://github.com/inveniosoftware/invenio-rdm-records/issues/1777)
-                    last_dump_date = json_ld_reponse.json()["datePublished"]
-                    if datetime.fromisoformat(last_dump_date) < datetime.fromisoformat(
-                        self._since
-                    ):
-                        return
-                    break
-            else:
-                raise ReaderError("Couldn't find json-ld in publisher's linkset.")
-
-        # Extract the Landing page Link Set Object located as the first (index 0) item.
-        landing_page_linkset = linkset_response.json()["linkset"][0]
-
-        # Extract the URL of the only ZIP file linked to the record.
-        landing_page_zip_items = [
-            item
-            for item in landing_page_linkset["item"]
-            if item["type"] == "application/zip"
-        ]
-        if len(landing_page_zip_items) != 1:
-            raise ReaderError(
-                f"Expected 1 ZIP item but got {len(landing_page_zip_items)}"
-            )
-        file_url = landing_page_zip_items[0]["href"]
+            last_dump_date = self._get_last_dump_date(linksets)
+            if last_dump_date < arrow.get(self._since):
+                return
+
+        for linkset in linksets:
+            items = linkset.get("item", [])
+            zip_files = [item for item in items if item["type"] == "application/zip"]
+            if len(zip_files) == 1:
+                file_url = zip_files[0]["href"]
+                break
+            if len(zip_files) > 1:
+                raise ReaderError(f"Expected 1 ZIP item but got {len(zip_files)}")
 
         # Download the ZIP file and fully load the response bytes content in memory.
         # The bytes content are then wrapped by a BytesIO to be file-like object (as required by `zipfile.ZipFile`).

diff --git a/invenio_vocabularies/contrib/subjects/schema.py b/invenio_vocabularies/contrib/subjects/schema.py
@@ -10,7 +10,6 @@
 
 """Subjects schema."""
 
-
 from invenio_i18n import get_locale
 from marshmallow import fields, pre_load
 from marshmallow_utils.fields import SanitizedUnicode

diff --git a/tests/contrib/common/ror/test_ror_datastreams.py b/tests/contrib/common/ror/test_ror_datastreams.py
@@ -22,28 +22,32 @@
 API_JSON_RESPONSE_CONTENT = {
     "linkset": [
         {
-            "anchor": "https://example.com/records/11186879",
+            "anchor": "https://zenodo.org/records/11186879",
+            "describedby": [
+                {
+                    "href": "https://zenodo.org/api/records/11186879",
+                    "type": "application/ld+json",
+                },
+            ],
             "item": [
                 {
                     "href": "https://example.com/records/11186879/files/v1.46.1-2024-05-13-ror-data.zip",
                     "type": "application/zip",
                 }
             ],
-            "type": "application/zip",
+            "type": [{"href": "https://schema.org/Dataset"}],
         },
         {
-            "anchor": "https://example.com/api/records/11186879",
-            "describes": [
-                {"href": "https://example.com/records/11186879", "type": "text/html"}
+            "anchor": "https://example.com/records/11186879/files/v1.46.1-2024-05-13-ror-data.zip",
+            "collection": [
+                {"href": "https://zenodo.org/records/11186879", "type": "text/html"}
             ],
-            "type": "application/dcat+xml",
         },
         {
-            "anchor": "https://example.com/records/12729557",
+            "anchor": "https://zenodo.org/api/records/11186879",
             "describes": [
-                {"href": "https://example.com/12729557", "type": "text/html"}
+                {"href": "https://zenodo.org/records/11186879", "type": "text/html"}
             ],
-            "type": "application/ld+json",
         },
     ]
 }
@@ -98,15 +102,7 @@
 
 API_JSON_RESPONSE_CONTENT_LD_JSON = {
     "name": "ROR Data",
-    "datePublished": "2024-07-11",
-    "dateModified": "2024-07-11T22:29:25.727626+00:00",
-    "distribution": [
-        {
-            "@type": "DataDownload",
-            "contentUrl": "https://example.com/records/12729557/files/v1.49-2024-07-11-ror-data.zip/content",
-            "encodingFormat": "application/zip",
-        }
-    ],
+    "dateCreated": "2024-07-11T22:29:25.727626+00:00",
 }
 
 DOWNLOAD_FILE_BYTES_CONTENT = b"The content of the file"
@@ -168,31 +164,20 @@ def side_effect(url, headers=None, allow_redirects=False):
         return MockResponse(API_JSON_RESPONSE_CONTENT)
 
 
-@pytest.fixture(scope="function")
-def download_file_bytes_content():
-    return DOWNLOAD_FILE_BYTES_CONTENT
-
-
-@patch(
-    "requests.get",
-    side_effect=side_effect,
-)
-def test_ror_http_reader(_, download_file_bytes_content):
+@patch("requests.get", side_effect=side_effect)
+def test_ror_http_reader(_):
     reader = RORHTTPReader()
     results = []
     for entry in reader.read():
         results.append(entry)
 
     assert len(results) == 1
     assert isinstance(results[0], io.BytesIO)
-    assert results[0].read() == download_file_bytes_content
+    assert results[0].read() == DOWNLOAD_FILE_BYTES_CONTENT
 
 
-@patch(
-    "requests.get",
-    side_effect=side_effect,
-)
-def test_ror_http_reader_since_before_publish(_, download_file_bytes_content):
+@patch("requests.get", side_effect=side_effect)
+def test_ror_http_reader_since_before_publish(_):
     reader = RORHTTPReader(since="2024-07-10")
     results = []
     for entry in reader.read():
@@ -201,11 +186,8 @@ def test_ror_http_reader_since_before_publish(_, download_file_bytes_content):
     assert len(results) == 1
 
 
-@patch(
-    "requests.get",
-    side_effect=side_effect,
-)
-def test_ror_http_reader_since_after_publish(_, download_file_bytes_content):
+@patch("requests.get", side_effect=side_effect)
+def test_ror_http_reader_since_after_publish(_):
     reader = RORHTTPReader(since="2024-07-12")
     results = []
     for entry in reader.read():
@@ -214,18 +196,6 @@ def test_ror_http_reader_since_after_publish(_, download_file_bytes_content):
     assert len(results) == 0
 
 
-@patch(
-    "requests.get",
-    side_effect=lambda url, headers=None, allow_redirects=False: MockResponse(
-        API_JSON_RESPONSE_CONTENT, remove_links=True
-    ),
-)
-def test_ror_http_reader_wrong_number_zip_items_error(_):
-    reader = RORHTTPReader()
-    with pytest.raises(ReaderError):
-        next(reader.read())
-
-
 @patch(
     "requests.get",
     side_effect=lambda url, headers=None, allow_redirects=False: MockResponse(
@@ -245,7 +215,7 @@ def test_ror_http_reader_wrong_number_zip_items_error(_):
     ),
 )
 def test_ror_http_reader_no_json_ld(_):
-    reader = RORHTTPReader(since="12-07-2024")
+    reader = RORHTTPReader(since="2024-07-12")
     with pytest.raises(ReaderError):
         next(reader.read())
 

diff --git a/tests/contrib/subjects/test_subjects_datastreams.py b/tests/contrib/subjects/test_subjects_datastreams.py
@@ -7,6 +7,7 @@
 # details.
 
 """Subject datastream tests."""
+
 from copy import deepcopy
 
 import pytest