Skip to content

Commit

Permalink
ror: fix dump modified date comparison
Browse files Browse the repository at this point in the history
  • Loading branch information
slint committed Aug 21, 2024
1 parent 109fdb4 commit 1d85bcf
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 85 deletions.
66 changes: 34 additions & 32 deletions invenio_vocabularies/contrib/common/ror/datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
"""ROR-related Datastreams Readers/Writers/Transformers module."""

import io
from datetime import datetime

import arrow
import requests
from idutils import normalize_ror

Expand All @@ -33,6 +33,26 @@ def _iter(self, fp, *args, **kwargs):
"RORHTTPReader downloads one file and therefore does not iterate through items"
)

def _get_last_dump_date(self, linksets):
"""Get the last dump date."""
for linkset in linksets:
metadata_formats = linkset.get("describedby", [])
for format_link in metadata_formats:
if format_link.get("type") == "application/ld+json":
json_ld_reponse = requests.get(
format_link["href"],
headers={"Accept": format_link["type"]},
)
json_ld_reponse.raise_for_status()
json_ld_data = json_ld_reponse.json()

last_dump_date = arrow.get(json_ld_data["dateCreated"])
return last_dump_date
else:
raise ReaderError(
"Couldn't find JSON-LD in publisher's linkset to determine last dump date."
)

def read(self, item=None, *args, **kwargs):
"""Reads the latest ROR data dump ZIP file from Zenodo and yields an in-memory binary stream of it."""
if item:
Expand All @@ -54,39 +74,21 @@ def read(self, item=None, *args, **kwargs):
headers={"Accept": "application/linkset+json"},
)
linkset_response.raise_for_status()
linksets = linkset_response.json()["linkset"]

if self._since:
for link in linkset_response.json()["linkset"]:
if "type" in link and link["type"] == "application/ld+json":
json_ld_reponse = requests.get(
link["anchor"], headers={"Accept": link["type"]}
)
json_ld_reponse.raise_for_status()

# TODO Update to use dateCreated once the field is added to InvenioRDM. (https://github.com/inveniosoftware/invenio-rdm-records/issues/1777)
last_dump_date = json_ld_reponse.json()["datePublished"]
if datetime.fromisoformat(last_dump_date) < datetime.fromisoformat(
self._since
):
return
break
else:
raise ReaderError("Couldn't find json-ld in publisher's linkset.")

# Extract the Landing page Link Set Object located as the first (index 0) item.
landing_page_linkset = linkset_response.json()["linkset"][0]

# Extract the URL of the only ZIP file linked to the record.
landing_page_zip_items = [
item
for item in landing_page_linkset["item"]
if item["type"] == "application/zip"
]
if len(landing_page_zip_items) != 1:
raise ReaderError(
f"Expected 1 ZIP item but got {len(landing_page_zip_items)}"
)
file_url = landing_page_zip_items[0]["href"]
last_dump_date = self._get_last_dump_date(linksets)
if last_dump_date < arrow.get(self._since):
return

for linkset in linksets:
items = linkset.get("item", [])
zip_files = [item for item in items if item["type"] == "application/zip"]
if len(zip_files) == 1:
file_url = zip_files[0]["href"]
break
if len(zip_files) > 1:
raise ReaderError(f"Expected 1 ZIP item but got {len(zip_files)}")

# Download the ZIP file and fully load the response bytes content in memory.
# The bytes content are then wrapped by a BytesIO to be file-like object (as required by `zipfile.ZipFile`).
Expand Down
1 change: 0 additions & 1 deletion invenio_vocabularies/contrib/subjects/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

"""Subjects schema."""


from invenio_i18n import get_locale
from marshmallow import fields, pre_load
from marshmallow_utils.fields import SanitizedUnicode
Expand Down
74 changes: 22 additions & 52 deletions tests/contrib/common/ror/test_ror_datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,28 +22,32 @@
API_JSON_RESPONSE_CONTENT = {
"linkset": [
{
"anchor": "https://example.com/records/11186879",
"anchor": "https://zenodo.org/records/11186879",
"describedby": [
{
"href": "https://zenodo.org/api/records/11186879",
"type": "application/ld+json",
},
],
"item": [
{
"href": "https://example.com/records/11186879/files/v1.46.1-2024-05-13-ror-data.zip",
"type": "application/zip",
}
],
"type": "application/zip",
"type": [{"href": "https://schema.org/Dataset"}],
},
{
"anchor": "https://example.com/api/records/11186879",
"describes": [
{"href": "https://example.com/records/11186879", "type": "text/html"}
"anchor": "https://example.com/records/11186879/files/v1.46.1-2024-05-13-ror-data.zip",
"collection": [
{"href": "https://zenodo.org/records/11186879", "type": "text/html"}
],
"type": "application/dcat+xml",
},
{
"anchor": "https://example.com/records/12729557",
"anchor": "https://zenodo.org/api/records/11186879",
"describes": [
{"href": "https://example.com/12729557", "type": "text/html"}
{"href": "https://zenodo.org/records/11186879", "type": "text/html"}
],
"type": "application/ld+json",
},
]
}
Expand Down Expand Up @@ -98,15 +102,7 @@

API_JSON_RESPONSE_CONTENT_LD_JSON = {
"name": "ROR Data",
"datePublished": "2024-07-11",
"dateModified": "2024-07-11T22:29:25.727626+00:00",
"distribution": [
{
"@type": "DataDownload",
"contentUrl": "https://example.com/records/12729557/files/v1.49-2024-07-11-ror-data.zip/content",
"encodingFormat": "application/zip",
}
],
"dateCreated": "2024-07-11T22:29:25.727626+00:00",
}

DOWNLOAD_FILE_BYTES_CONTENT = b"The content of the file"
Expand Down Expand Up @@ -168,31 +164,20 @@ def side_effect(url, headers=None, allow_redirects=False):
return MockResponse(API_JSON_RESPONSE_CONTENT)


@pytest.fixture(scope="function")
def download_file_bytes_content():
return DOWNLOAD_FILE_BYTES_CONTENT


@patch(
"requests.get",
side_effect=side_effect,
)
def test_ror_http_reader(_, download_file_bytes_content):
@patch("requests.get", side_effect=side_effect)
def test_ror_http_reader(_):
reader = RORHTTPReader()
results = []
for entry in reader.read():
results.append(entry)

assert len(results) == 1
assert isinstance(results[0], io.BytesIO)
assert results[0].read() == download_file_bytes_content
assert results[0].read() == DOWNLOAD_FILE_BYTES_CONTENT


@patch(
"requests.get",
side_effect=side_effect,
)
def test_ror_http_reader_since_before_publish(_, download_file_bytes_content):
@patch("requests.get", side_effect=side_effect)
def test_ror_http_reader_since_before_publish(_):
reader = RORHTTPReader(since="2024-07-10")
results = []
for entry in reader.read():
Expand All @@ -201,11 +186,8 @@ def test_ror_http_reader_since_before_publish(_, download_file_bytes_content):
assert len(results) == 1


@patch(
"requests.get",
side_effect=side_effect,
)
def test_ror_http_reader_since_after_publish(_, download_file_bytes_content):
@patch("requests.get", side_effect=side_effect)
def test_ror_http_reader_since_after_publish(_):
reader = RORHTTPReader(since="2024-07-12")
results = []
for entry in reader.read():
Expand All @@ -214,18 +196,6 @@ def test_ror_http_reader_since_after_publish(_, download_file_bytes_content):
assert len(results) == 0


@patch(
"requests.get",
side_effect=lambda url, headers=None, allow_redirects=False: MockResponse(
API_JSON_RESPONSE_CONTENT, remove_links=True
),
)
def test_ror_http_reader_wrong_number_zip_items_error(_):
reader = RORHTTPReader()
with pytest.raises(ReaderError):
next(reader.read())


@patch(
"requests.get",
side_effect=lambda url, headers=None, allow_redirects=False: MockResponse(
Expand All @@ -245,7 +215,7 @@ def test_ror_http_reader_wrong_number_zip_items_error(_):
),
)
def test_ror_http_reader_no_json_ld(_):
reader = RORHTTPReader(since="12-07-2024")
reader = RORHTTPReader(since="2024-07-12")
with pytest.raises(ReaderError):
next(reader.read())

Expand Down
1 change: 1 addition & 0 deletions tests/contrib/subjects/test_subjects_datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# details.

"""Subject datastream tests."""

from copy import deepcopy

import pytest
Expand Down

0 comments on commit 1d85bcf

Please sign in to comment.