Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ror: fix dump modified date comparison #388

Merged
merged 1 commit into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 34 additions & 32 deletions invenio_vocabularies/contrib/common/ror/datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
"""ROR-related Datastreams Readers/Writers/Transformers module."""

import io
from datetime import datetime

import arrow
import requests
from idutils import normalize_ror

Expand All @@ -33,6 +33,26 @@ def _iter(self, fp, *args, **kwargs):
"RORHTTPReader downloads one file and therefore does not iterate through items"
)

def _get_last_dump_date(self, linksets):
"""Get the last dump date."""
for linkset in linksets:
metadata_formats = linkset.get("describedby", [])
for format_link in metadata_formats:
if format_link.get("type") == "application/ld+json":
json_ld_reponse = requests.get(
format_link["href"],
headers={"Accept": format_link["type"]},
)
json_ld_reponse.raise_for_status()
json_ld_data = json_ld_reponse.json()

last_dump_date = arrow.get(json_ld_data["dateCreated"])
return last_dump_date
else:
raise ReaderError(
"Couldn't find JSON-LD in publisher's linkset to determine last dump date."
)

def read(self, item=None, *args, **kwargs):
"""Reads the latest ROR data dump ZIP file from Zenodo and yields an in-memory binary stream of it."""
if item:
Expand All @@ -54,39 +74,21 @@ def read(self, item=None, *args, **kwargs):
headers={"Accept": "application/linkset+json"},
)
linkset_response.raise_for_status()
linksets = linkset_response.json()["linkset"]

if self._since:
for link in linkset_response.json()["linkset"]:
if "type" in link and link["type"] == "application/ld+json":
json_ld_reponse = requests.get(
link["anchor"], headers={"Accept": link["type"]}
)
json_ld_reponse.raise_for_status()

# TODO Update to use dateCreated once the field is added to InvenioRDM. (https://github.com/inveniosoftware/invenio-rdm-records/issues/1777)
last_dump_date = json_ld_reponse.json()["datePublished"]
if datetime.fromisoformat(last_dump_date) < datetime.fromisoformat(
self._since
):
return
break
else:
raise ReaderError("Couldn't find json-ld in publisher's linkset.")

# Extract the Landing page Link Set Object located as the first (index 0) item.
landing_page_linkset = linkset_response.json()["linkset"][0]

# Extract the URL of the only ZIP file linked to the record.
landing_page_zip_items = [
item
for item in landing_page_linkset["item"]
if item["type"] == "application/zip"
]
if len(landing_page_zip_items) != 1:
raise ReaderError(
f"Expected 1 ZIP item but got {len(landing_page_zip_items)}"
)
file_url = landing_page_zip_items[0]["href"]
last_dump_date = self._get_last_dump_date(linksets)
if last_dump_date < arrow.get(self._since):
return

for linkset in linksets:
items = linkset.get("item", [])
zip_files = [item for item in items if item["type"] == "application/zip"]
if len(zip_files) == 1:
file_url = zip_files[0]["href"]
break
if len(zip_files) > 1:
raise ReaderError(f"Expected 1 ZIP item but got {len(zip_files)}")

# Download the ZIP file and fully load the response bytes content in memory.
# The bytes content are then wrapped by a BytesIO to be file-like object (as required by `zipfile.ZipFile`).
Expand Down
1 change: 0 additions & 1 deletion invenio_vocabularies/contrib/subjects/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

"""Subjects schema."""


from invenio_i18n import get_locale
from marshmallow import fields, pre_load
from marshmallow_utils.fields import SanitizedUnicode
Expand Down
74 changes: 22 additions & 52 deletions tests/contrib/common/ror/test_ror_datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,28 +22,32 @@
API_JSON_RESPONSE_CONTENT = {
"linkset": [
{
"anchor": "https://example.com/records/11186879",
"anchor": "https://zenodo.org/records/11186879",
"describedby": [
{
"href": "https://zenodo.org/api/records/11186879",
"type": "application/ld+json",
},
],
"item": [
{
"href": "https://example.com/records/11186879/files/v1.46.1-2024-05-13-ror-data.zip",
"type": "application/zip",
}
],
"type": "application/zip",
"type": [{"href": "https://schema.org/Dataset"}],
},
{
"anchor": "https://example.com/api/records/11186879",
"describes": [
{"href": "https://example.com/records/11186879", "type": "text/html"}
"anchor": "https://example.com/records/11186879/files/v1.46.1-2024-05-13-ror-data.zip",
"collection": [
{"href": "https://zenodo.org/records/11186879", "type": "text/html"}
],
"type": "application/dcat+xml",
},
{
"anchor": "https://example.com/records/12729557",
"anchor": "https://zenodo.org/api/records/11186879",
"describes": [
{"href": "https://example.com/12729557", "type": "text/html"}
{"href": "https://zenodo.org/records/11186879", "type": "text/html"}
],
"type": "application/ld+json",
},
]
}
Expand Down Expand Up @@ -98,15 +102,7 @@

API_JSON_RESPONSE_CONTENT_LD_JSON = {
"name": "ROR Data",
"datePublished": "2024-07-11",
"dateModified": "2024-07-11T22:29:25.727626+00:00",
"distribution": [
{
"@type": "DataDownload",
"contentUrl": "https://example.com/records/12729557/files/v1.49-2024-07-11-ror-data.zip/content",
"encodingFormat": "application/zip",
}
],
"dateCreated": "2024-07-11T22:29:25.727626+00:00",
}

DOWNLOAD_FILE_BYTES_CONTENT = b"The content of the file"
Expand Down Expand Up @@ -168,31 +164,20 @@ def side_effect(url, headers=None, allow_redirects=False):
return MockResponse(API_JSON_RESPONSE_CONTENT)


@pytest.fixture(scope="function")
def download_file_bytes_content():
return DOWNLOAD_FILE_BYTES_CONTENT


@patch(
"requests.get",
side_effect=side_effect,
)
def test_ror_http_reader(_, download_file_bytes_content):
@patch("requests.get", side_effect=side_effect)
def test_ror_http_reader(_):
reader = RORHTTPReader()
results = []
for entry in reader.read():
results.append(entry)

assert len(results) == 1
assert isinstance(results[0], io.BytesIO)
assert results[0].read() == download_file_bytes_content
assert results[0].read() == DOWNLOAD_FILE_BYTES_CONTENT


@patch(
"requests.get",
side_effect=side_effect,
)
def test_ror_http_reader_since_before_publish(_, download_file_bytes_content):
@patch("requests.get", side_effect=side_effect)
def test_ror_http_reader_since_before_publish(_):
reader = RORHTTPReader(since="2024-07-10")
results = []
for entry in reader.read():
Expand All @@ -201,11 +186,8 @@ def test_ror_http_reader_since_before_publish(_, download_file_bytes_content):
assert len(results) == 1


@patch(
"requests.get",
side_effect=side_effect,
)
def test_ror_http_reader_since_after_publish(_, download_file_bytes_content):
@patch("requests.get", side_effect=side_effect)
def test_ror_http_reader_since_after_publish(_):
reader = RORHTTPReader(since="2024-07-12")
results = []
for entry in reader.read():
Expand All @@ -214,18 +196,6 @@ def test_ror_http_reader_since_after_publish(_, download_file_bytes_content):
assert len(results) == 0


@patch(
"requests.get",
side_effect=lambda url, headers=None, allow_redirects=False: MockResponse(
API_JSON_RESPONSE_CONTENT, remove_links=True
),
)
def test_ror_http_reader_wrong_number_zip_items_error(_):
reader = RORHTTPReader()
with pytest.raises(ReaderError):
next(reader.read())


@patch(
"requests.get",
side_effect=lambda url, headers=None, allow_redirects=False: MockResponse(
Expand All @@ -245,7 +215,7 @@ def test_ror_http_reader_wrong_number_zip_items_error(_):
),
)
def test_ror_http_reader_no_json_ld(_):
reader = RORHTTPReader(since="12-07-2024")
reader = RORHTTPReader(since="2024-07-12")
with pytest.raises(ReaderError):
next(reader.read())

Expand Down
1 change: 1 addition & 0 deletions tests/contrib/subjects/test_subjects_datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# details.

"""Subject datastream tests."""

from copy import deepcopy

import pytest
Expand Down