Skip to content

Commit

Permalink
ror: fix dump modified date comparison
Browse files Browse the repository at this point in the history
  • Loading branch information
slint committed Aug 18, 2024
1 parent 81d5405 commit 8f3c0d0
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 29 deletions.
53 changes: 25 additions & 28 deletions invenio_vocabularies/contrib/common/ror/datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,37 +56,34 @@ def read(self, item=None, *args, **kwargs):
linkset_response.raise_for_status()

if self._since:
for link in linkset_response.json()["linkset"]:
if "type" in link and link["type"] == "application/ld+json":
json_ld_reponse = requests.get(
link["anchor"], headers={"Accept": link["type"]}
)
json_ld_reponse.raise_for_status()

# TODO Update to use dateCreated once the field is added to InvenioRDM. (https://github.com/inveniosoftware/invenio-rdm-records/issues/1777)
last_dump_date = json_ld_reponse.json()["datePublished"]
if datetime.fromisoformat(last_dump_date) < datetime.fromisoformat(
self._since
):
return
break
for linkset in linkset_response.json()["linkset"]:
metadata_formats = linkset.get("describedBy", [])
for format_link in metadata_formats:
if format_link.get("type") == "application/ld+json":
json_ld_reponse = requests.get(
format_link["anchor"],
headers={"Accept": format_link["type"]},
)
json_ld_reponse.raise_for_status()

last_dump_date = datetime.fromisoformat(
json_ld_reponse.json()["dateCreated"]
)
if last_dump_date < datetime.fromisoformat(self._since):
return
break
else:
raise ReaderError("Couldn't find json-ld in publisher's linkset.")
raise ReaderError("Couldn't find JSON-LD in publisher's linkset.")

# Extract the Landing page Link Set Object located as the first (index 0) item.
landing_page_linkset = linkset_response.json()["linkset"][0]

# Extract the URL of the only ZIP file linked to the record.
landing_page_zip_items = [
item
for item in landing_page_linkset["item"]
if item["type"] == "application/zip"
]
if len(landing_page_zip_items) != 1:
raise ReaderError(
f"Expected 1 ZIP item but got {len(landing_page_zip_items)}"
)
file_url = landing_page_zip_items[0]["href"]
for linkset in linkset_response.json()["linkset"]:
items = linkset.get("item", [])
zip_files = [item for item in items if item["type"] == "application/zip"]
if len(zip_files) == 1:
file_url = zip_files[0]["href"]
break
if len(zip_files) > 1:
raise ReaderError(f"Expected 1 ZIP item but got {len(zip_files)}")

# Download the ZIP file and fully load the response bytes content in memory.
# The bytes content are then wrapped by a BytesIO to be file-like object (as required by `zipfile.ZipFile`).
Expand Down
2 changes: 1 addition & 1 deletion tests/contrib/common/ror/test_ror_datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@
API_JSON_RESPONSE_CONTENT_LD_JSON = {
"name": "ROR Data",
"datePublished": "2024-07-11",
"dateModified": "2024-07-11T22:29:25.727626+00:00",
"dateCreated": "2024-07-11T22:29:25.727626+00:00",
"distribution": [
{
"@type": "DataDownload",
Expand Down

0 comments on commit 8f3c0d0

Please sign in to comment.