From 399e45bf9d1eabb5fdbcf75e025855d5fa061f70 Mon Sep 17 00:00:00 2001 From: Pablo Tamarit Date: Tue, 27 Aug 2024 15:57:22 +0200 Subject: [PATCH 01/11] cli: make the update command work for writers without args --- invenio_vocabularies/cli.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/invenio_vocabularies/cli.py b/invenio_vocabularies/cli.py index d50a9d13..aded3193 100644 --- a/invenio_vocabularies/cli.py +++ b/invenio_vocabularies/cli.py @@ -101,9 +101,14 @@ def update(vocabulary, filepath=None, origin=None): for w_conf in config["writers"]: if w_conf["type"] == "async": - w_conf["args"]["writer"]["args"]["update"] = True + w_conf_update = w_conf["args"]["writer"] else: - w_conf["args"]["update"] = True + w_conf_update = w_conf + + if "args" in w_conf_update: + w_conf_update["args"]["update"] = True + else: + w_conf_update["args"] = {"update": True} success, errored, filtered = _process_vocab(config) From 14dfe4582f78053cd1763eb302230d00368597b4 Mon Sep 17 00:00:00 2001 From: Pablo Tamarit Date: Tue, 27 Aug 2024 16:07:14 +0200 Subject: [PATCH 02/11] datastreams: writers: add option to not insert --- invenio_vocabularies/datastreams/writers.py | 40 ++++++++++++++------- tests/datastreams/test_writers.py | 22 ++++++++++++ 2 files changed, 50 insertions(+), 12 deletions(-) diff --git a/invenio_vocabularies/datastreams/writers.py b/invenio_vocabularies/datastreams/writers.py index 84084599..f898c1ad 100644 --- a/invenio_vocabularies/datastreams/writers.py +++ b/invenio_vocabularies/datastreams/writers.py @@ -13,10 +13,11 @@ import yaml from invenio_access.permissions import system_identity -from invenio_pidstore.errors import PIDAlreadyExists +from invenio_pidstore.errors import PIDAlreadyExists, PIDDoesNotExistError from invenio_records.systemfields.relations.errors import InvalidRelationValue from invenio_records_resources.proxies import current_service_registry from marshmallow import ValidationError +from sqlalchemy.exc import NoResultFound from .datastreams import StreamEntry from .errors import WriterError @@ -55,12 +56,15 @@ def write_many(self, stream_entries, *args, **kwargs): class ServiceWriter(BaseWriter): """Writes the entries to an RDM instance using a Service object.""" - def __init__(self, service_or_name, *args, identity=None, update=False, **kwargs): + def __init__( + self, service_or_name, *args, identity=None, insert=True, update=False, **kwargs + ): """Constructor. :param service_or_name: a service instance or a key of the service registry. :param identity: access identity. + :param insert: if True it will insert records which do not exist. :param update: if True it will update records if they exist. """ if isinstance(service_or_name, str): @@ -68,6 +72,7 @@ def __init__(self, service_or_name, *args, identity=None, update=False, **kwargs self._service = service_or_name self._identity = identity or system_identity + self._insert = insert self._update = update super().__init__(*args, **kwargs) @@ -79,20 +84,31 @@ def _entry_id(self, entry): def _resolve(self, id_): return self._service.read(self._identity, id_) + def _do_update(self, entry): + vocab_id = self._entry_id(entry) + current = self._resolve(vocab_id) + updated = dict(current.to_dict(), **entry) + return StreamEntry(self._service.update(self._identity, vocab_id, updated)) + def write(self, stream_entry, *args, **kwargs): """Writes the input entry using a given service.""" entry = stream_entry.entry try: - try: - return StreamEntry(self._service.create(self._identity, entry)) - except PIDAlreadyExists: - if not self._update: - raise WriterError([f"Vocabulary entry already exists: {entry}"]) - vocab_id = self._entry_id(entry) - current = self._resolve(vocab_id) - updated = dict(current.to_dict(), **entry) - return StreamEntry( - self._service.update(self._identity, vocab_id, updated) + if self._insert: + try: + return StreamEntry(self._service.create(self._identity, entry)) + except PIDAlreadyExists: + if not self._update: + raise WriterError([f"Vocabulary entry already exists: {entry}"]) + return self._do_update(entry) + elif self._update: + try: + return self._do_update(entry) + except (NoResultFound, PIDDoesNotExistError): + raise WriterError([f"Vocabulary entry does not exist: {entry}"]) + else: + raise WriterError( + ["Writer wrongly configured to not insert and to not update"] ) except ValidationError as err: diff --git a/tests/datastreams/test_writers.py b/tests/datastreams/test_writers.py index 04a1b1eb..8456b22f 100644 --- a/tests/datastreams/test_writers.py +++ b/tests/datastreams/test_writers.py @@ -76,6 +76,28 @@ def test_service_writer_update_non_existing(lang_type, lang_data, service, ident assert dict(record, **updated_lang) == record +def test_writer_wrong_config_no_insert_no_update( + lang_type, lang_data, service, identity +): + writer = ServiceWriter(service, identity=identity, insert=False, update=False) + + with pytest.raises(WriterError) as err: + writer.write(stream_entry=StreamEntry(lang_data)) + + expected_error = ["Writer wrongly configured to not insert and to not update"] + assert expected_error in err.value.args + + +def test_writer_no_insert(lang_type, lang_data, service, identity): + writer = ServiceWriter(service, identity=identity, insert=False, update=True) + + with pytest.raises(WriterError) as err: + writer.write(stream_entry=StreamEntry(lang_data)) + + expected_error = [f"Vocabulary entry does not exist: {lang_data}"] + assert expected_error in err.value.args + + ## # YAML Writer ## From a5294fd8b3e3d2199bc4c10d780dce4562b2cfdd Mon Sep 17 00:00:00 2001 From: Pablo Tamarit Date: Tue, 27 Aug 2024 16:13:48 +0200 Subject: [PATCH 03/11] datastreams: move OpenAIREProjectHTTPReader to generic OpenAIREHTTPReader --- .../contrib/awards/datastreams.py | 64 +-------- .../contrib/common/openaire/__init__.py | 9 ++ .../contrib/common/openaire/datastreams.py | 84 +++++++++++ .../contrib/awards/test_awards_datastreams.py | 125 +--------------- .../openaire/test_openaire_datastreams.py | 136 ++++++++++++++++++ 5 files changed, 232 insertions(+), 186 deletions(-) create mode 100644 invenio_vocabularies/contrib/common/openaire/__init__.py create mode 100644 invenio_vocabularies/contrib/common/openaire/datastreams.py create mode 100644 tests/contrib/common/openaire/test_openaire_datastreams.py diff --git a/invenio_vocabularies/contrib/awards/datastreams.py b/invenio_vocabularies/contrib/awards/datastreams.py index f4109bf0..22547f39 100644 --- a/invenio_vocabularies/contrib/awards/datastreams.py +++ b/invenio_vocabularies/contrib/awards/datastreams.py @@ -8,73 +8,15 @@ """Awards datastreams, transformers, writers and readers.""" -import io - -import requests from invenio_access.permissions import system_identity from invenio_i18n import lazy_gettext as _ -from ...datastreams.errors import ReaderError, TransformerError -from ...datastreams.readers import BaseReader +from ...datastreams.errors import TransformerError from ...datastreams.transformers import BaseTransformer from ...datastreams.writers import ServiceWriter from .config import awards_ec_ror_id, awards_openaire_funders_mapping -class OpenAIREProjectHTTPReader(BaseReader): - """OpenAIRE Project HTTP Reader returning an in-memory binary stream of the latest OpenAIRE Graph Dataset project tar file.""" - - def _iter(self, fp, *args, **kwargs): - raise NotImplementedError( - "OpenAIREProjectHTTPReader downloads one file and therefore does not iterate through items" - ) - - def read(self, item=None, *args, **kwargs): - """Reads the latest OpenAIRE Graph Dataset project tar file from Zenodo and yields an in-memory binary stream of it.""" - if item: - raise NotImplementedError( - "OpenAIREProjectHTTPReader does not support being chained after another reader" - ) - - if self._origin == "full": - # OpenAIRE Graph Dataset - api_url = "https://zenodo.org/api/records/3516917" - elif self._origin == "diff": - # OpenAIRE Graph dataset: new collected projects - api_url = "https://zenodo.org/api/records/6419021" - else: - raise ReaderError("The --origin option should be either 'full' or 'diff'") - - # Call the signposting `linkset+json` endpoint for the Concept DOI (i.e. latest version) of the OpenAIRE Graph Dataset. - # See: https://github.com/inveniosoftware/rfcs/blob/master/rfcs/rdm-0071-signposting.md#provide-an-applicationlinksetjson-endpoint - headers = {"Accept": "application/linkset+json"} - api_resp = requests.get(api_url, headers=headers) - api_resp.raise_for_status() - - # Extract the Landing page Link Set Object located as the first (index 0) item. - landing_page_linkset = api_resp.json()["linkset"][0] - - # Extract the URL of the only project tar file linked to the record. - landing_page_project_tar_items = [ - item - for item in landing_page_linkset["item"] - if item["type"] == "application/x-tar" - and item["href"].endswith("/project.tar") - ] - if len(landing_page_project_tar_items) != 1: - raise ReaderError( - f"Expected 1 project tar item but got {len(landing_page_project_tar_items)}" - ) - file_url = landing_page_project_tar_items[0]["href"] - - # Download the project tar file and fully load the response bytes content in memory. - # The bytes content are then wrapped by a BytesIO to be file-like object (as required by `tarfile.open`). - # Using directly `file_resp.raw` is not possible since `tarfile.open` requires the file-like object to be seekable. - file_resp = requests.get(file_url) - file_resp.raise_for_status() - yield io.BytesIO(file_resp.content) - - class AwardsServiceWriter(ServiceWriter): """Funders service writer.""" @@ -172,9 +114,7 @@ def apply(self, stream_entry, **kwargs): return stream_entry -VOCABULARIES_DATASTREAM_READERS = { - "openaire-project-http": OpenAIREProjectHTTPReader, -} +VOCABULARIES_DATASTREAM_READERS = {} VOCABULARIES_DATASTREAM_TRANSFORMERS = { "openaire-award": OpenAIREProjectTransformer, diff --git a/invenio_vocabularies/contrib/common/openaire/__init__.py b/invenio_vocabularies/contrib/common/openaire/__init__.py new file mode 100644 index 00000000..a1cf1934 --- /dev/null +++ b/invenio_vocabularies/contrib/common/openaire/__init__.py @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2024 CERN. +# +# Invenio-Vocabularies is free software; you can redistribute it and/or +# modify it under the terms of the MIT License; see LICENSE file for more +# details. + +"""OpenAIRE-related module.""" diff --git a/invenio_vocabularies/contrib/common/openaire/datastreams.py b/invenio_vocabularies/contrib/common/openaire/datastreams.py new file mode 100644 index 00000000..95dba01c --- /dev/null +++ b/invenio_vocabularies/contrib/common/openaire/datastreams.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2024 CERN. +# +# Invenio-Vocabularies is free software; you can redistribute it and/or +# modify it under the terms of the MIT License; see LICENSE file for more +# details. + +"""OpenAIRE-related Datastreams Readers/Writers/Transformers module.""" + +import io + +import requests + +from invenio_vocabularies.datastreams.errors import ReaderError +from invenio_vocabularies.datastreams.readers import BaseReader + + +class OpenAIREHTTPReader(BaseReader): + """OpenAIRE HTTP Reader returning an in-memory binary stream of the latest OpenAIRE Graph Dataset tar file of a given type.""" + + def __init__(self, origin=None, mode="r", tar_href=None, *args, **kwargs): + """Constructor.""" + self.tar_href = tar_href + super().__init__(origin, mode, *args, **kwargs) + + def _iter(self, fp, *args, **kwargs): + raise NotImplementedError( + "OpenAIREHTTPReader downloads one file and therefore does not iterate through items" + ) + + def read(self, item=None, *args, **kwargs): + """Reads the latest OpenAIRE Graph Dataset tar file of a given type from Zenodo and yields an in-memory binary stream of it.""" + if item: + raise NotImplementedError( + "OpenAIREHTTPReader does not support being chained after another reader" + ) + + if self._origin == "full": + # OpenAIRE Graph Dataset + api_url = "https://zenodo.org/api/records/3516917" + elif self._origin == "diff": + # OpenAIRE Graph dataset: new collected projects + api_url = "https://zenodo.org/api/records/6419021" + else: + raise ReaderError("The --origin option should be either 'full' or 'diff'") + + # Call the signposting `linkset+json` endpoint for the Concept DOI (i.e. latest version) of the OpenAIRE Graph Dataset. + # See: https://github.com/inveniosoftware/rfcs/blob/master/rfcs/rdm-0071-signposting.md#provide-an-applicationlinksetjson-endpoint + headers = {"Accept": "application/linkset+json"} + api_resp = requests.get(api_url, headers=headers) + api_resp.raise_for_status() + + # Extract the Landing page Link Set Object located as the first (index 0) item. + landing_page_linkset = api_resp.json()["linkset"][0] + + # Extract the URL of the only tar file matching `tar_href` linked to the record. + landing_page_matching_tar_items = [ + item + for item in landing_page_linkset["item"] + if item["type"] == "application/x-tar" + and item["href"].endswith(self.tar_href) + ] + if len(landing_page_matching_tar_items) != 1: + raise ReaderError( + f"Expected 1 tar item matching {self.tar_href} but got {len(landing_page_matching_tar_items)}" + ) + file_url = landing_page_matching_tar_items[0]["href"] + + # Download the matching tar file and fully load the response bytes content in memory. + # The bytes content are then wrapped by a BytesIO to be file-like object (as required by `tarfile.open`). + # Using directly `file_resp.raw` is not possible since `tarfile.open` requires the file-like object to be seekable. + file_resp = requests.get(file_url) + file_resp.raise_for_status() + yield io.BytesIO(file_resp.content) + + +VOCABULARIES_DATASTREAM_READERS = { + "openaire-http": OpenAIREHTTPReader, +} + +VOCABULARIES_DATASTREAM_TRANSFORMERS = {} + +VOCABULARIES_DATASTREAM_WRITERS = {} diff --git a/tests/contrib/awards/test_awards_datastreams.py b/tests/contrib/awards/test_awards_datastreams.py index b6555de9..0a43188c 100644 --- a/tests/contrib/awards/test_awards_datastreams.py +++ b/tests/contrib/awards/test_awards_datastreams.py @@ -8,9 +8,7 @@ """Awards datastreams tests.""" -import io from copy import deepcopy -from unittest.mock import patch import pytest from invenio_access.permissions import system_identity @@ -18,11 +16,10 @@ from invenio_vocabularies.contrib.awards.api import Award from invenio_vocabularies.contrib.awards.datastreams import ( AwardsServiceWriter, - OpenAIREProjectHTTPReader, OpenAIREProjectTransformer, ) from invenio_vocabularies.datastreams import StreamEntry -from invenio_vocabularies.datastreams.errors import ReaderError, WriterError +from invenio_vocabularies.datastreams.errors import WriterError @pytest.fixture(scope="function") @@ -115,126 +112,6 @@ def expected_from_award_json_ec(): } -API_JSON_RESPONSE_CONTENT = { - "linkset": [ - { - "anchor": "https://example.com/records/10488385", - "item": [ - { - "href": "https://example.com/records/10488385/files/organization.tar", - "type": "application/x-tar", - }, - { - "href": "https://example.com/records/10488385/files/project.tar", - "type": "application/x-tar", - }, - ], - }, - { - "anchor": "https://example.com/api/records/10488385", - "describes": [ - {"href": "https://example.com/records/10488385", "type": "text/html"} - ], - "type": "application/dcat+xml", - }, - ] -} - -API_JSON_RESPONSE_CONTENT_WRONG_NUMBER_PROJECT_TAR_ITEMS_ERROR = { - "linkset": [ - { - "anchor": "https://example.com/records/10488385", - "item": [ - { - "href": "https://example.com/records/10488385/files/organization.tar", - "type": "application/x-tar", - }, - { - "href": "https://example.com/records/10488385/files/project.tar", - "type": "application/x-tar", - }, - { - "href": "https://example.com/another/project.tar", - "type": "application/x-tar", - }, - ], - }, - { - "anchor": "https://example.com/api/records/10488385", - "describes": [ - {"href": "https://example.com/records/10488385", "type": "text/html"} - ], - "type": "application/dcat+xml", - }, - ] -} - -DOWNLOAD_FILE_BYTES_CONTENT = b"The content of the file" - - -class MockResponse: - content = DOWNLOAD_FILE_BYTES_CONTENT - - def __init__(self, api_json_response_content): - self.api_json_response_content = api_json_response_content - - def json(self, **kwargs): - return self.api_json_response_content - - def raise_for_status(self): - pass - - -@pytest.fixture(scope="function") -def download_file_bytes_content(): - return DOWNLOAD_FILE_BYTES_CONTENT - - -@patch( - "requests.get", - side_effect=lambda url, headers=None: MockResponse(API_JSON_RESPONSE_CONTENT), -) -def test_openaire_project_http_reader(_, download_file_bytes_content): - reader = OpenAIREProjectHTTPReader(origin="full") - results = [] - for entry in reader.read(): - results.append(entry) - - assert len(results) == 1 - assert isinstance(results[0], io.BytesIO) - assert results[0].read() == download_file_bytes_content - - -@patch( - "requests.get", - side_effect=lambda url, headers=None: MockResponse( - API_JSON_RESPONSE_CONTENT_WRONG_NUMBER_PROJECT_TAR_ITEMS_ERROR - ), -) -def test_openaire_project_http_reader_wrong_number_tar_items_error(_): - reader = OpenAIREProjectHTTPReader(origin="full") - with pytest.raises(ReaderError): - next(reader.read()) - - -def test_openaire_project_http_reader_unsupported_origin_option(): - reader = OpenAIREProjectHTTPReader(origin="unsupported_origin_option") - with pytest.raises(ReaderError): - next(reader.read()) - - -def test_openaire_project_http_reader_item_not_implemented(): - reader = OpenAIREProjectHTTPReader() - with pytest.raises(NotImplementedError): - next(reader.read("A fake item")) - - -def test_openaire_project_http_reader_iter_not_implemented(): - reader = OpenAIREProjectHTTPReader() - with pytest.raises(NotImplementedError): - reader._iter("A fake file pointer") - - def test_awards_transformer(app, dict_award_entry, expected_from_award_json): transformer = OpenAIREProjectTransformer() assert expected_from_award_json == transformer.apply(dict_award_entry).entry diff --git a/tests/contrib/common/openaire/test_openaire_datastreams.py b/tests/contrib/common/openaire/test_openaire_datastreams.py new file mode 100644 index 00000000..372b1dc6 --- /dev/null +++ b/tests/contrib/common/openaire/test_openaire_datastreams.py @@ -0,0 +1,136 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2024 CERN. +# +# Invenio-Vocabularies is free software; you can redistribute it and/or +# modify it under the terms of the MIT License; see LICENSE file for more +# details. + +"""OpenAIRE-related Datastreams Readers/Writers/Transformers tests.""" + +import io +from unittest.mock import patch + +import pytest + +from invenio_vocabularies.contrib.common.openaire.datastreams import OpenAIREHTTPReader +from invenio_vocabularies.datastreams.errors import ReaderError + +API_JSON_RESPONSE_CONTENT = { + "linkset": [ + { + "anchor": "https://example.com/records/10488385", + "item": [ + { + "href": "https://example.com/records/10488385/files/organization.tar", + "type": "application/x-tar", + }, + { + "href": "https://example.com/records/10488385/files/project.tar", + "type": "application/x-tar", + }, + ], + }, + { + "anchor": "https://example.com/api/records/10488385", + "describes": [ + {"href": "https://example.com/records/10488385", "type": "text/html"} + ], + "type": "application/dcat+xml", + }, + ] +} + +API_JSON_RESPONSE_CONTENT_WRONG_NUMBER_PROJECT_TAR_ITEMS_ERROR = { + "linkset": [ + { + "anchor": "https://example.com/records/10488385", + "item": [ + { + "href": "https://example.com/records/10488385/files/organization.tar", + "type": "application/x-tar", + }, + { + "href": "https://example.com/records/10488385/files/project.tar", + "type": "application/x-tar", + }, + { + "href": "https://example.com/another/project.tar", + "type": "application/x-tar", + }, + ], + }, + { + "anchor": "https://example.com/api/records/10488385", + "describes": [ + {"href": "https://example.com/records/10488385", "type": "text/html"} + ], + "type": "application/dcat+xml", + }, + ] +} + +DOWNLOAD_FILE_BYTES_CONTENT = b"The content of the file" + + +class MockResponse: + content = DOWNLOAD_FILE_BYTES_CONTENT + + def __init__(self, api_json_response_content): + self.api_json_response_content = api_json_response_content + + def json(self, **kwargs): + return self.api_json_response_content + + def raise_for_status(self): + pass + + +@pytest.fixture(scope="function") +def download_file_bytes_content(): + return DOWNLOAD_FILE_BYTES_CONTENT + + +@patch( + "requests.get", + side_effect=lambda url, headers=None: MockResponse(API_JSON_RESPONSE_CONTENT), +) +def test_openaire_http_reader(_, download_file_bytes_content): + reader = OpenAIREHTTPReader(origin="full", tar_href="/project.tar") + results = [] + for entry in reader.read(): + results.append(entry) + + assert len(results) == 1 + assert isinstance(results[0], io.BytesIO) + assert results[0].read() == download_file_bytes_content + + +@patch( + "requests.get", + side_effect=lambda url, headers=None: MockResponse( + API_JSON_RESPONSE_CONTENT_WRONG_NUMBER_PROJECT_TAR_ITEMS_ERROR + ), +) +def test_openaire_http_reader_wrong_number_tar_items_error(_): + reader = OpenAIREHTTPReader(origin="full", tar_href="/project.tar") + with pytest.raises(ReaderError): + next(reader.read()) + + +def test_openaire_http_reader_unsupported_origin_option(): + reader = OpenAIREHTTPReader(origin="unsupported_origin_option") + with pytest.raises(ReaderError): + next(reader.read()) + + +def test_openaire_http_reader_item_not_implemented(): + reader = OpenAIREHTTPReader() + with pytest.raises(NotImplementedError): + next(reader.read("A fake item")) + + +def test_openaire_http_reader_iter_not_implemented(): + reader = OpenAIREHTTPReader() + with pytest.raises(NotImplementedError): + reader._iter("A fake file pointer") From 72ca0272b4887b8f16c898525ea4a09745fd8eb7 Mon Sep 17 00:00:00 2001 From: Pablo Tamarit Date: Tue, 27 Aug 2024 16:16:42 +0200 Subject: [PATCH 04/11] datastreams: affiliations: OpenAIRE transformer and writer adding PIC identifier --- invenio_vocabularies/config.py | 9 ++ .../contrib/affiliations/datastreams.py | 95 ++++++++++++- invenio_vocabularies/datastreams/writers.py | 18 ++- invenio_vocabularies/factories.py | 15 ++ tests/contrib/affiliations/conftest.py | 10 ++ .../test_affiliations_datastreams.py | 133 ++++++++++++++++++ 6 files changed, 278 insertions(+), 2 deletions(-) diff --git a/invenio_vocabularies/config.py b/invenio_vocabularies/config.py index f92880c6..e749d947 100644 --- a/invenio_vocabularies/config.py +++ b/invenio_vocabularies/config.py @@ -43,8 +43,17 @@ } """"Generic identifier schemes, usable by other vocabularies.""" + +def is_pic(val): + """Test if argument is a Participant Identification Code (PIC).""" + if len(val) != 9: + return False + return val.isdigit() + + VOCABULARIES_AFFILIATION_SCHEMES = { **VOCABULARIES_IDENTIFIER_SCHEMES, + "pic": {"label": _("PIC"), "validator": is_pic}, } """Affiliations allowed identifier schemes.""" diff --git a/invenio_vocabularies/contrib/affiliations/datastreams.py b/invenio_vocabularies/contrib/affiliations/datastreams.py index 381bce2b..cffe6855 100644 --- a/invenio_vocabularies/contrib/affiliations/datastreams.py +++ b/invenio_vocabularies/contrib/affiliations/datastreams.py @@ -10,8 +10,9 @@ """Affiliations datastreams, transformers, writers and readers.""" from flask import current_app -from invenio_i18n import lazy_gettext as _ +from ...datastreams.errors import WriterError +from ...datastreams.transformers import BaseTransformer from ...datastreams.writers import ServiceWriter from ..common.ror.datastreams import RORTransformer @@ -46,16 +47,77 @@ def __init__( ) +class OpenAIREOrganizationTransformer(BaseTransformer): + """OpenAIRE Organization Transformer.""" + + def apply(self, stream_entry, **kwargs): + """Applies the transformation to the stream entry.""" + record = stream_entry.entry + + organization = {"openaire_id": record["id"]} + + for pid in record["pid"]: + if pid["scheme"] == "ROR": + organization["id"] = pid["value"].removeprefix("https://ror.org/") + elif pid["scheme"] == "PIC": + organization["identifiers"] = [ + { + "scheme": "pic", + "identifier": pid["value"], + } + ] + + stream_entry.entry = organization + return stream_entry + + +class OpenAIREAffiliationsServiceWriter(ServiceWriter): + """OpenAIRE Affiliations service writer.""" + + def __init__(self, *args, **kwargs): + """Constructor.""" + service_or_name = kwargs.pop("service_or_name", "affiliations") + # Here we only update and we do not insert, since OpenAIRE data is used to augment existing affiliations + # (with PIC identifiers) and is not used to create new affiliations. + super().__init__(service_or_name=service_or_name, insert=False, *args, **kwargs) + + def _entry_id(self, entry): + """Get the id from an entry.""" + return entry["id"] + + def write(self, stream_entry, *args, **kwargs): + """Writes the input entry using a given service.""" + entry = stream_entry.entry + + if not entry["openaire_id"].startswith("openorgs____::"): + raise WriterError([f"Not valid OpenAIRE OpenOrgs id for: {entry}"]) + del entry["openaire_id"] + + if "id" not in entry: + raise WriterError([f"No id for: {entry}"]) + + if "identifiers" not in entry: + raise WriterError([f"No alternative identifiers for: {entry}"]) + + return super().write(stream_entry, *args, **kwargs) + + def write_many(self, stream_entries, *args, **kwargs): + """Writes the input entries using a given service.""" + return super().write_many(stream_entries, *args, **kwargs) + + VOCABULARIES_DATASTREAM_READERS = {} """Affiliations datastream readers.""" VOCABULARIES_DATASTREAM_WRITERS = { "affiliations-service": AffiliationsServiceWriter, + "openaire-affiliations-service": OpenAIREAffiliationsServiceWriter, } """Affiliations datastream writers.""" VOCABULARIES_DATASTREAM_TRANSFORMERS = { "ror-affiliations": AffiliationsRORTransformer, + "openaire-organization": OpenAIREOrganizationTransformer, } """Affiliations datastream transformers.""" @@ -90,3 +152,34 @@ def __init__( An origin is required for the reader. """ + +DATASTREAM_CONFIG_OPENAIRE = { + "readers": [ + {"type": "openaire-http", "args": {"tar_href": "/organization.tar"}}, + { + "type": "tar", + "args": { + "regex": "\\.json.gz$", + "mode": "r", + }, + }, + {"type": "gzip"}, + {"type": "jsonl"}, + ], + "transformers": [ + { + "type": "openaire-organization", + }, + ], + "writers": [ + { + "type": "async", + "args": { + "writer": { + "type": "openaire-affiliations-service", + } + }, + } + ], +} +"""Alternative Data Stream configuration for OpenAIRE Affiliations.""" diff --git a/invenio_vocabularies/datastreams/writers.py b/invenio_vocabularies/datastreams/writers.py index f898c1ad..090fc7d8 100644 --- a/invenio_vocabularies/datastreams/writers.py +++ b/invenio_vocabularies/datastreams/writers.py @@ -87,7 +87,23 @@ def _resolve(self, id_): def _do_update(self, entry): vocab_id = self._entry_id(entry) current = self._resolve(vocab_id) - updated = dict(current.to_dict(), **entry) + + # Merge the `current` dictionary with new data in the `entry` dictionary + # by appending to lists at the top level instead of overwriting the list. + updated = current.to_dict() + for key, value in entry.items(): + if ( + key in updated + and isinstance(updated[key], list) + and isinstance(value, list) + ): + for value_item in value: + # TODO: If an identifier was wrong and is then corrected, this will cause duplicated entries. + if value_item not in updated[key]: + updated[key].append(value_item) + else: + updated[key] = value + return StreamEntry(self._service.update(self._identity, vocab_id, updated)) def write(self, stream_entry, *args, **kwargs): diff --git a/invenio_vocabularies/factories.py b/invenio_vocabularies/factories.py index 11135b1a..7d64d974 100644 --- a/invenio_vocabularies/factories.py +++ b/invenio_vocabularies/factories.py @@ -16,6 +16,9 @@ from .contrib.affiliations.datastreams import ( DATASTREAM_CONFIG as affiliations_ds_config, ) +from .contrib.affiliations.datastreams import ( + DATASTREAM_CONFIG_OPENAIRE as affiliations_openaire_ds_config, +) from .contrib.awards.datastreams import DATASTREAM_CONFIG as awards_ds_config from .contrib.funders.datastreams import DATASTREAM_CONFIG as funders_ds_config from .contrib.names.datastreams import DATASTREAM_CONFIG as names_ds_config @@ -95,6 +98,17 @@ def get_service(self): raise NotImplementedError("Service not implemented for Affiliations") +class AffiliationsOpenAIREVocabularyConfig(VocabularyConfig): + """OpenAIRE Affiliations Vocabulary Config.""" + + config = affiliations_openaire_ds_config + vocabulary_name = "affiliations:openaire" + + def get_service(self): + """Get the service for the vocabulary.""" + raise NotImplementedError("Service not implemented for OpenAIRE Affiliations") + + def get_vocabulary_config(vocabulary): """Factory function to get the appropriate Vocabulary Config.""" vocab_config = { @@ -102,6 +116,7 @@ def get_vocabulary_config(vocabulary): "funders": FundersVocabularyConfig, "awards": AwardsVocabularyConfig, "affiliations": AffiliationsVocabularyConfig, + "affiliations:openaire": AffiliationsOpenAIREVocabularyConfig, "subjects": SubjectsVocabularyConfig, } return vocab_config.get(vocabulary, VocabularyConfig)() diff --git a/tests/contrib/affiliations/conftest.py b/tests/contrib/affiliations/conftest.py index c428e297..0c0d6b2b 100644 --- a/tests/contrib/affiliations/conftest.py +++ b/tests/contrib/affiliations/conftest.py @@ -33,6 +33,16 @@ def affiliation_full_data(): } +@pytest.fixture(scope="function") +def openaire_affiliation_full_data(): + """Full OpenAIRE affiliation data.""" + return { + "openaire_id": "openorgs____::47efb6602225236c0b207761a8b3a21c", + "id": "01ggx4157", + "identifiers": [{"identifier": "999988133", "scheme": "pic"}], + } + + @pytest.fixture(scope="module") def service(): """Affiliations service object.""" diff --git a/tests/contrib/affiliations/test_affiliations_datastreams.py b/tests/contrib/affiliations/test_affiliations_datastreams.py index 243b540d..22c49e16 100644 --- a/tests/contrib/affiliations/test_affiliations_datastreams.py +++ b/tests/contrib/affiliations/test_affiliations_datastreams.py @@ -18,6 +18,8 @@ from invenio_vocabularies.contrib.affiliations.config import affiliation_schemes from invenio_vocabularies.contrib.affiliations.datastreams import ( AffiliationsServiceWriter, + OpenAIREAffiliationsServiceWriter, + OpenAIREOrganizationTransformer, ) from invenio_vocabularies.contrib.common.ror.datastreams import RORTransformer from invenio_vocabularies.datastreams import StreamEntry @@ -118,3 +120,134 @@ def test_affiliations_service_writer_update_non_existing( # not-ideal cleanup affiliation_rec._record.delete(force=True) + + +@pytest.fixture() +def dict_openaire_organization_entry(): + """An example entry from OpenAIRE organization Data Dump.""" + return StreamEntry( + { + "alternativeNames": [ + "European Organization for Nuclear Research", + "Organisation européenne pour la recherche nucléaire", + "CERN", + ], + "country": {"code": "CH", "label": "Switzerland"}, + "id": "openorgs____::47efb6602225236c0b207761a8b3a21c", + "legalName": "European Organization for Nuclear Research", + "legalShortName": "CERN", + "pid": [ + {"scheme": "mag_id", "value": "67311998"}, + {"scheme": "ISNI", "value": "000000012156142X"}, + {"scheme": "Wikidata", "value": "Q42944"}, + {"scheme": "PIC", "value": "999988133"}, + {"scheme": "ROR", "value": "https://ror.org/01ggx4157"}, + {"scheme": "OrgReg", "value": "INT1011"}, + {"scheme": "ISNI", "value": "000000012156142X"}, + {"scheme": "FundRef", "value": "100012470"}, + {"scheme": "GRID", "value": "grid.9132.9"}, + {"scheme": "OrgRef", "value": "37351"}, + ], + "websiteUrl": "http://home.web.cern.ch/", + } + ) + + +@pytest.fixture(scope="module") +def expected_from_openaire_json(): + return { + "openaire_id": "openorgs____::47efb6602225236c0b207761a8b3a21c", + "id": "01ggx4157", + "identifiers": [{"identifier": "999988133", "scheme": "pic"}], + } + + +def test_openaire_organization_transformer( + app, dict_openaire_organization_entry, expected_from_openaire_json +): + transformer = OpenAIREOrganizationTransformer() + assert ( + expected_from_openaire_json + == transformer.apply(dict_openaire_organization_entry).entry + ) + + +def test_openaire_affiliations_service_writer( + app, search_clear, affiliation_full_data, openaire_affiliation_full_data, service +): + # create vocabulary with original service writer + orig_writer = AffiliationsServiceWriter() + orig_affiliation_rec = orig_writer.write(StreamEntry(affiliation_full_data)) + orig_affiliation_dict = orig_affiliation_rec.entry.to_dict() + Affiliation.index.refresh() # refresh index to make changes live + + # update vocabulary and check changes vocabulary with OpenAIRE service writer + writer = OpenAIREAffiliationsServiceWriter(update=True) + _ = writer.write(StreamEntry(openaire_affiliation_full_data)) + Affiliation.index.refresh() # refresh index to make changes live + affiliation_rec = service.read(system_identity, orig_affiliation_rec.entry.id) + affiliation_dict = affiliation_rec.to_dict() + + assert _.entry.id == orig_affiliation_rec.entry.id + + # updating fields changing from one update to the other + orig_affiliation_dict["revision_id"] = affiliation_dict["revision_id"] + orig_affiliation_dict["updated"] = affiliation_dict["updated"] + # Adding the extra identifier coming from OpenAIRE + orig_affiliation_dict["identifiers"].extend( + openaire_affiliation_full_data["identifiers"] + ) + + assert dict(orig_affiliation_dict) == affiliation_dict + + # not-ideal cleanup + affiliation_rec._record.delete(force=True) + + +def test_openaire_affiliations_service_writer_non_openorgs( + app, openaire_affiliation_full_data +): + writer = OpenAIREAffiliationsServiceWriter() + + updated_openaire_affiliation = deepcopy(openaire_affiliation_full_data) + updated_openaire_affiliation["openaire_id"] = ( + "pending_org_::627931d047132a4e20dbc4a882eb9a35" + ) + + with pytest.raises(WriterError) as err: + writer.write(StreamEntry(updated_openaire_affiliation)) + + expected_error = [ + f"Not valid OpenAIRE OpenOrgs id for: {updated_openaire_affiliation}" + ] + assert expected_error in err.value.args + + +def test_openaire_affiliations_service_writer_no_id( + app, openaire_affiliation_full_data +): + writer = OpenAIREAffiliationsServiceWriter() + + updated_openaire_affiliation = deepcopy(openaire_affiliation_full_data) + del updated_openaire_affiliation["id"] + + with pytest.raises(WriterError) as err: + writer.write(StreamEntry(updated_openaire_affiliation)) + + expected_error = [f"No id for: {updated_openaire_affiliation}"] + assert expected_error in err.value.args + + +def test_openaire_affiliations_service_writer_no_alternative_identifiers( + app, openaire_affiliation_full_data +): + writer = OpenAIREAffiliationsServiceWriter() + + updated_openaire_affiliation = deepcopy(openaire_affiliation_full_data) + del updated_openaire_affiliation["identifiers"] + + with pytest.raises(WriterError) as err: + writer.write(StreamEntry(updated_openaire_affiliation)) + + expected_error = [f"No alternative identifiers for: {updated_openaire_affiliation}"] + assert expected_error in err.value.args From 23e63979d9251a36dc4c7b5ef3fe974c29527f39 Mon Sep 17 00:00:00 2001 From: Pablo Tamarit Date: Tue, 27 Aug 2024 17:03:20 +0200 Subject: [PATCH 05/11] awards: added CORDIS datastreams --- .../contrib/awards/datastreams.py | 119 +++++++++++++++++- .../jsonschemas/awards/award-v1.0.0.json | 42 +++++++ .../mappings/os-v1/awards/award-v1.0.0.json | 26 ++++ .../mappings/os-v2/awards/award-v1.0.0.json | 26 ++++ .../mappings/v7/awards/award-v1.0.0.json | 26 ++++ invenio_vocabularies/contrib/awards/schema.py | 16 ++- .../contrib/awards/serializer.py | 1 + .../contrib/names/datastreams.py | 14 ++- invenio_vocabularies/datastreams/readers.py | 19 ++- .../datastreams/transformers.py | 19 ++- invenio_vocabularies/datastreams/writers.py | 4 + invenio_vocabularies/factories.py | 27 +++- 12 files changed, 326 insertions(+), 13 deletions(-) diff --git a/invenio_vocabularies/contrib/awards/datastreams.py b/invenio_vocabularies/contrib/awards/datastreams.py index 22547f39..e9f90126 100644 --- a/invenio_vocabularies/contrib/awards/datastreams.py +++ b/invenio_vocabularies/contrib/awards/datastreams.py @@ -114,18 +114,135 @@ def apply(self, stream_entry, **kwargs): return stream_entry -VOCABULARIES_DATASTREAM_READERS = {} +class CORDISProjectHTTPReader(BaseReader): + """CORDIS Project HTTP Reader returning an in-memory binary stream of the latest CORDIS Horizon Europe project zip file.""" + + def _iter(self, fp, *args, **kwargs): + raise NotImplementedError( + "CORDISProjectHTTPReader downloads one file and therefore does not iterate through items" + ) + + def read(self, item=None, *args, **kwargs): + """Reads the latest CORDIS Horizon Europe project zip file and yields an in-memory binary stream of it.""" + if item: + raise NotImplementedError( + "CORDISProjectHTTPReader does not support being chained after another reader" + ) + + file_url = "https://cordis.europa.eu/data/cordis-HORIZONprojects-xml.zip" + + # Download the ZIP file and fully load the response bytes content in memory. + # The bytes content are then wrapped by a BytesIO to be file-like object (as required by `zipfile.ZipFile`). + # Using directly `file_resp.raw` is not possible since `zipfile.ZipFile` requires the file-like object to be seekable. + file_resp = requests.get(file_url) + file_resp.raise_for_status() + yield io.BytesIO(file_resp.content) + + +class CORDISProjectTransformer(BaseTransformer): + """Transforms a CORDIS project record into an award record.""" + + def apply(self, stream_entry, **kwargs): + """Applies the transformation to the stream entry.""" + record = stream_entry.entry + award = {} + + # Here `id` is the project ID, which will be used to attach the update to the existing project. + award["id"] = f"00k4n6c32::{record['id']}" + + award["subjects"] = [ + { + "scheme": "EuroSciVoc", + # TODO: Here lowercase while title cased in the subjects vocabulary. + "subject": category["title"], + } + for category in record["relations"]["categories"]["category"] + if category["@classification"] == "euroSciVoc" + ] + + organizations = record["relations"]["associations"]["organization"] + # Projects with a single organization are not wrapped in a list, + # so we do this here to be able to iterate over it. + organizations = organizations if isinstance(organizations, list) else [organizations] + award["organizations"] = [ + { + # TODO: Here the legal name is uppercase. + "organization": organization["legalname"], + "scheme": "pic", + "id": organization["id"], + } + for organization in organizations + ] + + stream_entry.entry = award + return stream_entry + + +class CORDISAwardsServiceWriter(ServiceWriter): + """CORDIS Awards service writer.""" + + def __init__(self, *args, **kwargs): + """Constructor.""" + service_or_name = kwargs.pop("service_or_name", "awards") + # Here we only update and we do not insert, since CORDIS data is used to augment existing awards + # (with subjects and organizations information) and is not used to create new awards. + super().__init__(service_or_name=service_or_name, insert=False, *args, **kwargs) + + def _entry_id(self, entry): + """Get the id from an entry.""" + return entry["id"] + + +VOCABULARIES_DATASTREAM_READERS = { + "cordis-project-http": CORDISProjectHTTPReader, +} VOCABULARIES_DATASTREAM_TRANSFORMERS = { "openaire-award": OpenAIREProjectTransformer, + "cordis-award": CORDISProjectTransformer, } """ORCiD Data Streams transformers.""" VOCABULARIES_DATASTREAM_WRITERS = { "awards-service": AwardsServiceWriter, + "cordis-awards-service": CORDISAwardsServiceWriter, } """ORCiD Data Streams transformers.""" +DATASTREAM_CONFIG_CORDIS = { + "readers": [ + # {"type": "cordis-project-http"}, + # { + # "type": "zip", + # "args": { + # "regex": "\\.xml$", + # "mode": "r", + # }, + # }, + { + "type": "xml", + "args": { + "root_element": "project", + }, + }, + ], + "transformers": [ + {"type": "cordis-award"}, + ], + "writers": [ + { + "type": "cordis-awards-service", + "args": { + "identity": system_identity, + }, + } + ], +} +"""Data Stream configuration. + +An origin is required for the reader. +""" + DATASTREAM_CONFIG = { "readers": [ { diff --git a/invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json b/invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json index bdc2cfa1..8c89e487 100644 --- a/invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json +++ b/invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json @@ -42,6 +42,48 @@ }, "program": { "type": "string" + }, + "subjects": { + "description": "Award's subjects.", + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "properties": { + "scheme": { + "description": "Identifier of the subject scheme.", + "$ref": "local://definitions-v1.0.0.json#/identifier" + }, + "subject": { + "description": "Human readable label.", + "type": "string" + } + } + }, + "uniqueItems": true + }, + "organizations": { + "description": "Award's organizations.", + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "properties": { + "scheme": { + "description": "Identifier of the organization scheme.", + "$ref": "local://definitions-v1.0.0.json#/identifier" + }, + "id": { + "description": "Identifier of the organization for the given scheme.", + "$ref": "local://definitions-v1.0.0.json#/identifier" + }, + "organization": { + "description": "Human readable label.", + "type": "string" + } + } + }, + "uniqueItems": true } } } diff --git a/invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json b/invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json index 90ec8049..86657803 100644 --- a/invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json +++ b/invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json @@ -64,6 +64,32 @@ "program": { "type": "keyword" }, + "subjects": { + "properties": { + "identifier": { + "type": "keyword" + }, + "scheme": { + "type": "keyword" + }, + "subject": { + "type": "keyword" + } + } + }, + "organizations": { + "properties": { + "scheme": { + "type": "keyword" + }, + "id": { + "type": "keyword" + }, + "organization": { + "type": "keyword" + } + } + }, "funder": { "type": "object", "properties": { diff --git a/invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json b/invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json index 90ec8049..86657803 100644 --- a/invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json +++ b/invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json @@ -64,6 +64,32 @@ "program": { "type": "keyword" }, + "subjects": { + "properties": { + "identifier": { + "type": "keyword" + }, + "scheme": { + "type": "keyword" + }, + "subject": { + "type": "keyword" + } + } + }, + "organizations": { + "properties": { + "scheme": { + "type": "keyword" + }, + "id": { + "type": "keyword" + }, + "organization": { + "type": "keyword" + } + } + }, "funder": { "type": "object", "properties": { diff --git a/invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json b/invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json index 90ec8049..86657803 100644 --- a/invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json +++ b/invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json @@ -64,6 +64,32 @@ "program": { "type": "keyword" }, + "subjects": { + "properties": { + "identifier": { + "type": "keyword" + }, + "scheme": { + "type": "keyword" + }, + "subject": { + "type": "keyword" + } + } + }, + "organizations": { + "properties": { + "scheme": { + "type": "keyword" + }, + "id": { + "type": "keyword" + }, + "organization": { + "type": "keyword" + } + } + }, "funder": { "type": "object", "properties": { diff --git a/invenio_vocabularies/contrib/awards/schema.py b/invenio_vocabularies/contrib/awards/schema.py index 487bafaf..f15453f5 100644 --- a/invenio_vocabularies/contrib/awards/schema.py +++ b/invenio_vocabularies/contrib/awards/schema.py @@ -15,15 +15,25 @@ from marshmallow_utils.fields import IdentifierSet, SanitizedUnicode from marshmallow_utils.schemas import IdentifierSchema +from ..subjects.schema import SubjectRelationSchema from ...services.schema import ( BaseVocabularySchema, ModePIDFieldVocabularyMixin, - i18n_strings, + i18n_strings, ContribVocabularyRelationSchema, ) from ..funders.schema import FunderRelationSchema from .config import award_schemes +class AwardOrganizationRelationSchema(ContribVocabularyRelationSchema): + """Schema to define an organization relation in an award.""" + + ftf_name = "organization" + parent_field_name = "organizations" + organization = SanitizedUnicode() + scheme = SanitizedUnicode() + + class AwardSchema(BaseVocabularySchema, ModePIDFieldVocabularyMixin): """Award schema.""" @@ -46,6 +56,10 @@ class AwardSchema(BaseVocabularySchema, ModePIDFieldVocabularyMixin): program = SanitizedUnicode() + subjects = fields.List(fields.Nested(SubjectRelationSchema)) + + organizations = fields.List(fields.Nested(AwardOrganizationRelationSchema)) + id = SanitizedUnicode( validate=validate.Length(min=1, error=_("PID cannot be blank.")) ) diff --git a/invenio_vocabularies/contrib/awards/serializer.py b/invenio_vocabularies/contrib/awards/serializer.py index 32aa1188..55e8a195 100644 --- a/invenio_vocabularies/contrib/awards/serializer.py +++ b/invenio_vocabularies/contrib/awards/serializer.py @@ -37,4 +37,5 @@ class AwardL10NItemSchema(Schema): acronym = fields.String(dump_only=True) program = fields.String(dump_only=True) funder = fields.Nested(FunderRelationSchema, dump_only=True) + # TODO: Add subjects and organizations here? identifiers = fields.List(fields.Nested(IdentifierSchema), dump_only=True) diff --git a/invenio_vocabularies/contrib/names/datastreams.py b/invenio_vocabularies/contrib/names/datastreams.py index d682526b..9929d173 100644 --- a/invenio_vocabularies/contrib/names/datastreams.py +++ b/invenio_vocabularies/contrib/names/datastreams.py @@ -243,7 +243,12 @@ def _entry_id(self, entry): "regex": "\\.xml$", }, }, - {"type": "xml"}, + { + "type": "xml", + "args": { + "root_element": "record", + }, + }, ], "transformers": [{"type": "orcid"}], "writers": [ @@ -266,7 +271,12 @@ def _entry_id(self, entry): { "type": "orcid-data-sync", }, - {"type": "xml"}, + { + "type": "xml", + "args": { + "root_element": "record", + }, + }, ], "transformers": [{"type": "orcid"}], "writers": [ diff --git a/invenio_vocabularies/datastreams/readers.py b/invenio_vocabularies/datastreams/readers.py index 4f017eb0..b559cf58 100644 --- a/invenio_vocabularies/datastreams/readers.py +++ b/invenio_vocabularies/datastreams/readers.py @@ -224,19 +224,30 @@ def _iter(self, fp, *args, **kwargs): class XMLReader(BaseReader): """XML reader.""" + def __init__(self, root_element=None, *args, **kwargs): + """Constructor.""" + # TODO: How to make root_element mandatory? + self.root_element = root_element + super().__init__(*args, **kwargs) + + def _iter(self, fp, *args, **kwargs): """Read and parse an XML file to dict.""" # NOTE: We parse HTML, to skip XML validation and strip XML namespaces record = None try: xml_tree = fromstring(fp) - record = etree_to_dict(xml_tree).get("record") + xml_dict = etree_to_dict(xml_tree) except Exception as e: xml_tree = html_parse(fp).getroot() - record = etree_to_dict(xml_tree)["html"]["body"].get("record") + xml_dict = etree_to_dict(xml_tree)["html"]["body"] - if not record: - raise ReaderError(f"Record not found in XML entry.") + if self.root_element: + record = xml_dict.get(self.root_element) + if not record: + raise ReaderError(f"Root element '{self.root_element}' not found in XML entry.") + else: + record = xml_dict yield record diff --git a/invenio_vocabularies/datastreams/transformers.py b/invenio_vocabularies/datastreams/transformers.py index d4274a68..cb495f47 100644 --- a/invenio_vocabularies/datastreams/transformers.py +++ b/invenio_vocabularies/datastreams/transformers.py @@ -32,6 +32,13 @@ def apply(self, stream_entry, *args, **kwargs): class XMLTransformer(BaseTransformer): """XML transformer.""" + def __init__( + self, root_element=None, *args, **kwargs + ): + """Initializes the transformer.""" + self.root_element = root_element + super().__init__(*args, **kwargs) + @classmethod def _xml_to_etree(cls, xml): """Converts XML to a lxml etree.""" @@ -43,10 +50,14 @@ def apply(self, stream_entry, **kwargs): Requires the root element to be named "record". """ xml_tree = self._xml_to_etree(stream_entry.entry) - record = etree_to_dict(xml_tree)["html"]["body"].get("record") - - if not record: - raise TransformerError(f"Record not found in XML entry.") + xml_dict = etree_to_dict(xml_tree)["html"]["body"] + + if self.root_element: + record = xml_dict.get(self.root_element) + if not record: + raise TransformerError(f"Root element '{self.root_element}' not found in XML entry.") + else: + record = xml_dict stream_entry.entry = record return stream_entry diff --git a/invenio_vocabularies/datastreams/writers.py b/invenio_vocabularies/datastreams/writers.py index 090fc7d8..060fcfd5 100644 --- a/invenio_vocabularies/datastreams/writers.py +++ b/invenio_vocabularies/datastreams/writers.py @@ -88,6 +88,10 @@ def _do_update(self, entry): vocab_id = self._entry_id(entry) current = self._resolve(vocab_id) + # updated = dict(current.to_dict(), **entry) + # TODO: Try to use _record instead of to_dict() + # updated = dict(current._record, **entry) + # Merge the `current` dictionary with new data in the `entry` dictionary # by appending to lists at the top level instead of overwriting the list. updated = current.to_dict() diff --git a/invenio_vocabularies/factories.py b/invenio_vocabularies/factories.py index 7d64d974..aa73b142 100644 --- a/invenio_vocabularies/factories.py +++ b/invenio_vocabularies/factories.py @@ -19,10 +19,11 @@ from .contrib.affiliations.datastreams import ( DATASTREAM_CONFIG_OPENAIRE as affiliations_openaire_ds_config, ) -from .contrib.awards.datastreams import DATASTREAM_CONFIG as awards_ds_config +from .contrib.awards.datastreams import DATASTREAM_CONFIG as awards_ds_config, DATASTREAM_CONFIG_CORDIS as awards_cordis_ds_config from .contrib.funders.datastreams import DATASTREAM_CONFIG as funders_ds_config from .contrib.names.datastreams import DATASTREAM_CONFIG as names_ds_config from .contrib.subjects.datastreams import DATASTREAM_CONFIG as subjects_ds_config +from .contrib.projects.datastreams import DATASTREAM_CONFIG as projects_ds_config class VocabularyConfig: @@ -87,6 +88,17 @@ def get_service(self): raise NotImplementedError("Service not implemented for Awards") +class AwardsCordisVocabularyConfig(VocabularyConfig): + """Awards Vocabulary Config.""" + + config = awards_cordis_ds_config + vocabulary_name = "awards:cordis" + + def get_service(self): + """Get the service for the vocabulary.""" + raise NotImplementedError("Service not implemented for CORDIS Awards") + + class AffiliationsVocabularyConfig(VocabularyConfig): """Affiliations Vocabulary Config.""" @@ -109,14 +121,27 @@ def get_service(self): raise NotImplementedError("Service not implemented for OpenAIRE Affiliations") +class ProjectsVocabularyConfig(VocabularyConfig): # TODO: Delete this config + """Projects Vocabulary Config.""" + + config = projects_ds_config + vocabulary_name = "projects" + + def get_service(self): + """Get the service for the vocabulary.""" + raise NotImplementedError("Service not implemented for Projects") + + def get_vocabulary_config(vocabulary): """Factory function to get the appropriate Vocabulary Config.""" vocab_config = { "names": NamesVocabularyConfig, "funders": FundersVocabularyConfig, "awards": AwardsVocabularyConfig, + "awards:cordis": AwardsCordisVocabularyConfig, "affiliations": AffiliationsVocabularyConfig, "affiliations:openaire": AffiliationsOpenAIREVocabularyConfig, + "projects": ProjectsVocabularyConfig, "subjects": SubjectsVocabularyConfig, } return vocab_config.get(vocabulary, VocabularyConfig)() From 4a173fc67b354b952221c212c6035f3e5bd8c682 Mon Sep 17 00:00:00 2001 From: Fatimah Zulfiqar Date: Mon, 16 Sep 2024 11:09:29 +0200 Subject: [PATCH 06/11] awards: added subjects info in cordis --- .../contrib/awards/datastreams.py | 30 ++++++++++--------- .../jsonschemas/awards/award-v1.0.0.json | 18 +++-------- .../mappings/os-v1/awards/award-v1.0.0.json | 10 ++----- .../mappings/os-v2/awards/award-v1.0.0.json | 10 ++----- .../mappings/v7/awards/award-v1.0.0.json | 10 ++----- invenio_vocabularies/contrib/awards/schema.py | 5 ++-- invenio_vocabularies/datastreams/readers.py | 5 ++-- .../datastreams/transformers.py | 8 ++--- invenio_vocabularies/factories.py | 24 ++++++++------- 9 files changed, 50 insertions(+), 70 deletions(-) diff --git a/invenio_vocabularies/contrib/awards/datastreams.py b/invenio_vocabularies/contrib/awards/datastreams.py index e9f90126..84fc0f56 100644 --- a/invenio_vocabularies/contrib/awards/datastreams.py +++ b/invenio_vocabularies/contrib/awards/datastreams.py @@ -8,10 +8,14 @@ """Awards datastreams, transformers, writers and readers.""" +import io + +import requests from invenio_access.permissions import system_identity from invenio_i18n import lazy_gettext as _ from ...datastreams.errors import TransformerError +from ...datastreams.readers import BaseReader from ...datastreams.transformers import BaseTransformer from ...datastreams.writers import ServiceWriter from .config import awards_ec_ror_id, awards_openaire_funders_mapping @@ -151,11 +155,7 @@ def apply(self, stream_entry, **kwargs): award["id"] = f"00k4n6c32::{record['id']}" award["subjects"] = [ - { - "scheme": "EuroSciVoc", - # TODO: Here lowercase while title cased in the subjects vocabulary. - "subject": category["title"], - } + {"id": f"euroscivoc:{category['code'].split('/')[-1]}"} for category in record["relations"]["categories"]["category"] if category["@classification"] == "euroSciVoc" ] @@ -163,7 +163,9 @@ def apply(self, stream_entry, **kwargs): organizations = record["relations"]["associations"]["organization"] # Projects with a single organization are not wrapped in a list, # so we do this here to be able to iterate over it. - organizations = organizations if isinstance(organizations, list) else [organizations] + organizations = ( + organizations if isinstance(organizations, list) else [organizations] + ) award["organizations"] = [ { # TODO: Here the legal name is uppercase. @@ -211,14 +213,14 @@ def _entry_id(self, entry): DATASTREAM_CONFIG_CORDIS = { "readers": [ - # {"type": "cordis-project-http"}, - # { - # "type": "zip", - # "args": { - # "regex": "\\.xml$", - # "mode": "r", - # }, - # }, + {"type": "cordis-project-http"}, + { + "type": "zip", + "args": { + "regex": "\\.xml$", + "mode": "r", + }, + }, { "type": "xml", "args": { diff --git a/invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json b/invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json index 8c89e487..04350a7b 100644 --- a/invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json +++ b/invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json @@ -46,18 +46,9 @@ "subjects": { "description": "Award's subjects.", "type": "array", - "items": { - "type": "object", - "additionalProperties": false, - "properties": { - "scheme": { - "description": "Identifier of the subject scheme.", - "$ref": "local://definitions-v1.0.0.json#/identifier" - }, - "subject": { - "description": "Human readable label.", - "type": "string" - } + "properties": { + "id": { + "$ref": "local://definitions-v1.0.0.json#/identifier" } }, "uniqueItems": true @@ -82,8 +73,7 @@ "type": "string" } } - }, - "uniqueItems": true + } } } } diff --git a/invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json b/invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json index 86657803..49228392 100644 --- a/invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json +++ b/invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json @@ -58,7 +58,7 @@ "acronym": { "type": "keyword", "fields": { - "text": { "type": "text"} + "text": { "type": "text" } } }, "program": { @@ -66,13 +66,7 @@ }, "subjects": { "properties": { - "identifier": { - "type": "keyword" - }, - "scheme": { - "type": "keyword" - }, - "subject": { + "id": { "type": "keyword" } } diff --git a/invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json b/invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json index 86657803..49228392 100644 --- a/invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json +++ b/invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json @@ -58,7 +58,7 @@ "acronym": { "type": "keyword", "fields": { - "text": { "type": "text"} + "text": { "type": "text" } } }, "program": { @@ -66,13 +66,7 @@ }, "subjects": { "properties": { - "identifier": { - "type": "keyword" - }, - "scheme": { - "type": "keyword" - }, - "subject": { + "id": { "type": "keyword" } } diff --git a/invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json b/invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json index 86657803..49228392 100644 --- a/invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json +++ b/invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json @@ -58,7 +58,7 @@ "acronym": { "type": "keyword", "fields": { - "text": { "type": "text"} + "text": { "type": "text" } } }, "program": { @@ -66,13 +66,7 @@ }, "subjects": { "properties": { - "identifier": { - "type": "keyword" - }, - "scheme": { - "type": "keyword" - }, - "subject": { + "id": { "type": "keyword" } } diff --git a/invenio_vocabularies/contrib/awards/schema.py b/invenio_vocabularies/contrib/awards/schema.py index f15453f5..9eb77fca 100644 --- a/invenio_vocabularies/contrib/awards/schema.py +++ b/invenio_vocabularies/contrib/awards/schema.py @@ -15,13 +15,14 @@ from marshmallow_utils.fields import IdentifierSet, SanitizedUnicode from marshmallow_utils.schemas import IdentifierSchema -from ..subjects.schema import SubjectRelationSchema from ...services.schema import ( BaseVocabularySchema, + ContribVocabularyRelationSchema, ModePIDFieldVocabularyMixin, - i18n_strings, ContribVocabularyRelationSchema, + i18n_strings, ) from ..funders.schema import FunderRelationSchema +from ..subjects.schema import SubjectRelationSchema from .config import award_schemes diff --git a/invenio_vocabularies/datastreams/readers.py b/invenio_vocabularies/datastreams/readers.py index b559cf58..5ef11ed7 100644 --- a/invenio_vocabularies/datastreams/readers.py +++ b/invenio_vocabularies/datastreams/readers.py @@ -230,7 +230,6 @@ def __init__(self, root_element=None, *args, **kwargs): self.root_element = root_element super().__init__(*args, **kwargs) - def _iter(self, fp, *args, **kwargs): """Read and parse an XML file to dict.""" # NOTE: We parse HTML, to skip XML validation and strip XML namespaces @@ -245,7 +244,9 @@ def _iter(self, fp, *args, **kwargs): if self.root_element: record = xml_dict.get(self.root_element) if not record: - raise ReaderError(f"Root element '{self.root_element}' not found in XML entry.") + raise ReaderError( + f"Root element '{self.root_element}' not found in XML entry." + ) else: record = xml_dict diff --git a/invenio_vocabularies/datastreams/transformers.py b/invenio_vocabularies/datastreams/transformers.py index cb495f47..822cb91a 100644 --- a/invenio_vocabularies/datastreams/transformers.py +++ b/invenio_vocabularies/datastreams/transformers.py @@ -32,9 +32,7 @@ def apply(self, stream_entry, *args, **kwargs): class XMLTransformer(BaseTransformer): """XML transformer.""" - def __init__( - self, root_element=None, *args, **kwargs - ): + def __init__(self, root_element=None, *args, **kwargs): """Initializes the transformer.""" self.root_element = root_element super().__init__(*args, **kwargs) @@ -55,7 +53,9 @@ def apply(self, stream_entry, **kwargs): if self.root_element: record = xml_dict.get(self.root_element) if not record: - raise TransformerError(f"Root element '{self.root_element}' not found in XML entry.") + raise TransformerError( + f"Root element '{self.root_element}' not found in XML entry." + ) else: record = xml_dict diff --git a/invenio_vocabularies/factories.py b/invenio_vocabularies/factories.py index aa73b142..7c7a6eb8 100644 --- a/invenio_vocabularies/factories.py +++ b/invenio_vocabularies/factories.py @@ -19,11 +19,15 @@ from .contrib.affiliations.datastreams import ( DATASTREAM_CONFIG_OPENAIRE as affiliations_openaire_ds_config, ) -from .contrib.awards.datastreams import DATASTREAM_CONFIG as awards_ds_config, DATASTREAM_CONFIG_CORDIS as awards_cordis_ds_config +from .contrib.awards.datastreams import DATASTREAM_CONFIG as awards_ds_config +from .contrib.awards.datastreams import ( + DATASTREAM_CONFIG_CORDIS as awards_cordis_ds_config, +) from .contrib.funders.datastreams import DATASTREAM_CONFIG as funders_ds_config from .contrib.names.datastreams import DATASTREAM_CONFIG as names_ds_config from .contrib.subjects.datastreams import DATASTREAM_CONFIG as subjects_ds_config -from .contrib.projects.datastreams import DATASTREAM_CONFIG as projects_ds_config + +# from .contrib.projects.datastreams import DATASTREAM_CONFIG as projects_ds_config class VocabularyConfig: @@ -121,15 +125,15 @@ def get_service(self): raise NotImplementedError("Service not implemented for OpenAIRE Affiliations") -class ProjectsVocabularyConfig(VocabularyConfig): # TODO: Delete this config - """Projects Vocabulary Config.""" +# class ProjectsVocabularyConfig(VocabularyConfig): # TODO: Delete this config +# """Projects Vocabulary Config.""" - config = projects_ds_config - vocabulary_name = "projects" +# config = projects_ds_config +# vocabulary_name = "projects" - def get_service(self): - """Get the service for the vocabulary.""" - raise NotImplementedError("Service not implemented for Projects") +# def get_service(self): +# """Get the service for the vocabulary.""" +# raise NotImplementedError("Service not implemented for Projects") def get_vocabulary_config(vocabulary): @@ -141,7 +145,7 @@ def get_vocabulary_config(vocabulary): "awards:cordis": AwardsCordisVocabularyConfig, "affiliations": AffiliationsVocabularyConfig, "affiliations:openaire": AffiliationsOpenAIREVocabularyConfig, - "projects": ProjectsVocabularyConfig, + # "projects": ProjectsVocabularyConfig, "subjects": SubjectsVocabularyConfig, } return vocab_config.get(vocabulary, VocabularyConfig)() From 433eb1a351acf31602960952e751ae91ec2caafd Mon Sep 17 00:00:00 2001 From: Pablo Tamarit Date: Tue, 17 Sep 2024 16:36:47 +0200 Subject: [PATCH 07/11] awards: support CORDIS projects from H2020 and FP7 --- .../contrib/awards/datastreams.py | 41 +++++++++++++++---- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/invenio_vocabularies/contrib/awards/datastreams.py b/invenio_vocabularies/contrib/awards/datastreams.py index 84fc0f56..31fd79eb 100644 --- a/invenio_vocabularies/contrib/awards/datastreams.py +++ b/invenio_vocabularies/contrib/awards/datastreams.py @@ -20,6 +20,8 @@ from ...datastreams.writers import ServiceWriter from .config import awards_ec_ror_id, awards_openaire_funders_mapping +from invenio_vocabularies.datastreams.errors import ReaderError + class AwardsServiceWriter(ServiceWriter): """Funders service writer.""" @@ -133,7 +135,16 @@ def read(self, item=None, *args, **kwargs): "CORDISProjectHTTPReader does not support being chained after another reader" ) - file_url = "https://cordis.europa.eu/data/cordis-HORIZONprojects-xml.zip" + if self._origin == "HE": + file_url = "https://cordis.europa.eu/data/cordis-HORIZONprojects-xml.zip" + elif self._origin == "H2020": + file_url = "https://cordis.europa.eu/data/cordis-h2020projects-xml.zip" + elif self._origin == "FP7": + file_url = "https://cordis.europa.eu/data/cordis-fp7projects-xml.zip" + else: + raise ReaderError( + "The --origin option should be either 'HE' (for Horizon Europe) or 'H2020' (for Horizon 2020) or 'FP7'" + ) # Download the ZIP file and fully load the response bytes content in memory. # The bytes content are then wrapped by a BytesIO to be file-like object (as required by `zipfile.ZipFile`). @@ -166,15 +177,31 @@ def apply(self, stream_entry, **kwargs): organizations = ( organizations if isinstance(organizations, list) else [organizations] ) - award["organizations"] = [ - { + award["organizations"] = [] + for organization in organizations: + # Some organizations in FP7 projects do not have a "legalname" key, + # for instance the 14th participant in "SAGE" https://cordis.europa.eu/project/id/999902. + # In this case, fully skip the organization entry. + if "legalname" not in organization: + continue + + organization_data = { # TODO: Here the legal name is uppercase. "organization": organization["legalname"], - "scheme": "pic", - "id": organization["id"], } - for organization in organizations - ] + + # Some organizations in FP7 projects do not have an "id" key (the PIC identifier), + # for instance "AIlGreenVehicles" in "MOTORBRAIN" https://cordis.europa.eu/project/id/270693. + # In this case, still store the name but skip the identifier part. + if "id" in organization: + organization_data.update( + { + "scheme": "pic", + "id": organization["id"], + } + ) + + award["organizations"].append(organization_data) stream_entry.entry = award return stream_entry From 74297506593ef1c5769972c3166db35477ceb843 Mon Sep 17 00:00:00 2001 From: Fatimah Zulfiqar Date: Wed, 18 Sep 2024 15:33:35 +0200 Subject: [PATCH 08/11] awards: added subjects in relations and serializer --- invenio_vocabularies/contrib/awards/awards.py | 19 +++++++++++++++---- .../contrib/awards/serializer.py | 8 +++++++- invenio_vocabularies/datastreams/writers.py | 18 +----------------- 3 files changed, 23 insertions(+), 22 deletions(-) diff --git a/invenio_vocabularies/contrib/awards/awards.py b/invenio_vocabularies/contrib/awards/awards.py index c111e7b1..8f9f9caf 100644 --- a/invenio_vocabularies/contrib/awards/awards.py +++ b/invenio_vocabularies/contrib/awards/awards.py @@ -18,24 +18,35 @@ from invenio_records.dumpers import SearchDumper from invenio_records.dumpers.indexedat import IndexedAtDumperExt from invenio_records.dumpers.relations import RelationDumperExt -from invenio_records.systemfields import RelationsField +from invenio_records.systemfields import MultiRelationsField from invenio_records_resources.factories.factory import RecordTypeFactory -from invenio_records_resources.records.systemfields import ModelPIDField, PIDRelation +from invenio_records_resources.records.systemfields import ( + ModelPIDField, + PIDListRelation, + PIDRelation, +) from invenio_records_resources.resources.records.headers import etag_headers from ...services.permissions import PermissionPolicy from ..funders.api import Funder +from ..subjects.api import Subject from .config import AwardsSearchOptions, service_components from .schema import AwardSchema from .serializer import AwardL10NItemSchema -award_relations = RelationsField( +award_relations = MultiRelationsField( funders=PIDRelation( "funder", keys=["name"], pid_field=Funder.pid, cache_key="funder", - ) + ), + subjects=PIDListRelation( + "subjects", + keys=["subject", "scheme", "props"], + pid_field=Subject.pid, + cache_key="subjects", + ), ) record_type = RecordTypeFactory( diff --git a/invenio_vocabularies/contrib/awards/serializer.py b/invenio_vocabularies/contrib/awards/serializer.py index 55e8a195..9882202e 100644 --- a/invenio_vocabularies/contrib/awards/serializer.py +++ b/invenio_vocabularies/contrib/awards/serializer.py @@ -12,6 +12,9 @@ from invenio_vocabularies.resources import L10NString +from ..subjects.schema import SubjectRelationSchema +from .schema import AwardOrganizationRelationSchema + class IdentifierSchema(Schema): """Identifier scheme.""" @@ -37,5 +40,8 @@ class AwardL10NItemSchema(Schema): acronym = fields.String(dump_only=True) program = fields.String(dump_only=True) funder = fields.Nested(FunderRelationSchema, dump_only=True) - # TODO: Add subjects and organizations here? + organizations = fields.List( + fields.Nested(AwardOrganizationRelationSchema), dump_only=True + ) + subjects = fields.List(fields.Nested(SubjectRelationSchema), dump_only=True) identifiers = fields.List(fields.Nested(IdentifierSchema), dump_only=True) diff --git a/invenio_vocabularies/datastreams/writers.py b/invenio_vocabularies/datastreams/writers.py index 060fcfd5..6bb3aeb0 100644 --- a/invenio_vocabularies/datastreams/writers.py +++ b/invenio_vocabularies/datastreams/writers.py @@ -88,26 +88,10 @@ def _do_update(self, entry): vocab_id = self._entry_id(entry) current = self._resolve(vocab_id) - # updated = dict(current.to_dict(), **entry) + updated = dict(current.to_dict(), **entry) # TODO: Try to use _record instead of to_dict() # updated = dict(current._record, **entry) - # Merge the `current` dictionary with new data in the `entry` dictionary - # by appending to lists at the top level instead of overwriting the list. - updated = current.to_dict() - for key, value in entry.items(): - if ( - key in updated - and isinstance(updated[key], list) - and isinstance(value, list) - ): - for value_item in value: - # TODO: If an identifier was wrong and is then corrected, this will cause duplicated entries. - if value_item not in updated[key]: - updated[key].append(value_item) - else: - updated[key] = value - return StreamEntry(self._service.update(self._identity, vocab_id, updated)) def write(self, stream_entry, *args, **kwargs): From 780c5b6e1712ca9d2202e9665a1f145bf084fc1b Mon Sep 17 00:00:00 2001 From: Fatimah Zulfiqar Date: Thu, 19 Sep 2024 15:14:20 +0200 Subject: [PATCH 09/11] awards: updated mappings and tests --- invenio_vocabularies/contrib/awards/awards.py | 2 +- .../contrib/awards/datastreams.py | 12 ++- .../jsonschemas/awards/award-v1.0.0.json | 24 ++++++ .../mappings/os-v1/awards/award-v1.0.0.json | 19 ++++- .../mappings/os-v2/awards/award-v1.0.0.json | 19 ++++- .../mappings/v7/awards/award-v1.0.0.json | 19 ++++- .../contrib/awards/serializer.py | 4 +- invenio_vocabularies/factories.py | 12 --- .../test_affiliations_datastreams.py | 2 +- .../contrib/awards/test_awards_datastreams.py | 80 +++++++++++++++++++ tests/datastreams/test_transformers.py | 9 ++- 11 files changed, 176 insertions(+), 26 deletions(-) diff --git a/invenio_vocabularies/contrib/awards/awards.py b/invenio_vocabularies/contrib/awards/awards.py index 8f9f9caf..3440c3b1 100644 --- a/invenio_vocabularies/contrib/awards/awards.py +++ b/invenio_vocabularies/contrib/awards/awards.py @@ -43,7 +43,7 @@ ), subjects=PIDListRelation( "subjects", - keys=["subject", "scheme", "props"], + keys=["subject", "scheme", "identifiers", "props"], pid_field=Subject.pid, cache_key="subjects", ), diff --git a/invenio_vocabularies/contrib/awards/datastreams.py b/invenio_vocabularies/contrib/awards/datastreams.py index 31fd79eb..6891b45a 100644 --- a/invenio_vocabularies/contrib/awards/datastreams.py +++ b/invenio_vocabularies/contrib/awards/datastreams.py @@ -14,14 +14,14 @@ from invenio_access.permissions import system_identity from invenio_i18n import lazy_gettext as _ +from invenio_vocabularies.datastreams.errors import ReaderError + from ...datastreams.errors import TransformerError from ...datastreams.readers import BaseReader from ...datastreams.transformers import BaseTransformer from ...datastreams.writers import ServiceWriter from .config import awards_ec_ror_id, awards_openaire_funders_mapping -from invenio_vocabularies.datastreams.errors import ReaderError - class AwardsServiceWriter(ServiceWriter): """Funders service writer.""" @@ -165,10 +165,14 @@ def apply(self, stream_entry, **kwargs): # Here `id` is the project ID, which will be used to attach the update to the existing project. award["id"] = f"00k4n6c32::{record['id']}" + categories = record["relations"]["categories"]["category"] + if isinstance(categories, dict): + categories = [categories] + award["subjects"] = [ {"id": f"euroscivoc:{category['code'].split('/')[-1]}"} - for category in record["relations"]["categories"]["category"] - if category["@classification"] == "euroSciVoc" + for category in categories + if category.get("@classification") == "euroSciVoc" ] organizations = record["relations"]["associations"]["organization"] diff --git a/invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json b/invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json index 04350a7b..a9ccac42 100644 --- a/invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json +++ b/invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json @@ -49,6 +49,30 @@ "properties": { "id": { "$ref": "local://definitions-v1.0.0.json#/identifier" + }, + "scheme": { + "description": "Identifier of the subject scheme.", + "$ref": "local://definitions-v1.0.0.json#/identifier" + }, + "subject": { + "description": "Human readable label.", + "type": "string" + }, + "props": { + "type": "object", + "patternProperties": { + "^.*$": { + "type": "string" + } + } + }, + "identifiers": { + "description": "Alternate identifiers for the subject.", + "type": "array", + "items": { + "$ref": "local://definitions-v2.0.0.json#/identifiers_with_scheme" + }, + "uniqueItems": true } }, "uniqueItems": true diff --git a/invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json b/invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json index 49228392..797c3654 100644 --- a/invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json +++ b/invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json @@ -11,7 +11,7 @@ } } ], - "dynamic": "strict", + "dynamic": "true", "properties": { "$schema": { "type": "keyword", @@ -66,8 +66,25 @@ }, "subjects": { "properties": { + "@v": { + "type": "keyword" + }, "id": { "type": "keyword" + }, + "props": { + "type": "object", + "dynamic": "true" + }, + "identifiers": { + "properties": { + "identifier": { + "type": "keyword" + }, + "scheme": { + "type": "keyword" + } + } } } }, diff --git a/invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json b/invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json index 49228392..797c3654 100644 --- a/invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json +++ b/invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json @@ -11,7 +11,7 @@ } } ], - "dynamic": "strict", + "dynamic": "true", "properties": { "$schema": { "type": "keyword", @@ -66,8 +66,25 @@ }, "subjects": { "properties": { + "@v": { + "type": "keyword" + }, "id": { "type": "keyword" + }, + "props": { + "type": "object", + "dynamic": "true" + }, + "identifiers": { + "properties": { + "identifier": { + "type": "keyword" + }, + "scheme": { + "type": "keyword" + } + } } } }, diff --git a/invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json b/invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json index 49228392..797c3654 100644 --- a/invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json +++ b/invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json @@ -11,7 +11,7 @@ } } ], - "dynamic": "strict", + "dynamic": "true", "properties": { "$schema": { "type": "keyword", @@ -66,8 +66,25 @@ }, "subjects": { "properties": { + "@v": { + "type": "keyword" + }, "id": { "type": "keyword" + }, + "props": { + "type": "object", + "dynamic": "true" + }, + "identifiers": { + "properties": { + "identifier": { + "type": "keyword" + }, + "scheme": { + "type": "keyword" + } + } } } }, diff --git a/invenio_vocabularies/contrib/awards/serializer.py b/invenio_vocabularies/contrib/awards/serializer.py index 9882202e..4b7704d7 100644 --- a/invenio_vocabularies/contrib/awards/serializer.py +++ b/invenio_vocabularies/contrib/awards/serializer.py @@ -40,8 +40,8 @@ class AwardL10NItemSchema(Schema): acronym = fields.String(dump_only=True) program = fields.String(dump_only=True) funder = fields.Nested(FunderRelationSchema, dump_only=True) + subjects = fields.List(fields.Nested(SubjectRelationSchema), dump_only=True) + identifiers = fields.List(fields.Nested(IdentifierSchema), dump_only=True) organizations = fields.List( fields.Nested(AwardOrganizationRelationSchema), dump_only=True ) - subjects = fields.List(fields.Nested(SubjectRelationSchema), dump_only=True) - identifiers = fields.List(fields.Nested(IdentifierSchema), dump_only=True) diff --git a/invenio_vocabularies/factories.py b/invenio_vocabularies/factories.py index 7c7a6eb8..ddf11982 100644 --- a/invenio_vocabularies/factories.py +++ b/invenio_vocabularies/factories.py @@ -125,17 +125,6 @@ def get_service(self): raise NotImplementedError("Service not implemented for OpenAIRE Affiliations") -# class ProjectsVocabularyConfig(VocabularyConfig): # TODO: Delete this config -# """Projects Vocabulary Config.""" - -# config = projects_ds_config -# vocabulary_name = "projects" - -# def get_service(self): -# """Get the service for the vocabulary.""" -# raise NotImplementedError("Service not implemented for Projects") - - def get_vocabulary_config(vocabulary): """Factory function to get the appropriate Vocabulary Config.""" vocab_config = { @@ -145,7 +134,6 @@ def get_vocabulary_config(vocabulary): "awards:cordis": AwardsCordisVocabularyConfig, "affiliations": AffiliationsVocabularyConfig, "affiliations:openaire": AffiliationsOpenAIREVocabularyConfig, - # "projects": ProjectsVocabularyConfig, "subjects": SubjectsVocabularyConfig, } return vocab_config.get(vocabulary, VocabularyConfig)() diff --git a/tests/contrib/affiliations/test_affiliations_datastreams.py b/tests/contrib/affiliations/test_affiliations_datastreams.py index 22c49e16..a7d3c980 100644 --- a/tests/contrib/affiliations/test_affiliations_datastreams.py +++ b/tests/contrib/affiliations/test_affiliations_datastreams.py @@ -140,8 +140,8 @@ def dict_openaire_organization_entry(): {"scheme": "mag_id", "value": "67311998"}, {"scheme": "ISNI", "value": "000000012156142X"}, {"scheme": "Wikidata", "value": "Q42944"}, - {"scheme": "PIC", "value": "999988133"}, {"scheme": "ROR", "value": "https://ror.org/01ggx4157"}, + {"scheme": "PIC", "value": "999988133"}, {"scheme": "OrgReg", "value": "INT1011"}, {"scheme": "ISNI", "value": "000000012156142X"}, {"scheme": "FundRef", "value": "100012470"}, diff --git a/tests/contrib/awards/test_awards_datastreams.py b/tests/contrib/awards/test_awards_datastreams.py index 0a43188c..a6970d56 100644 --- a/tests/contrib/awards/test_awards_datastreams.py +++ b/tests/contrib/awards/test_awards_datastreams.py @@ -16,10 +16,12 @@ from invenio_vocabularies.contrib.awards.api import Award from invenio_vocabularies.contrib.awards.datastreams import ( AwardsServiceWriter, + CORDISProjectTransformer, OpenAIREProjectTransformer, ) from invenio_vocabularies.datastreams import StreamEntry from invenio_vocabularies.datastreams.errors import WriterError +from invenio_vocabularies.datastreams.readers import XMLReader @pytest.fixture(scope="function") @@ -112,6 +114,72 @@ def expected_from_award_json_ec(): } +CORDIS_PROJECT_XML = bytes( + """ + + en + en + 264556 + 101117736 + Time2SWITCH + + 10.3030/101117736 + + + + + en + 1908489 + 999979888 + ATU37675002 + TECHNISCHE UNIVERSITAET WIEN + TU WIEN + Institute of Electrodynamics, Microwave and Circuit Engineer + + + + en + de,en,es,fr,it,pl + HES + Higher or Secondary Education Establishments + /Higher or Secondary Education Establishments + + + + + + + + en + de,en,es,fr,it,pl + /21/39/225 + oncology + /medical and health sciences/clinical medicine/oncology + + + + + """, + encoding="utf-8", +) + + +@pytest.fixture(scope="module") +def expected_from_cordis_project_xml(): + return { + "id": "00k4n6c32::101117736", + "subjects": [{"id": "euroscivoc:225"}], + "organizations": [ + { + "id": "999979888", + "scheme": "pic", + "organization": "TECHNISCHE UNIVERSITAET WIEN", + } + ], + } + + def test_awards_transformer(app, dict_award_entry, expected_from_award_json): transformer = OpenAIREProjectTransformer() assert expected_from_award_json == transformer.apply(dict_award_entry).entry @@ -239,3 +307,15 @@ def test_awards_service_writer_update_non_existing( # not-ideal cleanup award_rec._record.delete(force=True) + + +def test_awards_cordis_transformer( + expected_from_cordis_project_xml, +): + reader = XMLReader() + award = next(reader.read(CORDIS_PROJECT_XML)) + + cordis_transformer = CORDISProjectTransformer() + transformed_award_data = cordis_transformer.apply(StreamEntry(award["project"])) + + assert transformed_award_data.entry == expected_from_cordis_project_xml diff --git a/tests/datastreams/test_transformers.py b/tests/datastreams/test_transformers.py index aab05aae..764bde63 100644 --- a/tests/datastreams/test_transformers.py +++ b/tests/datastreams/test_transformers.py @@ -18,8 +18,10 @@ @pytest.fixture(scope="module") def expected_from_xml(): return { - "field_one": "value", - "multi_field": {"some": "value", "another": "value too"}, + "record": { + "field_one": "value", + "multi_field": {"some": "value", "another": "value too"}, + } } @@ -55,6 +57,7 @@ def test_bad_xml_transformer(): ) ) - transformer = XMLTransformer() + transformer = XMLTransformer(root_element="field_two") + with pytest.raises(TransformerError): transformer.apply(bytes_xml_entry) From 69d4dc559021f9b6488dcd245c26fbab9636e8e6 Mon Sep 17 00:00:00 2001 From: Fatimah Zulfiqar Date: Fri, 20 Sep 2024 09:56:03 +0200 Subject: [PATCH 10/11] awards: fixed program in openaire transformer --- invenio_vocabularies/contrib/awards/datastreams.py | 5 +---- .../contrib/awards/mappings/os-v1/awards/award-v1.0.0.json | 2 +- .../contrib/awards/mappings/os-v2/awards/award-v1.0.0.json | 2 +- .../contrib/awards/mappings/v7/awards/award-v1.0.0.json | 2 +- 4 files changed, 4 insertions(+), 7 deletions(-) diff --git a/invenio_vocabularies/contrib/awards/datastreams.py b/invenio_vocabularies/contrib/awards/datastreams.py index 6891b45a..88e47af7 100644 --- a/invenio_vocabularies/contrib/awards/datastreams.py +++ b/invenio_vocabularies/contrib/awards/datastreams.py @@ -73,10 +73,7 @@ def apply(self, stream_entry, **kwargs): funding = next(iter(record.get("funding", [])), None) if funding: - funding_stream_id = funding.get("funding_stream", {}).get("id", "") - # Example funding stream ID: `EC::HE::HORIZON-AG-UN`. We need the `EC` - # string, i.e. the second "part" of the identifier. - program = next(iter(funding_stream_id.split("::")[1:2]), "") + program = funding.get("fundingStream", {}).get("id", "") if program: award["program"] = program diff --git a/invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json b/invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json index 797c3654..7da72ee5 100644 --- a/invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json +++ b/invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json @@ -11,7 +11,7 @@ } } ], - "dynamic": "true", + "dynamic": "strict", "properties": { "$schema": { "type": "keyword", diff --git a/invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json b/invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json index 797c3654..7da72ee5 100644 --- a/invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json +++ b/invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json @@ -11,7 +11,7 @@ } } ], - "dynamic": "true", + "dynamic": "strict", "properties": { "$schema": { "type": "keyword", diff --git a/invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json b/invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json index 797c3654..7da72ee5 100644 --- a/invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json +++ b/invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json @@ -11,7 +11,7 @@ } } ], - "dynamic": "true", + "dynamic": "strict", "properties": { "$schema": { "type": "keyword", From efe917722bad5a25a22444d939bf6e6efad6ca03 Mon Sep 17 00:00:00 2001 From: Fatimah Zulfiqar Date: Wed, 2 Oct 2024 09:13:58 +0200 Subject: [PATCH 11/11] awards: refactor and mapping updates --- .../contrib/affiliations/datastreams.py | 27 +++--- .../contrib/awards/datastreams.py | 85 ++++++++++--------- .../jsonschemas/awards/award-v1.0.0.json | 3 +- .../mappings/os-v1/awards/award-v1.0.0.json | 6 ++ .../mappings/os-v2/awards/award-v1.0.0.json | 6 ++ .../mappings/v7/awards/award-v1.0.0.json | 6 ++ .../subjects/euroscivoc/datastreams.py | 4 +- invenio_vocabularies/datastreams/readers.py | 1 - invenio_vocabularies/datastreams/writers.py | 22 +++-- invenio_vocabularies/factories.py | 2 - tests/contrib/affiliations/conftest.py | 18 +++- .../test_affiliations_datastreams.py | 76 ++++++++++------- .../contrib/awards/test_awards_datastreams.py | 34 ++++---- .../test_subjects_euroscivoc_datastream.py | 14 +-- 14 files changed, 184 insertions(+), 120 deletions(-) diff --git a/invenio_vocabularies/contrib/affiliations/datastreams.py b/invenio_vocabularies/contrib/affiliations/datastreams.py index cffe6855..dd20c278 100644 --- a/invenio_vocabularies/contrib/affiliations/datastreams.py +++ b/invenio_vocabularies/contrib/affiliations/datastreams.py @@ -11,7 +11,7 @@ from flask import current_app -from ...datastreams.errors import WriterError +from ...datastreams.errors import TransformerError, WriterError from ...datastreams.transformers import BaseTransformer from ...datastreams.writers import ServiceWriter from ..common.ror.datastreams import RORTransformer @@ -54,7 +54,16 @@ def apply(self, stream_entry, **kwargs): """Applies the transformation to the stream entry.""" record = stream_entry.entry - organization = {"openaire_id": record["id"]} + if "id" not in record: + raise TransformerError([f"No id for: {record}"]) + + if not record["id"].startswith("openorgs____::"): + raise TransformerError([f"Not valid OpenAIRE OpenOrgs id for: {record}"]) + + if "pid" not in record: + raise TransformerError([f"No alternative identifiers for: {record}"]) + + organization = {} for pid in record["pid"]: if pid["scheme"] == "ROR": @@ -79,7 +88,9 @@ def __init__(self, *args, **kwargs): service_or_name = kwargs.pop("service_or_name", "affiliations") # Here we only update and we do not insert, since OpenAIRE data is used to augment existing affiliations # (with PIC identifiers) and is not used to create new affiliations. - super().__init__(service_or_name=service_or_name, insert=False, *args, **kwargs) + super().__init__( + service_or_name=service_or_name, insert=False, update=True, *args, **kwargs + ) def _entry_id(self, entry): """Get the id from an entry.""" @@ -89,16 +100,6 @@ def write(self, stream_entry, *args, **kwargs): """Writes the input entry using a given service.""" entry = stream_entry.entry - if not entry["openaire_id"].startswith("openorgs____::"): - raise WriterError([f"Not valid OpenAIRE OpenOrgs id for: {entry}"]) - del entry["openaire_id"] - - if "id" not in entry: - raise WriterError([f"No id for: {entry}"]) - - if "identifiers" not in entry: - raise WriterError([f"No alternative identifiers for: {entry}"]) - return super().write(stream_entry, *args, **kwargs) def write_many(self, stream_entries, *args, **kwargs): diff --git a/invenio_vocabularies/contrib/awards/datastreams.py b/invenio_vocabularies/contrib/awards/datastreams.py index 88e47af7..b6197183 100644 --- a/invenio_vocabularies/contrib/awards/datastreams.py +++ b/invenio_vocabularies/contrib/awards/datastreams.py @@ -11,6 +11,7 @@ import io import requests +from flask import current_app from invenio_access.permissions import system_identity from invenio_i18n import lazy_gettext as _ @@ -160,49 +161,55 @@ def apply(self, stream_entry, **kwargs): award = {} # Here `id` is the project ID, which will be used to attach the update to the existing project. - award["id"] = f"00k4n6c32::{record['id']}" + award["id"] = ( + f"{current_app.config['VOCABULARIES_AWARDS_EC_ROR_ID']}::{record['id']}" + ) - categories = record["relations"]["categories"]["category"] - if isinstance(categories, dict): - categories = [categories] + categories = record.get("relations", {}).get("categories", {}).get("category") + if categories: + if isinstance(categories, dict): + categories = [categories] - award["subjects"] = [ - {"id": f"euroscivoc:{category['code'].split('/')[-1]}"} - for category in categories - if category.get("@classification") == "euroSciVoc" - ] + award["subjects"] = [ + {"id": f"euroscivoc:{vocab_id}"} + for category in categories + if category.get("@classification") == "euroSciVoc" + and (vocab_id := category["code"].split("/")[-1]).isdigit() + ] - organizations = record["relations"]["associations"]["organization"] - # Projects with a single organization are not wrapped in a list, - # so we do this here to be able to iterate over it. organizations = ( - organizations if isinstance(organizations, list) else [organizations] + record.get("relations", {}).get("associations", {}).get("organization") ) - award["organizations"] = [] - for organization in organizations: - # Some organizations in FP7 projects do not have a "legalname" key, - # for instance the 14th participant in "SAGE" https://cordis.europa.eu/project/id/999902. - # In this case, fully skip the organization entry. - if "legalname" not in organization: - continue - - organization_data = { - # TODO: Here the legal name is uppercase. - "organization": organization["legalname"], - } - - # Some organizations in FP7 projects do not have an "id" key (the PIC identifier), - # for instance "AIlGreenVehicles" in "MOTORBRAIN" https://cordis.europa.eu/project/id/270693. - # In this case, still store the name but skip the identifier part. - if "id" in organization: - organization_data.update( - { - "scheme": "pic", - "id": organization["id"], - } - ) + if organizations: + # Projects with a single organization are not wrapped in a list, + # so we do this here to be able to iterate over it. + organizations = ( + organizations if isinstance(organizations, list) else [organizations] + ) + award["organizations"] = [] + for organization in organizations: + # Some organizations in FP7 projects do not have a "legalname" key, + # for instance the 14th participant in "SAGE" https://cordis.europa.eu/project/id/999902. + # In this case, fully skip the organization entry. + if "legalname" not in organization: + continue + + organization_data = { + "organization": organization["legalname"], + } - award["organizations"].append(organization_data) + # Some organizations in FP7 projects do not have an "id" key (the PIC identifier), + # for instance "AIlGreenVehicles" in "MOTORBRAIN" https://cordis.europa.eu/project/id/270693. + # In this case, still store the name but skip the identifier part. + if "id" in organization: + organization_data.update( + { + "scheme": "pic", + "id": organization["id"], + } + ) + + award["organizations"].append(organization_data) stream_entry.entry = award return stream_entry @@ -216,7 +223,9 @@ def __init__(self, *args, **kwargs): service_or_name = kwargs.pop("service_or_name", "awards") # Here we only update and we do not insert, since CORDIS data is used to augment existing awards # (with subjects and organizations information) and is not used to create new awards. - super().__init__(service_or_name=service_or_name, insert=False, *args, **kwargs) + super().__init__( + service_or_name=service_or_name, insert=False, update=True, *args, **kwargs + ) def _entry_id(self, entry): """Get the id from an entry.""" diff --git a/invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json b/invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json index a9ccac42..ef0610f0 100644 --- a/invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json +++ b/invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json @@ -74,8 +74,7 @@ }, "uniqueItems": true } - }, - "uniqueItems": true + } }, "organizations": { "description": "Award's organizations.", diff --git a/invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json b/invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json index 7da72ee5..0063b057 100644 --- a/invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json +++ b/invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json @@ -76,6 +76,12 @@ "type": "object", "dynamic": "true" }, + "subject": { + "type": "keyword" + }, + "scheme": { + "type": "keyword" + }, "identifiers": { "properties": { "identifier": { diff --git a/invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json b/invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json index 7da72ee5..0063b057 100644 --- a/invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json +++ b/invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json @@ -76,6 +76,12 @@ "type": "object", "dynamic": "true" }, + "subject": { + "type": "keyword" + }, + "scheme": { + "type": "keyword" + }, "identifiers": { "properties": { "identifier": { diff --git a/invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json b/invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json index 7da72ee5..0063b057 100644 --- a/invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json +++ b/invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json @@ -76,6 +76,12 @@ "type": "object", "dynamic": "true" }, + "subject": { + "type": "keyword" + }, + "scheme": { + "type": "keyword" + }, "identifiers": { "properties": { "identifier": { diff --git a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py index 64aa59e6..6bab5bb3 100644 --- a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py +++ b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py @@ -85,7 +85,7 @@ def _get_notation(self, subject, rdf_graph): def _get_labels(self, subject, rdf_graph): """Extract prefLabel and altLabel languages for a subject.""" labels = { - label.language: label.value + label.language: label.value.capitalize() for _, _, label in rdf_graph.triples( (subject, self.SKOS_CORE.prefLabel, None) ) @@ -94,7 +94,7 @@ def _get_labels(self, subject, rdf_graph): for _, _, label in rdf_graph.triples( (subject, self.SKOS_CORE.altLabel, None) ): - labels.setdefault(label.language, label.value) + labels.setdefault(label.language, label.value.capitalize()) return labels def _find_parents(self, subject, rdf_graph): diff --git a/invenio_vocabularies/datastreams/readers.py b/invenio_vocabularies/datastreams/readers.py index 5ef11ed7..0f817bf3 100644 --- a/invenio_vocabularies/datastreams/readers.py +++ b/invenio_vocabularies/datastreams/readers.py @@ -226,7 +226,6 @@ class XMLReader(BaseReader): def __init__(self, root_element=None, *args, **kwargs): """Constructor.""" - # TODO: How to make root_element mandatory? self.root_element = root_element super().__init__(*args, **kwargs) diff --git a/invenio_vocabularies/datastreams/writers.py b/invenio_vocabularies/datastreams/writers.py index 6bb3aeb0..6c70facc 100644 --- a/invenio_vocabularies/datastreams/writers.py +++ b/invenio_vocabularies/datastreams/writers.py @@ -87,16 +87,28 @@ def _resolve(self, id_): def _do_update(self, entry): vocab_id = self._entry_id(entry) current = self._resolve(vocab_id) + combined_dict = current.to_dict() + + # Update fields from entry + for key, value in entry.items(): + if key in combined_dict: + if isinstance(combined_dict[key], list) and isinstance(value, list): + combined_dict[key].extend( + item for item in value if item not in combined_dict[key] + ) + else: + combined_dict[key] = value + else: + combined_dict[key] = value - updated = dict(current.to_dict(), **entry) - # TODO: Try to use _record instead of to_dict() - # updated = dict(current._record, **entry) - - return StreamEntry(self._service.update(self._identity, vocab_id, updated)) + return StreamEntry( + self._service.update(self._identity, vocab_id, combined_dict) + ) def write(self, stream_entry, *args, **kwargs): """Writes the input entry using a given service.""" entry = stream_entry.entry + try: if self._insert: try: diff --git a/invenio_vocabularies/factories.py b/invenio_vocabularies/factories.py index ddf11982..de5a2f8f 100644 --- a/invenio_vocabularies/factories.py +++ b/invenio_vocabularies/factories.py @@ -27,8 +27,6 @@ from .contrib.names.datastreams import DATASTREAM_CONFIG as names_ds_config from .contrib.subjects.datastreams import DATASTREAM_CONFIG as subjects_ds_config -# from .contrib.projects.datastreams import DATASTREAM_CONFIG as projects_ds_config - class VocabularyConfig: """Vocabulary Config Factory.""" diff --git a/tests/contrib/affiliations/conftest.py b/tests/contrib/affiliations/conftest.py index 0c0d6b2b..5a3392e8 100644 --- a/tests/contrib/affiliations/conftest.py +++ b/tests/contrib/affiliations/conftest.py @@ -33,11 +33,27 @@ def affiliation_full_data(): } +@pytest.fixture(scope="function") +def affiliation_openaire_data(): + """Full affiliation data.""" + return { + "acronym": "CERN", + "id": "01ggx4157", + "identifiers": [{"identifier": "999988133", "scheme": "pic"}], + "name": "Test affiliation", + "title": {"en": "Test affiliation", "es": "Afiliacion de test"}, + "country": "CH", + "country_name": "Switzerland", + "location_name": "Geneva", + "status": "active", + "types": ["facility", "funder"], + } + + @pytest.fixture(scope="function") def openaire_affiliation_full_data(): """Full OpenAIRE affiliation data.""" return { - "openaire_id": "openorgs____::47efb6602225236c0b207761a8b3a21c", "id": "01ggx4157", "identifiers": [{"identifier": "999988133", "scheme": "pic"}], } diff --git a/tests/contrib/affiliations/test_affiliations_datastreams.py b/tests/contrib/affiliations/test_affiliations_datastreams.py index a7d3c980..7c8e1df0 100644 --- a/tests/contrib/affiliations/test_affiliations_datastreams.py +++ b/tests/contrib/affiliations/test_affiliations_datastreams.py @@ -23,7 +23,7 @@ ) from invenio_vocabularies.contrib.common.ror.datastreams import RORTransformer from invenio_vocabularies.datastreams import StreamEntry -from invenio_vocabularies.datastreams.errors import WriterError +from invenio_vocabularies.datastreams.errors import TransformerError, WriterError @pytest.fixture(scope="module") @@ -51,11 +51,13 @@ def expected_from_ror_json(): def test_ror_transformer(app, dict_ror_entry, expected_from_ror_json): + """Test RORTransformer to ensure it transforms ROR entries correctly.""" transformer = RORTransformer(vocab_schemes=affiliation_schemes) assert expected_from_ror_json == transformer.apply(dict_ror_entry).entry def test_affiliations_service_writer_create(app, search_clear, affiliation_full_data): + """Test AffiliationsServiceWriter for creating a new affiliation.""" writer = AffiliationsServiceWriter() affiliation_rec = writer.write(StreamEntry(affiliation_full_data)) affiliation_dict = affiliation_rec.entry.to_dict() @@ -68,6 +70,7 @@ def test_affiliations_service_writer_create(app, search_clear, affiliation_full_ def test_affiliations_service_writer_duplicate( app, search_clear, affiliation_full_data ): + """Test AffiliationsServiceWriter for handling duplicate entries.""" writer = AffiliationsServiceWriter() affiliation_rec = writer.write(stream_entry=StreamEntry(affiliation_full_data)) Affiliation.index.refresh() # refresh index to make changes live @@ -84,6 +87,7 @@ def test_affiliations_service_writer_duplicate( def test_affiliations_service_writer_update_existing( app, search_clear, affiliation_full_data, service ): + """Test updating an existing affiliation using AffiliationsServiceWriter.""" # create vocabulary writer = AffiliationsServiceWriter(update=True) orig_affiliation_rec = writer.write(stream_entry=StreamEntry(affiliation_full_data)) @@ -91,9 +95,11 @@ def test_affiliations_service_writer_update_existing( # update vocabulary updated_affiliation = deepcopy(affiliation_full_data) updated_affiliation["name"] = "Updated Name" + # check changes vocabulary _ = writer.write(stream_entry=StreamEntry(updated_affiliation)) affiliation_rec = service.read(system_identity, orig_affiliation_rec.entry.id) + affiliation_dict = affiliation_rec.to_dict() # needed while the writer resolves from ES @@ -107,6 +113,7 @@ def test_affiliations_service_writer_update_existing( def test_affiliations_service_writer_update_non_existing( app, search_clear, affiliation_full_data, service ): + """Test creating a new affiliation when updating a non-existing entry.""" # vocabulary item not created, call update directly updated_affiliation = deepcopy(affiliation_full_data) updated_affiliation["name"] = "New name" @@ -156,7 +163,6 @@ def dict_openaire_organization_entry(): @pytest.fixture(scope="module") def expected_from_openaire_json(): return { - "openaire_id": "openorgs____::47efb6602225236c0b207761a8b3a21c", "id": "01ggx4157", "identifiers": [{"identifier": "999988133", "scheme": "pic"}], } @@ -165,6 +171,7 @@ def expected_from_openaire_json(): def test_openaire_organization_transformer( app, dict_openaire_organization_entry, expected_from_openaire_json ): + """Test OpenAIREOrganizationTransformer for transforming OpenAIRE entries.""" transformer = OpenAIREOrganizationTransformer() assert ( expected_from_openaire_json @@ -173,16 +180,24 @@ def test_openaire_organization_transformer( def test_openaire_affiliations_service_writer( - app, search_clear, affiliation_full_data, openaire_affiliation_full_data, service + app, + search_clear, + affiliation_full_data, + openaire_affiliation_full_data, + service, ): + """Test writing and updating an OpenAIRE affiliation entry.""" # create vocabulary with original service writer orig_writer = AffiliationsServiceWriter() + orig_affiliation_rec = orig_writer.write(StreamEntry(affiliation_full_data)) + orig_affiliation_dict = orig_affiliation_rec.entry.to_dict() Affiliation.index.refresh() # refresh index to make changes live # update vocabulary and check changes vocabulary with OpenAIRE service writer - writer = OpenAIREAffiliationsServiceWriter(update=True) + writer = OpenAIREAffiliationsServiceWriter() + _ = writer.write(StreamEntry(openaire_affiliation_full_data)) Affiliation.index.refresh() # refresh index to make changes live affiliation_rec = service.read(system_identity, orig_affiliation_rec.entry.id) @@ -204,50 +219,49 @@ def test_openaire_affiliations_service_writer( affiliation_rec._record.delete(force=True) -def test_openaire_affiliations_service_writer_non_openorgs( - app, openaire_affiliation_full_data +def test_openaire_affiliations_transformer_non_openorgs( + app, dict_openaire_organization_entry ): - writer = OpenAIREAffiliationsServiceWriter() + """Test error handling for non-OpenOrgs ID in OpenAIRE transformer.""" + transformer = OpenAIREOrganizationTransformer() - updated_openaire_affiliation = deepcopy(openaire_affiliation_full_data) - updated_openaire_affiliation["openaire_id"] = ( - "pending_org_::627931d047132a4e20dbc4a882eb9a35" - ) + updated_organization_entry = deepcopy(dict_openaire_organization_entry.entry) + updated_organization_entry["id"] = "pending_org_::627931d047132a4e20dbc4a882eb9a35" - with pytest.raises(WriterError) as err: - writer.write(StreamEntry(updated_openaire_affiliation)) + with pytest.raises(TransformerError) as err: + transformer.apply(StreamEntry(updated_organization_entry)) expected_error = [ - f"Not valid OpenAIRE OpenOrgs id for: {updated_openaire_affiliation}" + f"Not valid OpenAIRE OpenOrgs id for: {updated_organization_entry}" ] assert expected_error in err.value.args -def test_openaire_affiliations_service_writer_no_id( - app, openaire_affiliation_full_data -): - writer = OpenAIREAffiliationsServiceWriter() +def test_openaire_affiliations_transformer_no_id(app, dict_openaire_organization_entry): + """Test error handling when missing ID in OpenAIRE transformer.""" + transformer = OpenAIREOrganizationTransformer() - updated_openaire_affiliation = deepcopy(openaire_affiliation_full_data) - del updated_openaire_affiliation["id"] + updated_organization_entry = deepcopy(dict_openaire_organization_entry.entry) + updated_organization_entry.pop("id", None) - with pytest.raises(WriterError) as err: - writer.write(StreamEntry(updated_openaire_affiliation)) + with pytest.raises(TransformerError) as err: + transformer.apply(StreamEntry(updated_organization_entry)) - expected_error = [f"No id for: {updated_openaire_affiliation}"] + expected_error = [f"No id for: {updated_organization_entry}"] assert expected_error in err.value.args -def test_openaire_affiliations_service_writer_no_alternative_identifiers( - app, openaire_affiliation_full_data +def test_openaire_affiliations_transformer_no_alternative_identifiers( + app, dict_openaire_organization_entry ): - writer = OpenAIREAffiliationsServiceWriter() + """Test error handling when missing alternative identifiers in OpenAIRE transformer.""" + transformer = OpenAIREOrganizationTransformer() - updated_openaire_affiliation = deepcopy(openaire_affiliation_full_data) - del updated_openaire_affiliation["identifiers"] + updated_organization_entry = deepcopy(dict_openaire_organization_entry.entry) + updated_organization_entry.pop("pid", None) - with pytest.raises(WriterError) as err: - writer.write(StreamEntry(updated_openaire_affiliation)) + with pytest.raises(TransformerError) as err: + transformer.apply(StreamEntry(updated_organization_entry)) - expected_error = [f"No alternative identifiers for: {updated_openaire_affiliation}"] + expected_error = [f"No alternative identifiers for: {updated_organization_entry}"] assert expected_error in err.value.args diff --git a/tests/contrib/awards/test_awards_datastreams.py b/tests/contrib/awards/test_awards_datastreams.py index a6970d56..182811ce 100644 --- a/tests/contrib/awards/test_awards_datastreams.py +++ b/tests/contrib/awards/test_awards_datastreams.py @@ -33,7 +33,7 @@ def dict_award_entry(): "enddate": "2010-09-30", "funding": [ { - "funding_stream": { + "fundingStream": { "description": "Directorate for Geosciences - Division of " "Ocean Sciences", "id": "NSF::GEO/OAD::GEO/OCE", @@ -65,7 +65,7 @@ def dict_award_entry_ec(): "enddate": "2025-12-31", "funding": [ { - "funding_stream": { + "fundingStream": { "description": "Test stream", "id": "TST::test::test", }, @@ -92,10 +92,10 @@ def expected_from_award_json(): "id": "021nxhr62::0751743", "identifiers": [{"identifier": "https://test.com", "scheme": "url"}], "number": "0751743", + "program": "NSF::GEO/OAD::GEO/OCE", "title": {"en": "Test title"}, "funder": {"id": "021nxhr62"}, "acronym": "TA", - "program": "GEO/OAD", } @@ -110,7 +110,7 @@ def expected_from_award_json_ec(): "title": {"en": "Test title"}, "funder": {"id": "00k4n6c32"}, "acronym": "TS", - "program": "test", + "program": "TST::test::test", } @@ -181,6 +181,7 @@ def expected_from_cordis_project_xml(): def test_awards_transformer(app, dict_award_entry, expected_from_award_json): + """Test the OpenAIREProjectTransformer's output against expected award data.""" transformer = OpenAIREProjectTransformer() assert expected_from_award_json == transformer.apply(dict_award_entry).entry @@ -188,6 +189,7 @@ def test_awards_transformer(app, dict_award_entry, expected_from_award_json): def test_awards_service_writer_create( app, search_clear, example_funder_ec, award_full_data ): + """Verify creation of an award record and match it with expected data.""" awards_writer = AwardsServiceWriter() award_rec = awards_writer.write(StreamEntry(award_full_data)) award_dict = award_rec.entry.to_dict() @@ -200,12 +202,9 @@ def test_awards_service_writer_create( def test_awards_funder_id_not_exist( - app, - search_clear, - example_funder, - example_funder_ec, - award_full_data_invalid_id, + app, search_clear, example_funder, example_funder_ec, award_full_data_invalid_id ): + """Ensure writing an award with an invalid funder ID raises an error.""" awards_writer = AwardsServiceWriter() with pytest.raises(WriterError) as err: awards_writer.write(StreamEntry(award_full_data_invalid_id)) @@ -223,6 +222,7 @@ def test_awards_funder_id_not_exist( def test_awards_funder_id_not_exist_no_funders( app, search_clear, award_full_data_invalid_id ): + """Check error on writing an award with no valid funders.""" awards_writer = AwardsServiceWriter() with pytest.raises(WriterError) as err: awards_writer.write(StreamEntry(award_full_data_invalid_id)) @@ -245,7 +245,9 @@ def test_awards_transformer_ec_functionality( expected_from_award_json, expected_from_award_json_ec, ): + """Test transformer output for standard and EC-specific awards.""" transformer = OpenAIREProjectTransformer() + assert expected_from_award_json == transformer.apply(dict_award_entry).entry assert expected_from_award_json_ec == transformer.apply(dict_award_entry_ec).entry @@ -253,6 +255,7 @@ def test_awards_transformer_ec_functionality( def test_awards_service_writer_duplicate( app, search_clear, example_funder_ec, award_full_data ): + """Verify error on attempting to create a duplicate award.""" writer = AwardsServiceWriter() award_rec = writer.write(stream_entry=StreamEntry(award_full_data)) Award.index.refresh() # refresh index to make changes live @@ -269,20 +272,17 @@ def test_awards_service_writer_duplicate( def test_awards_service_writer_update_existing( app, search_clear, example_funder_ec, award_full_data, service ): - # create vocabulary + """Check updating an existing award record with new data.""" writer = AwardsServiceWriter(update=True) orig_award_rec = writer.write(stream_entry=StreamEntry(award_full_data)) Award.index.refresh() # refresh index to make changes live - # update vocabulary updated_award = deepcopy(award_full_data) updated_award["title"] = {"en": "New Test title"} - # check changes vocabulary _ = writer.write(stream_entry=StreamEntry(updated_award)) award_rec = service.read(system_identity, orig_award_rec.entry.id) award_dict = award_rec.to_dict() updated_award["funder"]["name"] = example_funder_ec["name"] - # needed while the writer resolves from ES assert _.entry.id == orig_award_rec.entry.id assert dict(award_dict, **updated_award) == award_dict @@ -293,10 +293,9 @@ def test_awards_service_writer_update_existing( def test_awards_service_writer_update_non_existing( app, search_clear, example_funder_ec, award_full_data, service ): - # vocabulary item not created, call update directly + """Test updating a non-existing award, creating a new record.""" updated_award = deepcopy(award_full_data) updated_award["title"] = {"en": "New Test title"} - # check changes vocabulary writer = AwardsServiceWriter(update=True) award_rec = writer.write(stream_entry=StreamEntry(updated_award)) award_rec = service.read(system_identity, award_rec.entry.id) @@ -309,9 +308,8 @@ def test_awards_service_writer_update_non_existing( award_rec._record.delete(force=True) -def test_awards_cordis_transformer( - expected_from_cordis_project_xml, -): +def test_awards_cordis_transformer(expected_from_cordis_project_xml): + """Validate transformation of CORDIS project XML to expected format.""" reader = XMLReader() award = next(reader.read(CORDIS_PROJECT_XML)) diff --git a/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py b/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py index c8f62373..a638aff1 100644 --- a/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py +++ b/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py @@ -108,12 +108,12 @@ def expected_from_rdf_pref_label_with_parent(): "scheme": "EuroSciVoc", "subject": "Satellite radio", "title": { - "it": "radio satellitare", - "pl": "radio satelitarne", - "fr": "radio satellite", - "es": "radio por satélite", + "it": "Radio satellitare", + "pl": "Radio satelitarne", + "fr": "Radio satellite", + "es": "Radio por satélite", "de": "Satellitenfunk", - "en": "satellite radio", + "en": "Satellite radio", }, "props": {"parents": "euroscivoc:1225"}, "identifiers": [ @@ -127,7 +127,7 @@ def expected_from_rdf_pref_label_with_parent(): "id": "euroscivoc:1225", "scheme": "EuroSciVoc", "subject": "Radio channel", - "title": {"en": "radio channel"}, + "title": {"en": "Radio channel"}, "props": {}, "identifiers": [ { @@ -145,7 +145,7 @@ def expected_from_rdf_alt_label_without_parent(): "id": "euroscivoc:1717", "scheme": "EuroSciVoc", "subject": "Broadcastingsatellite service", - "title": {"en": "broadcastingsatellite service"}, + "title": {"en": "Broadcastingsatellite service"}, "props": {}, "identifiers": [ {