awards: refactor and mapping updates

inveniosoftware · Oct 3, 2024 · 9b0e908 · 9b0e908
1 parent e317fbc
commit 9b0e908
Show file tree

Hide file tree

Showing 14 changed files with 184 additions and 120 deletions.
diff --git a/invenio_vocabularies/contrib/affiliations/datastreams.py b/invenio_vocabularies/contrib/affiliations/datastreams.py
@@ -11,7 +11,7 @@
 
 from flask import current_app
 
-from ...datastreams.errors import WriterError
+from ...datastreams.errors import TransformerError, WriterError
 from ...datastreams.transformers import BaseTransformer
 from ...datastreams.writers import ServiceWriter
 from ..common.ror.datastreams import RORTransformer
@@ -54,7 +54,16 @@ def apply(self, stream_entry, **kwargs):
         """Applies the transformation to the stream entry."""
         record = stream_entry.entry
 
-        organization = {"openaire_id": record["id"]}
+        if "id" not in record:
+            raise TransformerError([f"No id for: {record}"])
+
+        if not record["id"].startswith("openorgs____::"):
+            raise TransformerError([f"Not valid OpenAIRE OpenOrgs id for: {record}"])
+
+        if "pid" not in record:
+            raise TransformerError([f"No alternative identifiers for: {record}"])
+
+        organization = {}
 
         for pid in record["pid"]:
             if pid["scheme"] == "ROR":
@@ -79,7 +88,9 @@ def __init__(self, *args, **kwargs):
         service_or_name = kwargs.pop("service_or_name", "affiliations")
         # Here we only update and we do not insert, since OpenAIRE data is used to augment existing affiliations
         # (with PIC identifiers) and is not used to create new affiliations.
-        super().__init__(service_or_name=service_or_name, insert=False, *args, **kwargs)
+        super().__init__(
+            service_or_name=service_or_name, insert=False, update=True, *args, **kwargs
+        )
 
     def _entry_id(self, entry):
         """Get the id from an entry."""
@@ -89,16 +100,6 @@ def write(self, stream_entry, *args, **kwargs):
         """Writes the input entry using a given service."""
         entry = stream_entry.entry
 
-        if not entry["openaire_id"].startswith("openorgs____::"):
-            raise WriterError([f"Not valid OpenAIRE OpenOrgs id for: {entry}"])
-        del entry["openaire_id"]
-
-        if "id" not in entry:
-            raise WriterError([f"No id for: {entry}"])
-
-        if "identifiers" not in entry:
-            raise WriterError([f"No alternative identifiers for: {entry}"])
-
         return super().write(stream_entry, *args, **kwargs)
 
     def write_many(self, stream_entries, *args, **kwargs):

diff --git a/invenio_vocabularies/contrib/awards/datastreams.py b/invenio_vocabularies/contrib/awards/datastreams.py
@@ -11,6 +11,7 @@
 import io
 
 import requests
+from flask import current_app
 from invenio_access.permissions import system_identity
 from invenio_i18n import lazy_gettext as _
 
@@ -160,49 +161,55 @@ def apply(self, stream_entry, **kwargs):
         award = {}
 
         # Here `id` is the project ID, which will be used to attach the update to the existing project.
-        award["id"] = f"00k4n6c32::{record['id']}"
+        award["id"] = (
+            f"{current_app.config['VOCABULARIES_AWARDS_EC_ROR_ID']}::{record['id']}"
+        )
 
-        categories = record["relations"]["categories"]["category"]
-        if isinstance(categories, dict):
-            categories = [categories]
+        categories = record.get("relations", {}).get("categories", {}).get("category")
+        if categories:
+            if isinstance(categories, dict):
+                categories = [categories]
 
-        award["subjects"] = [
-            {"id": f"euroscivoc:{category['code'].split('/')[-1]}"}
-            for category in categories
-            if category.get("@classification") == "euroSciVoc"
-        ]
+            award["subjects"] = [
+                {"id": f"euroscivoc:{vocab_id}"}
+                for category in categories
+                if category.get("@classification") == "euroSciVoc"
+                and (vocab_id := category["code"].split("/")[-1]).isdigit()
+            ]
 
-        organizations = record["relations"]["associations"]["organization"]
-        # Projects with a single organization are not wrapped in a list,
-        # so we do this here to be able to iterate over it.
         organizations = (
-            organizations if isinstance(organizations, list) else [organizations]
+            record.get("relations", {}).get("associations", {}).get("organization")
         )
-        award["organizations"] = []
-        for organization in organizations:
-            # Some organizations in FP7 projects do not have a "legalname" key,
-            # for instance the 14th participant in "SAGE" https://cordis.europa.eu/project/id/999902.
-            # In this case, fully skip the organization entry.
-            if "legalname" not in organization:
-                continue
-
-            organization_data = {
-                # TODO: Here the legal name is uppercase.
-                "organization": organization["legalname"],
-            }
-
-            # Some organizations in FP7 projects do not have an "id" key (the PIC identifier),
-            # for instance "AIlGreenVehicles" in "MOTORBRAIN" https://cordis.europa.eu/project/id/270693.
-            # In this case, still store the name but skip the identifier part.
-            if "id" in organization:
-                organization_data.update(
-                    {
-                        "scheme": "pic",
-                        "id": organization["id"],
-                    }
-                )
+        if organizations:
+            # Projects with a single organization are not wrapped in a list,
+            # so we do this here to be able to iterate over it.
+            organizations = (
+                organizations if isinstance(organizations, list) else [organizations]
+            )
+            award["organizations"] = []
+            for organization in organizations:
+                # Some organizations in FP7 projects do not have a "legalname" key,
+                # for instance the 14th participant in "SAGE" https://cordis.europa.eu/project/id/999902.
+                # In this case, fully skip the organization entry.
+                if "legalname" not in organization:
+                    continue
+
+                organization_data = {
+                    "organization": organization["legalname"],
+                }
 
-            award["organizations"].append(organization_data)
+                # Some organizations in FP7 projects do not have an "id" key (the PIC identifier),
+                # for instance "AIlGreenVehicles" in "MOTORBRAIN" https://cordis.europa.eu/project/id/270693.
+                # In this case, still store the name but skip the identifier part.
+                if "id" in organization:
+                    organization_data.update(
+                        {
+                            "scheme": "pic",
+                            "id": organization["id"],
+                        }
+                    )
+
+                award["organizations"].append(organization_data)
 
         stream_entry.entry = award
         return stream_entry
@@ -216,7 +223,9 @@ def __init__(self, *args, **kwargs):
         service_or_name = kwargs.pop("service_or_name", "awards")
         # Here we only update and we do not insert, since CORDIS data is used to augment existing awards
         # (with subjects and organizations information) and is not used to create new awards.
-        super().__init__(service_or_name=service_or_name, insert=False, *args, **kwargs)
+        super().__init__(
+            service_or_name=service_or_name, insert=False, update=True, *args, **kwargs
+        )
 
     def _entry_id(self, entry):
         """Get the id from an entry."""

diff --git a/invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json b/invenio_vocabularies/contrib/awards/jsonschemas/awards/award-v1.0.0.json
@@ -74,8 +74,7 @@
           },
           "uniqueItems": true
         }
-      },
-      "uniqueItems": true
+      }
     },
     "organizations": {
       "description": "Award's organizations.",

diff --git a/invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json b/invenio_vocabularies/contrib/awards/mappings/os-v1/awards/award-v1.0.0.json
@@ -76,6 +76,12 @@
             "type": "object",
             "dynamic": "true"
           },
+          "subject": {
+            "type": "keyword"
+          },
+          "scheme": {
+            "type": "keyword"
+          },
           "identifiers": {
             "properties": {
               "identifier": {

diff --git a/invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json b/invenio_vocabularies/contrib/awards/mappings/os-v2/awards/award-v1.0.0.json
@@ -76,6 +76,12 @@
             "type": "object",
             "dynamic": "true"
           },
+          "subject": {
+            "type": "keyword"
+          },
+          "scheme": {
+            "type": "keyword"
+          },
           "identifiers": {
             "properties": {
               "identifier": {

diff --git a/invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json b/invenio_vocabularies/contrib/awards/mappings/v7/awards/award-v1.0.0.json
@@ -76,6 +76,12 @@
             "type": "object",
             "dynamic": "true"
           },
+          "subject": {
+            "type": "keyword"
+          },
+          "scheme": {
+            "type": "keyword"
+          },
           "identifiers": {
             "properties": {
               "identifier": {

diff --git a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
@@ -85,7 +85,7 @@ def _get_notation(self, subject, rdf_graph):
     def _get_labels(self, subject, rdf_graph):
         """Extract prefLabel and altLabel languages for a subject."""
         labels = {
-            label.language: label.value
+            label.language: label.value.capitalize()
             for _, _, label in rdf_graph.triples(
                 (subject, self.SKOS_CORE.prefLabel, None)
             )
@@ -94,7 +94,7 @@ def _get_labels(self, subject, rdf_graph):
             for _, _, label in rdf_graph.triples(
                 (subject, self.SKOS_CORE.altLabel, None)
             ):
-                labels.setdefault(label.language, label.value)
+                labels.setdefault(label.language, label.value.capitalize())
         return labels
 
     def _find_parents(self, subject, rdf_graph):

diff --git a/invenio_vocabularies/datastreams/readers.py b/invenio_vocabularies/datastreams/readers.py
@@ -226,7 +226,6 @@ class XMLReader(BaseReader):
 
     def __init__(self, root_element=None, *args, **kwargs):
         """Constructor."""
-        # TODO: How to make root_element mandatory?
         self.root_element = root_element
         super().__init__(*args, **kwargs)
 

diff --git a/invenio_vocabularies/datastreams/writers.py b/invenio_vocabularies/datastreams/writers.py
@@ -87,16 +87,28 @@ def _resolve(self, id_):
     def _do_update(self, entry):
         vocab_id = self._entry_id(entry)
         current = self._resolve(vocab_id)
+        combined_dict = current.to_dict()
+
+        # Update fields from entry
+        for key, value in entry.items():
+            if key in combined_dict:
+                if isinstance(combined_dict[key], list) and isinstance(value, list):
+                    combined_dict[key].extend(
+                        item for item in value if item not in combined_dict[key]
+                    )
+                else:
+                    combined_dict[key] = value
+            else:
+                combined_dict[key] = value
 
-        updated = dict(current.to_dict(), **entry)
-        # TODO: Try to use _record instead of to_dict()
-        # updated = dict(current._record, **entry)
-
-        return StreamEntry(self._service.update(self._identity, vocab_id, updated))
+        return StreamEntry(
+            self._service.update(self._identity, vocab_id, combined_dict)
+        )
 
     def write(self, stream_entry, *args, **kwargs):
         """Writes the input entry using a given service."""
         entry = stream_entry.entry
+
         try:
             if self._insert:
                 try:

diff --git a/invenio_vocabularies/factories.py b/invenio_vocabularies/factories.py
@@ -27,8 +27,6 @@
 from .contrib.names.datastreams import DATASTREAM_CONFIG as names_ds_config
 from .contrib.subjects.datastreams import DATASTREAM_CONFIG as subjects_ds_config
 
-# from .contrib.projects.datastreams import DATASTREAM_CONFIG as projects_ds_config
-
 
 class VocabularyConfig:
     """Vocabulary Config Factory."""

diff --git a/tests/contrib/affiliations/conftest.py b/tests/contrib/affiliations/conftest.py
@@ -33,11 +33,27 @@ def affiliation_full_data():
     }
 
 
+@pytest.fixture(scope="function")
+def affiliation_openaire_data():
+    """Full affiliation data."""
+    return {
+        "acronym": "CERN",
+        "id": "01ggx4157",
+        "identifiers": [{"identifier": "999988133", "scheme": "pic"}],
+        "name": "Test affiliation",
+        "title": {"en": "Test affiliation", "es": "Afiliacion de test"},
+        "country": "CH",
+        "country_name": "Switzerland",
+        "location_name": "Geneva",
+        "status": "active",
+        "types": ["facility", "funder"],
+    }
+
+
 @pytest.fixture(scope="function")
 def openaire_affiliation_full_data():
     """Full OpenAIRE affiliation data."""
     return {
-        "openaire_id": "openorgs____::47efb6602225236c0b207761a8b3a21c",
         "id": "01ggx4157",
         "identifiers": [{"identifier": "999988133", "scheme": "pic"}],
     }