Skip to content

Commit

Permalink
awards: refactor and mapping updates
Browse files Browse the repository at this point in the history
  • Loading branch information
0einstein0 committed Oct 3, 2024
1 parent e317fbc commit 9b0e908
Show file tree
Hide file tree
Showing 14 changed files with 184 additions and 120 deletions.
27 changes: 14 additions & 13 deletions invenio_vocabularies/contrib/affiliations/datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from flask import current_app

from ...datastreams.errors import WriterError
from ...datastreams.errors import TransformerError, WriterError
from ...datastreams.transformers import BaseTransformer
from ...datastreams.writers import ServiceWriter
from ..common.ror.datastreams import RORTransformer
Expand Down Expand Up @@ -54,7 +54,16 @@ def apply(self, stream_entry, **kwargs):
"""Applies the transformation to the stream entry."""
record = stream_entry.entry

organization = {"openaire_id": record["id"]}
if "id" not in record:
raise TransformerError([f"No id for: {record}"])

if not record["id"].startswith("openorgs____::"):
raise TransformerError([f"Not valid OpenAIRE OpenOrgs id for: {record}"])

if "pid" not in record:
raise TransformerError([f"No alternative identifiers for: {record}"])

organization = {}

for pid in record["pid"]:
if pid["scheme"] == "ROR":
Expand All @@ -79,7 +88,9 @@ def __init__(self, *args, **kwargs):
service_or_name = kwargs.pop("service_or_name", "affiliations")
# Here we only update and we do not insert, since OpenAIRE data is used to augment existing affiliations
# (with PIC identifiers) and is not used to create new affiliations.
super().__init__(service_or_name=service_or_name, insert=False, *args, **kwargs)
super().__init__(
service_or_name=service_or_name, insert=False, update=True, *args, **kwargs
)

def _entry_id(self, entry):
"""Get the id from an entry."""
Expand All @@ -89,16 +100,6 @@ def write(self, stream_entry, *args, **kwargs):
"""Writes the input entry using a given service."""
entry = stream_entry.entry

if not entry["openaire_id"].startswith("openorgs____::"):
raise WriterError([f"Not valid OpenAIRE OpenOrgs id for: {entry}"])
del entry["openaire_id"]

if "id" not in entry:
raise WriterError([f"No id for: {entry}"])

if "identifiers" not in entry:
raise WriterError([f"No alternative identifiers for: {entry}"])

return super().write(stream_entry, *args, **kwargs)

def write_many(self, stream_entries, *args, **kwargs):
Expand Down
85 changes: 47 additions & 38 deletions invenio_vocabularies/contrib/awards/datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import io

import requests
from flask import current_app
from invenio_access.permissions import system_identity
from invenio_i18n import lazy_gettext as _

Expand Down Expand Up @@ -160,49 +161,55 @@ def apply(self, stream_entry, **kwargs):
award = {}

# Here `id` is the project ID, which will be used to attach the update to the existing project.
award["id"] = f"00k4n6c32::{record['id']}"
award["id"] = (
f"{current_app.config['VOCABULARIES_AWARDS_EC_ROR_ID']}::{record['id']}"
)

categories = record["relations"]["categories"]["category"]
if isinstance(categories, dict):
categories = [categories]
categories = record.get("relations", {}).get("categories", {}).get("category")
if categories:
if isinstance(categories, dict):
categories = [categories]

award["subjects"] = [
{"id": f"euroscivoc:{category['code'].split('/')[-1]}"}
for category in categories
if category.get("@classification") == "euroSciVoc"
]
award["subjects"] = [
{"id": f"euroscivoc:{vocab_id}"}
for category in categories
if category.get("@classification") == "euroSciVoc"
and (vocab_id := category["code"].split("/")[-1]).isdigit()
]

organizations = record["relations"]["associations"]["organization"]
# Projects with a single organization are not wrapped in a list,
# so we do this here to be able to iterate over it.
organizations = (
organizations if isinstance(organizations, list) else [organizations]
record.get("relations", {}).get("associations", {}).get("organization")
)
award["organizations"] = []
for organization in organizations:
# Some organizations in FP7 projects do not have a "legalname" key,
# for instance the 14th participant in "SAGE" https://cordis.europa.eu/project/id/999902.
# In this case, fully skip the organization entry.
if "legalname" not in organization:
continue

organization_data = {
# TODO: Here the legal name is uppercase.
"organization": organization["legalname"],
}

# Some organizations in FP7 projects do not have an "id" key (the PIC identifier),
# for instance "AIlGreenVehicles" in "MOTORBRAIN" https://cordis.europa.eu/project/id/270693.
# In this case, still store the name but skip the identifier part.
if "id" in organization:
organization_data.update(
{
"scheme": "pic",
"id": organization["id"],
}
)
if organizations:
# Projects with a single organization are not wrapped in a list,
# so we do this here to be able to iterate over it.
organizations = (
organizations if isinstance(organizations, list) else [organizations]
)
award["organizations"] = []
for organization in organizations:
# Some organizations in FP7 projects do not have a "legalname" key,
# for instance the 14th participant in "SAGE" https://cordis.europa.eu/project/id/999902.
# In this case, fully skip the organization entry.
if "legalname" not in organization:
continue

organization_data = {
"organization": organization["legalname"],
}

award["organizations"].append(organization_data)
# Some organizations in FP7 projects do not have an "id" key (the PIC identifier),
# for instance "AIlGreenVehicles" in "MOTORBRAIN" https://cordis.europa.eu/project/id/270693.
# In this case, still store the name but skip the identifier part.
if "id" in organization:
organization_data.update(
{
"scheme": "pic",
"id": organization["id"],
}
)

award["organizations"].append(organization_data)

stream_entry.entry = award
return stream_entry
Expand All @@ -216,7 +223,9 @@ def __init__(self, *args, **kwargs):
service_or_name = kwargs.pop("service_or_name", "awards")
# Here we only update and we do not insert, since CORDIS data is used to augment existing awards
# (with subjects and organizations information) and is not used to create new awards.
super().__init__(service_or_name=service_or_name, insert=False, *args, **kwargs)
super().__init__(
service_or_name=service_or_name, insert=False, update=True, *args, **kwargs
)

def _entry_id(self, entry):
"""Get the id from an entry."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,7 @@
},
"uniqueItems": true
}
},
"uniqueItems": true
}
},
"organizations": {
"description": "Award's organizations.",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,12 @@
"type": "object",
"dynamic": "true"
},
"subject": {
"type": "keyword"
},
"scheme": {
"type": "keyword"
},
"identifiers": {
"properties": {
"identifier": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,12 @@
"type": "object",
"dynamic": "true"
},
"subject": {
"type": "keyword"
},
"scheme": {
"type": "keyword"
},
"identifiers": {
"properties": {
"identifier": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,12 @@
"type": "object",
"dynamic": "true"
},
"subject": {
"type": "keyword"
},
"scheme": {
"type": "keyword"
},
"identifiers": {
"properties": {
"identifier": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def _get_notation(self, subject, rdf_graph):
def _get_labels(self, subject, rdf_graph):
"""Extract prefLabel and altLabel languages for a subject."""
labels = {
label.language: label.value
label.language: label.value.capitalize()
for _, _, label in rdf_graph.triples(
(subject, self.SKOS_CORE.prefLabel, None)
)
Expand All @@ -94,7 +94,7 @@ def _get_labels(self, subject, rdf_graph):
for _, _, label in rdf_graph.triples(
(subject, self.SKOS_CORE.altLabel, None)
):
labels.setdefault(label.language, label.value)
labels.setdefault(label.language, label.value.capitalize())
return labels

def _find_parents(self, subject, rdf_graph):
Expand Down
1 change: 0 additions & 1 deletion invenio_vocabularies/datastreams/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,6 @@ class XMLReader(BaseReader):

def __init__(self, root_element=None, *args, **kwargs):
"""Constructor."""
# TODO: How to make root_element mandatory?
self.root_element = root_element
super().__init__(*args, **kwargs)

Expand Down
22 changes: 17 additions & 5 deletions invenio_vocabularies/datastreams/writers.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,16 +87,28 @@ def _resolve(self, id_):
def _do_update(self, entry):
vocab_id = self._entry_id(entry)
current = self._resolve(vocab_id)
combined_dict = current.to_dict()

# Update fields from entry
for key, value in entry.items():
if key in combined_dict:
if isinstance(combined_dict[key], list) and isinstance(value, list):
combined_dict[key].extend(
item for item in value if item not in combined_dict[key]
)
else:
combined_dict[key] = value
else:
combined_dict[key] = value

updated = dict(current.to_dict(), **entry)
# TODO: Try to use _record instead of to_dict()
# updated = dict(current._record, **entry)

return StreamEntry(self._service.update(self._identity, vocab_id, updated))
return StreamEntry(
self._service.update(self._identity, vocab_id, combined_dict)
)

def write(self, stream_entry, *args, **kwargs):
"""Writes the input entry using a given service."""
entry = stream_entry.entry

try:
if self._insert:
try:
Expand Down
2 changes: 0 additions & 2 deletions invenio_vocabularies/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@
from .contrib.names.datastreams import DATASTREAM_CONFIG as names_ds_config
from .contrib.subjects.datastreams import DATASTREAM_CONFIG as subjects_ds_config

# from .contrib.projects.datastreams import DATASTREAM_CONFIG as projects_ds_config


class VocabularyConfig:
"""Vocabulary Config Factory."""
Expand Down
18 changes: 17 additions & 1 deletion tests/contrib/affiliations/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,27 @@ def affiliation_full_data():
}


@pytest.fixture(scope="function")
def affiliation_openaire_data():
"""Full affiliation data."""
return {
"acronym": "CERN",
"id": "01ggx4157",
"identifiers": [{"identifier": "999988133", "scheme": "pic"}],
"name": "Test affiliation",
"title": {"en": "Test affiliation", "es": "Afiliacion de test"},
"country": "CH",
"country_name": "Switzerland",
"location_name": "Geneva",
"status": "active",
"types": ["facility", "funder"],
}


@pytest.fixture(scope="function")
def openaire_affiliation_full_data():
"""Full OpenAIRE affiliation data."""
return {
"openaire_id": "openorgs____::47efb6602225236c0b207761a8b3a21c",
"id": "01ggx4157",
"identifiers": [{"identifier": "999988133", "scheme": "pic"}],
}
Expand Down
Loading

0 comments on commit 9b0e908

Please sign in to comment.