inveniosoftware · ptamarit · Sep 30, 2024 · Aug 29, 2024 · Sep 11, 2024 · Sep 12, 2024
diff --git a/invenio_vocabularies/config.py b/invenio_vocabularies/config.py
@@ -107,6 +107,7 @@
 
 VOCABULARIES_SUBJECTS_SCHEMES = {
     "gnd": {"label": _("GND"), "validator": idutils.is_gnd, "datacite": "GND"},
+    "url": {"label": _("URL"), "validator": idutils.is_url},
 }
 """Subjects allowed identifier schemes."""
 
@@ -162,6 +163,9 @@
 }
 """Vocabulary type search configuration."""
 
+SUBJECTS_EUROSCIVOC_FILE_URL = "https://op.europa.eu/o/opportal-service/euvoc-download-handler?cellarURI=http%3A%2F%2Fpublications.europa.eu%2Fresource%2Fdistribution%2Feuroscivoc%2F20231115-0%2Frdf%2Fskos_ap_eu%2FEuroSciVoc-skos-ap-eu.rdf&fileName=EuroSciVoc-skos-ap-eu.rdf"
+"""Subject EuroSciVoc file download link."""
+
 VOCABULARIES_ORCID_ACCESS_KEY = "TODO"
 """ORCID access key to access the s3 bucket."""
 VOCABULARIES_ORCID_SECRET_KEY = "TODO"

diff --git a/invenio_vocabularies/contrib/subjects/datastreams.py b/invenio_vocabularies/contrib/subjects/datastreams.py
@@ -12,9 +12,8 @@
 from invenio_i18n import lazy_gettext as _
 
 from ...datastreams.writers import ServiceWriter
-from .mesh.datastreams import VOCABULARIES_DATASTREAM_READERS as mesh_readers
-from .mesh.datastreams import VOCABULARIES_DATASTREAM_TRANSFORMERS as mesh_transformers
-from .mesh.datastreams import VOCABULARIES_DATASTREAM_WRITERS as mesh_writers
+from .euroscivoc import datastreams as euroscivoc_datastreams
+from .mesh import datastreams as mesh_datastreams
 
 
 class SubjectsServiceWriter(ServiceWriter):
@@ -30,15 +29,22 @@ def _entry_id(self, entry):
         return entry["id"]
 
 
-VOCABULARIES_DATASTREAM_READERS = {**mesh_readers}
+VOCABULARIES_DATASTREAM_READERS = {
+    **mesh_datastreams.VOCABULARIES_DATASTREAM_READERS,
+    **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_READERS,
+}
 """Subjects Data Streams readers."""
 
-VOCABULARIES_DATASTREAM_TRANSFORMERS = {**mesh_transformers}
+VOCABULARIES_DATASTREAM_TRANSFORMERS = {
+    **mesh_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
+    **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
+}
 """Subjects Data Streams transformers."""
 
 VOCABULARIES_DATASTREAM_WRITERS = {
     "subjects-service": SubjectsServiceWriter,
-    **mesh_writers,
+    **mesh_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
+    **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
 }
 """Subjects Data Streams writers."""
 

diff --git a/invenio_vocabularies/contrib/subjects/euroscivoc/__init__.py b/invenio_vocabularies/contrib/subjects/euroscivoc/__init__.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2024 CERN.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+
+"""EuroSciVoc Subjects module."""
diff --git a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
@@ -0,0 +1,171 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2022-2024 CERN.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+
+"""EuroSciVoc subjects datastreams, readers, transformers, and writers."""
+
+import io
+from collections import namedtuple
+
+import requests
+from rdflib import OWL, RDF, Graph, Namespace
+
+from invenio_vocabularies.config import SUBJECTS_EUROSCIVOC_FILE_URL
+from invenio_vocabularies.datastreams.readers import BaseReader
+from invenio_vocabularies.datastreams.transformers import BaseTransformer
+
+
+class EuroSciVocSubjectsHTTPReader(BaseReader):
+    """Reader class to fetch and process EuroSciVoc RDF data."""
+
+    def __init__(self, origin=None, mode="r", since=None, *args, **kwargs):
+        """Initialize the reader with the data source.
+
+        :param origin: The URL from which to fetch the RDF data.
+        :param mode: Mode of operation (default is 'r' for reading).
+        """
+        self.origin = origin or SUBJECTS_EUROSCIVOC_FILE_URL
+        super().__init__(origin=origin, mode=mode, *args, **kwargs)
+
+    def _iter(self, rdf_graph):
+        """Iterate over the RDF graph, yielding one subject at a time.
+
+        :param rdf_graph: The RDF graph to process.
+        :yield: Subject and graph to be transformed.
+        """
+        SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
+
+        for subject, _, _ in rdf_graph.triples((None, RDF.type, SKOS_CORE.Concept)):
+            yield {"subject": subject, "rdf_graph": rdf_graph}
+
+    def read(self, item=None, *args, **kwargs):
+        """Fetch and process the EuroSciVoc RDF data, yielding it one subject at a time.
+
+        :param item: The RDF data provided as bytes (optional).
+        :yield: Processed EuroSciVoc subject data.
+        """
+        if item:
+            raise NotImplementedError(
+                "EuroSciVocSubjectsHTTPReader does not support being chained after another reader"
+            )
+        # Fetch the RDF data from the specified origin URL
+        response = requests.get(self.origin)
+        response.raise_for_status()
+
+        # Treat the response content as a file-like object
+        rdf_data = io.BytesIO(response.content)
+
+        # Parse the RDF data into a graph
+        rdf_graph = Graph()
+        rdf_graph.parse(rdf_data, format="xml")
+
+        # Yield each processed subject from the RDF graph
+        yield from self._iter(rdf_graph)
+
+
+class EuroSciVocSubjectsTransformer(BaseTransformer):
+    """Transformer class to convert EuroSciVoc RDF data to a dictionary format."""
+
+    SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
+    SPLITCHAR = ","
+
+    def _get_notation(self, subject, rdf_graph):
+        """Extract the numeric notation for a subject."""
+        for _, _, notation in rdf_graph.triples(
+            (subject, self.SKOS_CORE.notation, None)
+        ):
+            if str(notation).isdigit():
+                return str(notation)
+        return None
+
+    def _get_labels(self, subject, rdf_graph):
+        """Extract prefLabel and altLabel languages for a subject."""
+        labels = {
+            label.language: label.value
+            for _, _, label in rdf_graph.triples(
+                (subject, self.SKOS_CORE.prefLabel, None)
+            )
+        }
+        if "en" not in labels:
+            for _, _, label in rdf_graph.triples(
+                (subject, self.SKOS_CORE.altLabel, None)
+            ):
+                labels.setdefault(label.language, label.value)
+        return labels
+
+    def _find_parents(self, subject, rdf_graph):
+        """Find parent notations."""
+        parents = []
+
+        # Traverse the broader hierarchy
+        for broader in rdf_graph.transitive_objects(subject, self.SKOS_CORE.broader):
+            if broader != subject:  # Ensure we don't include the current subject
+                parent_notation = self._get_notation(broader, rdf_graph)
+                if parent_notation:
+                    parents.append(parent_notation)
+
+        return parents
+
+    def _transform_entry(self, subject, rdf_graph):
+        """Transform an entry to the required dictionary format."""
+        # Get subject notation with euroscivoc prefix
+        notation = self._get_notation(subject, rdf_graph)
+        id = f"euroscivoc:{notation}" if notation else None
+        # Get labels for the current subject
+        labels = self._get_labels(subject, rdf_graph)
+        # Join parent notations with SPLITCHAR separator and add euroscivoc prefix
+        parents = self.SPLITCHAR.join(
+            f"euroscivoc:{n}" for n in reversed(self._find_parents(subject, rdf_graph))
+        )
+        # Create identifiers list
+        identifiers = [{"scheme": "url", "identifier": str(subject)}]
+
+        return {
+            "id": id,
+            "scheme": "EuroSciVoc",
+            "subject": labels.get("en", "").capitalize(),
+            "title": labels,
+            "props": {"parents": parents} if parents else {},
+            "identifiers": identifiers,
+        }
+
+    def apply(self, stream_entry, *args, **kwargs):
+        """Transform a stream entry to the required dictionary format.
+
+        :param stream_entry: The entry to be transformed, which includes the subject and the RDF graph.
+        :return: The transformed stream entry.
+        """
+        # Apply transformations
+        entry_data = self._transform_entry(
+            stream_entry.entry["subject"], stream_entry.entry["rdf_graph"]
+        )
+        stream_entry.entry = entry_data
+        return stream_entry
+
+
+# Configuration for datastream readers, transformers, and writers
+VOCABULARIES_DATASTREAM_READERS = {"euroscivoc-reader": EuroSciVocSubjectsHTTPReader}
+
+VOCABULARIES_DATASTREAM_WRITERS = {}
+
+VOCABULARIES_DATASTREAM_TRANSFORMERS = {
+    "euroscivoc-transformer": EuroSciVocSubjectsTransformer
+}
+
+DATASTREAM_CONFIG = {
+    "readers": [
+        {
+            "type": "euroscivoc-reader",
+        }
+    ],
+    "transformers": [{"type": "euroscivoc-transformer"}],
+    "writers": [
+        {
+            "type": "subjects-service",
+        }
+    ],
+}
diff --git a/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json b/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json
@@ -30,6 +30,22 @@
       "description": "Human readable label in different languages.",
       "$ref": "local://vocabularies/definitions-v1.0.0.json#/title"
     },
+    "props": {
+      "type": "object",
+      "patternProperties": {
+        "^.*$": {
+          "type": "string"
+        }
+      }
+    },
+    "identifiers": {
+      "description": "Alternate identifiers for the subject.",
+      "type": "array",
+      "items": {
+        "$ref": "local://definitions-v2.0.0.json#/identifiers_with_scheme"
+      },
+      "uniqueItems": true
+    },
     "synonyms": {
       "description": "Synonyms of the subject label.",
       "type": "array",

diff --git a/invenio_vocabularies/contrib/subjects/mappings/os-v1/subjects/subject-v1.0.0.json b/invenio_vocabularies/contrib/subjects/mappings/os-v1/subjects/subject-v1.0.0.json
@@ -71,6 +71,20 @@
         "type": "object",
         "dynamic": "true"
       },
+      "props": {
+        "type": "object",
+        "dynamic": "true"
+      },
+      "identifiers": {
+        "properties": {
+          "identifier": {
+            "type": "keyword"
+          },
+          "scheme": {
+            "type": "keyword"
+          }
+        }
+      },
       "synonyms": {
         "type": "text"
       },

diff --git a/invenio_vocabularies/contrib/subjects/mappings/os-v2/subjects/subject-v1.0.0.json b/invenio_vocabularies/contrib/subjects/mappings/os-v2/subjects/subject-v1.0.0.json
@@ -74,6 +74,20 @@
       "synonyms": {
         "type": "text"
       },
+      "props": {
+        "type": "object",
+        "dynamic": "true"
+      },
+      "identifiers": {
+        "properties": {
+          "identifier": {
+            "type": "keyword"
+          },
+          "scheme": {
+            "type": "keyword"
+          }
+        }
+      },
       "tags": {
         "type": "keyword"
       }

diff --git a/invenio_vocabularies/contrib/subjects/mappings/v7/subjects/subject-v1.0.0.json b/invenio_vocabularies/contrib/subjects/mappings/v7/subjects/subject-v1.0.0.json
@@ -71,6 +71,20 @@
         "type": "object",
         "dynamic": "true"
       },
+      "props": {
+        "type": "object",
+        "dynamic": "true"
+      },
+      "identifiers": {
+        "properties": {
+          "identifier": {
+            "type": "keyword"
+          },
+          "scheme": {
+            "type": "keyword"
+          }
+        }
+      },
       "synonyms": {
         "type": "text"
       },

diff --git a/invenio_vocabularies/contrib/subjects/mesh/__init__.py b/invenio_vocabularies/contrib/subjects/mesh/__init__.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2024 CERN.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+
+"""MeSH Subjects module."""
diff --git a/invenio_vocabularies/contrib/subjects/schema.py b/invenio_vocabularies/contrib/subjects/schema.py
@@ -10,15 +10,19 @@
 
 """Subjects schema."""
 
+from functools import partial
+
 from invenio_i18n import get_locale
-from marshmallow import fields, pre_load
-from marshmallow_utils.fields import SanitizedUnicode
+from marshmallow import Schema, fields, pre_load
+from marshmallow_utils.fields import IdentifierSet, SanitizedUnicode
+from marshmallow_utils.schemas import IdentifierSchema
 
 from ...services.schema import (
     BaseVocabularySchema,
     ContribVocabularyRelationSchema,
     i18n_strings,
 )
+from .config import subject_schemes
 
 
 class SubjectSchema(BaseVocabularySchema):
@@ -31,6 +35,16 @@ class SubjectSchema(BaseVocabularySchema):
     scheme = SanitizedUnicode(required=True)
     subject = SanitizedUnicode(required=True)
     title = i18n_strings
+    props = fields.Dict(keys=SanitizedUnicode(), values=SanitizedUnicode())
+    identifiers = IdentifierSet(
+        fields.Nested(
+            partial(
+                IdentifierSchema,
+                allowed_schemes=subject_schemes,
+                identifier_required=False,
+            )
+        )
+    )
     synonyms = fields.List(SanitizedUnicode())
 
     @pre_load
@@ -48,6 +62,16 @@ class SubjectRelationSchema(ContribVocabularyRelationSchema):
     ftf_name = "subject"
     parent_field_name = "subjects"
     subject = SanitizedUnicode()
-    scheme = SanitizedUnicode()
-    title = i18n_strings
-    synonyms = fields.List(SanitizedUnicode())
+    scheme = SanitizedUnicode(dump_only=True)
+    title = fields.Dict(dump_only=True)
+    props = fields.Dict(dump_only=True)
+    identifiers = IdentifierSet(
+        fields.Nested(
+            partial(
+                IdentifierSchema,
+                allowed_schemes=subject_schemes,
+                identifier_required=False,
+            )
+        )
+    )
+    synonyms = fields.List(SanitizedUnicode(), dump_only=True)