From 832b8ef671d94de83e699c08bb605d2b8004fb20 Mon Sep 17 00:00:00 2001
From: Fatimah Zulfiqar <fatimah0zulfiqar@gmail.com>
Date: Thu, 29 Aug 2024 09:40:09 +0200
Subject: [PATCH 1/5] subjects: added euroscivoc datastream

---
 .../contrib/subjects/datastreams.py           |  18 ++-
 .../contrib/subjects/euroscivoc/__init__.py   |   9 ++
 .../subjects/euroscivoc/datastreams.py        | 147 ++++++++++++++++++
 .../contrib/subjects/mesh/__init__.py         |   9 ++
 setup.cfg                                     |   1 +
 .../test_subjects_euroscivoc_datastream.py    | 101 ++++++++++++
 6 files changed, 279 insertions(+), 6 deletions(-)
 create mode 100644 invenio_vocabularies/contrib/subjects/euroscivoc/__init__.py
 create mode 100644 invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
 create mode 100644 invenio_vocabularies/contrib/subjects/mesh/__init__.py
 create mode 100644 tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py

diff --git a/invenio_vocabularies/contrib/subjects/datastreams.py b/invenio_vocabularies/contrib/subjects/datastreams.py
index 04925290..eeeaec50 100644
--- a/invenio_vocabularies/contrib/subjects/datastreams.py
+++ b/invenio_vocabularies/contrib/subjects/datastreams.py
@@ -12,9 +12,8 @@
 from invenio_i18n import lazy_gettext as _
 
 from ...datastreams.writers import ServiceWriter
-from .mesh.datastreams import VOCABULARIES_DATASTREAM_READERS as mesh_readers
-from .mesh.datastreams import VOCABULARIES_DATASTREAM_TRANSFORMERS as mesh_transformers
-from .mesh.datastreams import VOCABULARIES_DATASTREAM_WRITERS as mesh_writers
+from .euroscivoc import datastreams as euroscivoc_datastreams
+from .mesh import datastreams as mesh_datastreams
 
 
 class SubjectsServiceWriter(ServiceWriter):
@@ -30,15 +29,22 @@ def _entry_id(self, entry):
         return entry["id"]
 
 
-VOCABULARIES_DATASTREAM_READERS = {**mesh_readers}
+VOCABULARIES_DATASTREAM_READERS = {
+    **mesh_datastreams.VOCABULARIES_DATASTREAM_READERS,
+    **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_READERS,
+}
 """Subjects Data Streams readers."""
 
-VOCABULARIES_DATASTREAM_TRANSFORMERS = {**mesh_transformers}
+VOCABULARIES_DATASTREAM_TRANSFORMERS = {
+    **mesh_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
+    **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
+}
 """Subjects Data Streams transformers."""
 
 VOCABULARIES_DATASTREAM_WRITERS = {
     "subjects-service": SubjectsServiceWriter,
-    **mesh_writers,
+    **mesh_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
+    **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
 }
 """Subjects Data Streams writers."""
 
diff --git a/invenio_vocabularies/contrib/subjects/euroscivoc/__init__.py b/invenio_vocabularies/contrib/subjects/euroscivoc/__init__.py
new file mode 100644
index 00000000..a99c582c
--- /dev/null
+++ b/invenio_vocabularies/contrib/subjects/euroscivoc/__init__.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2024 CERN.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+
+"""EuroSciVoc Subjects module."""
diff --git a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
new file mode 100644
index 00000000..13114abe
--- /dev/null
+++ b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
@@ -0,0 +1,147 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2022-2024 CERN.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+
+"""EuroSciVoc subjects datastreams, readers, transformers, and writers."""
+
+import io
+from collections import namedtuple
+
+import requests
+from rdflib import OWL, RDF, Graph, Namespace
+
+from invenio_vocabularies.datastreams.readers import BaseReader
+from invenio_vocabularies.datastreams.transformers import BaseTransformer
+
+
+class EuroSciVocSubjectsHTTPReader(BaseReader):
+    """Reader class to fetch and process EuroSciVoc RDF data."""
+
+    def __init__(self, origin=None, mode="r", since=None, *args, **kwargs):
+        """Initialize the reader with the data source.
+
+        :param origin: The URL from which to fetch the RDF data.
+        :param mode: Mode of operation (default is 'r' for reading).
+        """
+        self.origin = (
+            origin
+            or "https://op.europa.eu/o/opportal-service/euvoc-download-handler?cellarURI=http%3A%2F%2Fpublications.europa.eu%2Fresource%2Fdistribution%2Feuroscivoc%2F20231115-0%2Frdf%2Fskos_ap_eu%2FEuroSciVoc-skos-ap-eu.rdf&fileName=EuroSciVoc-skos-ap-eu.rdf"
+        )
+        super().__init__(origin=origin, mode=mode, *args, **kwargs)
+
+    def _iter(self, rdf_graph):
+        """Iterate over the RDF graph, yielding one subject at a time.
+
+        :param rdf_graph: The RDF graph to process.
+        :yield: Subject and graph to be transformed.
+        """
+        SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
+
+        for subject, _, _ in rdf_graph.triples((None, RDF.type, SKOS_CORE.Concept)):
+            yield {"subject": subject, "rdf_graph": rdf_graph}
+
+    def read(self, item=None, *args, **kwargs):
+        """Fetch and process the EuroSciVoc RDF data, yielding it one subject at a time.
+
+        :param item: The RDF data provided as bytes (optional).
+        :yield: Processed EuroSciVoc subject data.
+        """
+        if item:
+            raise NotImplementedError(
+                "EuroSciVocSubjectsHTTPReader does not support being chained after another reader"
+            )
+        # Fetch the RDF data from the specified origin URL
+        response = requests.get(self.origin)
+        response.raise_for_status()
+
+        # Treat the response content as a file-like object
+        rdf_data = io.BytesIO(response.content)
+
+        # Parse the RDF data into a graph
+        rdf_graph = Graph()
+        rdf_graph.parse(rdf_data, format="xml")
+
+        # Yield each processed subject from the RDF graph
+        yield from self._iter(rdf_graph)
+
+
+class EuroSciVocSubjectsTransformer(BaseTransformer):
+    """Transformer class to convert EuroSciVoc RDF data to a dictionary format."""
+
+    def _transform_entry(self, subject, rdf_graph):
+        """Transform an entry to the required dictionary format."""
+        SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
+        Entry = namedtuple("Entry", ["id", "scheme", "subject", "title", "props"])
+        # Initialize entry fields
+        languages = {}
+        pref_label = None
+
+        for _, _, label in rdf_graph.triples((subject, SKOS_CORE.prefLabel, None)):
+            languages[label.language] = label.value
+            if label.language == "en":
+                pref_label = label.value
+
+        # Fallback to alternative labels if no preferred label in English
+        if not pref_label:
+            for _, _, label in rdf_graph.triples((subject, SKOS_CORE.altLabel, None)):
+                if label.language not in languages:
+                    languages[label.language] = label.value
+                if label.language == "en":
+                    pref_label = label.value
+                    break
+
+        title = languages
+        entry = Entry(str(subject), "EuroSciVoc", pref_label, title, {})
+        return entry
+
+    def _as_dict(self, entry):
+        """Convert an entry to a dictionary."""
+        return {
+            "id": entry.id,
+            "scheme": entry.scheme,
+            "subject": entry.subject,
+            "title": entry.title,
+        }
+
+    def apply(self, stream_entry, *args, **kwargs):
+        """Transform a stream entry to the required dictionary format.
+
+        :param stream_entry: The entry to be transformed, which includes the subject and the RDF graph.
+        :return: The transformed stream entry.
+        """
+        # Apply transformations
+        entry_data = self._transform_entry(
+            stream_entry.entry["subject"], stream_entry.entry["rdf_graph"]
+        )
+        entry_data = self._as_dict(entry_data)
+        stream_entry.entry = entry_data  # Update the stream entry with transformed data
+        return stream_entry
+
+
+# Configuration for datastream readers, transformers, and writers
+VOCABULARIES_DATASTREAM_READERS = {"euroscivoc-reader": EuroSciVocSubjectsHTTPReader}
+
+VOCABULARIES_DATASTREAM_WRITERS = {}
+
+VOCABULARIES_DATASTREAM_TRANSFORMERS = {
+    "euroscivoc-transformer": EuroSciVocSubjectsTransformer
+}
+
+DATASTREAM_CONFIG = {
+    "readers": [
+        {
+            "type": "euroscivoc-reader",
+        }
+    ],
+    "transformers": [{"type": "euroscivoc-transformer"}],
+    "writers": [
+        {
+            "args": {"writer": {"args": {"update": True}, "type": "subjects-service"}},
+            "type": "async",
+        }
+    ],
+}
diff --git a/invenio_vocabularies/contrib/subjects/mesh/__init__.py b/invenio_vocabularies/contrib/subjects/mesh/__init__.py
new file mode 100644
index 00000000..9b6ae5b8
--- /dev/null
+++ b/invenio_vocabularies/contrib/subjects/mesh/__init__.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2024 CERN.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+
+"""MeSH Subjects module."""
diff --git a/setup.cfg b/setup.cfg
index 9d819d0a..1db3fa6a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -33,6 +33,7 @@ install_requires =
     lxml>=4.5.0
     PyYAML>=5.4.1
     regex>=2024.7.24
+    rdflib>=7.0.0
 
 [options.extras_require]
 s3fs =
diff --git a/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py b/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py
new file mode 100644
index 00000000..b6ae384e
--- /dev/null
+++ b/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py
@@ -0,0 +1,101 @@
+import io
+import unittest
+from unittest.mock import Mock, patch
+
+import pytest
+from rdflib import RDF, Graph, Namespace, URIRef
+
+from invenio_vocabularies.contrib.subjects.euroscivoc.datastreams import (  # Adjust import based on your module name
+    EuroSciVocSubjectsHTTPReader,
+    EuroSciVocSubjectsTransformer,
+)
+from invenio_vocabularies.datastreams.datastreams import StreamEntry
+from invenio_vocabularies.datastreams.errors import ReaderError, TransformerError
+
+XML_DATA_PREF_LABEL = bytes(
+    """<?xml version="1.0" encoding="UTF-8"?>
+<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+    <rdf:Description rdf:about="http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba">
+        <startDate xmlns="http://publications.europa.eu/ontology/euvoc#" rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2019-12-02</startDate>
+        <status xmlns="http://publications.europa.eu/ontology/euvoc#" rdf:resource="http://publications.europa.eu/resource/authority/concept-status/CURRENT"/>
+        <rdf:type rdf:resource="http://www.w3.org/2004/02/skos/core#Concept"/>
+        <altLabel xmlns="http://www.w3.org/2004/02/skos/core#" xml:lang="en">broadcastingsatellite service</altLabel>
+        <altLabel xmlns="http://www.w3.org/2004/02/skos/core#" xml:lang="en">satellite radio system</altLabel>
+        <prefLabel xmlns="http://www.w3.org/2004/02/skos/core#" xml:lang="de">Satellitenfunk</prefLabel>
+        <prefLabel xmlns="http://www.w3.org/2004/02/skos/core#" xml:lang="en">satellite radio</prefLabel>
+        <prefLabel xmlns="http://www.w3.org/2004/02/skos/core#" xml:lang="fr">radio satellite</prefLabel>
+        <prefLabel xmlns="http://www.w3.org/2004/02/skos/core#" xml:lang="es">radio por satélite</prefLabel>
+        <prefLabel xmlns="http://www.w3.org/2004/02/skos/core#" xml:lang="it">radio satellitare</prefLabel>
+        <prefLabel xmlns="http://www.w3.org/2004/02/skos/core#" xml:lang="pl">radio satelitarne</prefLabel>
+    </rdf:Description>
+</rdf:RDF>""",
+    encoding="utf-8",
+)
+
+
+XML_DATA_ALT_LABEL = bytes(
+    """<?xml version="1.0" encoding="UTF-8"?>
+<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+    <rdf:Description rdf:about="http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba">
+        <startDate xmlns="http://publications.europa.eu/ontology/euvoc#" rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2019-12-02</startDate>
+        <status xmlns="http://publications.europa.eu/ontology/euvoc#" rdf:resource="http://publications.europa.eu/resource/authority/concept-status/CURRENT"/>
+        <rdf:type rdf:resource="http://www.w3.org/2004/02/skos/core#Concept"/>
+        <altLabel xmlns="http://www.w3.org/2004/02/skos/core#" xml:lang="en">broadcastingsatellite service</altLabel>
+        <altLabel xmlns="http://www.w3.org/2004/02/skos/core#" xml:lang="en">satellite radio system</altLabel>
+    </rdf:Description>
+</rdf:RDF>""",
+    encoding="utf-8",
+)
+
+
+@pytest.fixture(scope="module")
+def expected_from_rdf_pref_label():
+    return {
+        "id": "http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba",
+        "scheme": "EuroSciVoc",
+        "subject": "satellite radio",
+        "title": {
+            "de": "Satellitenfunk",
+            "en": "satellite radio",
+            "fr": "radio satellite",
+            "es": "radio por satélite",
+            "it": "radio satellitare",
+            "pl": "radio satelitarne",
+        },
+    }
+
+
+@pytest.fixture(scope="module")
+def expected_from_rdf_alt_label():
+    return {
+        "id": "http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba",
+        "scheme": "EuroSciVoc",
+        "subject": "broadcastingsatellite service",
+        "title": {"en": "broadcastingsatellite service"},
+    }
+
+
+def test_euroscivoc_subjects_transformer_pref_label(expected_from_rdf_pref_label):
+    reader = EuroSciVocSubjectsHTTPReader()
+    rdf_data = io.BytesIO(XML_DATA_PREF_LABEL)
+    rdf_graph = Graph()
+    rdf_graph.parse(rdf_data, format="xml")
+    stream_entries = list(reader._iter(rdf_graph))
+    assert len(stream_entries) > 0
+    transformer = EuroSciVocSubjectsTransformer()
+    for entry in stream_entries:
+        result = transformer.apply(StreamEntry(entry))
+        assert expected_from_rdf_pref_label == result.entry
+
+
+def test_euroscivoc_subjects_transformer_alt_label(expected_from_rdf_alt_label):
+    reader = EuroSciVocSubjectsHTTPReader()
+    rdf_data = io.BytesIO(XML_DATA_ALT_LABEL)
+    rdf_graph = Graph()
+    rdf_graph.parse(rdf_data, format="xml")
+    stream_entries = list(reader._iter(rdf_graph))
+    assert len(stream_entries) > 0
+    transformer = EuroSciVocSubjectsTransformer()
+    for entry in stream_entries:
+        result = transformer.apply(StreamEntry(entry))
+        assert expected_from_rdf_alt_label == result.entry

From c2c0c013d5a826ae467ccf5d4bb959de435c8ad6 Mon Sep 17 00:00:00 2001
From: Fatimah Zulfiqar <fatimah0zulfiqar@gmail.com>
Date: Wed, 11 Sep 2024 14:18:09 +0200
Subject: [PATCH 2/5] subjects: updated id and added hierarchy

---
 .../subjects/euroscivoc/datastreams.py        | 90 ++++++++++++++-----
 .../jsonschemas/subjects/subject-v1.0.0.json  |  8 ++
 .../contrib/subjects/schema.py                |  2 +
 3 files changed, 76 insertions(+), 24 deletions(-)

diff --git a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
index 13114abe..7e2ff11d 100644
--- a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
+++ b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
@@ -71,32 +71,73 @@ def read(self, item=None, *args, **kwargs):
 
 class EuroSciVocSubjectsTransformer(BaseTransformer):
     """Transformer class to convert EuroSciVoc RDF data to a dictionary format."""
+    SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
+
+    def _get_notation(self, subject, rdf_graph):
+        """Extract the numeric notation for a subject, ignoring UUID-style notations."""
+        for _, _, notation in rdf_graph.triples((subject, self.SKOS_CORE.notation, None)):
+            notation_str = str(notation)
+            if notation_str.isdigit():
+                return notation_str
+        return None
+
+    def _get_labels(self, subject, rdf_graph):
+        """Extract prefLabel and altLabel languages for a subject."""
+        labels = {}
+        for _, _, label in rdf_graph.triples((subject, self.SKOS_CORE.prefLabel, None)):
+            labels[label.language] = label.value
+
+        if "en" not in labels:
+            for _, _, label in rdf_graph.triples((subject, self.SKOS_CORE.altLabel, None)):
+                if label.language not in labels:
+                    labels[label.language] = label.value
+
+        return labels
+
+    def _find_parents(self, subject, rdf_graph):
+        """Find the parent notations of a subject."""
+        parents = []
+        previous_parent = None
+
+        while True:
+            broader_found = False
+            for _, _, parent in rdf_graph.triples((subject, self.SKOS_CORE.broader, None)):
+                if previous_parent is not None:
+                    parents.append(self._get_notation(previous_parent, rdf_graph))
+                previous_parent = parent
+                subject = parent
+                broader_found = True
+                break
+
+            if not broader_found:
+                if previous_parent is not None:
+                    parents.append(self._get_notation(previous_parent, rdf_graph))
+                break
+
+        return parents
 
     def _transform_entry(self, subject, rdf_graph):
         """Transform an entry to the required dictionary format."""
-        SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
         Entry = namedtuple("Entry", ["id", "scheme", "subject", "title", "props"])
-        # Initialize entry fields
-        languages = {}
-        pref_label = None
-
-        for _, _, label in rdf_graph.triples((subject, SKOS_CORE.prefLabel, None)):
-            languages[label.language] = label.value
-            if label.language == "en":
-                pref_label = label.value
-
-        # Fallback to alternative labels if no preferred label in English
-        if not pref_label:
-            for _, _, label in rdf_graph.triples((subject, SKOS_CORE.altLabel, None)):
-                if label.language not in languages:
-                    languages[label.language] = label.value
-                if label.language == "en":
-                    pref_label = label.value
-                    break
-
-        title = languages
-        entry = Entry(str(subject), "EuroSciVoc", pref_label, title, {})
-        return entry
+
+        # Get subject notation with euroscivoc prefix
+        notation = self._get_notation(subject, rdf_graph)
+        id = f"euroscivoc:{notation}" if notation else None
+
+        # Get labels for the current subject
+        languages = self._get_labels(subject, rdf_graph)
+        pref_label = languages.get("en", "")
+
+        # Find parent notations in order from top parent to lowest
+        parent_notations = self._find_parents(subject, rdf_graph)
+        parents = [f"euroscivoc:{notation}" for notation in reversed(parent_notations)]
+
+        # Store parent notations with euroscivoc prefix in props
+        props = {
+            "parents": parents
+        }
+
+        return Entry(id, "EuroSciVoc", pref_label.capitalize(), languages, props)
 
     def _as_dict(self, entry):
         """Convert an entry to a dictionary."""
@@ -105,6 +146,7 @@ def _as_dict(self, entry):
             "scheme": entry.scheme,
             "subject": entry.subject,
             "title": entry.title,
+            "props": entry.props,
         }
 
     def apply(self, stream_entry, *args, **kwargs):
@@ -117,11 +159,11 @@ def apply(self, stream_entry, *args, **kwargs):
         entry_data = self._transform_entry(
             stream_entry.entry["subject"], stream_entry.entry["rdf_graph"]
         )
-        entry_data = self._as_dict(entry_data)
-        stream_entry.entry = entry_data  # Update the stream entry with transformed data
+        stream_entry.entry = self._as_dict(entry_data)
         return stream_entry
 
 
+
 # Configuration for datastream readers, transformers, and writers
 VOCABULARIES_DATASTREAM_READERS = {"euroscivoc-reader": EuroSciVocSubjectsHTTPReader}
 
diff --git a/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json b/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json
index c4c52fcf..6a50454a 100644
--- a/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json
+++ b/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json
@@ -30,6 +30,14 @@
       "description": "Human readable label in different languages.",
       "$ref": "local://vocabularies/definitions-v1.0.0.json#/title"
     },
+    "props": {
+      "type": "object",
+      "patternProperties": {
+        "^.*$": {
+          "type": "array"
+        }
+      }
+    },
     "synonyms": {
       "description": "Synonyms of the subject label.",
       "type": "array",
diff --git a/invenio_vocabularies/contrib/subjects/schema.py b/invenio_vocabularies/contrib/subjects/schema.py
index 99691a16..836b6e67 100644
--- a/invenio_vocabularies/contrib/subjects/schema.py
+++ b/invenio_vocabularies/contrib/subjects/schema.py
@@ -31,6 +31,7 @@ class SubjectSchema(BaseVocabularySchema):
     scheme = SanitizedUnicode(required=True)
     subject = SanitizedUnicode(required=True)
     title = i18n_strings
+    props = fields.Dict(keys=SanitizedUnicode(), values=fields.List(SanitizedUnicode())) 
     synonyms = fields.List(SanitizedUnicode())
 
     @pre_load
@@ -50,4 +51,5 @@ class SubjectRelationSchema(ContribVocabularyRelationSchema):
     subject = SanitizedUnicode()
     scheme = SanitizedUnicode()
     title = i18n_strings
+    props =  fields.Dict() 
     synonyms = fields.List(SanitizedUnicode())

From 395c3d194ddeeaea92f9869abfeb48e6d2d57ccb Mon Sep 17 00:00:00 2001
From: Fatimah Zulfiqar <fatimah0zulfiqar@gmail.com>
Date: Thu, 12 Sep 2024 12:49:50 +0200
Subject: [PATCH 3/5] subjects: added identifier, updated mappings

---
 invenio_vocabularies/config.py                |  1 +
 .../subjects/euroscivoc/datastreams.py        | 32 ++++++++++++-------
 .../jsonschemas/subjects/subject-v1.0.0.json  |  8 +++++
 .../os-v1/subjects/subject-v1.0.0.json        | 14 ++++++++
 .../os-v2/subjects/subject-v1.0.0.json        | 14 ++++++++
 .../mappings/v7/subjects/subject-v1.0.0.json  | 14 ++++++++
 .../contrib/subjects/schema.py                | 21 +++++++++---
 7 files changed, 89 insertions(+), 15 deletions(-)

diff --git a/invenio_vocabularies/config.py b/invenio_vocabularies/config.py
index ef121034..f895b9cb 100644
--- a/invenio_vocabularies/config.py
+++ b/invenio_vocabularies/config.py
@@ -107,6 +107,7 @@
 
 VOCABULARIES_SUBJECTS_SCHEMES = {
     "gnd": {"label": _("GND"), "validator": idutils.is_gnd, "datacite": "GND"},
+    "url": {"label": _("URL"), "validator": idutils.is_url},
 }
 """Subjects allowed identifier schemes."""
 
diff --git a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
index 7e2ff11d..11fee339 100644
--- a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
+++ b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
@@ -71,11 +71,14 @@ def read(self, item=None, *args, **kwargs):
 
 class EuroSciVocSubjectsTransformer(BaseTransformer):
     """Transformer class to convert EuroSciVoc RDF data to a dictionary format."""
+
     SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
 
     def _get_notation(self, subject, rdf_graph):
         """Extract the numeric notation for a subject, ignoring UUID-style notations."""
-        for _, _, notation in rdf_graph.triples((subject, self.SKOS_CORE.notation, None)):
+        for _, _, notation in rdf_graph.triples(
+            (subject, self.SKOS_CORE.notation, None)
+        ):
             notation_str = str(notation)
             if notation_str.isdigit():
                 return notation_str
@@ -88,7 +91,9 @@ def _get_labels(self, subject, rdf_graph):
             labels[label.language] = label.value
 
         if "en" not in labels:
-            for _, _, label in rdf_graph.triples((subject, self.SKOS_CORE.altLabel, None)):
+            for _, _, label in rdf_graph.triples(
+                (subject, self.SKOS_CORE.altLabel, None)
+            ):
                 if label.language not in labels:
                     labels[label.language] = label.value
 
@@ -101,7 +106,9 @@ def _find_parents(self, subject, rdf_graph):
 
         while True:
             broader_found = False
-            for _, _, parent in rdf_graph.triples((subject, self.SKOS_CORE.broader, None)):
+            for _, _, parent in rdf_graph.triples(
+                (subject, self.SKOS_CORE.broader, None)
+            ):
                 if previous_parent is not None:
                     parents.append(self._get_notation(previous_parent, rdf_graph))
                 previous_parent = parent
@@ -118,7 +125,9 @@ def _find_parents(self, subject, rdf_graph):
 
     def _transform_entry(self, subject, rdf_graph):
         """Transform an entry to the required dictionary format."""
-        Entry = namedtuple("Entry", ["id", "scheme", "subject", "title", "props"])
+        Entry = namedtuple(
+            "Entry", ["id", "scheme", "subject", "title", "identifiers", "props"]
+        )
 
         # Get subject notation with euroscivoc prefix
         notation = self._get_notation(subject, rdf_graph)
@@ -133,11 +142,13 @@ def _transform_entry(self, subject, rdf_graph):
         parents = [f"euroscivoc:{notation}" for notation in reversed(parent_notations)]
 
         # Store parent notations with euroscivoc prefix in props
-        props = {
-            "parents": parents
-        }
+        props = {"parents": parents}
 
-        return Entry(id, "EuroSciVoc", pref_label.capitalize(), languages, props)
+        # Create identifiers list
+        identifiers = [{"scheme": "url", "identifier": str(subject)}]
+        return Entry(
+            id, "EuroSciVoc", pref_label.capitalize(), languages, identifiers, props
+        )
 
     def _as_dict(self, entry):
         """Convert an entry to a dictionary."""
@@ -147,6 +158,7 @@ def _as_dict(self, entry):
             "subject": entry.subject,
             "title": entry.title,
             "props": entry.props,
+            "identifiers": entry.identifiers,
         }
 
     def apply(self, stream_entry, *args, **kwargs):
@@ -163,7 +175,6 @@ def apply(self, stream_entry, *args, **kwargs):
         return stream_entry
 
 
-
 # Configuration for datastream readers, transformers, and writers
 VOCABULARIES_DATASTREAM_READERS = {"euroscivoc-reader": EuroSciVocSubjectsHTTPReader}
 
@@ -182,8 +193,7 @@ def apply(self, stream_entry, *args, **kwargs):
     "transformers": [{"type": "euroscivoc-transformer"}],
     "writers": [
         {
-            "args": {"writer": {"args": {"update": True}, "type": "subjects-service"}},
-            "type": "async",
+            "type": "subjects-service",
         }
     ],
 }
diff --git a/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json b/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json
index 6a50454a..7b1c7451 100644
--- a/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json
+++ b/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json
@@ -38,6 +38,14 @@
         }
       }
     },
+    "identifiers": {
+      "description": "Alternate identifiers for the subject.",
+      "type": "array",
+      "items": {
+        "$ref": "local://definitions-v1.0.0.json#/identifiers_with_scheme"
+      },
+      "uniqueItems": true
+    },
     "synonyms": {
       "description": "Synonyms of the subject label.",
       "type": "array",
diff --git a/invenio_vocabularies/contrib/subjects/mappings/os-v1/subjects/subject-v1.0.0.json b/invenio_vocabularies/contrib/subjects/mappings/os-v1/subjects/subject-v1.0.0.json
index 349277df..d845b8b4 100644
--- a/invenio_vocabularies/contrib/subjects/mappings/os-v1/subjects/subject-v1.0.0.json
+++ b/invenio_vocabularies/contrib/subjects/mappings/os-v1/subjects/subject-v1.0.0.json
@@ -71,6 +71,20 @@
         "type": "object",
         "dynamic": "true"
       },
+      "props": {
+        "type": "object",
+        "dynamic": "true"
+      },
+      "identifiers": {
+        "properties": {
+          "identifier": {
+            "type": "keyword"
+          },
+          "scheme": {
+            "type": "keyword"
+          }
+        }
+      },
       "synonyms": {
         "type": "text"
       },
diff --git a/invenio_vocabularies/contrib/subjects/mappings/os-v2/subjects/subject-v1.0.0.json b/invenio_vocabularies/contrib/subjects/mappings/os-v2/subjects/subject-v1.0.0.json
index 349277df..d29ba8fd 100644
--- a/invenio_vocabularies/contrib/subjects/mappings/os-v2/subjects/subject-v1.0.0.json
+++ b/invenio_vocabularies/contrib/subjects/mappings/os-v2/subjects/subject-v1.0.0.json
@@ -74,6 +74,20 @@
       "synonyms": {
         "type": "text"
       },
+      "props": {
+        "type": "object",
+        "dynamic": "true"
+      },
+      "identifiers": {
+        "properties": {
+          "identifier": {
+            "type": "keyword"
+          },
+          "scheme": {
+            "type": "keyword"
+          }
+        }
+      },
       "tags": {
         "type": "keyword"
       }
diff --git a/invenio_vocabularies/contrib/subjects/mappings/v7/subjects/subject-v1.0.0.json b/invenio_vocabularies/contrib/subjects/mappings/v7/subjects/subject-v1.0.0.json
index 349277df..d845b8b4 100644
--- a/invenio_vocabularies/contrib/subjects/mappings/v7/subjects/subject-v1.0.0.json
+++ b/invenio_vocabularies/contrib/subjects/mappings/v7/subjects/subject-v1.0.0.json
@@ -71,6 +71,20 @@
         "type": "object",
         "dynamic": "true"
       },
+      "props": {
+        "type": "object",
+        "dynamic": "true"
+      },
+      "identifiers": {
+        "properties": {
+          "identifier": {
+            "type": "keyword"
+          },
+          "scheme": {
+            "type": "keyword"
+          }
+        }
+      },
       "synonyms": {
         "type": "text"
       },
diff --git a/invenio_vocabularies/contrib/subjects/schema.py b/invenio_vocabularies/contrib/subjects/schema.py
index 836b6e67..397a8ccc 100644
--- a/invenio_vocabularies/contrib/subjects/schema.py
+++ b/invenio_vocabularies/contrib/subjects/schema.py
@@ -10,15 +10,19 @@
 
 """Subjects schema."""
 
+from functools import partial
+
 from invenio_i18n import get_locale
-from marshmallow import fields, pre_load
-from marshmallow_utils.fields import SanitizedUnicode
+from marshmallow import Schema, fields, pre_load
+from marshmallow_utils.fields import IdentifierSet, SanitizedUnicode
+from marshmallow_utils.schemas import IdentifierSchema
 
 from ...services.schema import (
     BaseVocabularySchema,
     ContribVocabularyRelationSchema,
     i18n_strings,
 )
+from .config import subject_schemes
 
 
 class SubjectSchema(BaseVocabularySchema):
@@ -31,7 +35,16 @@ class SubjectSchema(BaseVocabularySchema):
     scheme = SanitizedUnicode(required=True)
     subject = SanitizedUnicode(required=True)
     title = i18n_strings
-    props = fields.Dict(keys=SanitizedUnicode(), values=fields.List(SanitizedUnicode())) 
+    props = fields.Dict(keys=SanitizedUnicode(), values=fields.List(SanitizedUnicode()))
+    identifiers = IdentifierSet(
+        fields.Nested(
+            partial(
+                IdentifierSchema,
+                allowed_schemes=subject_schemes,
+                identifier_required=False,
+            )
+        )
+    )
     synonyms = fields.List(SanitizedUnicode())
 
     @pre_load
@@ -51,5 +64,5 @@ class SubjectRelationSchema(ContribVocabularyRelationSchema):
     subject = SanitizedUnicode()
     scheme = SanitizedUnicode()
     title = i18n_strings
-    props =  fields.Dict() 
+    props = fields.Dict()
     synonyms = fields.List(SanitizedUnicode())

From 3e2a5a9cb3151ed184453fdde1cc195dedec695a Mon Sep 17 00:00:00 2001
From: Fatimah Zulfiqar <fatimah0zulfiqar@gmail.com>
Date: Thu, 12 Sep 2024 15:21:18 +0200
Subject: [PATCH 4/5] subjects: updated tests for euroscivoc

---
 .../subjects/euroscivoc/datastreams.py        |  11 +-
 .../test_subjects_euroscivoc_datastream.py    | 165 +++++++++++++-----
 2 files changed, 135 insertions(+), 41 deletions(-)

diff --git a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
index 11fee339..093ed7df 100644
--- a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
+++ b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
@@ -139,10 +139,17 @@ def _transform_entry(self, subject, rdf_graph):
 
         # Find parent notations in order from top parent to lowest
         parent_notations = self._find_parents(subject, rdf_graph)
-        parents = [f"euroscivoc:{notation}" for notation in reversed(parent_notations)]
+
+        parents = [
+            f"euroscivoc:{notation}"
+            for notation in reversed(parent_notations)
+            if notation is not None
+        ]
 
         # Store parent notations with euroscivoc prefix in props
-        props = {"parents": parents}
+        props = {}
+        if parents:
+            props["parents"] = parents
 
         # Create identifiers list
         identifiers = [{"scheme": "url", "identifier": str(subject)}]
diff --git a/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py b/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py
index b6ae384e..acd09c89 100644
--- a/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py
+++ b/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py
@@ -14,20 +14,58 @@
 
 XML_DATA_PREF_LABEL = bytes(
     """<?xml version="1.0" encoding="UTF-8"?>
-<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:skos="http://www.w3.org/2004/02/skos/core#" xmlns:ns5="http://publications.europa.eu/ontology/euvoc#" xmlns:ns6="http://www.w3.org/2008/05/skos-xl#" xmlns:dc="http://purl.org/dc/elements/1.1/">
     <rdf:Description rdf:about="http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba">
-        <startDate xmlns="http://publications.europa.eu/ontology/euvoc#" rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2019-12-02</startDate>
-        <status xmlns="http://publications.europa.eu/ontology/euvoc#" rdf:resource="http://publications.europa.eu/resource/authority/concept-status/CURRENT"/>
         <rdf:type rdf:resource="http://www.w3.org/2004/02/skos/core#Concept"/>
-        <altLabel xmlns="http://www.w3.org/2004/02/skos/core#" xml:lang="en">broadcastingsatellite service</altLabel>
-        <altLabel xmlns="http://www.w3.org/2004/02/skos/core#" xml:lang="en">satellite radio system</altLabel>
-        <prefLabel xmlns="http://www.w3.org/2004/02/skos/core#" xml:lang="de">Satellitenfunk</prefLabel>
-        <prefLabel xmlns="http://www.w3.org/2004/02/skos/core#" xml:lang="en">satellite radio</prefLabel>
-        <prefLabel xmlns="http://www.w3.org/2004/02/skos/core#" xml:lang="fr">radio satellite</prefLabel>
-        <prefLabel xmlns="http://www.w3.org/2004/02/skos/core#" xml:lang="es">radio por satélite</prefLabel>
-        <prefLabel xmlns="http://www.w3.org/2004/02/skos/core#" xml:lang="it">radio satellitare</prefLabel>
-        <prefLabel xmlns="http://www.w3.org/2004/02/skos/core#" xml:lang="pl">radio satelitarne</prefLabel>
+        <dcterms:created rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2019-12-02</dcterms:created>
+        <dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2019-12-02</dcterms:modified>
+        <owl:versionInfo>1.1.0</owl:versionInfo>
+        <skos:inScheme rdf:resource="http://data.europa.eu/8mn/euroscivoc/40c0f173-baa3-48a3-9fe6-d6e8fb366a00"/>
+        <skos:prefLabel xml:lang="it">radio satellitare</skos:prefLabel>
+        <skos:prefLabel xml:lang="pl">radio satelitarne</skos:prefLabel>
+        <skos:prefLabel xml:lang="fr">radio satellite</skos:prefLabel>
+        <skos:prefLabel xml:lang="es">radio por satélite</skos:prefLabel>
+        <skos:prefLabel xml:lang="de">Satellitenfunk</skos:prefLabel>
+        <skos:prefLabel xml:lang="en">satellite radio</skos:prefLabel>
+        <ns5:startDate rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2019-12-02</ns5:startDate>
+        <skos:notation>87ff3577-527a-4a40-9c76-2f9d3075e2ba</skos:notation>
+        <skos:notation>1717</skos:notation>
+        <ns5:status rdf:resource="http://publications.europa.eu/resource/authority/concept-status/CURRENT"/>
+        <ns5:xlNotation rdf:resource="http://data.europa.eu/8mn/euroscivoc/7f282340-9125-4b0d-aceb-23389311e306"/>
+        <ns5:xlNotation rdf:resource="http://data.europa.eu/8mn/euroscivoc/notation_6014c8c4f809ee00ced00312669d98ad"/>
+        <skos:altLabel xml:lang="en">broadcastingsatellite service</skos:altLabel>
+        <skos:altLabel xml:lang="en">satellite radio system</skos:altLabel>
+        <skos:broader rdf:resource="http://data.europa.eu/8mn/euroscivoc/d913bd42-e79c-46a7-8714-14f2a6a0d82f"/>
+        <dc:identifier>87ff3577-527a-4a40-9c76-2f9d3075e2ba</dc:identifier>
+        <owl:deprecated rdf:datatype="http://www.w3.org/2001/XMLSchema#boolean">false</owl:deprecated>
+        </rdf:Description>
+        <rdf:Description rdf:about="http://data.europa.eu/8mn/euroscivoc/d913bd42-e79c-46a7-8714-14f2a6a0d82f">
+        <skos:narrower rdf:resource="http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba"/>
     </rdf:Description>
+     <rdf:Description rdf:about="http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba">
+            <skos:broader rdf:resource="http://data.europa.eu/8mn/euroscivoc/d913bd42-e79c-46a7-8714-14f2a6a0d82f"/>
+            </rdf:Description>
+            <rdf:Description rdf:about="http://data.europa.eu/8mn/euroscivoc/8d83b645-355f-4cf1-abf3-ce4cd3172c34">
+            <skos:broader rdf:resource="http://data.europa.eu/8mn/euroscivoc/d913bd42-e79c-46a7-8714-14f2a6a0d82f"/>
+            </rdf:Description>
+            <rdf:Description rdf:about="http://data.europa.eu/8mn/euroscivoc/d913bd42-e79c-46a7-8714-14f2a6a0d82f">
+            <rdf:type rdf:resource="http://www.w3.org/2004/02/skos/core#Concept"/>
+            <dcterms:created rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2019-12-02</dcterms:created>
+            <dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2023-03-10</dcterms:modified>
+            <owl:versionInfo>1.3.4</owl:versionInfo>
+            <skos:inScheme rdf:resource="http://data.europa.eu/8mn/euroscivoc/40c0f173-baa3-48a3-9fe6-d6e8fb366a00"/>
+            <ns5:startDate rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2019-12-02</ns5:startDate>
+            <skos:notation>1225</skos:notation>
+            <skos:notation>d913bd42-e79c-46a7-8714-14f2a6a0d82f</skos:notation>
+            <ns5:status rdf:resource="http://publications.europa.eu/resource/authority/concept-status/CURRENT"/>
+            <ns5:xlNotation rdf:resource="http://data.europa.eu/8mn/euroscivoc/notation_1391bd2069b8dde8f6394b6a3b4241cb"/>
+            <ns5:xlNotation rdf:resource="http://data.europa.eu/8mn/euroscivoc/1faa45c8-4f46-45d6-9d6f-447a2626125d"/>
+            <skos:altLabel xml:lang="en">radio channel</skos:altLabel>
+            <skos:altLabel xml:lang="en">wireless</skos:altLabel>
+            <skos:broader rdf:resource="http://data.europa.eu/8mn/euroscivoc/1198b23a-f82f-4189-8778-d9a742430a0f"/>
+            <dc:identifier>d913bd42-e79c-46a7-8714-14f2a6a0d82f</dc:identifier>
+            <owl:deprecated rdf:datatype="http://www.w3.org/2001/XMLSchema#boolean">false</owl:deprecated>
+        </rdf:Description>
 </rdf:RDF>""",
     encoding="utf-8",
 )
@@ -35,47 +73,92 @@
 
 XML_DATA_ALT_LABEL = bytes(
     """<?xml version="1.0" encoding="UTF-8"?>
-<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
-    <rdf:Description rdf:about="http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba">
-        <startDate xmlns="http://publications.europa.eu/ontology/euvoc#" rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2019-12-02</startDate>
-        <status xmlns="http://publications.europa.eu/ontology/euvoc#" rdf:resource="http://publications.europa.eu/resource/authority/concept-status/CURRENT"/>
+        <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:owl="http://www.w3.org/2002/07/owl#" xmlns:skos="http://www.w3.org/2004/02/skos/core#" xmlns:ns5="http://publications.europa.eu/ontology/euvoc#" xmlns:ns6="http://www.w3.org/2008/05/skos-xl#" xmlns:dc="http://purl.org/dc/elements/1.1/">
+        <rdf:Description rdf:about="http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba">
         <rdf:type rdf:resource="http://www.w3.org/2004/02/skos/core#Concept"/>
-        <altLabel xmlns="http://www.w3.org/2004/02/skos/core#" xml:lang="en">broadcastingsatellite service</altLabel>
-        <altLabel xmlns="http://www.w3.org/2004/02/skos/core#" xml:lang="en">satellite radio system</altLabel>
-    </rdf:Description>
+        <dcterms:created rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2019-12-02</dcterms:created>
+        <dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2019-12-02</dcterms:modified>
+        <owl:versionInfo>1.1.0</owl:versionInfo>
+        <skos:inScheme rdf:resource="http://data.europa.eu/8mn/euroscivoc/40c0f173-baa3-48a3-9fe6-d6e8fb366a00"/>
+        <ns5:startDate rdf:datatype="http://www.w3.org/2001/XMLSchema#date">2019-12-02</ns5:startDate>
+        <skos:notation>87ff3577-527a-4a40-9c76-2f9d3075e2ba</skos:notation>
+        <skos:notation>1717</skos:notation>
+        <ns5:status rdf:resource="http://publications.europa.eu/resource/authority/concept-status/CURRENT"/>
+        <ns5:xlNotation rdf:resource="http://data.europa.eu/8mn/euroscivoc/7f282340-9125-4b0d-aceb-23389311e306"/>
+        <ns5:xlNotation rdf:resource="http://data.europa.eu/8mn/euroscivoc/notation_6014c8c4f809ee00ced00312669d98ad"/>
+        <skos:altLabel xml:lang="en">broadcastingsatellite service</skos:altLabel>
+        <skos:altLabel xml:lang="en">satellite radio system</skos:altLabel>
+        <skos:broader rdf:resource="http://data.europa.eu/8mn/euroscivoc/d913bd42-e79c-46a7-8714-14f2a6a0d82f"/>
+        <dc:identifier>87ff3577-527a-4a40-9c76-2f9d3075e2ba</dc:identifier>
+        <owl:deprecated rdf:datatype="http://www.w3.org/2001/XMLSchema#boolean">false</owl:deprecated>
+        </rdf:Description>
+        <rdf:Description rdf:about="http://data.europa.eu/8mn/euroscivoc/d913bd42-e79c-46a7-8714-14f2a6a0d82f">
+        <skos:narrower rdf:resource="http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba"/>
+        </rdf:Description>
 </rdf:RDF>""",
     encoding="utf-8",
 )
 
 
 @pytest.fixture(scope="module")
-def expected_from_rdf_pref_label():
-    return {
-        "id": "http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba",
-        "scheme": "EuroSciVoc",
-        "subject": "satellite radio",
-        "title": {
-            "de": "Satellitenfunk",
-            "en": "satellite radio",
-            "fr": "radio satellite",
-            "es": "radio por satélite",
-            "it": "radio satellitare",
-            "pl": "radio satelitarne",
+def expected_from_rdf_pref_label_with_parent():
+    return [
+        {
+            "id": "euroscivoc:1717",
+            "scheme": "EuroSciVoc",
+            "subject": "Satellite radio",
+            "title": {
+                "it": "radio satellitare",
+                "pl": "radio satelitarne",
+                "fr": "radio satellite",
+                "es": "radio por satélite",
+                "de": "Satellitenfunk",
+                "en": "satellite radio",
+            },
+            "props": {"parents": ["euroscivoc:1225"]},
+            "identifiers": [
+                {
+                    "scheme": "url",
+                    "identifier": "http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba",
+                }
+            ],
         },
-    }
+        {
+            "id": "euroscivoc:1225",
+            "scheme": "EuroSciVoc",
+            "subject": "Radio channel",
+            "title": {"en": "radio channel"},
+            "props": {},
+            "identifiers": [
+                {
+                    "scheme": "url",
+                    "identifier": "http://data.europa.eu/8mn/euroscivoc/d913bd42-e79c-46a7-8714-14f2a6a0d82f",
+                }
+            ],
+        },
+    ]
 
 
 @pytest.fixture(scope="module")
-def expected_from_rdf_alt_label():
+def expected_from_rdf_alt_label_without_parent():
     return {
-        "id": "http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba",
+        "id": "euroscivoc:1717",
         "scheme": "EuroSciVoc",
-        "subject": "broadcastingsatellite service",
+        "subject": "Broadcastingsatellite service",
         "title": {"en": "broadcastingsatellite service"},
+        "props": {},
+        "identifiers": [
+            {
+                "scheme": "url",
+                "identifier": "http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba",
+            }
+        ],
     }
 
 
-def test_euroscivoc_subjects_transformer_pref_label(expected_from_rdf_pref_label):
+def test_euroscivoc_subjects_transformer_pref_label(
+    expected_from_rdf_pref_label_with_parent,
+):
     reader = EuroSciVocSubjectsHTTPReader()
     rdf_data = io.BytesIO(XML_DATA_PREF_LABEL)
     rdf_graph = Graph()
@@ -83,12 +166,16 @@ def test_euroscivoc_subjects_transformer_pref_label(expected_from_rdf_pref_label
     stream_entries = list(reader._iter(rdf_graph))
     assert len(stream_entries) > 0
     transformer = EuroSciVocSubjectsTransformer()
+    result = []
     for entry in stream_entries:
-        result = transformer.apply(StreamEntry(entry))
-        assert expected_from_rdf_pref_label == result.entry
+        entry = transformer.apply(StreamEntry(entry)).entry
+        result.append(entry)
+    assert expected_from_rdf_pref_label_with_parent == result
 
 
-def test_euroscivoc_subjects_transformer_alt_label(expected_from_rdf_alt_label):
+def test_euroscivoc_subjects_transformer_alt_label(
+    expected_from_rdf_alt_label_without_parent,
+):
     reader = EuroSciVocSubjectsHTTPReader()
     rdf_data = io.BytesIO(XML_DATA_ALT_LABEL)
     rdf_graph = Graph()
@@ -98,4 +185,4 @@ def test_euroscivoc_subjects_transformer_alt_label(expected_from_rdf_alt_label):
     transformer = EuroSciVocSubjectsTransformer()
     for entry in stream_entries:
         result = transformer.apply(StreamEntry(entry))
-        assert expected_from_rdf_alt_label == result.entry
+        assert expected_from_rdf_alt_label_without_parent == result.entry

From 8b699559f3af8c72aff919fb5cc9fe8ef3a998f3 Mon Sep 17 00:00:00 2001
From: Fatimah Zulfiqar <fatimah0zulfiqar@gmail.com>
Date: Wed, 25 Sep 2024 16:33:25 +0200
Subject: [PATCH 5/5] subjects: refactor and updates

---
 invenio_vocabularies/config.py                |  3 +
 .../subjects/euroscivoc/datastreams.py        | 99 ++++++-------------
 .../jsonschemas/subjects/subject-v1.0.0.json  |  4 +-
 .../contrib/subjects/schema.py                | 19 +++-
 .../test_subjects_euroscivoc_datastream.py    |  4 +-
 5 files changed, 53 insertions(+), 76 deletions(-)

diff --git a/invenio_vocabularies/config.py b/invenio_vocabularies/config.py
index f895b9cb..f92880c6 100644
--- a/invenio_vocabularies/config.py
+++ b/invenio_vocabularies/config.py
@@ -163,6 +163,9 @@
 }
 """Vocabulary type search configuration."""
 
+SUBJECTS_EUROSCIVOC_FILE_URL = "https://op.europa.eu/o/opportal-service/euvoc-download-handler?cellarURI=http%3A%2F%2Fpublications.europa.eu%2Fresource%2Fdistribution%2Feuroscivoc%2F20231115-0%2Frdf%2Fskos_ap_eu%2FEuroSciVoc-skos-ap-eu.rdf&fileName=EuroSciVoc-skos-ap-eu.rdf"
+"""Subject EuroSciVoc file download link."""
+
 VOCABULARIES_ORCID_ACCESS_KEY = "TODO"
 """ORCID access key to access the s3 bucket."""
 VOCABULARIES_ORCID_SECRET_KEY = "TODO"
diff --git a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
index 093ed7df..64aa59e6 100644
--- a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
+++ b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
@@ -14,6 +14,7 @@
 import requests
 from rdflib import OWL, RDF, Graph, Namespace
 
+from invenio_vocabularies.config import SUBJECTS_EUROSCIVOC_FILE_URL
 from invenio_vocabularies.datastreams.readers import BaseReader
 from invenio_vocabularies.datastreams.transformers import BaseTransformer
 
@@ -27,10 +28,7 @@ def __init__(self, origin=None, mode="r", since=None, *args, **kwargs):
         :param origin: The URL from which to fetch the RDF data.
         :param mode: Mode of operation (default is 'r' for reading).
         """
-        self.origin = (
-            origin
-            or "https://op.europa.eu/o/opportal-service/euvoc-download-handler?cellarURI=http%3A%2F%2Fpublications.europa.eu%2Fresource%2Fdistribution%2Feuroscivoc%2F20231115-0%2Frdf%2Fskos_ap_eu%2FEuroSciVoc-skos-ap-eu.rdf&fileName=EuroSciVoc-skos-ap-eu.rdf"
-        )
+        self.origin = origin or SUBJECTS_EUROSCIVOC_FILE_URL
         super().__init__(origin=origin, mode=mode, *args, **kwargs)
 
     def _iter(self, rdf_graph):
@@ -73,99 +71,66 @@ class EuroSciVocSubjectsTransformer(BaseTransformer):
     """Transformer class to convert EuroSciVoc RDF data to a dictionary format."""
 
     SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
+    SPLITCHAR = ","
 
     def _get_notation(self, subject, rdf_graph):
-        """Extract the numeric notation for a subject, ignoring UUID-style notations."""
+        """Extract the numeric notation for a subject."""
         for _, _, notation in rdf_graph.triples(
             (subject, self.SKOS_CORE.notation, None)
         ):
-            notation_str = str(notation)
-            if notation_str.isdigit():
-                return notation_str
+            if str(notation).isdigit():
+                return str(notation)
         return None
 
     def _get_labels(self, subject, rdf_graph):
         """Extract prefLabel and altLabel languages for a subject."""
-        labels = {}
-        for _, _, label in rdf_graph.triples((subject, self.SKOS_CORE.prefLabel, None)):
-            labels[label.language] = label.value
-
+        labels = {
+            label.language: label.value
+            for _, _, label in rdf_graph.triples(
+                (subject, self.SKOS_CORE.prefLabel, None)
+            )
+        }
         if "en" not in labels:
             for _, _, label in rdf_graph.triples(
                 (subject, self.SKOS_CORE.altLabel, None)
             ):
-                if label.language not in labels:
-                    labels[label.language] = label.value
-
+                labels.setdefault(label.language, label.value)
         return labels
 
     def _find_parents(self, subject, rdf_graph):
-        """Find the parent notations of a subject."""
+        """Find parent notations."""
         parents = []
-        previous_parent = None
 
-        while True:
-            broader_found = False
-            for _, _, parent in rdf_graph.triples(
-                (subject, self.SKOS_CORE.broader, None)
-            ):
-                if previous_parent is not None:
-                    parents.append(self._get_notation(previous_parent, rdf_graph))
-                previous_parent = parent
-                subject = parent
-                broader_found = True
-                break
-
-            if not broader_found:
-                if previous_parent is not None:
-                    parents.append(self._get_notation(previous_parent, rdf_graph))
-                break
+        # Traverse the broader hierarchy
+        for broader in rdf_graph.transitive_objects(subject, self.SKOS_CORE.broader):
+            if broader != subject:  # Ensure we don't include the current subject
+                parent_notation = self._get_notation(broader, rdf_graph)
+                if parent_notation:
+                    parents.append(parent_notation)
 
         return parents
 
     def _transform_entry(self, subject, rdf_graph):
         """Transform an entry to the required dictionary format."""
-        Entry = namedtuple(
-            "Entry", ["id", "scheme", "subject", "title", "identifiers", "props"]
-        )
-
         # Get subject notation with euroscivoc prefix
         notation = self._get_notation(subject, rdf_graph)
         id = f"euroscivoc:{notation}" if notation else None
-
         # Get labels for the current subject
-        languages = self._get_labels(subject, rdf_graph)
-        pref_label = languages.get("en", "")
-
-        # Find parent notations in order from top parent to lowest
-        parent_notations = self._find_parents(subject, rdf_graph)
-
-        parents = [
-            f"euroscivoc:{notation}"
-            for notation in reversed(parent_notations)
-            if notation is not None
-        ]
-
-        # Store parent notations with euroscivoc prefix in props
-        props = {}
-        if parents:
-            props["parents"] = parents
-
+        labels = self._get_labels(subject, rdf_graph)
+        # Join parent notations with SPLITCHAR separator and add euroscivoc prefix
+        parents = self.SPLITCHAR.join(
+            f"euroscivoc:{n}" for n in reversed(self._find_parents(subject, rdf_graph))
+        )
         # Create identifiers list
         identifiers = [{"scheme": "url", "identifier": str(subject)}]
-        return Entry(
-            id, "EuroSciVoc", pref_label.capitalize(), languages, identifiers, props
-        )
 
-    def _as_dict(self, entry):
-        """Convert an entry to a dictionary."""
         return {
-            "id": entry.id,
-            "scheme": entry.scheme,
-            "subject": entry.subject,
-            "title": entry.title,
-            "props": entry.props,
-            "identifiers": entry.identifiers,
+            "id": id,
+            "scheme": "EuroSciVoc",
+            "subject": labels.get("en", "").capitalize(),
+            "title": labels,
+            "props": {"parents": parents} if parents else {},
+            "identifiers": identifiers,
         }
 
     def apply(self, stream_entry, *args, **kwargs):
@@ -178,7 +143,7 @@ def apply(self, stream_entry, *args, **kwargs):
         entry_data = self._transform_entry(
             stream_entry.entry["subject"], stream_entry.entry["rdf_graph"]
         )
-        stream_entry.entry = self._as_dict(entry_data)
+        stream_entry.entry = entry_data
         return stream_entry
 
 
diff --git a/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json b/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json
index 7b1c7451..1d0498b8 100644
--- a/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json
+++ b/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json
@@ -34,7 +34,7 @@
       "type": "object",
       "patternProperties": {
         "^.*$": {
-          "type": "array"
+          "type": "string"
         }
       }
     },
@@ -42,7 +42,7 @@
       "description": "Alternate identifiers for the subject.",
       "type": "array",
       "items": {
-        "$ref": "local://definitions-v1.0.0.json#/identifiers_with_scheme"
+        "$ref": "local://definitions-v2.0.0.json#/identifiers_with_scheme"
       },
       "uniqueItems": true
     },
diff --git a/invenio_vocabularies/contrib/subjects/schema.py b/invenio_vocabularies/contrib/subjects/schema.py
index 397a8ccc..1eb6830c 100644
--- a/invenio_vocabularies/contrib/subjects/schema.py
+++ b/invenio_vocabularies/contrib/subjects/schema.py
@@ -35,7 +35,7 @@ class SubjectSchema(BaseVocabularySchema):
     scheme = SanitizedUnicode(required=True)
     subject = SanitizedUnicode(required=True)
     title = i18n_strings
-    props = fields.Dict(keys=SanitizedUnicode(), values=fields.List(SanitizedUnicode()))
+    props = fields.Dict(keys=SanitizedUnicode(), values=SanitizedUnicode())
     identifiers = IdentifierSet(
         fields.Nested(
             partial(
@@ -62,7 +62,16 @@ class SubjectRelationSchema(ContribVocabularyRelationSchema):
     ftf_name = "subject"
     parent_field_name = "subjects"
     subject = SanitizedUnicode()
-    scheme = SanitizedUnicode()
-    title = i18n_strings
-    props = fields.Dict()
-    synonyms = fields.List(SanitizedUnicode())
+    scheme = SanitizedUnicode(dump_only=True)
+    title = fields.Dict(dump_only=True)
+    props = fields.Dict(dump_only=True)
+    identifiers = IdentifierSet(
+        fields.Nested(
+            partial(
+                IdentifierSchema,
+                allowed_schemes=subject_schemes,
+                identifier_required=False,
+            )
+        )
+    )
+    synonyms = fields.List(SanitizedUnicode(), dump_only=True)
diff --git a/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py b/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py
index acd09c89..c8f62373 100644
--- a/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py
+++ b/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py
@@ -5,7 +5,7 @@
 import pytest
 from rdflib import RDF, Graph, Namespace, URIRef
 
-from invenio_vocabularies.contrib.subjects.euroscivoc.datastreams import (  # Adjust import based on your module name
+from invenio_vocabularies.contrib.subjects.euroscivoc.datastreams import (
     EuroSciVocSubjectsHTTPReader,
     EuroSciVocSubjectsTransformer,
 )
@@ -115,7 +115,7 @@ def expected_from_rdf_pref_label_with_parent():
                 "de": "Satellitenfunk",
                 "en": "satellite radio",
             },
-            "props": {"parents": ["euroscivoc:1225"]},
+            "props": {"parents": "euroscivoc:1225"},
             "identifiers": [
                 {
                     "scheme": "url",