From 832b8ef671d94de83e699c08bb605d2b8004fb20 Mon Sep 17 00:00:00 2001 From: Fatimah Zulfiqar Date: Thu, 29 Aug 2024 09:40:09 +0200 Subject: [PATCH 1/5] subjects: added euroscivoc datastream --- .../contrib/subjects/datastreams.py | 18 ++- .../contrib/subjects/euroscivoc/__init__.py | 9 ++ .../subjects/euroscivoc/datastreams.py | 147 ++++++++++++++++++ .../contrib/subjects/mesh/__init__.py | 9 ++ setup.cfg | 1 + .../test_subjects_euroscivoc_datastream.py | 101 ++++++++++++ 6 files changed, 279 insertions(+), 6 deletions(-) create mode 100644 invenio_vocabularies/contrib/subjects/euroscivoc/__init__.py create mode 100644 invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py create mode 100644 invenio_vocabularies/contrib/subjects/mesh/__init__.py create mode 100644 tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py diff --git a/invenio_vocabularies/contrib/subjects/datastreams.py b/invenio_vocabularies/contrib/subjects/datastreams.py index 04925290..eeeaec50 100644 --- a/invenio_vocabularies/contrib/subjects/datastreams.py +++ b/invenio_vocabularies/contrib/subjects/datastreams.py @@ -12,9 +12,8 @@ from invenio_i18n import lazy_gettext as _ from ...datastreams.writers import ServiceWriter -from .mesh.datastreams import VOCABULARIES_DATASTREAM_READERS as mesh_readers -from .mesh.datastreams import VOCABULARIES_DATASTREAM_TRANSFORMERS as mesh_transformers -from .mesh.datastreams import VOCABULARIES_DATASTREAM_WRITERS as mesh_writers +from .euroscivoc import datastreams as euroscivoc_datastreams +from .mesh import datastreams as mesh_datastreams class SubjectsServiceWriter(ServiceWriter): @@ -30,15 +29,22 @@ def _entry_id(self, entry): return entry["id"] -VOCABULARIES_DATASTREAM_READERS = {**mesh_readers} +VOCABULARIES_DATASTREAM_READERS = { + **mesh_datastreams.VOCABULARIES_DATASTREAM_READERS, + **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_READERS, +} """Subjects Data Streams readers.""" -VOCABULARIES_DATASTREAM_TRANSFORMERS = {**mesh_transformers} +VOCABULARIES_DATASTREAM_TRANSFORMERS = { + **mesh_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS, + **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS, +} """Subjects Data Streams transformers.""" VOCABULARIES_DATASTREAM_WRITERS = { "subjects-service": SubjectsServiceWriter, - **mesh_writers, + **mesh_datastreams.VOCABULARIES_DATASTREAM_WRITERS, + **euroscivoc_datastreams.VOCABULARIES_DATASTREAM_WRITERS, } """Subjects Data Streams writers.""" diff --git a/invenio_vocabularies/contrib/subjects/euroscivoc/__init__.py b/invenio_vocabularies/contrib/subjects/euroscivoc/__init__.py new file mode 100644 index 00000000..a99c582c --- /dev/null +++ b/invenio_vocabularies/contrib/subjects/euroscivoc/__init__.py @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2024 CERN. +# +# Invenio-Vocabularies is free software; you can redistribute it and/or +# modify it under the terms of the MIT License; see LICENSE file for more +# details. + +"""EuroSciVoc Subjects module.""" diff --git a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py new file mode 100644 index 00000000..13114abe --- /dev/null +++ b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py @@ -0,0 +1,147 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2022-2024 CERN. +# +# Invenio-Vocabularies is free software; you can redistribute it and/or +# modify it under the terms of the MIT License; see LICENSE file for more +# details. + +"""EuroSciVoc subjects datastreams, readers, transformers, and writers.""" + +import io +from collections import namedtuple + +import requests +from rdflib import OWL, RDF, Graph, Namespace + +from invenio_vocabularies.datastreams.readers import BaseReader +from invenio_vocabularies.datastreams.transformers import BaseTransformer + + +class EuroSciVocSubjectsHTTPReader(BaseReader): + """Reader class to fetch and process EuroSciVoc RDF data.""" + + def __init__(self, origin=None, mode="r", since=None, *args, **kwargs): + """Initialize the reader with the data source. + + :param origin: The URL from which to fetch the RDF data. + :param mode: Mode of operation (default is 'r' for reading). + """ + self.origin = ( + origin + or "https://op.europa.eu/o/opportal-service/euvoc-download-handler?cellarURI=http%3A%2F%2Fpublications.europa.eu%2Fresource%2Fdistribution%2Feuroscivoc%2F20231115-0%2Frdf%2Fskos_ap_eu%2FEuroSciVoc-skos-ap-eu.rdf&fileName=EuroSciVoc-skos-ap-eu.rdf" + ) + super().__init__(origin=origin, mode=mode, *args, **kwargs) + + def _iter(self, rdf_graph): + """Iterate over the RDF graph, yielding one subject at a time. + + :param rdf_graph: The RDF graph to process. + :yield: Subject and graph to be transformed. + """ + SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#") + + for subject, _, _ in rdf_graph.triples((None, RDF.type, SKOS_CORE.Concept)): + yield {"subject": subject, "rdf_graph": rdf_graph} + + def read(self, item=None, *args, **kwargs): + """Fetch and process the EuroSciVoc RDF data, yielding it one subject at a time. + + :param item: The RDF data provided as bytes (optional). + :yield: Processed EuroSciVoc subject data. + """ + if item: + raise NotImplementedError( + "EuroSciVocSubjectsHTTPReader does not support being chained after another reader" + ) + # Fetch the RDF data from the specified origin URL + response = requests.get(self.origin) + response.raise_for_status() + + # Treat the response content as a file-like object + rdf_data = io.BytesIO(response.content) + + # Parse the RDF data into a graph + rdf_graph = Graph() + rdf_graph.parse(rdf_data, format="xml") + + # Yield each processed subject from the RDF graph + yield from self._iter(rdf_graph) + + +class EuroSciVocSubjectsTransformer(BaseTransformer): + """Transformer class to convert EuroSciVoc RDF data to a dictionary format.""" + + def _transform_entry(self, subject, rdf_graph): + """Transform an entry to the required dictionary format.""" + SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#") + Entry = namedtuple("Entry", ["id", "scheme", "subject", "title", "props"]) + # Initialize entry fields + languages = {} + pref_label = None + + for _, _, label in rdf_graph.triples((subject, SKOS_CORE.prefLabel, None)): + languages[label.language] = label.value + if label.language == "en": + pref_label = label.value + + # Fallback to alternative labels if no preferred label in English + if not pref_label: + for _, _, label in rdf_graph.triples((subject, SKOS_CORE.altLabel, None)): + if label.language not in languages: + languages[label.language] = label.value + if label.language == "en": + pref_label = label.value + break + + title = languages + entry = Entry(str(subject), "EuroSciVoc", pref_label, title, {}) + return entry + + def _as_dict(self, entry): + """Convert an entry to a dictionary.""" + return { + "id": entry.id, + "scheme": entry.scheme, + "subject": entry.subject, + "title": entry.title, + } + + def apply(self, stream_entry, *args, **kwargs): + """Transform a stream entry to the required dictionary format. + + :param stream_entry: The entry to be transformed, which includes the subject and the RDF graph. + :return: The transformed stream entry. + """ + # Apply transformations + entry_data = self._transform_entry( + stream_entry.entry["subject"], stream_entry.entry["rdf_graph"] + ) + entry_data = self._as_dict(entry_data) + stream_entry.entry = entry_data # Update the stream entry with transformed data + return stream_entry + + +# Configuration for datastream readers, transformers, and writers +VOCABULARIES_DATASTREAM_READERS = {"euroscivoc-reader": EuroSciVocSubjectsHTTPReader} + +VOCABULARIES_DATASTREAM_WRITERS = {} + +VOCABULARIES_DATASTREAM_TRANSFORMERS = { + "euroscivoc-transformer": EuroSciVocSubjectsTransformer +} + +DATASTREAM_CONFIG = { + "readers": [ + { + "type": "euroscivoc-reader", + } + ], + "transformers": [{"type": "euroscivoc-transformer"}], + "writers": [ + { + "args": {"writer": {"args": {"update": True}, "type": "subjects-service"}}, + "type": "async", + } + ], +} diff --git a/invenio_vocabularies/contrib/subjects/mesh/__init__.py b/invenio_vocabularies/contrib/subjects/mesh/__init__.py new file mode 100644 index 00000000..9b6ae5b8 --- /dev/null +++ b/invenio_vocabularies/contrib/subjects/mesh/__init__.py @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2024 CERN. +# +# Invenio-Vocabularies is free software; you can redistribute it and/or +# modify it under the terms of the MIT License; see LICENSE file for more +# details. + +"""MeSH Subjects module.""" diff --git a/setup.cfg b/setup.cfg index 9d819d0a..1db3fa6a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -33,6 +33,7 @@ install_requires = lxml>=4.5.0 PyYAML>=5.4.1 regex>=2024.7.24 + rdflib>=7.0.0 [options.extras_require] s3fs = diff --git a/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py b/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py new file mode 100644 index 00000000..b6ae384e --- /dev/null +++ b/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py @@ -0,0 +1,101 @@ +import io +import unittest +from unittest.mock import Mock, patch + +import pytest +from rdflib import RDF, Graph, Namespace, URIRef + +from invenio_vocabularies.contrib.subjects.euroscivoc.datastreams import ( # Adjust import based on your module name + EuroSciVocSubjectsHTTPReader, + EuroSciVocSubjectsTransformer, +) +from invenio_vocabularies.datastreams.datastreams import StreamEntry +from invenio_vocabularies.datastreams.errors import ReaderError, TransformerError + +XML_DATA_PREF_LABEL = bytes( + """ + + + 2019-12-02 + + + broadcastingsatellite service + satellite radio system + Satellitenfunk + satellite radio + radio satellite + radio por satélite + radio satellitare + radio satelitarne + +""", + encoding="utf-8", +) + + +XML_DATA_ALT_LABEL = bytes( + """ + + + 2019-12-02 + + + broadcastingsatellite service + satellite radio system + +""", + encoding="utf-8", +) + + +@pytest.fixture(scope="module") +def expected_from_rdf_pref_label(): + return { + "id": "http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba", + "scheme": "EuroSciVoc", + "subject": "satellite radio", + "title": { + "de": "Satellitenfunk", + "en": "satellite radio", + "fr": "radio satellite", + "es": "radio por satélite", + "it": "radio satellitare", + "pl": "radio satelitarne", + }, + } + + +@pytest.fixture(scope="module") +def expected_from_rdf_alt_label(): + return { + "id": "http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba", + "scheme": "EuroSciVoc", + "subject": "broadcastingsatellite service", + "title": {"en": "broadcastingsatellite service"}, + } + + +def test_euroscivoc_subjects_transformer_pref_label(expected_from_rdf_pref_label): + reader = EuroSciVocSubjectsHTTPReader() + rdf_data = io.BytesIO(XML_DATA_PREF_LABEL) + rdf_graph = Graph() + rdf_graph.parse(rdf_data, format="xml") + stream_entries = list(reader._iter(rdf_graph)) + assert len(stream_entries) > 0 + transformer = EuroSciVocSubjectsTransformer() + for entry in stream_entries: + result = transformer.apply(StreamEntry(entry)) + assert expected_from_rdf_pref_label == result.entry + + +def test_euroscivoc_subjects_transformer_alt_label(expected_from_rdf_alt_label): + reader = EuroSciVocSubjectsHTTPReader() + rdf_data = io.BytesIO(XML_DATA_ALT_LABEL) + rdf_graph = Graph() + rdf_graph.parse(rdf_data, format="xml") + stream_entries = list(reader._iter(rdf_graph)) + assert len(stream_entries) > 0 + transformer = EuroSciVocSubjectsTransformer() + for entry in stream_entries: + result = transformer.apply(StreamEntry(entry)) + assert expected_from_rdf_alt_label == result.entry From c2c0c013d5a826ae467ccf5d4bb959de435c8ad6 Mon Sep 17 00:00:00 2001 From: Fatimah Zulfiqar Date: Wed, 11 Sep 2024 14:18:09 +0200 Subject: [PATCH 2/5] subjects: updated id and added hierarchy --- .../subjects/euroscivoc/datastreams.py | 90 ++++++++++++++----- .../jsonschemas/subjects/subject-v1.0.0.json | 8 ++ .../contrib/subjects/schema.py | 2 + 3 files changed, 76 insertions(+), 24 deletions(-) diff --git a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py index 13114abe..7e2ff11d 100644 --- a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py +++ b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py @@ -71,32 +71,73 @@ def read(self, item=None, *args, **kwargs): class EuroSciVocSubjectsTransformer(BaseTransformer): """Transformer class to convert EuroSciVoc RDF data to a dictionary format.""" + SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#") + + def _get_notation(self, subject, rdf_graph): + """Extract the numeric notation for a subject, ignoring UUID-style notations.""" + for _, _, notation in rdf_graph.triples((subject, self.SKOS_CORE.notation, None)): + notation_str = str(notation) + if notation_str.isdigit(): + return notation_str + return None + + def _get_labels(self, subject, rdf_graph): + """Extract prefLabel and altLabel languages for a subject.""" + labels = {} + for _, _, label in rdf_graph.triples((subject, self.SKOS_CORE.prefLabel, None)): + labels[label.language] = label.value + + if "en" not in labels: + for _, _, label in rdf_graph.triples((subject, self.SKOS_CORE.altLabel, None)): + if label.language not in labels: + labels[label.language] = label.value + + return labels + + def _find_parents(self, subject, rdf_graph): + """Find the parent notations of a subject.""" + parents = [] + previous_parent = None + + while True: + broader_found = False + for _, _, parent in rdf_graph.triples((subject, self.SKOS_CORE.broader, None)): + if previous_parent is not None: + parents.append(self._get_notation(previous_parent, rdf_graph)) + previous_parent = parent + subject = parent + broader_found = True + break + + if not broader_found: + if previous_parent is not None: + parents.append(self._get_notation(previous_parent, rdf_graph)) + break + + return parents def _transform_entry(self, subject, rdf_graph): """Transform an entry to the required dictionary format.""" - SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#") Entry = namedtuple("Entry", ["id", "scheme", "subject", "title", "props"]) - # Initialize entry fields - languages = {} - pref_label = None - - for _, _, label in rdf_graph.triples((subject, SKOS_CORE.prefLabel, None)): - languages[label.language] = label.value - if label.language == "en": - pref_label = label.value - - # Fallback to alternative labels if no preferred label in English - if not pref_label: - for _, _, label in rdf_graph.triples((subject, SKOS_CORE.altLabel, None)): - if label.language not in languages: - languages[label.language] = label.value - if label.language == "en": - pref_label = label.value - break - - title = languages - entry = Entry(str(subject), "EuroSciVoc", pref_label, title, {}) - return entry + + # Get subject notation with euroscivoc prefix + notation = self._get_notation(subject, rdf_graph) + id = f"euroscivoc:{notation}" if notation else None + + # Get labels for the current subject + languages = self._get_labels(subject, rdf_graph) + pref_label = languages.get("en", "") + + # Find parent notations in order from top parent to lowest + parent_notations = self._find_parents(subject, rdf_graph) + parents = [f"euroscivoc:{notation}" for notation in reversed(parent_notations)] + + # Store parent notations with euroscivoc prefix in props + props = { + "parents": parents + } + + return Entry(id, "EuroSciVoc", pref_label.capitalize(), languages, props) def _as_dict(self, entry): """Convert an entry to a dictionary.""" @@ -105,6 +146,7 @@ def _as_dict(self, entry): "scheme": entry.scheme, "subject": entry.subject, "title": entry.title, + "props": entry.props, } def apply(self, stream_entry, *args, **kwargs): @@ -117,11 +159,11 @@ def apply(self, stream_entry, *args, **kwargs): entry_data = self._transform_entry( stream_entry.entry["subject"], stream_entry.entry["rdf_graph"] ) - entry_data = self._as_dict(entry_data) - stream_entry.entry = entry_data # Update the stream entry with transformed data + stream_entry.entry = self._as_dict(entry_data) return stream_entry + # Configuration for datastream readers, transformers, and writers VOCABULARIES_DATASTREAM_READERS = {"euroscivoc-reader": EuroSciVocSubjectsHTTPReader} diff --git a/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json b/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json index c4c52fcf..6a50454a 100644 --- a/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json +++ b/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json @@ -30,6 +30,14 @@ "description": "Human readable label in different languages.", "$ref": "local://vocabularies/definitions-v1.0.0.json#/title" }, + "props": { + "type": "object", + "patternProperties": { + "^.*$": { + "type": "array" + } + } + }, "synonyms": { "description": "Synonyms of the subject label.", "type": "array", diff --git a/invenio_vocabularies/contrib/subjects/schema.py b/invenio_vocabularies/contrib/subjects/schema.py index 99691a16..836b6e67 100644 --- a/invenio_vocabularies/contrib/subjects/schema.py +++ b/invenio_vocabularies/contrib/subjects/schema.py @@ -31,6 +31,7 @@ class SubjectSchema(BaseVocabularySchema): scheme = SanitizedUnicode(required=True) subject = SanitizedUnicode(required=True) title = i18n_strings + props = fields.Dict(keys=SanitizedUnicode(), values=fields.List(SanitizedUnicode())) synonyms = fields.List(SanitizedUnicode()) @pre_load @@ -50,4 +51,5 @@ class SubjectRelationSchema(ContribVocabularyRelationSchema): subject = SanitizedUnicode() scheme = SanitizedUnicode() title = i18n_strings + props = fields.Dict() synonyms = fields.List(SanitizedUnicode()) From 395c3d194ddeeaea92f9869abfeb48e6d2d57ccb Mon Sep 17 00:00:00 2001 From: Fatimah Zulfiqar Date: Thu, 12 Sep 2024 12:49:50 +0200 Subject: [PATCH 3/5] subjects: added identifier, updated mappings --- invenio_vocabularies/config.py | 1 + .../subjects/euroscivoc/datastreams.py | 32 ++++++++++++------- .../jsonschemas/subjects/subject-v1.0.0.json | 8 +++++ .../os-v1/subjects/subject-v1.0.0.json | 14 ++++++++ .../os-v2/subjects/subject-v1.0.0.json | 14 ++++++++ .../mappings/v7/subjects/subject-v1.0.0.json | 14 ++++++++ .../contrib/subjects/schema.py | 21 +++++++++--- 7 files changed, 89 insertions(+), 15 deletions(-) diff --git a/invenio_vocabularies/config.py b/invenio_vocabularies/config.py index ef121034..f895b9cb 100644 --- a/invenio_vocabularies/config.py +++ b/invenio_vocabularies/config.py @@ -107,6 +107,7 @@ VOCABULARIES_SUBJECTS_SCHEMES = { "gnd": {"label": _("GND"), "validator": idutils.is_gnd, "datacite": "GND"}, + "url": {"label": _("URL"), "validator": idutils.is_url}, } """Subjects allowed identifier schemes.""" diff --git a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py index 7e2ff11d..11fee339 100644 --- a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py +++ b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py @@ -71,11 +71,14 @@ def read(self, item=None, *args, **kwargs): class EuroSciVocSubjectsTransformer(BaseTransformer): """Transformer class to convert EuroSciVoc RDF data to a dictionary format.""" + SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#") def _get_notation(self, subject, rdf_graph): """Extract the numeric notation for a subject, ignoring UUID-style notations.""" - for _, _, notation in rdf_graph.triples((subject, self.SKOS_CORE.notation, None)): + for _, _, notation in rdf_graph.triples( + (subject, self.SKOS_CORE.notation, None) + ): notation_str = str(notation) if notation_str.isdigit(): return notation_str @@ -88,7 +91,9 @@ def _get_labels(self, subject, rdf_graph): labels[label.language] = label.value if "en" not in labels: - for _, _, label in rdf_graph.triples((subject, self.SKOS_CORE.altLabel, None)): + for _, _, label in rdf_graph.triples( + (subject, self.SKOS_CORE.altLabel, None) + ): if label.language not in labels: labels[label.language] = label.value @@ -101,7 +106,9 @@ def _find_parents(self, subject, rdf_graph): while True: broader_found = False - for _, _, parent in rdf_graph.triples((subject, self.SKOS_CORE.broader, None)): + for _, _, parent in rdf_graph.triples( + (subject, self.SKOS_CORE.broader, None) + ): if previous_parent is not None: parents.append(self._get_notation(previous_parent, rdf_graph)) previous_parent = parent @@ -118,7 +125,9 @@ def _find_parents(self, subject, rdf_graph): def _transform_entry(self, subject, rdf_graph): """Transform an entry to the required dictionary format.""" - Entry = namedtuple("Entry", ["id", "scheme", "subject", "title", "props"]) + Entry = namedtuple( + "Entry", ["id", "scheme", "subject", "title", "identifiers", "props"] + ) # Get subject notation with euroscivoc prefix notation = self._get_notation(subject, rdf_graph) @@ -133,11 +142,13 @@ def _transform_entry(self, subject, rdf_graph): parents = [f"euroscivoc:{notation}" for notation in reversed(parent_notations)] # Store parent notations with euroscivoc prefix in props - props = { - "parents": parents - } + props = {"parents": parents} - return Entry(id, "EuroSciVoc", pref_label.capitalize(), languages, props) + # Create identifiers list + identifiers = [{"scheme": "url", "identifier": str(subject)}] + return Entry( + id, "EuroSciVoc", pref_label.capitalize(), languages, identifiers, props + ) def _as_dict(self, entry): """Convert an entry to a dictionary.""" @@ -147,6 +158,7 @@ def _as_dict(self, entry): "subject": entry.subject, "title": entry.title, "props": entry.props, + "identifiers": entry.identifiers, } def apply(self, stream_entry, *args, **kwargs): @@ -163,7 +175,6 @@ def apply(self, stream_entry, *args, **kwargs): return stream_entry - # Configuration for datastream readers, transformers, and writers VOCABULARIES_DATASTREAM_READERS = {"euroscivoc-reader": EuroSciVocSubjectsHTTPReader} @@ -182,8 +193,7 @@ def apply(self, stream_entry, *args, **kwargs): "transformers": [{"type": "euroscivoc-transformer"}], "writers": [ { - "args": {"writer": {"args": {"update": True}, "type": "subjects-service"}}, - "type": "async", + "type": "subjects-service", } ], } diff --git a/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json b/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json index 6a50454a..7b1c7451 100644 --- a/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json +++ b/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json @@ -38,6 +38,14 @@ } } }, + "identifiers": { + "description": "Alternate identifiers for the subject.", + "type": "array", + "items": { + "$ref": "local://definitions-v1.0.0.json#/identifiers_with_scheme" + }, + "uniqueItems": true + }, "synonyms": { "description": "Synonyms of the subject label.", "type": "array", diff --git a/invenio_vocabularies/contrib/subjects/mappings/os-v1/subjects/subject-v1.0.0.json b/invenio_vocabularies/contrib/subjects/mappings/os-v1/subjects/subject-v1.0.0.json index 349277df..d845b8b4 100644 --- a/invenio_vocabularies/contrib/subjects/mappings/os-v1/subjects/subject-v1.0.0.json +++ b/invenio_vocabularies/contrib/subjects/mappings/os-v1/subjects/subject-v1.0.0.json @@ -71,6 +71,20 @@ "type": "object", "dynamic": "true" }, + "props": { + "type": "object", + "dynamic": "true" + }, + "identifiers": { + "properties": { + "identifier": { + "type": "keyword" + }, + "scheme": { + "type": "keyword" + } + } + }, "synonyms": { "type": "text" }, diff --git a/invenio_vocabularies/contrib/subjects/mappings/os-v2/subjects/subject-v1.0.0.json b/invenio_vocabularies/contrib/subjects/mappings/os-v2/subjects/subject-v1.0.0.json index 349277df..d29ba8fd 100644 --- a/invenio_vocabularies/contrib/subjects/mappings/os-v2/subjects/subject-v1.0.0.json +++ b/invenio_vocabularies/contrib/subjects/mappings/os-v2/subjects/subject-v1.0.0.json @@ -74,6 +74,20 @@ "synonyms": { "type": "text" }, + "props": { + "type": "object", + "dynamic": "true" + }, + "identifiers": { + "properties": { + "identifier": { + "type": "keyword" + }, + "scheme": { + "type": "keyword" + } + } + }, "tags": { "type": "keyword" } diff --git a/invenio_vocabularies/contrib/subjects/mappings/v7/subjects/subject-v1.0.0.json b/invenio_vocabularies/contrib/subjects/mappings/v7/subjects/subject-v1.0.0.json index 349277df..d845b8b4 100644 --- a/invenio_vocabularies/contrib/subjects/mappings/v7/subjects/subject-v1.0.0.json +++ b/invenio_vocabularies/contrib/subjects/mappings/v7/subjects/subject-v1.0.0.json @@ -71,6 +71,20 @@ "type": "object", "dynamic": "true" }, + "props": { + "type": "object", + "dynamic": "true" + }, + "identifiers": { + "properties": { + "identifier": { + "type": "keyword" + }, + "scheme": { + "type": "keyword" + } + } + }, "synonyms": { "type": "text" }, diff --git a/invenio_vocabularies/contrib/subjects/schema.py b/invenio_vocabularies/contrib/subjects/schema.py index 836b6e67..397a8ccc 100644 --- a/invenio_vocabularies/contrib/subjects/schema.py +++ b/invenio_vocabularies/contrib/subjects/schema.py @@ -10,15 +10,19 @@ """Subjects schema.""" +from functools import partial + from invenio_i18n import get_locale -from marshmallow import fields, pre_load -from marshmallow_utils.fields import SanitizedUnicode +from marshmallow import Schema, fields, pre_load +from marshmallow_utils.fields import IdentifierSet, SanitizedUnicode +from marshmallow_utils.schemas import IdentifierSchema from ...services.schema import ( BaseVocabularySchema, ContribVocabularyRelationSchema, i18n_strings, ) +from .config import subject_schemes class SubjectSchema(BaseVocabularySchema): @@ -31,7 +35,16 @@ class SubjectSchema(BaseVocabularySchema): scheme = SanitizedUnicode(required=True) subject = SanitizedUnicode(required=True) title = i18n_strings - props = fields.Dict(keys=SanitizedUnicode(), values=fields.List(SanitizedUnicode())) + props = fields.Dict(keys=SanitizedUnicode(), values=fields.List(SanitizedUnicode())) + identifiers = IdentifierSet( + fields.Nested( + partial( + IdentifierSchema, + allowed_schemes=subject_schemes, + identifier_required=False, + ) + ) + ) synonyms = fields.List(SanitizedUnicode()) @pre_load @@ -51,5 +64,5 @@ class SubjectRelationSchema(ContribVocabularyRelationSchema): subject = SanitizedUnicode() scheme = SanitizedUnicode() title = i18n_strings - props = fields.Dict() + props = fields.Dict() synonyms = fields.List(SanitizedUnicode()) From 3e2a5a9cb3151ed184453fdde1cc195dedec695a Mon Sep 17 00:00:00 2001 From: Fatimah Zulfiqar Date: Thu, 12 Sep 2024 15:21:18 +0200 Subject: [PATCH 4/5] subjects: updated tests for euroscivoc --- .../subjects/euroscivoc/datastreams.py | 11 +- .../test_subjects_euroscivoc_datastream.py | 165 +++++++++++++----- 2 files changed, 135 insertions(+), 41 deletions(-) diff --git a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py index 11fee339..093ed7df 100644 --- a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py +++ b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py @@ -139,10 +139,17 @@ def _transform_entry(self, subject, rdf_graph): # Find parent notations in order from top parent to lowest parent_notations = self._find_parents(subject, rdf_graph) - parents = [f"euroscivoc:{notation}" for notation in reversed(parent_notations)] + + parents = [ + f"euroscivoc:{notation}" + for notation in reversed(parent_notations) + if notation is not None + ] # Store parent notations with euroscivoc prefix in props - props = {"parents": parents} + props = {} + if parents: + props["parents"] = parents # Create identifiers list identifiers = [{"scheme": "url", "identifier": str(subject)}] diff --git a/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py b/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py index b6ae384e..acd09c89 100644 --- a/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py +++ b/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py @@ -14,20 +14,58 @@ XML_DATA_PREF_LABEL = bytes( """ - + - 2019-12-02 - - broadcastingsatellite service - satellite radio system - Satellitenfunk - satellite radio - radio satellite - radio por satélite - radio satellitare - radio satelitarne + 2019-12-02 + 2019-12-02 + 1.1.0 + + radio satellitare + radio satelitarne + radio satellite + radio por satélite + Satellitenfunk + satellite radio + 2019-12-02 + 87ff3577-527a-4a40-9c76-2f9d3075e2ba + 1717 + + + + broadcastingsatellite service + satellite radio system + + 87ff3577-527a-4a40-9c76-2f9d3075e2ba + false + + + + + + + + + + + + 2019-12-02 + 2023-03-10 + 1.3.4 + + 2019-12-02 + 1225 + d913bd42-e79c-46a7-8714-14f2a6a0d82f + + + + radio channel + wireless + + d913bd42-e79c-46a7-8714-14f2a6a0d82f + false + """, encoding="utf-8", ) @@ -35,47 +73,92 @@ XML_DATA_ALT_LABEL = bytes( """ - - - 2019-12-02 - + + - broadcastingsatellite service - satellite radio system - + 2019-12-02 + 2019-12-02 + 1.1.0 + + 2019-12-02 + 87ff3577-527a-4a40-9c76-2f9d3075e2ba + 1717 + + + + broadcastingsatellite service + satellite radio system + + 87ff3577-527a-4a40-9c76-2f9d3075e2ba + false + + + + """, encoding="utf-8", ) @pytest.fixture(scope="module") -def expected_from_rdf_pref_label(): - return { - "id": "http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba", - "scheme": "EuroSciVoc", - "subject": "satellite radio", - "title": { - "de": "Satellitenfunk", - "en": "satellite radio", - "fr": "radio satellite", - "es": "radio por satélite", - "it": "radio satellitare", - "pl": "radio satelitarne", +def expected_from_rdf_pref_label_with_parent(): + return [ + { + "id": "euroscivoc:1717", + "scheme": "EuroSciVoc", + "subject": "Satellite radio", + "title": { + "it": "radio satellitare", + "pl": "radio satelitarne", + "fr": "radio satellite", + "es": "radio por satélite", + "de": "Satellitenfunk", + "en": "satellite radio", + }, + "props": {"parents": ["euroscivoc:1225"]}, + "identifiers": [ + { + "scheme": "url", + "identifier": "http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba", + } + ], }, - } + { + "id": "euroscivoc:1225", + "scheme": "EuroSciVoc", + "subject": "Radio channel", + "title": {"en": "radio channel"}, + "props": {}, + "identifiers": [ + { + "scheme": "url", + "identifier": "http://data.europa.eu/8mn/euroscivoc/d913bd42-e79c-46a7-8714-14f2a6a0d82f", + } + ], + }, + ] @pytest.fixture(scope="module") -def expected_from_rdf_alt_label(): +def expected_from_rdf_alt_label_without_parent(): return { - "id": "http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba", + "id": "euroscivoc:1717", "scheme": "EuroSciVoc", - "subject": "broadcastingsatellite service", + "subject": "Broadcastingsatellite service", "title": {"en": "broadcastingsatellite service"}, + "props": {}, + "identifiers": [ + { + "scheme": "url", + "identifier": "http://data.europa.eu/8mn/euroscivoc/87ff3577-527a-4a40-9c76-2f9d3075e2ba", + } + ], } -def test_euroscivoc_subjects_transformer_pref_label(expected_from_rdf_pref_label): +def test_euroscivoc_subjects_transformer_pref_label( + expected_from_rdf_pref_label_with_parent, +): reader = EuroSciVocSubjectsHTTPReader() rdf_data = io.BytesIO(XML_DATA_PREF_LABEL) rdf_graph = Graph() @@ -83,12 +166,16 @@ def test_euroscivoc_subjects_transformer_pref_label(expected_from_rdf_pref_label stream_entries = list(reader._iter(rdf_graph)) assert len(stream_entries) > 0 transformer = EuroSciVocSubjectsTransformer() + result = [] for entry in stream_entries: - result = transformer.apply(StreamEntry(entry)) - assert expected_from_rdf_pref_label == result.entry + entry = transformer.apply(StreamEntry(entry)).entry + result.append(entry) + assert expected_from_rdf_pref_label_with_parent == result -def test_euroscivoc_subjects_transformer_alt_label(expected_from_rdf_alt_label): +def test_euroscivoc_subjects_transformer_alt_label( + expected_from_rdf_alt_label_without_parent, +): reader = EuroSciVocSubjectsHTTPReader() rdf_data = io.BytesIO(XML_DATA_ALT_LABEL) rdf_graph = Graph() @@ -98,4 +185,4 @@ def test_euroscivoc_subjects_transformer_alt_label(expected_from_rdf_alt_label): transformer = EuroSciVocSubjectsTransformer() for entry in stream_entries: result = transformer.apply(StreamEntry(entry)) - assert expected_from_rdf_alt_label == result.entry + assert expected_from_rdf_alt_label_without_parent == result.entry From 8b699559f3af8c72aff919fb5cc9fe8ef3a998f3 Mon Sep 17 00:00:00 2001 From: Fatimah Zulfiqar Date: Wed, 25 Sep 2024 16:33:25 +0200 Subject: [PATCH 5/5] subjects: refactor and updates --- invenio_vocabularies/config.py | 3 + .../subjects/euroscivoc/datastreams.py | 99 ++++++------------- .../jsonschemas/subjects/subject-v1.0.0.json | 4 +- .../contrib/subjects/schema.py | 19 +++- .../test_subjects_euroscivoc_datastream.py | 4 +- 5 files changed, 53 insertions(+), 76 deletions(-) diff --git a/invenio_vocabularies/config.py b/invenio_vocabularies/config.py index f895b9cb..f92880c6 100644 --- a/invenio_vocabularies/config.py +++ b/invenio_vocabularies/config.py @@ -163,6 +163,9 @@ } """Vocabulary type search configuration.""" +SUBJECTS_EUROSCIVOC_FILE_URL = "https://op.europa.eu/o/opportal-service/euvoc-download-handler?cellarURI=http%3A%2F%2Fpublications.europa.eu%2Fresource%2Fdistribution%2Feuroscivoc%2F20231115-0%2Frdf%2Fskos_ap_eu%2FEuroSciVoc-skos-ap-eu.rdf&fileName=EuroSciVoc-skos-ap-eu.rdf" +"""Subject EuroSciVoc file download link.""" + VOCABULARIES_ORCID_ACCESS_KEY = "TODO" """ORCID access key to access the s3 bucket.""" VOCABULARIES_ORCID_SECRET_KEY = "TODO" diff --git a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py index 093ed7df..64aa59e6 100644 --- a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py +++ b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py @@ -14,6 +14,7 @@ import requests from rdflib import OWL, RDF, Graph, Namespace +from invenio_vocabularies.config import SUBJECTS_EUROSCIVOC_FILE_URL from invenio_vocabularies.datastreams.readers import BaseReader from invenio_vocabularies.datastreams.transformers import BaseTransformer @@ -27,10 +28,7 @@ def __init__(self, origin=None, mode="r", since=None, *args, **kwargs): :param origin: The URL from which to fetch the RDF data. :param mode: Mode of operation (default is 'r' for reading). """ - self.origin = ( - origin - or "https://op.europa.eu/o/opportal-service/euvoc-download-handler?cellarURI=http%3A%2F%2Fpublications.europa.eu%2Fresource%2Fdistribution%2Feuroscivoc%2F20231115-0%2Frdf%2Fskos_ap_eu%2FEuroSciVoc-skos-ap-eu.rdf&fileName=EuroSciVoc-skos-ap-eu.rdf" - ) + self.origin = origin or SUBJECTS_EUROSCIVOC_FILE_URL super().__init__(origin=origin, mode=mode, *args, **kwargs) def _iter(self, rdf_graph): @@ -73,99 +71,66 @@ class EuroSciVocSubjectsTransformer(BaseTransformer): """Transformer class to convert EuroSciVoc RDF data to a dictionary format.""" SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#") + SPLITCHAR = "," def _get_notation(self, subject, rdf_graph): - """Extract the numeric notation for a subject, ignoring UUID-style notations.""" + """Extract the numeric notation for a subject.""" for _, _, notation in rdf_graph.triples( (subject, self.SKOS_CORE.notation, None) ): - notation_str = str(notation) - if notation_str.isdigit(): - return notation_str + if str(notation).isdigit(): + return str(notation) return None def _get_labels(self, subject, rdf_graph): """Extract prefLabel and altLabel languages for a subject.""" - labels = {} - for _, _, label in rdf_graph.triples((subject, self.SKOS_CORE.prefLabel, None)): - labels[label.language] = label.value - + labels = { + label.language: label.value + for _, _, label in rdf_graph.triples( + (subject, self.SKOS_CORE.prefLabel, None) + ) + } if "en" not in labels: for _, _, label in rdf_graph.triples( (subject, self.SKOS_CORE.altLabel, None) ): - if label.language not in labels: - labels[label.language] = label.value - + labels.setdefault(label.language, label.value) return labels def _find_parents(self, subject, rdf_graph): - """Find the parent notations of a subject.""" + """Find parent notations.""" parents = [] - previous_parent = None - while True: - broader_found = False - for _, _, parent in rdf_graph.triples( - (subject, self.SKOS_CORE.broader, None) - ): - if previous_parent is not None: - parents.append(self._get_notation(previous_parent, rdf_graph)) - previous_parent = parent - subject = parent - broader_found = True - break - - if not broader_found: - if previous_parent is not None: - parents.append(self._get_notation(previous_parent, rdf_graph)) - break + # Traverse the broader hierarchy + for broader in rdf_graph.transitive_objects(subject, self.SKOS_CORE.broader): + if broader != subject: # Ensure we don't include the current subject + parent_notation = self._get_notation(broader, rdf_graph) + if parent_notation: + parents.append(parent_notation) return parents def _transform_entry(self, subject, rdf_graph): """Transform an entry to the required dictionary format.""" - Entry = namedtuple( - "Entry", ["id", "scheme", "subject", "title", "identifiers", "props"] - ) - # Get subject notation with euroscivoc prefix notation = self._get_notation(subject, rdf_graph) id = f"euroscivoc:{notation}" if notation else None - # Get labels for the current subject - languages = self._get_labels(subject, rdf_graph) - pref_label = languages.get("en", "") - - # Find parent notations in order from top parent to lowest - parent_notations = self._find_parents(subject, rdf_graph) - - parents = [ - f"euroscivoc:{notation}" - for notation in reversed(parent_notations) - if notation is not None - ] - - # Store parent notations with euroscivoc prefix in props - props = {} - if parents: - props["parents"] = parents - + labels = self._get_labels(subject, rdf_graph) + # Join parent notations with SPLITCHAR separator and add euroscivoc prefix + parents = self.SPLITCHAR.join( + f"euroscivoc:{n}" for n in reversed(self._find_parents(subject, rdf_graph)) + ) # Create identifiers list identifiers = [{"scheme": "url", "identifier": str(subject)}] - return Entry( - id, "EuroSciVoc", pref_label.capitalize(), languages, identifiers, props - ) - def _as_dict(self, entry): - """Convert an entry to a dictionary.""" return { - "id": entry.id, - "scheme": entry.scheme, - "subject": entry.subject, - "title": entry.title, - "props": entry.props, - "identifiers": entry.identifiers, + "id": id, + "scheme": "EuroSciVoc", + "subject": labels.get("en", "").capitalize(), + "title": labels, + "props": {"parents": parents} if parents else {}, + "identifiers": identifiers, } def apply(self, stream_entry, *args, **kwargs): @@ -178,7 +143,7 @@ def apply(self, stream_entry, *args, **kwargs): entry_data = self._transform_entry( stream_entry.entry["subject"], stream_entry.entry["rdf_graph"] ) - stream_entry.entry = self._as_dict(entry_data) + stream_entry.entry = entry_data return stream_entry diff --git a/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json b/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json index 7b1c7451..1d0498b8 100644 --- a/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json +++ b/invenio_vocabularies/contrib/subjects/jsonschemas/subjects/subject-v1.0.0.json @@ -34,7 +34,7 @@ "type": "object", "patternProperties": { "^.*$": { - "type": "array" + "type": "string" } } }, @@ -42,7 +42,7 @@ "description": "Alternate identifiers for the subject.", "type": "array", "items": { - "$ref": "local://definitions-v1.0.0.json#/identifiers_with_scheme" + "$ref": "local://definitions-v2.0.0.json#/identifiers_with_scheme" }, "uniqueItems": true }, diff --git a/invenio_vocabularies/contrib/subjects/schema.py b/invenio_vocabularies/contrib/subjects/schema.py index 397a8ccc..1eb6830c 100644 --- a/invenio_vocabularies/contrib/subjects/schema.py +++ b/invenio_vocabularies/contrib/subjects/schema.py @@ -35,7 +35,7 @@ class SubjectSchema(BaseVocabularySchema): scheme = SanitizedUnicode(required=True) subject = SanitizedUnicode(required=True) title = i18n_strings - props = fields.Dict(keys=SanitizedUnicode(), values=fields.List(SanitizedUnicode())) + props = fields.Dict(keys=SanitizedUnicode(), values=SanitizedUnicode()) identifiers = IdentifierSet( fields.Nested( partial( @@ -62,7 +62,16 @@ class SubjectRelationSchema(ContribVocabularyRelationSchema): ftf_name = "subject" parent_field_name = "subjects" subject = SanitizedUnicode() - scheme = SanitizedUnicode() - title = i18n_strings - props = fields.Dict() - synonyms = fields.List(SanitizedUnicode()) + scheme = SanitizedUnicode(dump_only=True) + title = fields.Dict(dump_only=True) + props = fields.Dict(dump_only=True) + identifiers = IdentifierSet( + fields.Nested( + partial( + IdentifierSchema, + allowed_schemes=subject_schemes, + identifier_required=False, + ) + ) + ) + synonyms = fields.List(SanitizedUnicode(), dump_only=True) diff --git a/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py b/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py index acd09c89..c8f62373 100644 --- a/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py +++ b/tests/contrib/subjects/euroscivoc/test_subjects_euroscivoc_datastream.py @@ -5,7 +5,7 @@ import pytest from rdflib import RDF, Graph, Namespace, URIRef -from invenio_vocabularies.contrib.subjects.euroscivoc.datastreams import ( # Adjust import based on your module name +from invenio_vocabularies.contrib.subjects.euroscivoc.datastreams import ( EuroSciVocSubjectsHTTPReader, EuroSciVocSubjectsTransformer, ) @@ -115,7 +115,7 @@ def expected_from_rdf_pref_label_with_parent(): "de": "Satellitenfunk", "en": "satellite radio", }, - "props": {"parents": ["euroscivoc:1225"]}, + "props": {"parents": "euroscivoc:1225"}, "identifiers": [ { "scheme": "url",