Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

subjects: added euroscivoc datastream #386

Merged
merged 5 commits into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions invenio_vocabularies/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@

VOCABULARIES_SUBJECTS_SCHEMES = {
"gnd": {"label": _("GND"), "validator": idutils.is_gnd, "datacite": "GND"},
"url": {"label": _("URL"), "validator": idutils.is_url},
}
"""Subjects allowed identifier schemes."""

Expand Down Expand Up @@ -162,6 +163,9 @@
}
"""Vocabulary type search configuration."""

SUBJECTS_EUROSCIVOC_FILE_URL = "https://op.europa.eu/o/opportal-service/euvoc-download-handler?cellarURI=http%3A%2F%2Fpublications.europa.eu%2Fresource%2Fdistribution%2Feuroscivoc%2F20231115-0%2Frdf%2Fskos_ap_eu%2FEuroSciVoc-skos-ap-eu.rdf&fileName=EuroSciVoc-skos-ap-eu.rdf"
0einstein0 marked this conversation as resolved.
Show resolved Hide resolved
"""Subject EuroSciVoc file download link."""

VOCABULARIES_ORCID_ACCESS_KEY = "TODO"
"""ORCID access key to access the s3 bucket."""
VOCABULARIES_ORCID_SECRET_KEY = "TODO"
Expand Down
18 changes: 12 additions & 6 deletions invenio_vocabularies/contrib/subjects/datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,8 @@
from invenio_i18n import lazy_gettext as _

from ...datastreams.writers import ServiceWriter
from .mesh.datastreams import VOCABULARIES_DATASTREAM_READERS as mesh_readers
from .mesh.datastreams import VOCABULARIES_DATASTREAM_TRANSFORMERS as mesh_transformers
from .mesh.datastreams import VOCABULARIES_DATASTREAM_WRITERS as mesh_writers
from .euroscivoc import datastreams as euroscivoc_datastreams
from .mesh import datastreams as mesh_datastreams


class SubjectsServiceWriter(ServiceWriter):
Expand All @@ -30,15 +29,22 @@ def _entry_id(self, entry):
return entry["id"]


VOCABULARIES_DATASTREAM_READERS = {**mesh_readers}
VOCABULARIES_DATASTREAM_READERS = {
**mesh_datastreams.VOCABULARIES_DATASTREAM_READERS,
**euroscivoc_datastreams.VOCABULARIES_DATASTREAM_READERS,
}
"""Subjects Data Streams readers."""

VOCABULARIES_DATASTREAM_TRANSFORMERS = {**mesh_transformers}
VOCABULARIES_DATASTREAM_TRANSFORMERS = {
**mesh_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
**euroscivoc_datastreams.VOCABULARIES_DATASTREAM_TRANSFORMERS,
}
"""Subjects Data Streams transformers."""

VOCABULARIES_DATASTREAM_WRITERS = {
"subjects-service": SubjectsServiceWriter,
**mesh_writers,
**mesh_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
**euroscivoc_datastreams.VOCABULARIES_DATASTREAM_WRITERS,
}
"""Subjects Data Streams writers."""

Expand Down
9 changes: 9 additions & 0 deletions invenio_vocabularies/contrib/subjects/euroscivoc/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 CERN.
#
# Invenio-Vocabularies is free software; you can redistribute it and/or
# modify it under the terms of the MIT License; see LICENSE file for more
# details.

"""EuroSciVoc Subjects module."""
171 changes: 171 additions & 0 deletions invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2022-2024 CERN.
#
# Invenio-Vocabularies is free software; you can redistribute it and/or
# modify it under the terms of the MIT License; see LICENSE file for more
# details.

"""EuroSciVoc subjects datastreams, readers, transformers, and writers."""

import io
from collections import namedtuple

import requests
from rdflib import OWL, RDF, Graph, Namespace

from invenio_vocabularies.config import SUBJECTS_EUROSCIVOC_FILE_URL
from invenio_vocabularies.datastreams.readers import BaseReader
from invenio_vocabularies.datastreams.transformers import BaseTransformer


class EuroSciVocSubjectsHTTPReader(BaseReader):
"""Reader class to fetch and process EuroSciVoc RDF data."""

def __init__(self, origin=None, mode="r", since=None, *args, **kwargs):
"""Initialize the reader with the data source.

:param origin: The URL from which to fetch the RDF data.
:param mode: Mode of operation (default is 'r' for reading).
"""
self.origin = origin or SUBJECTS_EUROSCIVOC_FILE_URL
super().__init__(origin=origin, mode=mode, *args, **kwargs)

def _iter(self, rdf_graph):
"""Iterate over the RDF graph, yielding one subject at a time.

:param rdf_graph: The RDF graph to process.
:yield: Subject and graph to be transformed.
"""
SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")

for subject, _, _ in rdf_graph.triples((None, RDF.type, SKOS_CORE.Concept)):
yield {"subject": subject, "rdf_graph": rdf_graph}

def read(self, item=None, *args, **kwargs):
"""Fetch and process the EuroSciVoc RDF data, yielding it one subject at a time.

:param item: The RDF data provided as bytes (optional).
:yield: Processed EuroSciVoc subject data.
"""
if item:
raise NotImplementedError(
"EuroSciVocSubjectsHTTPReader does not support being chained after another reader"
)
# Fetch the RDF data from the specified origin URL
response = requests.get(self.origin)
response.raise_for_status()

# Treat the response content as a file-like object
rdf_data = io.BytesIO(response.content)

# Parse the RDF data into a graph
rdf_graph = Graph()
rdf_graph.parse(rdf_data, format="xml")

# Yield each processed subject from the RDF graph
yield from self._iter(rdf_graph)


class EuroSciVocSubjectsTransformer(BaseTransformer):
0einstein0 marked this conversation as resolved.
Show resolved Hide resolved
"""Transformer class to convert EuroSciVoc RDF data to a dictionary format."""

SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
SPLITCHAR = ","

def _get_notation(self, subject, rdf_graph):
"""Extract the numeric notation for a subject."""
for _, _, notation in rdf_graph.triples(
(subject, self.SKOS_CORE.notation, None)
):
if str(notation).isdigit():
return str(notation)
return None

def _get_labels(self, subject, rdf_graph):
"""Extract prefLabel and altLabel languages for a subject."""
labels = {
label.language: label.value
for _, _, label in rdf_graph.triples(
(subject, self.SKOS_CORE.prefLabel, None)
)
}
if "en" not in labels:
for _, _, label in rdf_graph.triples(
(subject, self.SKOS_CORE.altLabel, None)
):
labels.setdefault(label.language, label.value)
return labels

def _find_parents(self, subject, rdf_graph):
"""Find parent notations."""
parents = []

# Traverse the broader hierarchy
for broader in rdf_graph.transitive_objects(subject, self.SKOS_CORE.broader):
if broader != subject: # Ensure we don't include the current subject
parent_notation = self._get_notation(broader, rdf_graph)
if parent_notation:
parents.append(parent_notation)

return parents

def _transform_entry(self, subject, rdf_graph):
"""Transform an entry to the required dictionary format."""
# Get subject notation with euroscivoc prefix
notation = self._get_notation(subject, rdf_graph)
id = f"euroscivoc:{notation}" if notation else None
# Get labels for the current subject
labels = self._get_labels(subject, rdf_graph)
# Join parent notations with SPLITCHAR separator and add euroscivoc prefix
parents = self.SPLITCHAR.join(
f"euroscivoc:{n}" for n in reversed(self._find_parents(subject, rdf_graph))
)
# Create identifiers list
identifiers = [{"scheme": "url", "identifier": str(subject)}]

return {
"id": id,
"scheme": "EuroSciVoc",
"subject": labels.get("en", "").capitalize(),
"title": labels,
"props": {"parents": parents} if parents else {},
"identifiers": identifiers,
}

def apply(self, stream_entry, *args, **kwargs):
"""Transform a stream entry to the required dictionary format.

:param stream_entry: The entry to be transformed, which includes the subject and the RDF graph.
:return: The transformed stream entry.
"""
# Apply transformations
entry_data = self._transform_entry(
stream_entry.entry["subject"], stream_entry.entry["rdf_graph"]
)
stream_entry.entry = entry_data
return stream_entry


# Configuration for datastream readers, transformers, and writers
VOCABULARIES_DATASTREAM_READERS = {"euroscivoc-reader": EuroSciVocSubjectsHTTPReader}

VOCABULARIES_DATASTREAM_WRITERS = {}

VOCABULARIES_DATASTREAM_TRANSFORMERS = {
"euroscivoc-transformer": EuroSciVocSubjectsTransformer
}

DATASTREAM_CONFIG = {
"readers": [
{
"type": "euroscivoc-reader",
}
],
"transformers": [{"type": "euroscivoc-transformer"}],
"writers": [
{
"type": "subjects-service",
}
],
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,22 @@
"description": "Human readable label in different languages.",
"$ref": "local://vocabularies/definitions-v1.0.0.json#/title"
},
"props": {
"type": "object",
"patternProperties": {
"^.*$": {
"type": "string"
}
}
},
"identifiers": {
"description": "Alternate identifiers for the subject.",
"type": "array",
"items": {
"$ref": "local://definitions-v2.0.0.json#/identifiers_with_scheme"
},
"uniqueItems": true
},
"synonyms": {
"description": "Synonyms of the subject label.",
"type": "array",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,20 @@
"type": "object",
"dynamic": "true"
},
"props": {
"type": "object",
"dynamic": "true"
},
"identifiers": {
"properties": {
"identifier": {
"type": "keyword"
},
"scheme": {
"type": "keyword"
}
}
},
"synonyms": {
"type": "text"
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,20 @@
"synonyms": {
"type": "text"
},
"props": {
"type": "object",
"dynamic": "true"
},
"identifiers": {
"properties": {
"identifier": {
"type": "keyword"
},
"scheme": {
"type": "keyword"
}
}
},
"tags": {
"type": "keyword"
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,20 @@
"type": "object",
"dynamic": "true"
},
"props": {
"type": "object",
"dynamic": "true"
},
"identifiers": {
"properties": {
"identifier": {
"type": "keyword"
},
"scheme": {
"type": "keyword"
}
}
},
"synonyms": {
"type": "text"
},
Expand Down
9 changes: 9 additions & 0 deletions invenio_vocabularies/contrib/subjects/mesh/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 CERN.
#
# Invenio-Vocabularies is free software; you can redistribute it and/or
# modify it under the terms of the MIT License; see LICENSE file for more
# details.

"""MeSH Subjects module."""
34 changes: 29 additions & 5 deletions invenio_vocabularies/contrib/subjects/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,19 @@

"""Subjects schema."""

from functools import partial

from invenio_i18n import get_locale
from marshmallow import fields, pre_load
from marshmallow_utils.fields import SanitizedUnicode
from marshmallow import Schema, fields, pre_load
from marshmallow_utils.fields import IdentifierSet, SanitizedUnicode
from marshmallow_utils.schemas import IdentifierSchema

from ...services.schema import (
BaseVocabularySchema,
ContribVocabularyRelationSchema,
i18n_strings,
)
from .config import subject_schemes


class SubjectSchema(BaseVocabularySchema):
Expand All @@ -31,6 +35,16 @@ class SubjectSchema(BaseVocabularySchema):
scheme = SanitizedUnicode(required=True)
subject = SanitizedUnicode(required=True)
title = i18n_strings
props = fields.Dict(keys=SanitizedUnicode(), values=SanitizedUnicode())
0einstein0 marked this conversation as resolved.
Show resolved Hide resolved
identifiers = IdentifierSet(
fields.Nested(
partial(
IdentifierSchema,
allowed_schemes=subject_schemes,
identifier_required=False,
)
)
)
synonyms = fields.List(SanitizedUnicode())

@pre_load
Expand All @@ -48,6 +62,16 @@ class SubjectRelationSchema(ContribVocabularyRelationSchema):
ftf_name = "subject"
parent_field_name = "subjects"
subject = SanitizedUnicode()
scheme = SanitizedUnicode()
title = i18n_strings
synonyms = fields.List(SanitizedUnicode())
scheme = SanitizedUnicode(dump_only=True)
title = fields.Dict(dump_only=True)
props = fields.Dict(dump_only=True)
identifiers = IdentifierSet(
fields.Nested(
partial(
IdentifierSchema,
allowed_schemes=subject_schemes,
identifier_required=False,
)
)
)
synonyms = fields.List(SanitizedUnicode(), dump_only=True)
Loading