Skip to content

Commit

Permalink
first implementation of OAIPMHReader (#329)
Browse files Browse the repository at this point in the history
* first implementation of OAIPMHReader

* introduce a simple way to map OAI records to a dict without expecting a special metadata format.

* fix installation requirements

* fix tests

* small fixes to make the tests run

* add error handling

* renamed oaipmh_scythe package

* handle remarks/questions from review.

* replaced access to a real OAI server with a mocking implementation.

* Update invenio_vocabularies/datastreams/readers.py

Co-authored-by: Pablo Tamarit <[email protected]>

* Update tests/datastreams/test_datastreams.py

Co-authored-by: Pablo Tamarit <[email protected]>

* Moved reader tests to testreaders.py

* add missing copyright in header

---------

Co-authored-by: Pablo Tamarit <[email protected]>
  • Loading branch information
wgresshoff and ptamarit authored Jun 10, 2024
1 parent 94d9453 commit 87d7553
Show file tree
Hide file tree
Showing 4 changed files with 220 additions and 0 deletions.
2 changes: 2 additions & 0 deletions invenio_vocabularies/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
GzipReader,
JsonLinesReader,
JsonReader,
OAIPMHReader,
TarReader,
XMLReader,
YamlReader,
Expand Down Expand Up @@ -111,6 +112,7 @@
"yaml": YamlReader,
"zip": ZipReader,
"xml": XMLReader,
"oai-pmh": OAIPMHReader,
}
"""Data Streams readers."""

Expand Down
83 changes: 83 additions & 0 deletions invenio_vocabularies/datastreams/readers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2021-2024 CERN.
# Copyright (C) 2024 University of Münster.
#
# Invenio-Vocabularies is free software; you can redistribute it and/or
# modify it under the terms of the MIT License; see LICENSE file for more
Expand All @@ -19,7 +20,11 @@

import requests
import yaml
from lxml import etree
from lxml.html import parse as html_parse
from oaipmh_scythe import Scythe
from oaipmh_scythe.exceptions import NoRecordsMatch
from oaipmh_scythe.models import Record

from .errors import ReaderError
from .xml import etree_to_dict
Expand Down Expand Up @@ -226,3 +231,81 @@ def _iter(self, fp, *args, **kwargs):
raise ReaderError(f"Record not found in XML entry.")

yield record


class OAIPMHReader(BaseReader):
"""OAIPMH reader."""

def __init__(
self,
*args,
base_url=None,
metadata_prefix=None,
set=None,
from_date=None,
until_date=None,
verb=None,
**kwargs,
):
"""Constructor."""
self._base_url = base_url
self._metadata_prefix = metadata_prefix if not None else "oai_dc"
self._set = set
self._until = until_date
self._from = from_date
self._verb = verb if not None else "ListRecords"
super().__init__(*args, **kwargs)

def _iter(self, scythe, *args, **kwargs):
"""Read and parse an OAIPMH stream to dict."""
scythe.class_mapping["ListRecords"] = self.OAIRecord
try:
records = scythe.list_records(
from_=self._from,
until=self._until,
metadata_prefix=self._metadata_prefix,
set_=self._set,
ignore_deleted=True,
)
for record in records:
yield {"record": record}
except NoRecordsMatch:
raise ReaderError(f"No records found in OAI-PMH request.")

def read(self, item=None, *args, **kwargs):
"""Reads from item or opens the file descriptor from origin."""
if item:
raise NotImplementedError(
"OAIPMHReader does not support being chained after another reader"
)
else:
with Scythe(self._base_url) as scythe:
yield from self._iter(scythe=scythe, *args, **kwargs)

class OAIRecord(Record):
"""An XML unpacking implementation for more complicated formats."""

def get_metadata(self):
"""Extract and return the record's metadata as a dictionary."""
return xml_to_dict(
self.xml.find(".//" + self._oai_namespace + "metadata").getchildren()[
0
],
)


def xml_to_dict(tree: etree._Element):
"""Convert an XML tree to a dictionary.
This function takes an XML element tree and converts it into a dictionary.
Args:
tree: The root element of the XML tree to be converted.
Returns:
A dictionary with the key "record".
"""
dict_obj = dict()
dict_obj["record"] = etree.tostring(tree)

return dict_obj
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,14 @@ install_requires =
invenio-records-resources>=6.0.0,<7.0.0
lxml>=4.5.0
PyYAML>=5.4.1
oaipmh-scythe @ git+https://github.com/ulbmuenster/invenio-oaipmh-scythe.git

[options.extras_require]
tests =
pytest-black-ng>=0.4.0
invenio-app>=1.4.0,<2.0.0
invenio-db[postgresql,mysql]>=1.0.14,<2.0.0
pytest_httpserver>=1.0.10
pytest-invenio>=2.1.0,<3.0.0
Sphinx>=4.5
elasticsearch7 =
Expand Down
133 changes: 133 additions & 0 deletions tests/datastreams/test_readers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2021-2024 CERN.
# Copyright (C) 2024 University of Münster.
#
# Invenio-Vocabularies is free software; you can redistribute it and/or
# modify it under the terms of the MIT License; see LICENSE file for more
Expand All @@ -16,8 +17,10 @@
import pytest
import yaml

from invenio_vocabularies.datastreams.errors import ReaderError
from invenio_vocabularies.datastreams.readers import (
JsonReader,
OAIPMHReader,
TarReader,
YamlReader,
ZipReader,
Expand Down Expand Up @@ -185,4 +188,134 @@ def test_json_element_reader(json_element_file, json_element):
assert count == 1


@pytest.fixture(scope="module")
def oai_response_match():
response_data = """
<?xml version = "1.0" encoding = "UTF-8"?>
<OAI-PMH xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd" xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<responseDate>2024-05-29T13:20:04Z</responseDate>
<request metadataPrefix="MARC21plus-1-xml" set="authorities:sachbegriff" verb="ListRecords" from="2024-01-01T09:00:00Z" until="2024-01-01T17:00:00Z">https://services.dnb.de/oai/repository</request>
<ListRecords xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd">
<record>
<header>
<identifier>oai:dnb.de/authorities:sachbegriff/1074025261</identifier>
<datestamp>2024-01-01T16:51:21Z</datestamp>
<setSpec>authorities:sachbegriff</setSpec>
</header>
<metadata>
<record type="Authority" xmlns="http://www.loc.gov/MARC21/slim">
<leader>00000nz a2200000nc 4500</leader>
<controlfield tag="001">1074025261</controlfield>
<controlfield tag="003">DE-101</controlfield>
<controlfield tag="005">20240101175121.0</controlfield>
<controlfield tag="008">150717n||azznnbabn | ana |c</controlfield>
<datafield tag="024" ind1="7" ind2=" ">
<subfield code="a">1074025261</subfield>
<subfield code="0">http://d-nb.info/gnd/1074025261</subfield>
<subfield code="2">gnd</subfield>
</datafield>
<datafield tag="035" ind1=" " ind2=" ">
<subfield code="a">(DE-101)1074025261</subfield>
</datafield>
<datafield tag="035" ind1=" " ind2=" ">
<subfield code="a">(DE-588)1074025261</subfield>
</datafield>
<datafield tag="040" ind1=" " ind2=" ">
<subfield code="a">DE-12</subfield>
<subfield code="c">DE-12</subfield>
<subfield code="9">r:DE-384</subfield>
<subfield code="b">ger</subfield>
<subfield code="d">1210</subfield>
<subfield code="f">rswk</subfield>
</datafield>
<datafield tag="042" ind1=" " ind2=" ">
<subfield code="a">gnd1</subfield>
</datafield>
<datafield tag="065" ind1=" " ind2=" ">
<subfield code="a">31.3b</subfield>
<subfield code="2">sswd</subfield>
</datafield>
<datafield tag="075" ind1=" " ind2=" ">
<subfield code="b">s</subfield>
<subfield code="2">gndgen</subfield>
</datafield>
<datafield tag="075" ind1=" " ind2=" ">
<subfield code="b">saz</subfield>
<subfield code="2">gndspec</subfield>
</datafield>
<datafield tag="079" ind1=" " ind2=" ">
<subfield code="a">g</subfield>
<subfield code="q">s</subfield>
</datafield>
<datafield tag="150" ind1=" " ind2=" ">
<subfield code="a">Rundbogenhalle</subfield>
</datafield>
<datafield tag="450" ind1=" " ind2=" ">
<subfield code="a">Bogenhalle</subfield>
</datafield>
<datafield tag="550" ind1=" " ind2=" ">
<subfield code="0">(DE-101)040230236</subfield>
<subfield code="0">(DE-588)4023023-5</subfield>
<subfield code="0">https://d-nb.info/gnd/4023023-5</subfield>
<subfield code="a">Halle</subfield>
<subfield code="4">obge</subfield>
<subfield code="4">https://d-nb.info/standards/elementset/gnd#broaderTermGeneric</subfield>
<subfield code="w">r</subfield>
<subfield code="i">Oberbegriff generisch</subfield>
</datafield>
<datafield tag="670" ind1=" " ind2=" ">
<subfield code="a">Stahlbetonbauwerke mit großer Spannweite, eingesetzt im Industriebau, z.b.Paketposthalle München; teilweise heute unter Denkmalschutz</subfield>
</datafield>
</record>
</metadata>
</record>
</ ListRecords>
</OAI-PMH>
"""
return response_data


def test_oaipmh_reader(app, httpserver, oai_response_match):
httpserver.expect_request("/oai/repository").respond_with_data(
response_data=oai_response_match, mimetype="application/xml"
)
reader = OAIPMHReader(
base_url=httpserver.url_for("/oai/repository"),
metadata_prefix="MARC21plus-1-xml",
set="authorities:sachbegriff",
from_date="2024-01-01T09:00:00Z",
until_date="2024-01-31T10:00:00Z",
)
result = reader.read()
assert "record" in next(result)


@pytest.fixture(scope="module")
def oai_response_no_match():
response_data = """
<OAI-PMH xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd" xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<responseDate>2024-05-29T13:09:44Z</responseDate>
<request metadataPrefix="MARC21plus-1-xml" set="authorities:sachbegriff" verb="ListRecords" from="2024-01-01T09:00:00Z" until="2024-01-01T10:00:00Z">https://services.dnb.de/oai/repository</request>
<error code="noRecordsMatch"/>
</OAI-PMH>
"""
return response_data


def test_oaipmh_reader_no_records_match(httpserver, oai_response_no_match):
httpserver.expect_request("/oai/repository").respond_with_data(
response_data=oai_response_no_match, mimetype="application/xml"
)
reader = OAIPMHReader(
base_url=httpserver.url_for("/oai/repository"),
metadata_prefix="MARC21plus-1-xml",
set="authorities:sachbegriff",
from_date="2024-01-01T09:00:00Z",
until_date="2024-01-01T10:00:00Z",
)
result = reader.read()
with pytest.raises(ReaderError):
next(result)


# FIXME: add test for csv reader

0 comments on commit 87d7553

Please sign in to comment.