From 68f0d5db2509fabfd260c9464df6206e5cd0298f Mon Sep 17 00:00:00 2001 From: David Eckhard Date: Mon, 6 Dec 2021 13:24:07 +0100 Subject: [PATCH 01/19] feature: update oai set implementation --- invenio_oaiserver/config.py | 4 +- invenio_oaiserver/errors.py | 5 ++ invenio_oaiserver/ext.py | 5 ++ invenio_oaiserver/fetchers.py | 21 +++++++ invenio_oaiserver/percolator.py | 86 ++++++++++++++++++++++++++- invenio_oaiserver/query.py | 6 +- invenio_oaiserver/resumption_token.py | 4 +- invenio_oaiserver/verbs.py | 3 - invenio_oaiserver/views/server.py | 13 ++++ 9 files changed, 140 insertions(+), 7 deletions(-) diff --git a/invenio_oaiserver/config.py b/invenio_oaiserver/config.py index a0c4026..89220cb 100644 --- a/invenio_oaiserver/config.py +++ b/invenio_oaiserver/config.py @@ -51,7 +51,7 @@ with meanings as defined in ISO8601. """ -OAISERVER_RESUMPTION_TOKEN_EXPIRE_TIME = 1 * 60 +OAISERVER_RESUMPTION_TOKEN_EXPIRE_TIME = 10 * 60 """The expiration time of a resumption token in seconds. **Default: 60 seconds = 1 minute**. @@ -127,6 +127,8 @@ OAISERVER_RECORD_SETS_FETCHER = 'invenio_oaiserver.utils:record_sets_fetcher' """Record's OAI sets function.""" +OAISERVER_SET_RECORDS_QUERY_FETCHER = 'invenio_oaiserver.fetchers:rdm_records_set_records_query_fetcher' + OAISERVER_RECORD_CLS = 'invenio_records.api:Record' """Record retrieval class.""" diff --git a/invenio_oaiserver/errors.py b/invenio_oaiserver/errors.py index 4fe7d47..8bad811 100644 --- a/invenio_oaiserver/errors.py +++ b/invenio_oaiserver/errors.py @@ -20,3 +20,8 @@ class OAISetSpecUpdateError(Exception): The correct way is to delete the set and create a new one. """ + +class OAINoRecordsMatchError(Exception): + """The combination of the values of the from, until, and set arguments + results in an empty list. + """ diff --git a/invenio_oaiserver/ext.py b/invenio_oaiserver/ext.py index 61609ca..1c0c2c6 100644 --- a/invenio_oaiserver/ext.py +++ b/invenio_oaiserver/ext.py @@ -56,6 +56,11 @@ def record_fetcher(self): """Get the record fetcher class for record serialization.""" return obj_or_import_string(self.app.config['OAISERVER_GETRECORD_FETCHER']) + @property + def set_records_query_fetcher(self): + """Get query to retrieve records based on set""" + return obj_or_import_string(self.app.config['OAISERVER_SET_RECORDS_QUERY_FETCHER']) + @property def last_update_key(self): """Get record update key.""" diff --git a/invenio_oaiserver/fetchers.py b/invenio_oaiserver/fetchers.py index 1e4abf1..4214cb5 100644 --- a/invenio_oaiserver/fetchers.py +++ b/invenio_oaiserver/fetchers.py @@ -10,10 +10,13 @@ """Persistent identifier fetchers.""" from __future__ import absolute_import, print_function +from elasticsearch_dsl.query import Q from invenio_pidstore.errors import PersistentIdentifierError from invenio_pidstore.fetchers import FetchedPID +from .models import OAISet +from .query import query_string_parser from .provider import OAIIDProvider @@ -33,3 +36,21 @@ def oaiid_fetcher(record_uuid, data): pid_type=OAIIDProvider.pid_type, pid_value=str(pid_value), ) + + +def set_records_query_fetcher(setSpec): + """Fetch query to retrieve records based on provided set spec""" + return Q('match', **{'_oai.sets': setSpec}) + + +# TODO: move to invenio_rdm_records +def rdm_records_set_records_query_fetcher(setSpec): + """Fetch query to retrieve records based on provided set spec""" + set = OAISet.query.filter(OAISet.spec == setSpec).first() + if set is None: + # raise error that no matches can be found ? + query = Q('match_none') + else: + query = Q(query_string_parser(set.search_pattern)) + + return query diff --git a/invenio_oaiserver/percolator.py b/invenio_oaiserver/percolator.py index 0c854f4..1b983a3 100644 --- a/invenio_oaiserver/percolator.py +++ b/invenio_oaiserver/percolator.py @@ -18,10 +18,11 @@ from invenio_indexer.utils import schema_to_index from invenio_search import current_search, current_search_client from invenio_search.utils import build_index_name +from invenio_oaiserver.query import query_string_parser +from invenio_oaiserver.utils import record_sets_fetcher from .models import OAISet from .proxies import current_oaiserver -from .query import query_string_parser def _build_percolator_index_name(index): @@ -168,3 +169,86 @@ def get_record_sets(record): if set_name.startswith(prefix): name = set_name[prefix_len:] yield name + + +def record_in_set(record, set_spec): + percolator_index = _build_percolator_index_name("rdmrecords-records-record-v4.0.0") + result = percolate_query(percolator_index=percolator_index, percolator_ids=[set_spec], documents=[record]) + return len(result) > 0 + + +def create_percolate_query(percolator_ids=None, documents=None, document_es_ids=None, document_es_indices=None): + queries = [] + + # documents or (document_es_ids and document_es_indices) has to be set + if documents is not None: + queries.append({ + "percolate" : { + "field": "query", + "documents" : documents, + } + }) + elif (document_es_ids is not None and document_es_indices is not None and len(document_es_ids) == len(document_es_indices)): + queries.extend( [{ + "percolate" : { + "field" : "query", + "index" : es_index, + "id" : es_id, + "name" : f"{es_index}:{es_id}", + } + } for (es_id, es_index) in zip(document_es_ids, document_es_indices)]) + else: + return {} + + if percolator_ids: + queries.append({ + "ids": { + "values" : percolator_ids + } + }) + + query = { + "query" : { + "bool" : { + "must" : queries + } + } + } + + return query + +def percolate_query(percolator_index, percolator_ids=None, documents=None, document_es_ids=None, document_es_indices=None): + # TODO: remove before merging. only for testing purposes + index_sets() + + query = create_percolate_query(percolator_ids=percolator_ids, documents=documents, document_es_ids=document_es_ids, document_es_indices=document_es_indices) + result = current_search_client.search(index=percolator_index, body=query, scroll='1m', size=20) + # TODO: clear scroll? + # TOOO: iterate over scroll? + return result["hits"]["hits"] + + +def find_sets_for_record(record): + """Fetch a record's sets.""" + hits = percolate_query(percolator_index="rdmrecords-records-record-v4.0.0-percolators", documents=[record]) + return [s["_id"] for s in hits] + + +def index_sets(): + # should be done when a set is created or updated + sets = OAISet.query.all() + if not sets: + return [] + + index = "rdmrecords-records-record-v4.0.0" + percolator_doc_type = _get_percolator_doc_type(index) + # only created if it does not exist + _create_percolator_mapping(index, percolator_doc_type) + + for set in sets: + query = query_string_parser(set.search_pattern) + current_search_client.index( + index=_build_percolator_index_name(index), + id=set.spec, + body={'query': query.to_dict()} + ) diff --git a/invenio_oaiserver/query.py b/invenio_oaiserver/query.py index feeebe0..3729119 100644 --- a/invenio_oaiserver/query.py +++ b/invenio_oaiserver/query.py @@ -15,6 +15,7 @@ from flask import current_app from invenio_search import RecordsSearch, current_search_client from werkzeug.utils import cached_property, import_string +from invenio_oaiserver.errors import OAINoRecordsMatchError from . import current_oaiserver @@ -100,7 +101,7 @@ def get_records(**kwargs): ) if 'set' in kwargs: - search = search.query('match', **{'_oai.sets': kwargs['set']}) + search = search.query(current_oaiserver.set_records_query_fetcher(kwargs['set'])) time_range = {} if 'from_' in kwargs: @@ -129,6 +130,9 @@ def __init__(self, response): self.total = response['hits']['total'] if ES_VERSION[0] < 7 else response['hits']['total']['value'] self._scroll_id = response.get('_scroll_id') + if (self.total == 0): + raise OAINoRecordsMatchError() + # clean descriptor on last page if not self.has_next: current_search_client.clear_scroll(scroll_id=self._scroll_id) diff --git a/invenio_oaiserver/resumption_token.py b/invenio_oaiserver/resumption_token.py index 965b9a9..90147ac 100644 --- a/invenio_oaiserver/resumption_token.py +++ b/invenio_oaiserver/resumption_token.py @@ -53,7 +53,9 @@ def _deserialize(self, value, attr, data, **kwargs): result = token_builder.loads(value, max_age=current_app.config[ 'OAISERVER_RESUMPTION_TOKEN_EXPIRE_TIME']) result['token'] = value - result['kwargs'] = self.root.load(result['kwargs'], partial=True).data + # TODO: remove? + # this loads the arguments from the token, which is not necessary as the resumptionToken keyword is exclusive + # result['kwargs'] = self.root.load(result['kwargs'], partial=True).data return result diff --git a/invenio_oaiserver/verbs.py b/invenio_oaiserver/verbs.py index a9b9665..a210816 100644 --- a/invenio_oaiserver/verbs.py +++ b/invenio_oaiserver/verbs.py @@ -150,9 +150,6 @@ class ListIdentifiers(OAISchema, ResumptionTokenSchema): class ListRecords(OAISchema, ResumptionTokenSchema): """Arguments for ListRecords verb.""" - metadataPrefix = fields.Str(load_only=True, - validate=validate_metadata_prefix) - class ListSets(OAISchema, ResumptionTokenSchema): """Arguments for ListSets verb.""" diff --git a/invenio_oaiserver/views/server.py b/invenio_oaiserver/views/server.py index 09b0982..6925ffd 100644 --- a/invenio_oaiserver/views/server.py +++ b/invenio_oaiserver/views/server.py @@ -18,6 +18,7 @@ from webargs.flaskparser import use_args from .. import response as xml +from ..errors import OAINoRecordsMatchError from ..verbs import make_request_validator blueprint = Blueprint( @@ -77,10 +78,20 @@ def resumptiontoken_error(exception): ])), 422, {'Content-Type': 'text/xml'}) +@blueprint.errorhandler(OAINoRecordsMatchError) +def no_records_error(exception): + """Handle no records match Exceptions.""" + return (etree.tostring(xml.error([('noRecordsMatch', + '')])), + 422, + {'Content-Type': 'text/xml'}) + +import time @blueprint.route('/oai2d', methods=['GET', 'POST']) @use_args(make_request_validator) def response(args): """Response endpoint.""" + start = time.time() e_tree = getattr(xml, args['verb'].lower())(**args) response = make_response(etree.tostring( @@ -89,5 +100,7 @@ def response(args): xml_declaration=True, encoding='UTF-8', )) + end = time.time() + print("took:", end - start) response.headers['Content-Type'] = 'text/xml' return response From a452d807d775a08ea29df6a50212a73a7add2828 Mon Sep 17 00:00:00 2001 From: David Eckhard Date: Wed, 12 Jan 2022 12:05:01 +0100 Subject: [PATCH 02/19] update: query sets more efficiently --- invenio_oaiserver/config.py | 4 +- invenio_oaiserver/percolator.py | 156 +++++++++++++++++++++----------- invenio_oaiserver/query.py | 6 +- invenio_oaiserver/receivers.py | 19 ++-- invenio_oaiserver/response.py | 21 +++-- 5 files changed, 130 insertions(+), 76 deletions(-) diff --git a/invenio_oaiserver/config.py b/invenio_oaiserver/config.py index 89220cb..ba8481f 100644 --- a/invenio_oaiserver/config.py +++ b/invenio_oaiserver/config.py @@ -2,7 +2,7 @@ # # This file is part of Invenio. # Copyright (C) 2015-2018 CERN. -# Copyright (C) 2021 Graz University of Technology. +# Copyright (C) 2022 Graz University of Technology. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. @@ -124,7 +124,7 @@ OAISERVER_CREATED_KEY = "_created" """Record created key.""" -OAISERVER_RECORD_SETS_FETCHER = 'invenio_oaiserver.utils:record_sets_fetcher' +OAISERVER_RECORD_SETS_FETCHER = 'invenio_oaiserver.percolator:find_sets_for_record' """Record's OAI sets function.""" OAISERVER_SET_RECORDS_QUERY_FETCHER = 'invenio_oaiserver.fetchers:rdm_records_set_records_query_fetcher' diff --git a/invenio_oaiserver/percolator.py b/invenio_oaiserver/percolator.py index 1b983a3..146d953 100644 --- a/invenio_oaiserver/percolator.py +++ b/invenio_oaiserver/percolator.py @@ -2,6 +2,7 @@ # # This file is part of Invenio. # Copyright (C) 2017-2018 CERN. +# Copyright (C) 2022 Graz University of Technology. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. @@ -13,13 +14,13 @@ import json from elasticsearch import VERSION as ES_VERSION +from elasticsearch.helpers.actions import scan from flask import current_app from invenio_indexer.api import RecordIndexer from invenio_indexer.utils import schema_to_index from invenio_search import current_search, current_search_client from invenio_search.utils import build_index_name from invenio_oaiserver.query import query_string_parser -from invenio_oaiserver.utils import record_sets_fetcher from .models import OAISet from .proxies import current_oaiserver @@ -106,34 +107,44 @@ def _get_percolator_doc_type(index): def _new_percolator(spec, search_pattern): """Create new percolator associated with the new set.""" + index = current_app.config.get('OAISERVER_RECORD_INDEX') if spec and search_pattern: query = query_string_parser(search_pattern=search_pattern).to_dict() - for index, mapping_path in current_search.mappings.items(): - # Create the percolator doc_type in the existing index for >= ES5 - # TODO: Consider doing this only once in app initialization - percolator_doc_type = _get_percolator_doc_type(index) - _create_percolator_mapping( - index, percolator_doc_type, mapping_path) - current_search_client.index( - index=_build_percolator_index_name(index), - doc_type=percolator_doc_type, - id='oaiset-{}'.format(spec), - body={'query': query} - ) + # Create the percolator doc_type in the existing index for >= ES5 + # TODO: Consider doing this only once in app initialization + percolator_doc_type = _get_percolator_doc_type(index) + _create_percolator_mapping(index, percolator_doc_type) + current_search_client.index( + index=_build_percolator_index_name(index), + doc_type=percolator_doc_type, + id='oaiset-{}'.format(spec), + body={'query': query} + ) + # for index, mapping_path in current_search.mappings.items(): + # # Create the percolator doc_type in the existing index for >= ES5 + # # TODO: Consider doing this only once in app initialization + # percolator_doc_type = _get_percolator_doc_type(index) + # _create_percolator_mapping( + # index, percolator_doc_type, mapping_path) + # current_search_client.index( + # index=_build_percolator_index_name(index), + # doc_type=percolator_doc_type, + # id='oaiset-{}'.format(spec), + # body={'query': query} + # ) def _delete_percolator(spec, search_pattern): - """Delete percolator associated with the new oaiset.""" - if spec: - for index in current_search.mappings.keys(): - # Create the percolator doc_type in the existing index for >= ES5 - percolator_doc_type = _get_percolator_doc_type(index) - _create_percolator_mapping(index, percolator_doc_type) - current_search_client.delete( - index=_build_percolator_index_name(index), - doc_type=percolator_doc_type, - id='oaiset-{}'.format(spec), ignore=[404] - ) + """Delete percolator associated with the removed/updated oaiset.""" + index = current_app.config.get('OAISERVER_RECORD_INDEX') + # Create the percolator doc_type in the existing index for >= ES5 + percolator_doc_type = _get_percolator_doc_type(index) + _create_percolator_mapping(index, percolator_doc_type) + current_search_client.delete( + index=_build_percolator_index_name(index), + doc_type=percolator_doc_type, + id='oaiset-{}'.format(spec), ignore=[404] + ) def _build_cache(): @@ -171,15 +182,8 @@ def get_record_sets(record): yield name -def record_in_set(record, set_spec): - percolator_index = _build_percolator_index_name("rdmrecords-records-record-v4.0.0") - result = percolate_query(percolator_index=percolator_index, percolator_ids=[set_spec], documents=[record]) - return len(result) > 0 - - def create_percolate_query(percolator_ids=None, documents=None, document_es_ids=None, document_es_indices=None): queries = [] - # documents or (document_es_ids and document_es_indices) has to be set if documents is not None: queries.append({ @@ -198,7 +202,7 @@ def create_percolate_query(percolator_ids=None, documents=None, document_es_ids= } } for (es_id, es_index) in zip(document_es_ids, document_es_indices)]) else: - return {} + raise Exception("Either documents or (document_es_ids and document_es_indices) must be specified.") if percolator_ids: queries.append({ @@ -218,37 +222,81 @@ def create_percolate_query(percolator_ids=None, documents=None, document_es_ids= return query def percolate_query(percolator_index, percolator_ids=None, documents=None, document_es_ids=None, document_es_indices=None): - # TODO: remove before merging. only for testing purposes - index_sets() - query = create_percolate_query(percolator_ids=percolator_ids, documents=documents, document_es_ids=document_es_ids, document_es_indices=document_es_indices) - result = current_search_client.search(index=percolator_index, body=query, scroll='1m', size=20) - # TODO: clear scroll? - # TOOO: iterate over scroll? - return result["hits"]["hits"] + result = scan(current_search_client, index=percolator_index, query=query, scroll="1m", size=100) + return result + + +def sets_search_all(records): + record_index = current_app.config.get('OAISERVER_RECORD_INDEX') + percolator_index = _build_percolator_index_name(record_index) + record_sets = [[] for _ in range(len(records))] + + result = percolate_query(percolator_index, documents=records) + + prefix = 'oaiset-' + prefix_len = len(prefix) + + for s in result: + set_index_id = s["_id"] + if set_index_id.startswith(prefix): + set_spec = set_index_id[prefix_len:] + for record_index in s.get("fields", {}).get("_percolator_document_slot", []): + record_sets[record_index].append(set_spec) + return record_sets def find_sets_for_record(record): """Fetch a record's sets.""" - hits = percolate_query(percolator_index="rdmrecords-records-record-v4.0.0-percolators", documents=[record]) - return [s["_id"] for s in hits] - + return sets_search_all([record])[0] + +# TODO: Remove everything below before merging +def remove_sets(): + from invenio_db import db + specs = [] + index = current_app.config.get('OAISERVER_RECORD_INDEX') + + for spec in specs: + try: + db_set = OAISet.query.filter_by(spec=spec).one() + print(db_set) + db.session.delete(db_set) + db.session.commit() + except Exception as e: + print(f"Exception during set removal ({spec}):",e) + + + _delete_percolator(spec, "***") + current_search_client.delete( + index=_build_percolator_index_name(index), + id=spec, + ignore=[404] + ) +def create_new_set(): + from datetime import datetime + from invenio_db import db + name = f"published-{datetime.now()}" + s = OAISet(spec=name, name=name, description="created via python orm", search_pattern="is_published:true") + db.session.add(s) + db.session.commit() + +import time def index_sets(): - # should be done when a set is created or updated sets = OAISet.query.all() if not sets: return [] - index = "rdmrecords-records-record-v4.0.0" - percolator_doc_type = _get_percolator_doc_type(index) - # only created if it does not exist - _create_percolator_mapping(index, percolator_doc_type) + x0 = time.time() + num_total = len(sets) + for index, set in enumerate(sets): + _new_percolator(set.spec, set.search_pattern) + print_estimated_time(x0, num_total, index) - for set in sets: - query = query_string_parser(set.search_pattern) - current_search_client.index( - index=_build_percolator_index_name(index), - id=set.spec, - body={'query': query.to_dict()} - ) + +def print_estimated_time(start_time, num_total_elements, num_current_element): + current_time = time.time() + total_time_so_far = current_time - start_time + average_time = total_time_so_far / (num_current_element + 1) + estimated_time = (num_total_elements - num_current_element) * average_time + print(f"{num_current_element}/{num_total_elements} took {total_time_so_far:.2f}. Estimate: {estimated_time:.2f}") diff --git a/invenio_oaiserver/query.py b/invenio_oaiserver/query.py index 3729119..a874f4d 100644 --- a/invenio_oaiserver/query.py +++ b/invenio_oaiserver/query.py @@ -2,7 +2,7 @@ # # This file is part of Invenio. # Copyright (C) 2016-2018 CERN. -# Copyright (C) 2021 Graz University of Technology. +# Copyright (C) 2022 Graz University of Technology. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. @@ -79,14 +79,14 @@ def get_affected_records(spec=None, search_pattern=None): for result in search.scan(): yield result.meta.id - + def get_records(**kwargs): """Get records paginated.""" page_ = kwargs.get('resumptionToken', {}).get('page', 1) size_ = current_app.config['OAISERVER_PAGE_SIZE'] scroll = current_app.config['OAISERVER_RESUMPTION_TOKEN_EXPIRE_TIME'] scroll_id = kwargs.get('resumptionToken', {}).get('scroll_id') - + if scroll_id is None: search = ( current_oaiserver.search_cls( diff --git a/invenio_oaiserver/receivers.py b/invenio_oaiserver/receivers.py index 1fdeba4..15316a9 100644 --- a/invenio_oaiserver/receivers.py +++ b/invenio_oaiserver/receivers.py @@ -2,6 +2,7 @@ # # This file is part of Invenio. # Copyright (C) 2016-2018 CERN. +# Copyright (C) 2022 Graz University of Technology. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. @@ -36,23 +37,23 @@ def __call__(self, sender, record, **kwargs): def after_insert_oai_set(mapper, connection, target): """Update records on OAISet insertion.""" _new_percolator(spec=target.spec, search_pattern=target.search_pattern) - sleep(2) - update_affected_records.delay(search_pattern=target.search_pattern) + # sleep(2) + # update_affected_records.delay(search_pattern=target.search_pattern) def after_update_oai_set(mapper, connection, target): """Update records on OAISet update.""" _delete_percolator(spec=target.spec, search_pattern=target.search_pattern) _new_percolator(spec=target.spec, search_pattern=target.search_pattern) - sleep(2) - update_affected_records.delay( - spec=target.spec, search_pattern=target.search_pattern) + # sleep(2) + # update_affected_records.delay( + # spec=target.spec, search_pattern=target.search_pattern) def after_delete_oai_set(mapper, connection, target): """Update records on OAISet deletion.""" _delete_percolator(spec=target.spec, search_pattern=target.search_pattern) - sleep(2) - update_affected_records.delay( - spec=target.spec - ) + # sleep(2) + # update_affected_records.delay( + # spec=target.spec + # ) diff --git a/invenio_oaiserver/response.py b/invenio_oaiserver/response.py index d1eb595..30e383e 100644 --- a/invenio_oaiserver/response.py +++ b/invenio_oaiserver/response.py @@ -2,6 +2,7 @@ # # This file is part of Invenio. # Copyright (C) 2015-2019 CERN. +# Copyright (C) 2022 Graz University of Technology. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. @@ -16,6 +17,8 @@ from lxml import etree from lxml.etree import Element, ElementTree, SubElement +from invenio_oaiserver.percolator import sets_search_all + from .models import OAISet from .provider import OAIIDProvider from .proxies import current_oaiserver @@ -304,7 +307,10 @@ def listidentifiers(**kwargs): e_tree, e_listidentifiers = verb(**kwargs) result = get_records(**kwargs) - for record in result.items: + all_records = [record for record in result.items] + records_sets = sets_search_all([r['json']['_source'] for r in all_records]) + + for index, record in enumerate(all_records): pid = current_oaiserver.oaiid_fetcher( record['id'], record['json']['_source'] ) @@ -312,9 +318,7 @@ def listidentifiers(**kwargs): e_listidentifiers, identifier=pid.pid_value, datestamp=record['updated'], - sets=current_oaiserver.record_sets_fetcher( - record['json']['_source'] - ), + sets=records_sets[index], ) resumption_token(e_listidentifiers, result, **kwargs) @@ -328,7 +332,10 @@ def listrecords(**kwargs): e_tree, e_listrecords = verb(**kwargs) result = get_records(**kwargs) - for record in result.items: + all_records = [record for record in result.items] + records_sets = sets_search_all([r['json']['_source'] for r in all_records]) + + for index, record in enumerate(all_records): pid = current_oaiserver.oaiid_fetcher( record['id'], record['json']['_source'] ) @@ -337,9 +344,7 @@ def listrecords(**kwargs): e_record, identifier=pid.pid_value, datestamp=record['updated'], - sets=current_oaiserver.record_sets_fetcher( - record['json']['_source'] - ), + sets=records_sets[index], ) e_metadata = SubElement(e_record, etree.QName(NS_OAIPMH, 'metadata')) e_metadata.append(record_dumper(pid, record['json'])) From 13e66e28095f025c216401f1b7a909b0c55f074c Mon Sep 17 00:00:00 2001 From: David Eckhard Date: Mon, 17 Jan 2022 17:40:55 +0100 Subject: [PATCH 03/19] tests: fix pydocstyle and isort --- invenio_oaiserver/errors.py | 7 +- invenio_oaiserver/ext.py | 2 +- invenio_oaiserver/fetchers.py | 14 +-- invenio_oaiserver/percolator.py | 172 +++++++++++++++++++----------- invenio_oaiserver/query.py | 5 +- invenio_oaiserver/views/server.py | 4 +- 6 files changed, 130 insertions(+), 74 deletions(-) diff --git a/invenio_oaiserver/errors.py b/invenio_oaiserver/errors.py index 8bad811..dac9c91 100644 --- a/invenio_oaiserver/errors.py +++ b/invenio_oaiserver/errors.py @@ -21,7 +21,10 @@ class OAISetSpecUpdateError(Exception): The correct way is to delete the set and create a new one. """ + class OAINoRecordsMatchError(Exception): - """The combination of the values of the from, until, and set arguments - results in an empty list. + """No records match the query. + + The combination of the values of the from, until, and set arguments + results in an empty list. """ diff --git a/invenio_oaiserver/ext.py b/invenio_oaiserver/ext.py index 1c0c2c6..2874b23 100644 --- a/invenio_oaiserver/ext.py +++ b/invenio_oaiserver/ext.py @@ -58,7 +58,7 @@ def record_fetcher(self): @property def set_records_query_fetcher(self): - """Get query to retrieve records based on set""" + """Get query to retrieve records based on set.""" return obj_or_import_string(self.app.config['OAISERVER_SET_RECORDS_QUERY_FETCHER']) @property diff --git a/invenio_oaiserver/fetchers.py b/invenio_oaiserver/fetchers.py index 4214cb5..3d8d814 100644 --- a/invenio_oaiserver/fetchers.py +++ b/invenio_oaiserver/fetchers.py @@ -10,14 +10,14 @@ """Persistent identifier fetchers.""" from __future__ import absolute_import, print_function -from elasticsearch_dsl.query import Q +from elasticsearch_dsl.query import Q from invenio_pidstore.errors import PersistentIdentifierError from invenio_pidstore.fetchers import FetchedPID from .models import OAISet -from .query import query_string_parser from .provider import OAIIDProvider +from .query import query_string_parser def oaiid_fetcher(record_uuid, data): @@ -39,18 +39,18 @@ def oaiid_fetcher(record_uuid, data): def set_records_query_fetcher(setSpec): - """Fetch query to retrieve records based on provided set spec""" + """Fetch query to retrieve records based on provided set spec.""" return Q('match', **{'_oai.sets': setSpec}) - - + + # TODO: move to invenio_rdm_records def rdm_records_set_records_query_fetcher(setSpec): - """Fetch query to retrieve records based on provided set spec""" + """Fetch query to retrieve records based on provided set spec.""" set = OAISet.query.filter(OAISet.spec == setSpec).first() if set is None: # raise error that no matches can be found ? query = Q('match_none') else: query = Q(query_string_parser(set.search_pattern)) - + return query diff --git a/invenio_oaiserver/percolator.py b/invenio_oaiserver/percolator.py index 146d953..c544b0a 100644 --- a/invenio_oaiserver/percolator.py +++ b/invenio_oaiserver/percolator.py @@ -20,6 +20,7 @@ from invenio_indexer.utils import schema_to_index from invenio_search import current_search, current_search_client from invenio_search.utils import build_index_name + from invenio_oaiserver.query import query_string_parser from .models import OAISet @@ -45,8 +46,8 @@ def _create_percolator_mapping(index, doc_type, mapping_path=None): percolator_index = _build_percolator_index_name(index) if ES_VERSION[0] in (5, 6): current_search_client.indices.put_mapping( - index=index, doc_type=doc_type, - body=PERCOLATOR_MAPPING) + index=index, doc_type=doc_type, body=PERCOLATOR_MAPPING + ) elif ES_VERSION[0] == 7: if not mapping_path: mapping_path = current_search.mappings[index] @@ -54,10 +55,10 @@ def _create_percolator_mapping(index, doc_type, mapping_path=None): with open(mapping_path, 'r') as body: mapping = json.load(body) mapping["mappings"]["properties"].update( - PERCOLATOR_MAPPING["properties"]) + PERCOLATOR_MAPPING["properties"] + ) current_search_client.indices.create( - index=percolator_index, - body=mapping + index=percolator_index, body=mapping ) @@ -66,14 +67,20 @@ def _percolate_query(index, doc_type, percolator_doc_type, document): index = _build_percolator_index_name(index) if ES_VERSION[0] in (2, 5): results = current_search_client.percolate( - index=index, doc_type=doc_type, allow_no_indices=True, - ignore_unavailable=True, body={'doc': document} + index=index, + doc_type=doc_type, + allow_no_indices=True, + ignore_unavailable=True, + body={'doc': document}, ) return results['matches'] elif ES_VERSION[0] in (6, 7): es_client_params = dict( - index=index, doc_type=percolator_doc_type, allow_no_indices=True, - ignore_unavailable=True, body={ + index=index, + doc_type=percolator_doc_type, + allow_no_indices=True, + ignore_unavailable=True, + body={ 'query': { 'percolate': { 'field': 'query', @@ -81,7 +88,8 @@ def _percolate_query(index, doc_type, percolator_doc_type, document): 'document': document, } } - }) + }, + ) if ES_VERSION[0] == 7: es_client_params.pop('doc_type') results = current_search_client.search(**es_client_params) @@ -100,9 +108,7 @@ def _get_percolator_doc_type(index): return doc_type -PERCOLATOR_MAPPING = { - 'properties': {'query': {'type': 'percolator'}} -} +PERCOLATOR_MAPPING = {'properties': {'query': {'type': 'percolator'}}} def _new_percolator(spec, search_pattern): @@ -118,7 +124,7 @@ def _new_percolator(spec, search_pattern): index=_build_percolator_index_name(index), doc_type=percolator_doc_type, id='oaiset-{}'.format(spec), - body={'query': query} + body={'query': query}, ) # for index, mapping_path in current_search.mappings.items(): # # Create the percolator doc_type in the existing index for >= ES5 @@ -143,7 +149,8 @@ def _delete_percolator(spec, search_pattern): current_search_client.delete( index=_build_percolator_index_name(index), doc_type=percolator_doc_type, - id='oaiset-{}'.format(spec), ignore=[404] + id='oaiset-{}'.format(spec), + ignore=[404], ) @@ -153,8 +160,9 @@ def _build_cache(): if sets is None: # build sets cache sets = current_oaiserver.sets = [ - oaiset.spec for oaiset in OAISet.query.filter( - OAISet.search_pattern.is_(None)).all()] + oaiset.spec + for oaiset in OAISet.query.filter(OAISet.search_pattern.is_(None)).all() + ] return sets @@ -182,53 +190,77 @@ def get_record_sets(record): yield name -def create_percolate_query(percolator_ids=None, documents=None, document_es_ids=None, document_es_indices=None): +def create_percolate_query( + percolator_ids=None, documents=None, document_es_ids=None, document_es_indices=None +): + """Create percolate query for provided arguments.""" queries = [] # documents or (document_es_ids and document_es_indices) has to be set if documents is not None: - queries.append({ - "percolate" : { - "field": "query", - "documents" : documents, - } - }) - elif (document_es_ids is not None and document_es_indices is not None and len(document_es_ids) == len(document_es_indices)): - queries.extend( [{ - "percolate" : { - "field" : "query", - "index" : es_index, - "id" : es_id, - "name" : f"{es_index}:{es_id}", + queries.append( + { + "percolate": { + "field": "query", + "documents": documents, + } } - } for (es_id, es_index) in zip(document_es_ids, document_es_indices)]) + ) + elif ( + document_es_ids is not None and document_es_indices + is not None and len(document_es_ids) == len(document_es_indices) + ): + queries.extend( + [ + { + "percolate": { + "field": "query", + "index": es_index, + "id": es_id, + "name": f"{es_index}:{es_id}", + } + } + for (es_id, es_index) in zip(document_es_ids, document_es_indices) + ] + ) else: - raise Exception("Either documents or (document_es_ids and document_es_indices) must be specified.") - + raise Exception( + "Either documents or (document_es_ids and document_es_indices) must be specified." + ) + if percolator_ids: - queries.append({ - "ids": { - "values" : percolator_ids - } - }) - - query = { - "query" : { - "bool" : { - "must" : queries - } - } - } + queries.append({"ids": {"values": percolator_ids}}) + + query = {"query": {"bool": {"must": queries}}} return query -def percolate_query(percolator_index, percolator_ids=None, documents=None, document_es_ids=None, document_es_indices=None): - query = create_percolate_query(percolator_ids=percolator_ids, documents=documents, document_es_ids=document_es_ids, document_es_indices=document_es_indices) - result = scan(current_search_client, index=percolator_index, query=query, scroll="1m", size=100) + +def percolate_query( + percolator_index, + percolator_ids=None, + documents=None, + document_es_ids=None, + document_es_indices=None, +): + """Get results for a percolate query.""" + query = create_percolate_query( + percolator_ids=percolator_ids, + documents=documents, + document_es_ids=document_es_ids, + document_es_indices=document_es_indices, + ) + result = scan( + current_search_client, + index=percolator_index, + query=query, + scroll="1m", + ) return result def sets_search_all(records): - record_index = current_app.config.get('OAISERVER_RECORD_INDEX') + """Retrieve sets for provided records.""" + record_index = current_app.config.get('OAISERVER_RECORD_INDEX') percolator_index = _build_percolator_index_name(record_index) record_sets = [[] for _ in range(len(records))] @@ -241,7 +273,9 @@ def sets_search_all(records): set_index_id = s["_id"] if set_index_id.startswith(prefix): set_spec = set_index_id[prefix_len:] - for record_index in s.get("fields", {}).get("_percolator_document_slot", []): + for record_index in s.get("fields", {}).get( + "_percolator_document_slot", [] + ): record_sets[record_index].append(set_spec) return record_sets @@ -250,11 +284,14 @@ def find_sets_for_record(record): """Fetch a record's sets.""" return sets_search_all([record])[0] + # TODO: Remove everything below before merging def remove_sets(): + """Remove specified sets.""" from invenio_db import db + specs = [] - index = current_app.config.get('OAISERVER_RECORD_INDEX') + index = current_app.config.get('OAISERVER_RECORD_INDEX') for spec in specs: try: @@ -263,26 +300,36 @@ def remove_sets(): db.session.delete(db_set) db.session.commit() except Exception as e: - print(f"Exception during set removal ({spec}):",e) - + print(f"Exception during set removal ({spec}):", e) _delete_percolator(spec, "***") current_search_client.delete( - index=_build_percolator_index_name(index), - id=spec, - ignore=[404] + index=_build_percolator_index_name(index), id=spec, ignore=[404] ) + def create_new_set(): + """Create a new set.""" from datetime import datetime + from invenio_db import db + name = f"published-{datetime.now()}" - s = OAISet(spec=name, name=name, description="created via python orm", search_pattern="is_published:true") + s = OAISet( + spec=name, + name=name, + description="created via python orm", + search_pattern="is_published:true", + ) db.session.add(s) db.session.commit() - + + import time + + def index_sets(): + """Index all sets.""" sets = OAISet.query.all() if not sets: return [] @@ -295,8 +342,11 @@ def index_sets(): def print_estimated_time(start_time, num_total_elements, num_current_element): + """Calculate and print estimated remaining time.""" current_time = time.time() total_time_so_far = current_time - start_time average_time = total_time_so_far / (num_current_element + 1) estimated_time = (num_total_elements - num_current_element) * average_time - print(f"{num_current_element}/{num_total_elements} took {total_time_so_far:.2f}. Estimate: {estimated_time:.2f}") + print( + f"{num_current_element}/{num_total_elements} took {total_time_so_far:.2f}. Estimate: {estimated_time:.2f}" + ) diff --git a/invenio_oaiserver/query.py b/invenio_oaiserver/query.py index a874f4d..9486b05 100644 --- a/invenio_oaiserver/query.py +++ b/invenio_oaiserver/query.py @@ -15,6 +15,7 @@ from flask import current_app from invenio_search import RecordsSearch, current_search_client from werkzeug.utils import cached_property, import_string + from invenio_oaiserver.errors import OAINoRecordsMatchError from . import current_oaiserver @@ -79,14 +80,14 @@ def get_affected_records(spec=None, search_pattern=None): for result in search.scan(): yield result.meta.id - + def get_records(**kwargs): """Get records paginated.""" page_ = kwargs.get('resumptionToken', {}).get('page', 1) size_ = current_app.config['OAISERVER_PAGE_SIZE'] scroll = current_app.config['OAISERVER_RESUMPTION_TOKEN_EXPIRE_TIME'] scroll_id = kwargs.get('resumptionToken', {}).get('scroll_id') - + if scroll_id is None: search = ( current_oaiserver.search_cls( diff --git a/invenio_oaiserver/views/server.py b/invenio_oaiserver/views/server.py index 6925ffd..0af54f5 100644 --- a/invenio_oaiserver/views/server.py +++ b/invenio_oaiserver/views/server.py @@ -10,6 +10,8 @@ from __future__ import absolute_import +import time + from flask import Blueprint, make_response from invenio_pidstore.errors import PIDDoesNotExistError from itsdangerous import BadSignature @@ -86,7 +88,7 @@ def no_records_error(exception): 422, {'Content-Type': 'text/xml'}) -import time + @blueprint.route('/oai2d', methods=['GET', 'POST']) @use_args(make_request_validator) def response(args): From 98fe14e393b390103dd9afb8de324853eed06321 Mon Sep 17 00:00:00 2001 From: David Eckhard Date: Tue, 18 Jan 2022 13:08:47 +0100 Subject: [PATCH 04/19] fix: use recordindexer to retrieve index --- invenio_oaiserver/percolator.py | 62 ++++++++++++++++----------------- invenio_oaiserver/response.py | 2 +- 2 files changed, 31 insertions(+), 33 deletions(-) diff --git a/invenio_oaiserver/percolator.py b/invenio_oaiserver/percolator.py index c544b0a..bb4be6e 100644 --- a/invenio_oaiserver/percolator.py +++ b/invenio_oaiserver/percolator.py @@ -113,45 +113,38 @@ def _get_percolator_doc_type(index): def _new_percolator(spec, search_pattern): """Create new percolator associated with the new set.""" - index = current_app.config.get('OAISERVER_RECORD_INDEX') if spec and search_pattern: query = query_string_parser(search_pattern=search_pattern).to_dict() - # Create the percolator doc_type in the existing index for >= ES5 - # TODO: Consider doing this only once in app initialization + for index, mapping_path in current_search.mappings.items(): + # Create the percolator doc_type in the existing index for >= ES5 + # TODO: Consider doing this only once in app initialization + try: + percolator_doc_type = _get_percolator_doc_type(index) + _create_percolator_mapping( + index, percolator_doc_type, mapping_path) + current_search_client.index( + index=_build_percolator_index_name(index), + doc_type=percolator_doc_type, + id='oaiset-{}'.format(spec), + body={'query': query} + ) + except: + # caught on schemas, which do not contain the query field + pass + + +def _delete_percolator(spec, search_pattern): + """Delete percolator associated with the removed/updated oaiset.""" + # Create the percolator doc_type in the existing index for >= ES5 + for index, mapping_path in current_search.mappings.items(): percolator_doc_type = _get_percolator_doc_type(index) _create_percolator_mapping(index, percolator_doc_type) - current_search_client.index( + current_search_client.delete( index=_build_percolator_index_name(index), doc_type=percolator_doc_type, id='oaiset-{}'.format(spec), - body={'query': query}, + ignore=[404], ) - # for index, mapping_path in current_search.mappings.items(): - # # Create the percolator doc_type in the existing index for >= ES5 - # # TODO: Consider doing this only once in app initialization - # percolator_doc_type = _get_percolator_doc_type(index) - # _create_percolator_mapping( - # index, percolator_doc_type, mapping_path) - # current_search_client.index( - # index=_build_percolator_index_name(index), - # doc_type=percolator_doc_type, - # id='oaiset-{}'.format(spec), - # body={'query': query} - # ) - - -def _delete_percolator(spec, search_pattern): - """Delete percolator associated with the removed/updated oaiset.""" - index = current_app.config.get('OAISERVER_RECORD_INDEX') - # Create the percolator doc_type in the existing index for >= ES5 - percolator_doc_type = _get_percolator_doc_type(index) - _create_percolator_mapping(index, percolator_doc_type) - current_search_client.delete( - index=_build_percolator_index_name(index), - doc_type=percolator_doc_type, - id='oaiset-{}'.format(spec), - ignore=[404], - ) def _build_cache(): @@ -260,7 +253,12 @@ def percolate_query( def sets_search_all(records): """Retrieve sets for provided records.""" - record_index = current_app.config.get('OAISERVER_RECORD_INDEX') + if not records: + return [] + + # records should all have the same index. maybe add index as parameter? + record_index, doc_type = RecordIndexer()._record_to_index(records[0]) + _create_percolator_mapping(record_index, doc_type) percolator_index = _build_percolator_index_name(record_index) record_sets = [[] for _ in range(len(records))] diff --git a/invenio_oaiserver/response.py b/invenio_oaiserver/response.py index 30e383e..4309dfe 100644 --- a/invenio_oaiserver/response.py +++ b/invenio_oaiserver/response.py @@ -17,7 +17,7 @@ from lxml import etree from lxml.etree import Element, ElementTree, SubElement -from invenio_oaiserver.percolator import sets_search_all +from invenio_oaiserver.percolator import create_new_set, sets_search_all from .models import OAISet from .provider import OAIIDProvider From fcf3b3846a47a3e91e92ec65d4f1c18ba4a2a726 Mon Sep 17 00:00:00 2001 From: David Eckhard Date: Tue, 18 Jan 2022 13:59:31 +0100 Subject: [PATCH 05/19] fix: remove unused parts --- invenio_oaiserver/ext.py | 17 +------- invenio_oaiserver/models.py | 63 +++++++++++---------------- invenio_oaiserver/percolator.py | 36 --------------- invenio_oaiserver/query.py | 34 --------------- invenio_oaiserver/receivers.py | 31 +------------ invenio_oaiserver/resumption_token.py | 7 ++- invenio_oaiserver/tasks.py | 54 ----------------------- 7 files changed, 35 insertions(+), 207 deletions(-) delete mode 100644 invenio_oaiserver/tasks.py diff --git a/invenio_oaiserver/ext.py b/invenio_oaiserver/ext.py index 2874b23..3a49f30 100644 --- a/invenio_oaiserver/ext.py +++ b/invenio_oaiserver/ext.py @@ -2,7 +2,7 @@ # # This file is part of Invenio. # Copyright (C) 2015-2018 CERN. -# Copyright (C) 2021 Graz University of Technology. +# Copyright (C) 2021-2022 Graz University of Technology. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. @@ -87,14 +87,6 @@ def sets(self, values): def register_signals(self): """Register signals.""" - from .receivers import OAIServerUpdater - - # Register Record signals to update OAI informations - self.update_function = OAIServerUpdater() - records_signals.before_record_insert.connect(self.update_function, - weak=False) - records_signals.before_record_update.connect(self.update_function, - weak=False) if self.app.config['OAISERVER_REGISTER_SET_SIGNALS']: self.register_signals_oaiset() @@ -110,11 +102,6 @@ def register_signals_oaiset(self): def unregister_signals(self): """Unregister signals.""" # Unregister Record signals - if hasattr(self, 'update_function'): - records_signals.before_record_insert.disconnect( - self.update_function) - records_signals.before_record_update.disconnect( - self.update_function) self.unregister_signals_oaiset() def unregister_signals_oaiset(self): @@ -173,7 +160,7 @@ def init_config(self, app): import warnings app.config['OAISERVER_ID_PREFIX'] = \ - 'oai:{0}:recid/'.format(socket.gethostname()) + 'oai:{0}:'.format(socket.gethostname()) warnings.warn( """Please specify the OAISERVER_ID_PREFIX configuration.""" """default value is: {0}""".format( diff --git a/invenio_oaiserver/models.py b/invenio_oaiserver/models.py index b476f06..d2a02ef 100644 --- a/invenio_oaiserver/models.py +++ b/invenio_oaiserver/models.py @@ -8,8 +8,6 @@ """Models for storing information about OAIServer state.""" -from datetime import datetime - from flask_babelex import lazy_gettext as _ from invenio_db import db from sqlalchemy.event import listen @@ -18,7 +16,6 @@ from .errors import OAISetSpecUpdateError from .proxies import current_oaiserver -from .utils import datetime_to_datestamp class OAISet(db.Model, Timestamp): @@ -76,52 +73,44 @@ def validate_spec(self, key, value): raise OAISetSpecUpdateError("Updating spec is not allowed.") return value - def add_record(self, record): - """Add a record to the OAISet. - - :param record: Record to be added. - :type record: `invenio_records.api.Record` or derivative. - """ - record.setdefault('_oai', {}).setdefault('sets', []) - assert not self.has_record(record) + # TODO: Add and remove can be implemented but it will require to + # update the `search_pattern` - record['_oai']['sets'].append(self.spec) + # def add_record(self, record): + # """Add a record to the OAISet. - def remove_record(self, record): - """Remove a record from the OAISet. + # :param record: Record to be added. + # :type record: `invenio_records.api.Record` or derivative. + # """ + # record.setdefault('_oai', {}).setdefault('sets', []) - :param record: Record to be removed. - :type record: `invenio_records.api.Record` or derivative. - """ - assert self.has_record(record) + # assert not self.has_record(record) - record['_oai']['sets'] = [ - s for s in record['_oai']['sets'] if s != self.spec] + # record['_oai']['sets'].append(self.spec) - def has_record(self, record): - """Check if the record blongs to the OAISet. + # def remove_record(self, record): + # """Remove a record from the OAISet. - :param record: Record to be checked. - :type record: `invenio_records.api.Record` or derivative. - """ - return self.spec in record.get('_oai', {}).get('sets', []) + # :param record: Record to be removed. + # :type record: `invenio_records.api.Record` or derivative. + # """ + # assert self.has_record(record) + # record['_oai']['sets'] = [ + # s for s in record['_oai']['sets'] if s != self.spec] -def oaiset_removed_or_inserted(mapper, connection, target): - """Invalidate cache on collection insert or delete.""" - current_oaiserver.sets = None + # TODO: has_record can be implemented but it will require to + # to do a full search. -def oaiset_attribute_changed(target, value, oldvalue, initiator): - """Invalidate cache if dbquery change.""" - if value != oldvalue: - current_oaiserver.sets = None + # def has_record(self, record): + # """Check if the record blongs to the OAISet. + # :param record: Record to be checked. + # :type record: `invenio_records.api.Record` or derivative. + # """ + # return self.spec in record.get('_oai', {}).get('sets', []) -# update cache with list of collections -listen(OAISet, 'after_insert', oaiset_removed_or_inserted) -listen(OAISet, 'after_delete', oaiset_removed_or_inserted) -listen(OAISet.search_pattern, 'set', oaiset_attribute_changed) __all__ = ('OAISet', ) diff --git a/invenio_oaiserver/percolator.py b/invenio_oaiserver/percolator.py index bb4be6e..1eb7d7d 100644 --- a/invenio_oaiserver/percolator.py +++ b/invenio_oaiserver/percolator.py @@ -147,42 +147,6 @@ def _delete_percolator(spec, search_pattern): ) -def _build_cache(): - """Build sets cache.""" - sets = current_oaiserver.sets - if sets is None: - # build sets cache - sets = current_oaiserver.sets = [ - oaiset.spec - for oaiset in OAISet.query.filter(OAISet.search_pattern.is_(None)).all() - ] - return sets - - -def get_record_sets(record): - """Find matching sets.""" - # get lists of sets with search_pattern equals to None but already in the - # set list inside the record - record_sets = set(current_oaiserver.record_sets_fetcher(record)) - for spec in _build_cache(): - if spec in record_sets: - yield spec - - # get list of sets that match using percolator - index, doc_type = RecordIndexer().record_to_index(record) - document = record.dumps() - percolator_doc_type = _get_percolator_doc_type(index) - _create_percolator_mapping(index, percolator_doc_type) - results = _percolate_query(index, doc_type, percolator_doc_type, document) - prefix = 'oaiset-' - prefix_len = len(prefix) - for match in results: - set_name = match['_id'] - if set_name.startswith(prefix): - name = set_name[prefix_len:] - yield name - - def create_percolate_query( percolator_ids=None, documents=None, document_es_ids=None, document_es_indices=None ): diff --git a/invenio_oaiserver/query.py b/invenio_oaiserver/query.py index 9486b05..45766d4 100644 --- a/invenio_oaiserver/query.py +++ b/invenio_oaiserver/query.py @@ -47,40 +47,6 @@ class Meta: default_filter = Q('exists', field='_oai.id') -def get_affected_records(spec=None, search_pattern=None): - """Get list of affected records. - - :param spec: The record spec. - :param search_pattern: The search pattern. - :returns: An iterator to lazily find results. - """ - # spec pattern query - # ---------- ---------- ------- - # None None None - # None Y Y - # X None X - # X '' X - # X Y X OR Y - - if spec is None and search_pattern is None: - return - - queries = [] - - if spec is not None: - queries.append(Q('match', **{'_oai.sets': spec})) - - if search_pattern: - queries.append(query_string_parser(search_pattern=search_pattern)) - - search = current_oaiserver.search_cls( - index=current_app.config['OAISERVER_RECORD_INDEX'], - ).query(Q('bool', should=queries)) - - for result in search.scan(): - yield result.meta.id - - def get_records(**kwargs): """Get records paginated.""" page_ = kwargs.get('resumptionToken', {}).get('page', 1) diff --git a/invenio_oaiserver/receivers.py b/invenio_oaiserver/receivers.py index 15316a9..7de3cca 100644 --- a/invenio_oaiserver/receivers.py +++ b/invenio_oaiserver/receivers.py @@ -11,49 +11,20 @@ from __future__ import absolute_import, print_function -from time import sleep - -from .percolator import _delete_percolator, _new_percolator, get_record_sets -from .tasks import update_affected_records - - -class OAIServerUpdater(object): - """Return the right update oaisets function.""" - - def __call__(self, sender, record, **kwargs): - """Update sets list. - - :param record: The record data. - """ - if '_oai' in record and 'id' in record['_oai']: - new_sets = set(get_record_sets(record=record)) - # Update only if old and new sets differ - if set(record['_oai'].get('sets', [])) != new_sets: - record['_oai'].update({ - 'sets': list(new_sets) - }) +from .percolator import _delete_percolator, _new_percolator def after_insert_oai_set(mapper, connection, target): """Update records on OAISet insertion.""" _new_percolator(spec=target.spec, search_pattern=target.search_pattern) - # sleep(2) - # update_affected_records.delay(search_pattern=target.search_pattern) def after_update_oai_set(mapper, connection, target): """Update records on OAISet update.""" _delete_percolator(spec=target.spec, search_pattern=target.search_pattern) _new_percolator(spec=target.spec, search_pattern=target.search_pattern) - # sleep(2) - # update_affected_records.delay( - # spec=target.spec, search_pattern=target.search_pattern) def after_delete_oai_set(mapper, connection, target): """Update records on OAISet deletion.""" _delete_percolator(spec=target.spec, search_pattern=target.search_pattern) - # sleep(2) - # update_affected_records.delay( - # spec=target.spec - # ) diff --git a/invenio_oaiserver/resumption_token.py b/invenio_oaiserver/resumption_token.py index 90147ac..3654e30 100644 --- a/invenio_oaiserver/resumption_token.py +++ b/invenio_oaiserver/resumption_token.py @@ -2,6 +2,7 @@ # # This file is part of Invenio. # Copyright (C) 2016-2018 CERN. +# Copyright (C) 2022 Graz University of Technology. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. @@ -53,8 +54,12 @@ def _deserialize(self, value, attr, data, **kwargs): result = token_builder.loads(value, max_age=current_app.config[ 'OAISERVER_RESUMPTION_TOKEN_EXPIRE_TIME']) result['token'] = value + # TODO: remove? - # this loads the arguments from the token, which is not necessary as the resumptionToken keyword is exclusive + # this loads the arguments from the token, which is not necessary as + # the resumptionToken keyword is exclusive and will lead to an error + # that other arguments have to be provided as well. + # result['kwargs'] = self.root.load(result['kwargs'], partial=True).data return result diff --git a/invenio_oaiserver/tasks.py b/invenio_oaiserver/tasks.py deleted file mode 100644 index e986cb0..0000000 --- a/invenio_oaiserver/tasks.py +++ /dev/null @@ -1,54 +0,0 @@ -# -*- coding: utf-8 -*- -# -# This file is part of Invenio. -# Copyright (C) 2016-2018 CERN. -# -# Invenio is free software; you can redistribute it and/or modify it -# under the terms of the MIT License; see LICENSE file for more details. - -"""Task for OAI.""" - -from celery import group, shared_task -from flask import current_app -from flask_celeryext import RequestContextTask -from invenio_db import db -from invenio_records.api import Record - -from .query import get_affected_records - -try: - from itertools import zip_longest -except ImportError: # pragma: no cover - from itertools import izip_longest as zip_longest - - -def _records_commit(record_ids): - """Commit all records.""" - for record_id in record_ids: - record = Record.get_record(record_id) - record.commit() - - -@shared_task(base=RequestContextTask) -def update_records_sets(record_ids): - """Update records sets. - - :param record_ids: List of record UUID. - """ - _records_commit(record_ids) - db.session.commit() - - -@shared_task(base=RequestContextTask) -def update_affected_records(spec=None, search_pattern=None): - """Update all affected records by OAISet change. - - :param spec: The record spec. - :param search_pattern: The search pattern. - """ - chunk_size = current_app.config['OAISERVER_CELERY_TASK_CHUNK_SIZE'] - record_ids = get_affected_records(spec=spec, search_pattern=search_pattern) - group( - update_records_sets.s(list(filter(None, chunk))) - for chunk in zip_longest(*[iter(record_ids)] * chunk_size) - )() From 96dd51a3c7674f0205982f84f50ded76ad65e071 Mon Sep 17 00:00:00 2001 From: David Eckhard Date: Tue, 18 Jan 2022 14:14:05 +0100 Subject: [PATCH 06/19] refactor: pass testcases --- invenio_oaiserver/config.py | 2 +- invenio_oaiserver/models.py | 2 - invenio_oaiserver/percolator.py | 45 ++- invenio_oaiserver/resumption_token.py | 2 +- tests/test_invenio_oaiserver.py | 2 +- tests/test_percolator.py | 487 +++++++++++++------------- 6 files changed, 268 insertions(+), 272 deletions(-) diff --git a/invenio_oaiserver/config.py b/invenio_oaiserver/config.py index ba8481f..fc6c156 100644 --- a/invenio_oaiserver/config.py +++ b/invenio_oaiserver/config.py @@ -51,7 +51,7 @@ with meanings as defined in ISO8601. """ -OAISERVER_RESUMPTION_TOKEN_EXPIRE_TIME = 10 * 60 +OAISERVER_RESUMPTION_TOKEN_EXPIRE_TIME = 1 * 60 """The expiration time of a resumption token in seconds. **Default: 60 seconds = 1 minute**. diff --git a/invenio_oaiserver/models.py b/invenio_oaiserver/models.py index d2a02ef..b5a69e2 100644 --- a/invenio_oaiserver/models.py +++ b/invenio_oaiserver/models.py @@ -73,7 +73,6 @@ def validate_spec(self, key, value): raise OAISetSpecUpdateError("Updating spec is not allowed.") return value - # TODO: Add and remove can be implemented but it will require to # update the `search_pattern` @@ -100,7 +99,6 @@ def validate_spec(self, key, value): # record['_oai']['sets'] = [ # s for s in record['_oai']['sets'] if s != self.spec] - # TODO: has_record can be implemented but it will require to # to do a full search. diff --git a/invenio_oaiserver/percolator.py b/invenio_oaiserver/percolator.py index 1eb7d7d..979e0d1 100644 --- a/invenio_oaiserver/percolator.py +++ b/invenio_oaiserver/percolator.py @@ -12,6 +12,7 @@ from __future__ import absolute_import, print_function import json +import warnings from elasticsearch import VERSION as ES_VERSION from elasticsearch.helpers.actions import scan @@ -128,8 +129,9 @@ def _new_percolator(spec, search_pattern): id='oaiset-{}'.format(spec), body={'query': query} ) - except: + except Exception as e: # caught on schemas, which do not contain the query field + warnings.warn(e) pass @@ -153,32 +155,26 @@ def create_percolate_query( """Create percolate query for provided arguments.""" queries = [] # documents or (document_es_ids and document_es_indices) has to be set + # TODO: discuss if this is needed or documents alone is enough. if documents is not None: - queries.append( - { - "percolate": { - "field": "query", - "documents": documents, - } + queries.append({ + "percolate": { + "field": "query", + "documents": documents, } - ) + }) elif ( document_es_ids is not None and document_es_indices is not None and len(document_es_ids) == len(document_es_indices) ): - queries.extend( - [ - { - "percolate": { - "field": "query", - "index": es_index, - "id": es_id, - "name": f"{es_index}:{es_id}", - } - } - for (es_id, es_index) in zip(document_es_ids, document_es_indices) - ] - ) + queries.extend([{ + "percolate": { + "field": "query", + "index": es_index, + "id": es_id, + "name": f"{es_index}:{es_id}", + } + } for (es_id, es_index) in zip(document_es_ids, document_es_indices)]) else: raise Exception( "Either documents or (document_es_ids and document_es_indices) must be specified." @@ -287,11 +283,10 @@ def create_new_set(): db.session.commit() -import time - - def index_sets(): """Index all sets.""" + import time + sets = OAISet.query.all() if not sets: return [] @@ -305,6 +300,8 @@ def index_sets(): def print_estimated_time(start_time, num_total_elements, num_current_element): """Calculate and print estimated remaining time.""" + import time + current_time = time.time() total_time_so_far = current_time - start_time average_time = total_time_so_far / (num_current_element + 1) diff --git a/invenio_oaiserver/resumption_token.py b/invenio_oaiserver/resumption_token.py index 3654e30..58aa2f1 100644 --- a/invenio_oaiserver/resumption_token.py +++ b/invenio_oaiserver/resumption_token.py @@ -54,7 +54,7 @@ def _deserialize(self, value, attr, data, **kwargs): result = token_builder.loads(value, max_age=current_app.config[ 'OAISERVER_RESUMPTION_TOKEN_EXPIRE_TIME']) result['token'] = value - + # TODO: remove? # this loads the arguments from the token, which is not necessary as # the resumptionToken keyword is exclusive and will lead to an error diff --git a/tests/test_invenio_oaiserver.py b/tests/test_invenio_oaiserver.py index 8de4d40..618594b 100644 --- a/tests/test_invenio_oaiserver.py +++ b/tests/test_invenio_oaiserver.py @@ -65,7 +65,7 @@ def test_no_id_prefix(): with pytest.warns(UserWarning, match='specify the OAISERVER_ID_PREFIX'): app = Flask('testapp') ext = InvenioOAIServer(app) - expected_id_prefix = 'oai:{0}:recid/'.format(socket.gethostname()) + expected_id_prefix = 'oai:{0}:'.format(socket.gethostname()) assert app.config['OAISERVER_ID_PREFIX'] == expected_id_prefix diff --git a/tests/test_percolator.py b/tests/test_percolator.py index 9387d42..c933547 100644 --- a/tests/test_percolator.py +++ b/tests/test_percolator.py @@ -22,246 +22,247 @@ from invenio_oaiserver.receivers import after_delete_oai_set, \ after_insert_oai_set, after_update_oai_set - -def test_search_pattern_change(app, without_oaiset_signals, schema): - """Test search pattern change.""" - record0 = create_record(app, { - '_oai': {'sets': ['a']}, 'title_statement': {'title': 'Test0'}, - '$schema': schema - }) - rec_uuid = record0.id - oaiset = OAISet(spec="a", search_pattern="title_statement.title:Test0") - db.session.add(oaiset) - db.session.commit() - run_after_insert_oai_set() - current_search.flush_and_refresh('records') - record = Record.get_record(rec_uuid) - assert record['_oai']['sets'] == ['a'] - - # change search pattern: record0 will not inside it anymore - oaiset = OAISet.query.first() - oaiset.search_pattern = 'title_statement.title:Test1' - db.session.merge(oaiset) - db.session.commit() - after_update_oai_set(None, None, oaiset) - current_search.flush_and_refresh('records') - record = Record.get_record(rec_uuid) - record.commit() - assert record['_oai']['sets'] == [] - - -def test_populate_oaisets(app, without_oaiset_signals, schema): - """Populate OAISets.""" - def create_oaiset(**kwargs): - oaiset = OAISet(**kwargs) - db.session.add(oaiset) - db.session.commit() - return oaiset - - a = create_oaiset(spec='a') - create_oaiset(spec='b') - create_oaiset( - spec="e", search_pattern="title_statement.title:Test2 OR " - "title_statement.title:Test3") - create_oaiset(spec="c", search_pattern="title_statement.title:Test0") - create_oaiset(spec="d", search_pattern="title_statement.title:Test1") - f = create_oaiset(spec="f", search_pattern="title_statement.title:Test2") - create_oaiset(spec="g") - create_oaiset(spec="h") - i = create_oaiset(spec="i", search_pattern="title_statement.title:Test3") - j = create_oaiset(spec="j with space", - search_pattern="title_statement.title:Test4") - # Note below: brackets around AND search query are required - create_oaiset(spec="math", - search_pattern="(title_statement.title:foo AND genre:math)") - create_oaiset(spec="nonmath", - search_pattern="(title_statement.title:foo AND -genre:math)") - - run_after_insert_oai_set() - - a_id = OAISet.query.filter_by(spec=a.spec).one().id - i_id = OAISet.query.filter_by(spec=i.spec).one().id - - # start tests - - record0 = create_record(app, { - '_oai': {'sets': ['a']}, 'title_statement': {'title': 'Test0'}, - '$schema': schema - }) - - assert 'a' in record0['_oai']['sets'], 'Keep manually managed set "a".' - assert 'c' in record0['_oai']['sets'] - assert len(record0['_oai']['sets']) == 2 - - record_not_found = create_record( - app, {'title': 'TestNotFound', '$schema': schema} - ) - - # Don't create empty sets list just because of commit - assert 'sets' not in record_not_found['_oai'] - - record1 = create_record(app, {'title_statement': {'title': 'Test1'}, - '$schema': schema}) - - assert 'd' in record1['_oai']['sets'] - assert len(record1['_oai']['sets']) == 1 - - record2 = create_record(app, {'title_statement': {'title': 'Test2'}, - '$schema': schema}) - record2_id = record2.id - - assert 'e' in record2['_oai']['sets'] - assert 'f' in record2['_oai']['sets'] - assert len(record2['_oai']['sets']) == 2 - - record3 = create_record(app, {'title_statement': {'title': 'Test3'}, - '$schema': schema}) - record3_id = record3.id - - assert 'e' in record3['_oai']['sets'] - assert 'i' in record3['_oai']['sets'] - assert len(record3['_oai']['sets']) == 2 - - record4 = create_record(app, {'title_statement': {'title': 'Test4'}, - '$schema': schema}) - record4_id = record4.id - - assert 'j with space' in record4['_oai']['sets'] - assert len(record4['_oai']['sets']) == 1 - - # If record does not have '_oai', don't add any sets, - # nor even the default '_oai' key - record5 = create_record(app, {'title_statement': {'title': 'Test1'}, - '$schema': schema}, - mint_oaiid=False) - assert '_oai' not in record5 - - # Test 'AND' keyword for records - record6 = create_record(app, { - 'title_statement': {'title': 'foo'}, - 'genre': 'math', '$schema': schema - }) - assert record6['_oai']['sets'] == ['math', ] - - record7 = create_record(app, { - 'title_statement': {'title': 'foo'}, - 'genre': 'physics', '$schema': schema - }) - assert record7['_oai']['sets'] == ['nonmath', ] - - record8 = create_record(app, { - 'title_statement': {'title': 'bar'}, - 'genre': 'math', '$schema': schema - }) - assert 'sets' not in record8['_oai'] # title is not 'foo' - - current_search.flush_and_refresh('records') - - # test delete - current_oaiserver.unregister_signals_oaiset() - with patch('invenio_oaiserver.receivers.after_delete_oai_set') as f: - current_oaiserver.register_signals_oaiset() - - with db.session.begin_nested(): - db.session.delete(j) - db.session.commit() - assert f.called - after_delete_oai_set(None, None, j) - record4_model = RecordMetadata.query.filter_by( - id=record4_id).first().json - - assert 'j with space' not in record4_model['_oai'].get('sets', []) - assert len(record4_model['_oai'].get('sets', [])) == 0 - - current_oaiserver.unregister_signals_oaiset() - - # test update search_pattern - with patch('invenio_oaiserver.receivers.after_update_oai_set') as f: - current_oaiserver.register_signals_oaiset() - with db.session.begin_nested(): - i.search_pattern = None - assert current_oaiserver.sets is None, 'Cache should be empty.' - db.session.merge(i) - db.session.commit() - assert f.called - i = OAISet.query.get(i_id) - after_update_oai_set(None, None, i) - record3_model = RecordMetadata.query.filter_by( - id=record3_id).first().json - - assert 'i' not in record3_model['_oai']['sets'], \ - 'Set "i" is manually managed.' - assert 'e' in record3_model['_oai']['sets'] - assert len(record3_model['_oai']['sets']) == 1 - - current_oaiserver.unregister_signals_oaiset() - - # test update search_pattern - with patch('invenio_oaiserver.receivers.after_update_oai_set') as f: - current_oaiserver.register_signals_oaiset() - - with db.session.begin_nested(): - i.search_pattern = 'title_statement.title:Test3' - db.session.merge(i) - db.session.commit() - assert f.called - i = OAISet.query.get(i_id) - after_update_oai_set(None, None, i) - record3_model = RecordMetadata.query.filter_by( - id=record3_id).first().json - - assert 'e' in record3_model['_oai']['sets'] - assert 'i' in record3_model['_oai']['sets'] - assert len(record3_model['_oai']['sets']) == 2 - - current_oaiserver.unregister_signals_oaiset() - - # test update the spec - with pytest.raises(OAISetSpecUpdateError) as exc_info: - a = OAISet.query.get(a_id) - a.spec = 'new-a' - assert exc_info.type is OAISetSpecUpdateError - - # test create new set - with patch('invenio_oaiserver.receivers.after_insert_oai_set') as f: - current_oaiserver.register_signals_oaiset() - - with db.session.begin_nested(): - k = OAISet(spec="k", search_pattern="title_statement.title:Test2") - db.session.add(k) - db.session.commit() - assert f.called - after_insert_oai_set(None, None, k) - record2_model = RecordMetadata.query.filter_by( - id=record2_id).first().json - - assert 'e' in record2_model['_oai']['sets'] - assert 'f' in record2_model['_oai']['sets'] - assert 'k' in record2_model['_oai']['sets'] - assert len(record2_model['_oai']['sets']) == 3 - - current_oaiserver.register_signals_oaiset() - - -def test_oaiset_add_remove_record(app): - """Test the API method for manual record adding.""" - with app.app_context(): - oaiset1 = OAISet(spec='abc') - rec1 = Record.create({'title_statement': {'title': 'Test1'}}) - rec1.commit() - # Adding a record to an OAIset should change the record's updated date - dt1 = rec1.updated - assert not oaiset1.has_record(rec1) - oaiset1.add_record(rec1) - assert 'abc' in rec1['_oai']['sets'] - assert oaiset1.has_record(rec1) - rec1.commit() - dt2 = rec1.updated - assert dt2 > dt1 - - oaiset1.remove_record(rec1) - rec1.commit() - dt3 = rec1.updated - assert 'abc' not in rec1['_oai']['sets'] - assert not oaiset1.has_record(rec1) - assert dt3 > dt2 +# TODO: Remove test cases or adapt to new implementation + +# def test_search_pattern_change(app, without_oaiset_signals, schema): +# """Test search pattern change.""" +# record0 = create_record(app, { +# '_oai': {'sets': ['a']}, 'title_statement': {'title': 'Test0'}, +# '$schema': schema +# }) +# rec_uuid = record0.id +# oaiset = OAISet(spec="a", search_pattern="title_statement.title:Test0") +# db.session.add(oaiset) +# db.session.commit() +# run_after_insert_oai_set() +# current_search.flush_and_refresh('records') +# record = Record.get_record(rec_uuid) +# assert record['_oai']['sets'] == ['a'] + +# # change search pattern: record0 will not inside it anymore +# oaiset = OAISet.query.first() +# oaiset.search_pattern = 'title_statement.title:Test1' +# db.session.merge(oaiset) +# db.session.commit() +# after_update_oai_set(None, None, oaiset) +# current_search.flush_and_refresh('records') +# record = Record.get_record(rec_uuid) +# record.commit() +# assert record['_oai']['sets'] == [] + + +# def test_populate_oaisets(app, without_oaiset_signals, schema): +# """Populate OAISets.""" +# def create_oaiset(**kwargs): +# oaiset = OAISet(**kwargs) +# db.session.add(oaiset) +# db.session.commit() +# return oaiset + +# a = create_oaiset(spec='a') +# create_oaiset(spec='b') +# create_oaiset( +# spec="e", search_pattern="title_statement.title:Test2 OR " +# "title_statement.title:Test3") +# create_oaiset(spec="c", search_pattern="title_statement.title:Test0") +# create_oaiset(spec="d", search_pattern="title_statement.title:Test1") +# f = create_oaiset(spec="f", search_pattern="title_statement.title:Test2") +# create_oaiset(spec="g") +# create_oaiset(spec="h") +# i = create_oaiset(spec="i", search_pattern="title_statement.title:Test3") +# j = create_oaiset(spec="j with space", +# search_pattern="title_statement.title:Test4") +# # Note below: brackets around AND search query are required +# create_oaiset(spec="math", +# search_pattern="(title_statement.title:foo AND genre:math)") +# create_oaiset(spec="nonmath", +# search_pattern="(title_statement.title:foo AND -genre:math)") + +# run_after_insert_oai_set() + +# a_id = OAISet.query.filter_by(spec=a.spec).one().id +# i_id = OAISet.query.filter_by(spec=i.spec).one().id + +# # start tests + +# record0 = create_record(app, { +# '_oai': {'sets': ['a']}, 'title_statement': {'title': 'Test0'}, +# '$schema': schema +# }) + +# assert 'a' in record0['_oai']['sets'], 'Keep manually managed set "a".' +# assert 'c' in record0['_oai']['sets'] +# assert len(record0['_oai']['sets']) == 2 + +# record_not_found = create_record( +# app, {'title': 'TestNotFound', '$schema': schema} +# ) + +# # Don't create empty sets list just because of commit +# assert 'sets' not in record_not_found['_oai'] + +# record1 = create_record(app, {'title_statement': {'title': 'Test1'}, +# '$schema': schema}) + +# assert 'd' in record1['_oai']['sets'] +# assert len(record1['_oai']['sets']) == 1 + +# record2 = create_record(app, {'title_statement': {'title': 'Test2'}, +# '$schema': schema}) +# record2_id = record2.id + +# assert 'e' in record2['_oai']['sets'] +# assert 'f' in record2['_oai']['sets'] +# assert len(record2['_oai']['sets']) == 2 + +# record3 = create_record(app, {'title_statement': {'title': 'Test3'}, +# '$schema': schema}) +# record3_id = record3.id + +# assert 'e' in record3['_oai']['sets'] +# assert 'i' in record3['_oai']['sets'] +# assert len(record3['_oai']['sets']) == 2 + +# record4 = create_record(app, {'title_statement': {'title': 'Test4'}, +# '$schema': schema}) +# record4_id = record4.id + +# assert 'j with space' in record4['_oai']['sets'] +# assert len(record4['_oai']['sets']) == 1 + +# # If record does not have '_oai', don't add any sets, +# # nor even the default '_oai' key +# record5 = create_record(app, {'title_statement': {'title': 'Test1'}, +# '$schema': schema}, +# mint_oaiid=False) +# assert '_oai' not in record5 + +# # Test 'AND' keyword for records +# record6 = create_record(app, { +# 'title_statement': {'title': 'foo'}, +# 'genre': 'math', '$schema': schema +# }) +# assert record6['_oai']['sets'] == ['math', ] + +# record7 = create_record(app, { +# 'title_statement': {'title': 'foo'}, +# 'genre': 'physics', '$schema': schema +# }) +# assert record7['_oai']['sets'] == ['nonmath', ] + +# record8 = create_record(app, { +# 'title_statement': {'title': 'bar'}, +# 'genre': 'math', '$schema': schema +# }) +# assert 'sets' not in record8['_oai'] # title is not 'foo' + +# current_search.flush_and_refresh('records') + +# # test delete +# current_oaiserver.unregister_signals_oaiset() +# with patch('invenio_oaiserver.receivers.after_delete_oai_set') as f: +# current_oaiserver.register_signals_oaiset() + +# with db.session.begin_nested(): +# db.session.delete(j) +# db.session.commit() +# assert f.called +# after_delete_oai_set(None, None, j) +# record4_model = RecordMetadata.query.filter_by( +# id=record4_id).first().json + +# assert 'j with space' not in record4_model['_oai'].get('sets', []) +# assert len(record4_model['_oai'].get('sets', [])) == 0 + +# current_oaiserver.unregister_signals_oaiset() + +# # test update search_pattern +# with patch('invenio_oaiserver.receivers.after_update_oai_set') as f: +# current_oaiserver.register_signals_oaiset() +# with db.session.begin_nested(): +# i.search_pattern = None +# assert current_oaiserver.sets is None, 'Cache should be empty.' +# db.session.merge(i) +# db.session.commit() +# assert f.called +# i = OAISet.query.get(i_id) +# after_update_oai_set(None, None, i) +# record3_model = RecordMetadata.query.filter_by( +# id=record3_id).first().json + +# assert 'i' not in record3_model['_oai']['sets'], \ +# 'Set "i" is manually managed.' +# assert 'e' in record3_model['_oai']['sets'] +# assert len(record3_model['_oai']['sets']) == 1 + +# current_oaiserver.unregister_signals_oaiset() + +# # test update search_pattern +# with patch('invenio_oaiserver.receivers.after_update_oai_set') as f: +# current_oaiserver.register_signals_oaiset() + +# with db.session.begin_nested(): +# i.search_pattern = 'title_statement.title:Test3' +# db.session.merge(i) +# db.session.commit() +# assert f.called +# i = OAISet.query.get(i_id) +# after_update_oai_set(None, None, i) +# record3_model = RecordMetadata.query.filter_by( +# id=record3_id).first().json + +# assert 'e' in record3_model['_oai']['sets'] +# assert 'i' in record3_model['_oai']['sets'] +# assert len(record3_model['_oai']['sets']) == 2 + +# current_oaiserver.unregister_signals_oaiset() + +# # test update the spec +# with pytest.raises(OAISetSpecUpdateError) as exc_info: +# a = OAISet.query.get(a_id) +# a.spec = 'new-a' +# assert exc_info.type is OAISetSpecUpdateError + +# # test create new set +# with patch('invenio_oaiserver.receivers.after_insert_oai_set') as f: +# current_oaiserver.register_signals_oaiset() + +# with db.session.begin_nested(): +# k = OAISet(spec="k", search_pattern="title_statement.title:Test2") +# db.session.add(k) +# db.session.commit() +# assert f.called +# after_insert_oai_set(None, None, k) +# record2_model = RecordMetadata.query.filter_by( +# id=record2_id).first().json + +# assert 'e' in record2_model['_oai']['sets'] +# assert 'f' in record2_model['_oai']['sets'] +# assert 'k' in record2_model['_oai']['sets'] +# assert len(record2_model['_oai']['sets']) == 3 + +# current_oaiserver.register_signals_oaiset() + + +# def test_oaiset_add_remove_record(app): +# """Test the API method for manual record adding.""" +# with app.app_context(): +# oaiset1 = OAISet(spec='abc') +# rec1 = Record.create({'title_statement': {'title': 'Test1'}}) +# rec1.commit() +# # Adding a record to an OAIset should change the record's updated date +# dt1 = rec1.updated +# assert not oaiset1.has_record(rec1) +# oaiset1.add_record(rec1) +# assert 'abc' in rec1['_oai']['sets'] +# assert oaiset1.has_record(rec1) +# rec1.commit() +# dt2 = rec1.updated +# assert dt2 > dt1 + +# oaiset1.remove_record(rec1) +# rec1.commit() +# dt3 = rec1.updated +# assert 'abc' not in rec1['_oai']['sets'] +# assert not oaiset1.has_record(rec1) +# assert dt3 > dt2 From c425c8501a0ee3a6a32fffd57b57d093cdedeeac Mon Sep 17 00:00:00 2001 From: Peter Weber Date: Sun, 23 Jan 2022 19:12:19 +0100 Subject: [PATCH 07/19] fix: resumption token * Corrects resumption token with from and until dates. * Closes #178. --- invenio_oaiserver/verbs.py | 11 ++++++++++- tests/test_verbs.py | 16 ++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/invenio_oaiserver/verbs.py b/invenio_oaiserver/verbs.py index a210816..87afd63 100644 --- a/invenio_oaiserver/verbs.py +++ b/invenio_oaiserver/verbs.py @@ -2,6 +2,7 @@ # # This file is part of Invenio. # Copyright (C) 2015-2018 CERN. +# Copyright (C) 2022 RERO. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. @@ -14,6 +15,7 @@ from invenio_rest.serializer import BaseSchema from marshmallow import ValidationError, fields, utils, validates_schema from marshmallow.fields import DateTime as _DateTime +from marshmallow.utils import isoformat from .resumption_token import ResumptionTokenSchema @@ -60,12 +62,19 @@ def from_iso_permissive(datestring, use_dateutil=True): _DateTime.DATEFORMAT_DESERIALIZATION_FUNCS, permissive=from_iso_permissive ) + DATEFORMAT_SERIALIZATION_FUNCS = dict( + _DateTime.DATEFORMAT_SERIALIZATION_FUNCS, + permissive=isoformat + ) except AttributeError: DESERIALIZATION_FUNCS = dict( _DateTime.DESERIALIZATION_FUNCS, permissive=from_iso_permissive ) - + SERIALIZATION_FUNCS = dict( + _DateTime.SERIALIZATION_FUNCS, + permissive=isoformat + ) class OAISchema(BaseSchema): """Base OAI argument schema.""" diff --git a/tests/test_verbs.py b/tests/test_verbs.py index 85bc9ac..4c1ebe8 100644 --- a/tests/test_verbs.py +++ b/tests/test_verbs.py @@ -2,6 +2,7 @@ # # This file is part of Invenio. # Copyright (C) 2015-2019 CERN. +# Copyright (C) 2022 RERO. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. @@ -581,6 +582,21 @@ def test_listrecords(app): assert len(tree.xpath('/x:OAI-PMH/x:ListRecords/x:record', namespaces=NAMESPACES)) == 10 + # Check from:until range in resumption token + resumption_token = tree.xpath( + '/x:OAI-PMH/x:ListRecords/x:resumptionToken', + namespaces=NAMESPACES + )[0] + assert resumption_token.text + with app.test_client() as c: + result = c.get( + '/oai2d?verb=ListRecords&resumptionToken={0}'.format( + resumption_token.text + ) + ) + assert result.status_code == 200 + + def test_listidentifiers(app): """Test verb ListIdentifiers.""" From 6614ff7b6a6ce2d954ac782ed66d3fc843bc3413 Mon Sep 17 00:00:00 2001 From: David Eckhard Date: Tue, 25 Jan 2022 08:54:10 +0100 Subject: [PATCH 08/19] fix: read metadataPrefix from resumptionToken refactor: pydocstyle --- invenio_oaiserver/response.py | 15 +++++++++++++-- invenio_oaiserver/resumption_token.py | 4 ++-- invenio_oaiserver/verbs.py | 2 ++ tests/test_verbs.py | 1 - 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/invenio_oaiserver/response.py b/invenio_oaiserver/response.py index 4309dfe..15a9bdf 100644 --- a/invenio_oaiserver/response.py +++ b/invenio_oaiserver/response.py @@ -283,7 +283,13 @@ def header(parent, identifier, datestamp, sets=None, deleted=False): def getrecord(**kwargs): """Create OAI-PMH response for verb Identify.""" - record_dumper = serializer(kwargs['metadataPrefix']) + metadataPrefix = ( + kwargs.get('resumptionToken').get('metadataPrefix') + if kwargs.get('resumptionToken') + else kwargs['metadataPrefix'] + ) + record_dumper = serializer(metadataPrefix) + pid = OAIIDProvider.get(pid_value=kwargs['identifier']).pid record = current_oaiserver.record_fetcher(pid.object_uuid) @@ -327,7 +333,12 @@ def listidentifiers(**kwargs): def listrecords(**kwargs): """Create OAI-PMH response for verb ListRecords.""" - record_dumper = serializer(kwargs['metadataPrefix']) + metadataPrefix = ( + kwargs.get('resumptionToken').get('metadataPrefix') + if kwargs.get('resumptionToken') + else kwargs['metadataPrefix'] + ) + record_dumper = serializer(metadataPrefix) e_tree, e_listrecords = verb(**kwargs) result = get_records(**kwargs) diff --git a/invenio_oaiserver/resumption_token.py b/invenio_oaiserver/resumption_token.py index 58aa2f1..e2df685 100644 --- a/invenio_oaiserver/resumption_token.py +++ b/invenio_oaiserver/resumption_token.py @@ -2,7 +2,7 @@ # # This file is part of Invenio. # Copyright (C) 2016-2018 CERN. -# Copyright (C) 2022 Graz University of Technology. +# Copyright (C) 2021-2022 Graz University of Technology. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. @@ -74,7 +74,7 @@ def load(self, data, many=None, partial=None): result = super(ResumptionTokenSchema, self).load( data, many=many, partial=partial ) - result.data.update( + result.data.get('resumptionToken', {}).update( result.data.get('resumptionToken', {}).get('kwargs', {}) ) return result diff --git a/invenio_oaiserver/verbs.py b/invenio_oaiserver/verbs.py index 87afd63..6a4a0b9 100644 --- a/invenio_oaiserver/verbs.py +++ b/invenio_oaiserver/verbs.py @@ -2,6 +2,7 @@ # # This file is part of Invenio. # Copyright (C) 2015-2018 CERN. +# Copyright (C) 2021-2022 Graz University of Technology. # Copyright (C) 2022 RERO. # # Invenio is free software; you can redistribute it and/or modify it @@ -76,6 +77,7 @@ def from_iso_permissive(datestring, use_dateutil=True): permissive=isoformat ) + class OAISchema(BaseSchema): """Base OAI argument schema.""" diff --git a/tests/test_verbs.py b/tests/test_verbs.py index 4c1ebe8..8239444 100644 --- a/tests/test_verbs.py +++ b/tests/test_verbs.py @@ -597,7 +597,6 @@ def test_listrecords(app): assert result.status_code == 200 - def test_listidentifiers(app): """Test verb ListIdentifiers.""" from invenio_oaiserver.models import OAISet From a60831d57fbc3c7a961564c1c2b54e4b87f86120 Mon Sep 17 00:00:00 2001 From: Peter Weber Date: Tue, 25 Jan 2022 11:30:22 +0100 Subject: [PATCH 09/19] test: multiple resumption tokens --- run-tests.sh | 2 +- tests/test_verbs.py | 48 +++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/run-tests.sh b/run-tests.sh index bf5c30e..2704aae 100755 --- a/run-tests.sh +++ b/run-tests.sh @@ -23,7 +23,7 @@ trap cleanup EXIT python -m check_manifest --ignore ".*-requirements.txt" python -m sphinx.cmd.build -qnNW docs docs/_build/html eval "$(docker-services-cli up --db ${DB:-postgresql} --search ${SEARCH:-elasticsearch} --mq ${MQ:-rabbitmq} --env)" -python -m pytest +python -m pytest -vv -s tests/test_verbs.py::test_listrecords tests_exit_code=$? python -m sphinx.cmd.build -qnNW -b doctest docs docs/_build/doctest exit "$tests_exit_code" diff --git a/tests/test_verbs.py b/tests/test_verbs.py index 8239444..97cad14 100644 --- a/tests/test_verbs.py +++ b/tests/test_verbs.py @@ -486,7 +486,7 @@ def test_listrecords_fail_missing_metadataPrefix(app): def test_listrecords(app): """Test ListRecords.""" - total = 12 + total = 22 record_ids = [] with app.test_request_context(): @@ -528,10 +528,12 @@ def test_listrecords(app): assert len(tree.xpath('/x:OAI-PMH/x:ListRecords/x:record/x:metadata', namespaces=NAMESPACES)) == 10 + # First resumption token resumption_token = tree.xpath( '/x:OAI-PMH/x:ListRecords/x:resumptionToken', namespaces=NAMESPACES )[0] assert resumption_token.text + # Get data for resumption token with app.test_client() as c: result = c.get( '/oai2d?verb=ListRecords&resumptionToken={0}'.format( @@ -540,9 +542,36 @@ def test_listrecords(app): ) tree = etree.fromstring(result.data) - assert len(tree.xpath('/x:OAI-PMH', namespaces=NAMESPACES)) == 1 + assert len(tree.xpath('/x:OAI-PMH/x:ListRecords', + namespaces=NAMESPACES)) == 1 + assert len(tree.xpath('/x:OAI-PMH/x:ListRecords/x:record', + namespaces=NAMESPACES)) == 10 + assert len(tree.xpath('/x:OAI-PMH/x:ListRecords/x:record/x:header', + namespaces=NAMESPACES)) == 10 + assert len(tree.xpath('/x:OAI-PMH/x:ListRecords/x:record/x:header' + '/x:identifier', namespaces=NAMESPACES)) == 10 + assert len(tree.xpath('/x:OAI-PMH/x:ListRecords/x:record/x:header' + '/x:datestamp', namespaces=NAMESPACES)) == 10 + assert len(tree.xpath('/x:OAI-PMH/x:ListRecords/x:record/x:metadata', + namespaces=NAMESPACES)) == 10 + + + # Second resumption token + resumption_token = tree.xpath( + '/x:OAI-PMH/x:ListRecords/x:resumptionToken', namespaces=NAMESPACES + )[0] + assert resumption_token.text + # Get data for resumption token + with app.test_client() as c: + result = c.get( + '/oai2d?verb=ListRecords&resumptionToken={0}'.format( + resumption_token.text + ) + ) + tree = etree.fromstring(result.data) + assert len(tree.xpath('/x:OAI-PMH', namespaces=NAMESPACES)) == 1 assert len(tree.xpath('/x:OAI-PMH/x:ListRecords', namespaces=NAMESPACES)) == 1 assert len(tree.xpath('/x:OAI-PMH/x:ListRecords/x:record', @@ -556,6 +585,21 @@ def test_listrecords(app): assert len(tree.xpath('/x:OAI-PMH/x:ListRecords/x:record/x:metadata', namespaces=NAMESPACES)) == 2 + # Third resumption token + resumption_token = tree.xpath( + '/x:OAI-PMH/x:ListRecords/x:resumptionToken', namespaces=NAMESPACES + )[0] + assert resumption_token.text + with app.test_client() as c: + result = c.get( + '/oai2d?verb=ListRecords&resumptionToken={0}'.format( + resumption_token.text + ) + ) + + tree = etree.fromstring(result.data) + + # No fourth resumption token resumption_token = tree.xpath( '/x:OAI-PMH/x:ListRecords/x:resumptionToken', namespaces=NAMESPACES )[0] From 3a1dc34c71fdd4a85dc9a4312b41fcedcddaa09d Mon Sep 17 00:00:00 2001 From: David Eckhard Date: Wed, 26 Jan 2022 11:26:55 +0100 Subject: [PATCH 10/19] fix: load resumptionToken via model --- invenio_oaiserver/response.py | 7 +------ invenio_oaiserver/resumption_token.py | 13 +++++++------ invenio_oaiserver/verbs.py | 20 +++++++++++++------- invenio_oaiserver/views/server.py | 5 ----- 4 files changed, 21 insertions(+), 24 deletions(-) diff --git a/invenio_oaiserver/response.py b/invenio_oaiserver/response.py index 15a9bdf..0948db1 100644 --- a/invenio_oaiserver/response.py +++ b/invenio_oaiserver/response.py @@ -283,12 +283,7 @@ def header(parent, identifier, datestamp, sets=None, deleted=False): def getrecord(**kwargs): """Create OAI-PMH response for verb Identify.""" - metadataPrefix = ( - kwargs.get('resumptionToken').get('metadataPrefix') - if kwargs.get('resumptionToken') - else kwargs['metadataPrefix'] - ) - record_dumper = serializer(metadataPrefix) + record_dumper = serializer(kwargs['metadataPrefix']) pid = OAIIDProvider.get(pid_value=kwargs['identifier']).pid record = current_oaiserver.record_fetcher(pid.object_uuid) diff --git a/invenio_oaiserver/resumption_token.py b/invenio_oaiserver/resumption_token.py index e2df685..956a761 100644 --- a/invenio_oaiserver/resumption_token.py +++ b/invenio_oaiserver/resumption_token.py @@ -33,8 +33,11 @@ def serialize(pagination, **kwargs): salt=kwargs['verb'], ) schema = _schema_from_verb(kwargs['verb'], partial=False) + schema_kwargs = kwargs.copy() + schema_kwargs.update(schema_kwargs.get('resumptionToken', {})) + data = dict(seed=random.random(), page=pagination.next_num, - kwargs=schema.dump(kwargs).data) + kwargs=schema.dump(schema_kwargs).data) scroll_id = getattr(pagination, '_scroll_id', None) if scroll_id: data['scroll_id'] = scroll_id @@ -55,12 +58,10 @@ def _deserialize(self, value, attr, data, **kwargs): 'OAISERVER_RESUMPTION_TOKEN_EXPIRE_TIME']) result['token'] = value - # TODO: remove? - # this loads the arguments from the token, which is not necessary as - # the resumptionToken keyword is exclusive and will lead to an error - # that other arguments have to be provided as well. + schema_kwargs = result['kwargs'].copy() + schema_kwargs['verb'] = data['verb'] - # result['kwargs'] = self.root.load(result['kwargs'], partial=True).data + result['kwargs'] = _schema_from_verb(data['verb']).load(schema_kwargs).data return result diff --git a/invenio_oaiserver/verbs.py b/invenio_oaiserver/verbs.py index 6a4a0b9..4eed982 100644 --- a/invenio_oaiserver/verbs.py +++ b/invenio_oaiserver/verbs.py @@ -101,12 +101,6 @@ def validate(self, data, **kwargs): if 'from_' in data and 'until' in data and \ data['from_'] > data['until']: raise ValidationError('Date "from" must be before "until".') - extra = set(request.values.keys()) - set([ - getattr(f, 'load_from', None) or getattr( - f, 'data_key', None) or f.name for f in self.fields.values() - ]) - if extra: - raise ValidationError('You have passed too many arguments.') class Verbs(object): @@ -165,9 +159,21 @@ class ListSets(OAISchema, ResumptionTokenSchema): """Arguments for ListSets verb.""" +def check_extra_params_in_request(verb): + """Check for extra arguments in incomming request.""" + extra = set(request.values.keys()) - set([ + getattr(f, 'load_from', None) or getattr( + f, 'data_key', None) or f.name for f in verb.fields.values() + ]) + if extra: + raise ValidationError({'_schema': ['You have passed too many arguments.']}) + + def make_request_validator(request): """Validate arguments in incomming request.""" verb = request.values.get('verb', '', type=str) resumption_token = request.values.get('resumptionToken', None) schema = Verbs if resumption_token is None else ResumptionVerbs - return getattr(schema, verb, OAISchema)(partial=False) + initialized_verb = getattr(schema, verb, OAISchema)(partial=False) + check_extra_params_in_request(initialized_verb) + return initialized_verb diff --git a/invenio_oaiserver/views/server.py b/invenio_oaiserver/views/server.py index 0af54f5..d7f20b2 100644 --- a/invenio_oaiserver/views/server.py +++ b/invenio_oaiserver/views/server.py @@ -10,8 +10,6 @@ from __future__ import absolute_import -import time - from flask import Blueprint, make_response from invenio_pidstore.errors import PIDDoesNotExistError from itsdangerous import BadSignature @@ -93,7 +91,6 @@ def no_records_error(exception): @use_args(make_request_validator) def response(args): """Response endpoint.""" - start = time.time() e_tree = getattr(xml, args['verb'].lower())(**args) response = make_response(etree.tostring( @@ -102,7 +99,5 @@ def response(args): xml_declaration=True, encoding='UTF-8', )) - end = time.time() - print("took:", end - start) response.headers['Content-Type'] = 'text/xml' return response From 447f7f138042a16ae437310ca7f4260ca029f18f Mon Sep 17 00:00:00 2001 From: David Eckhard Date: Wed, 26 Jan 2022 11:57:37 +0100 Subject: [PATCH 11/19] refactor: remove unused code test: update resumptionToken test --- invenio_oaiserver/config.py | 2 +- invenio_oaiserver/errors.py | 1 + invenio_oaiserver/fetchers.py | 6 --- invenio_oaiserver/models.py | 1 + invenio_oaiserver/percolator.py | 72 +------------------------------ invenio_oaiserver/response.py | 2 +- invenio_oaiserver/views/server.py | 1 + run-tests.sh | 2 +- tests/test_percolator.py | 1 + tests/test_verbs.py | 27 +++++++++--- 10 files changed, 28 insertions(+), 87 deletions(-) diff --git a/invenio_oaiserver/config.py b/invenio_oaiserver/config.py index fc6c156..6817c7a 100644 --- a/invenio_oaiserver/config.py +++ b/invenio_oaiserver/config.py @@ -127,7 +127,7 @@ OAISERVER_RECORD_SETS_FETCHER = 'invenio_oaiserver.percolator:find_sets_for_record' """Record's OAI sets function.""" -OAISERVER_SET_RECORDS_QUERY_FETCHER = 'invenio_oaiserver.fetchers:rdm_records_set_records_query_fetcher' +OAISERVER_SET_RECORDS_QUERY_FETCHER = 'invenio_oaiserver.fetchers:set_records_query_fetcher' OAISERVER_RECORD_CLS = 'invenio_records.api:Record' """Record retrieval class.""" diff --git a/invenio_oaiserver/errors.py b/invenio_oaiserver/errors.py index dac9c91..9de87f2 100644 --- a/invenio_oaiserver/errors.py +++ b/invenio_oaiserver/errors.py @@ -2,6 +2,7 @@ # # This file is part of Invenio. # Copyright (C) 2016-2018 CERN. +# Copyright (C) 2022 Graz University of Technology. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. diff --git a/invenio_oaiserver/fetchers.py b/invenio_oaiserver/fetchers.py index 3d8d814..f6e598f 100644 --- a/invenio_oaiserver/fetchers.py +++ b/invenio_oaiserver/fetchers.py @@ -39,12 +39,6 @@ def oaiid_fetcher(record_uuid, data): def set_records_query_fetcher(setSpec): - """Fetch query to retrieve records based on provided set spec.""" - return Q('match', **{'_oai.sets': setSpec}) - - -# TODO: move to invenio_rdm_records -def rdm_records_set_records_query_fetcher(setSpec): """Fetch query to retrieve records based on provided set spec.""" set = OAISet.query.filter(OAISet.spec == setSpec).first() if set is None: diff --git a/invenio_oaiserver/models.py b/invenio_oaiserver/models.py index b5a69e2..6c25e3b 100644 --- a/invenio_oaiserver/models.py +++ b/invenio_oaiserver/models.py @@ -2,6 +2,7 @@ # # This file is part of Invenio. # Copyright (C) 2015-2018 CERN. +# Copyright (C) 2022 Graz University of Technology. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. diff --git a/invenio_oaiserver/percolator.py b/invenio_oaiserver/percolator.py index 979e0d1..5310dbb 100644 --- a/invenio_oaiserver/percolator.py +++ b/invenio_oaiserver/percolator.py @@ -24,7 +24,6 @@ from invenio_oaiserver.query import query_string_parser -from .models import OAISet from .proxies import current_oaiserver @@ -184,7 +183,6 @@ def create_percolate_query( queries.append({"ids": {"values": percolator_ids}}) query = {"query": {"bool": {"must": queries}}} - return query @@ -216,7 +214,7 @@ def sets_search_all(records): if not records: return [] - # records should all have the same index. maybe add index as parameter? + # TODO: records should all have the same index. maybe add index as parameter? record_index, doc_type = RecordIndexer()._record_to_index(records[0]) _create_percolator_mapping(record_index, doc_type) percolator_index = _build_percolator_index_name(record_index) @@ -241,71 +239,3 @@ def sets_search_all(records): def find_sets_for_record(record): """Fetch a record's sets.""" return sets_search_all([record])[0] - - -# TODO: Remove everything below before merging -def remove_sets(): - """Remove specified sets.""" - from invenio_db import db - - specs = [] - index = current_app.config.get('OAISERVER_RECORD_INDEX') - - for spec in specs: - try: - db_set = OAISet.query.filter_by(spec=spec).one() - print(db_set) - db.session.delete(db_set) - db.session.commit() - except Exception as e: - print(f"Exception during set removal ({spec}):", e) - - _delete_percolator(spec, "***") - current_search_client.delete( - index=_build_percolator_index_name(index), id=spec, ignore=[404] - ) - - -def create_new_set(): - """Create a new set.""" - from datetime import datetime - - from invenio_db import db - - name = f"published-{datetime.now()}" - s = OAISet( - spec=name, - name=name, - description="created via python orm", - search_pattern="is_published:true", - ) - db.session.add(s) - db.session.commit() - - -def index_sets(): - """Index all sets.""" - import time - - sets = OAISet.query.all() - if not sets: - return [] - - x0 = time.time() - num_total = len(sets) - for index, set in enumerate(sets): - _new_percolator(set.spec, set.search_pattern) - print_estimated_time(x0, num_total, index) - - -def print_estimated_time(start_time, num_total_elements, num_current_element): - """Calculate and print estimated remaining time.""" - import time - - current_time = time.time() - total_time_so_far = current_time - start_time - average_time = total_time_so_far / (num_current_element + 1) - estimated_time = (num_total_elements - num_current_element) * average_time - print( - f"{num_current_element}/{num_total_elements} took {total_time_so_far:.2f}. Estimate: {estimated_time:.2f}" - ) diff --git a/invenio_oaiserver/response.py b/invenio_oaiserver/response.py index 0948db1..f97e886 100644 --- a/invenio_oaiserver/response.py +++ b/invenio_oaiserver/response.py @@ -17,7 +17,7 @@ from lxml import etree from lxml.etree import Element, ElementTree, SubElement -from invenio_oaiserver.percolator import create_new_set, sets_search_all +from invenio_oaiserver.percolator import sets_search_all from .models import OAISet from .provider import OAIIDProvider diff --git a/invenio_oaiserver/views/server.py b/invenio_oaiserver/views/server.py index d7f20b2..0ce0ddb 100644 --- a/invenio_oaiserver/views/server.py +++ b/invenio_oaiserver/views/server.py @@ -2,6 +2,7 @@ # # This file is part of Invenio. # Copyright (C) 2015-2018 CERN. +# Copyright (C) 2022 Graz University of Technology. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. diff --git a/run-tests.sh b/run-tests.sh index 2704aae..bf5c30e 100755 --- a/run-tests.sh +++ b/run-tests.sh @@ -23,7 +23,7 @@ trap cleanup EXIT python -m check_manifest --ignore ".*-requirements.txt" python -m sphinx.cmd.build -qnNW docs docs/_build/html eval "$(docker-services-cli up --db ${DB:-postgresql} --search ${SEARCH:-elasticsearch} --mq ${MQ:-rabbitmq} --env)" -python -m pytest -vv -s tests/test_verbs.py::test_listrecords +python -m pytest tests_exit_code=$? python -m sphinx.cmd.build -qnNW -b doctest docs docs/_build/doctest exit "$tests_exit_code" diff --git a/tests/test_percolator.py b/tests/test_percolator.py index c933547..5ef23ab 100644 --- a/tests/test_percolator.py +++ b/tests/test_percolator.py @@ -2,6 +2,7 @@ # # This file is part of Invenio. # Copyright (C) 2015-2018 CERN. +# Copyright (C) 2022 Graz University of Technology. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. diff --git a/tests/test_verbs.py b/tests/test_verbs.py index 97cad14..ca12479 100644 --- a/tests/test_verbs.py +++ b/tests/test_verbs.py @@ -3,6 +3,7 @@ # This file is part of Invenio. # Copyright (C) 2015-2019 CERN. # Copyright (C) 2022 RERO. +# Copyright (C) 2022 Graz University of Technology. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. @@ -486,7 +487,7 @@ def test_listrecords_fail_missing_metadataPrefix(app): def test_listrecords(app): """Test ListRecords.""" - total = 22 + total = 32 record_ids = [] with app.test_request_context(): @@ -556,7 +557,6 @@ def test_listrecords(app): assert len(tree.xpath('/x:OAI-PMH/x:ListRecords/x:record/x:metadata', namespaces=NAMESPACES)) == 10 - # Second resumption token resumption_token = tree.xpath( '/x:OAI-PMH/x:ListRecords/x:resumptionToken', namespaces=NAMESPACES @@ -575,15 +575,15 @@ def test_listrecords(app): assert len(tree.xpath('/x:OAI-PMH/x:ListRecords', namespaces=NAMESPACES)) == 1 assert len(tree.xpath('/x:OAI-PMH/x:ListRecords/x:record', - namespaces=NAMESPACES)) == 2 + namespaces=NAMESPACES)) == 10 assert len(tree.xpath('/x:OAI-PMH/x:ListRecords/x:record/x:header', - namespaces=NAMESPACES)) == 2 + namespaces=NAMESPACES)) == 10 assert len(tree.xpath('/x:OAI-PMH/x:ListRecords/x:record/x:header' - '/x:identifier', namespaces=NAMESPACES)) == 2 + '/x:identifier', namespaces=NAMESPACES)) == 10 assert len(tree.xpath('/x:OAI-PMH/x:ListRecords/x:record/x:header' - '/x:datestamp', namespaces=NAMESPACES)) == 2 + '/x:datestamp', namespaces=NAMESPACES)) == 10 assert len(tree.xpath('/x:OAI-PMH/x:ListRecords/x:record/x:metadata', - namespaces=NAMESPACES)) == 2 + namespaces=NAMESPACES)) == 10 # Third resumption token resumption_token = tree.xpath( @@ -598,6 +598,19 @@ def test_listrecords(app): ) tree = etree.fromstring(result.data) + assert len(tree.xpath('/x:OAI-PMH', namespaces=NAMESPACES)) == 1 + assert len(tree.xpath('/x:OAI-PMH/x:ListRecords', + namespaces=NAMESPACES)) == 1 + assert len(tree.xpath('/x:OAI-PMH/x:ListRecords/x:record', + namespaces=NAMESPACES)) == 2 + assert len(tree.xpath('/x:OAI-PMH/x:ListRecords/x:record/x:header', + namespaces=NAMESPACES)) == 2 + assert len(tree.xpath('/x:OAI-PMH/x:ListRecords/x:record/x:header' + '/x:identifier', namespaces=NAMESPACES)) == 2 + assert len(tree.xpath('/x:OAI-PMH/x:ListRecords/x:record/x:header' + '/x:datestamp', namespaces=NAMESPACES)) == 2 + assert len(tree.xpath('/x:OAI-PMH/x:ListRecords/x:record/x:metadata', + namespaces=NAMESPACES)) == 2 # No fourth resumption token resumption_token = tree.xpath( From 48e791e5c50177acffccafa0fd11dde52b84a20c Mon Sep 17 00:00:00 2001 From: Pablo Panero Date: Mon, 31 Jan 2022 10:20:05 +0100 Subject: [PATCH 12/19] logging: use application logger --- invenio_oaiserver/percolator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/invenio_oaiserver/percolator.py b/invenio_oaiserver/percolator.py index 5310dbb..8900b71 100644 --- a/invenio_oaiserver/percolator.py +++ b/invenio_oaiserver/percolator.py @@ -12,7 +12,6 @@ from __future__ import absolute_import, print_function import json -import warnings from elasticsearch import VERSION as ES_VERSION from elasticsearch.helpers.actions import scan @@ -130,7 +129,7 @@ def _new_percolator(spec, search_pattern): ) except Exception as e: # caught on schemas, which do not contain the query field - warnings.warn(e) + current_app.logger.warning(e) pass From cffd1f32a48f2678e043e43cfc1abc975e9ec037 Mon Sep 17 00:00:00 2001 From: Pablo Panero Date: Mon, 31 Jan 2022 10:20:53 +0100 Subject: [PATCH 13/19] test: enable set tests for verbs --- tests/test_verbs.py | 92 ++++++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 47 deletions(-) diff --git a/tests/test_verbs.py b/tests/test_verbs.py index ca12479..f11862e 100644 --- a/tests/test_verbs.py +++ b/tests/test_verbs.py @@ -741,53 +741,51 @@ def test_listidentifiers(app): ) assert len(identifier) == 1 - # TODO: Add test cases after sets have been reimplemented. - - # Check set param - # with app.test_client() as c: - # for granularity in (False, True): - # result = c.get( - # '/oai2d?verb=ListIdentifiers&metadataPrefix=oai_dc' - # '&set=test0'.format( - # datetime_to_datestamp( - # record.updated - timedelta(1), - # day_granularity=granularity), - # datetime_to_datestamp( - # record.updated + timedelta(1), - # day_granularity=granularity), - # ) - # ) - # assert result.status_code == 200 - - # tree = etree.fromstring(result.data) - # identifier = tree.xpath( - # '/x:OAI-PMH/x:ListIdentifiers/x:header/x:identifier', - # namespaces=NAMESPACES - # ) - # assert len(identifier) == 1 - - # Check from:until range and set param - # with app.test_client() as c: - # for granularity in (False, True): - # result = c.get( - # '/oai2d?verb=ListIdentifiers&metadataPrefix=oai_dc' - # '&from={0}&until={1}&set=test0'.format( - # datetime_to_datestamp( - # record.updated - timedelta(1), - # day_granularity=granularity), - # datetime_to_datestamp( - # record.updated + timedelta(1), - # day_granularity=granularity), - # ) - # ) - # assert result.status_code == 200 - - # tree = etree.fromstring(result.data) - # identifier = tree.xpath( - # '/x:OAI-PMH/x:ListIdentifiers/x:header/x:identifier', - # namespaces=NAMESPACES - # ) - # assert len(identifier) == 1 + # check set param + with app.test_client() as c: + for granularity in (False, True): + result = c.get( + '/oai2d?verb=ListIdentifiers&metadataPrefix=oai_dc' + '&set=test0'.format( + datetime_to_datestamp( + record.updated - timedelta(1), + day_granularity=granularity), + datetime_to_datestamp( + record.updated + timedelta(1), + day_granularity=granularity), + ) + ) + assert result.status_code == 200 + + tree = etree.fromstring(result.data) + identifier = tree.xpath( + '/x:OAI-PMH/x:ListIdentifiers/x:header/x:identifier', + namespaces=NAMESPACES + ) + assert len(identifier) == 1 + + # check from:until range and set param + with app.test_client() as c: + for granularity in (False, True): + result = c.get( + '/oai2d?verb=ListIdentifiers&metadataPrefix=oai_dc' + '&from={0}&until={1}&set=test0'.format( + datetime_to_datestamp( + record.updated - timedelta(1), + day_granularity=granularity), + datetime_to_datestamp( + record.updated + timedelta(1), + day_granularity=granularity), + ) + ) + assert result.status_code == 200 + + tree = etree.fromstring(result.data) + identifier = tree.xpath( + '/x:OAI-PMH/x:ListIdentifiers/x:header/x:identifier', + namespaces=NAMESPACES + ) + assert len(identifier) == 1 def test_list_sets_long(app): From ec28a0228e835001f6de5ca6933526f0c851644b Mon Sep 17 00:00:00 2001 From: Pablo Panero Date: Mon, 31 Jan 2022 10:21:16 +0100 Subject: [PATCH 14/19] installation: use invenio_db global version --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index fb423e9..17024b4 100644 --- a/setup.py +++ b/setup.py @@ -51,13 +51,13 @@ ], # Databases 'mysql': [ - 'invenio-db[mysql]>=1.0.9', + 'invenio-db[mysql]{}'.format(invenio_db_version), ], 'postgresql': [ - 'invenio-db[postgresql]>=1.0.9', + 'invenio-db[postgresql]{}'.format(invenio_db_version), ], 'sqlite': [ - 'invenio-db>=1.0.9', + 'invenio-db{}'.format(invenio_db_version), ], 'tests': tests_require, } From e15b44770cc460843421c590415142e02eb6e8c4 Mon Sep 17 00:00:00 2001 From: Pablo Panero Date: Mon, 31 Jan 2022 11:00:50 +0100 Subject: [PATCH 15/19] global: remove python2 support --- invenio_oaiserver/__init__.py | 4 +--- invenio_oaiserver/errors.py | 4 +--- invenio_oaiserver/ext.py | 5 +---- invenio_oaiserver/fetchers.py | 4 +--- invenio_oaiserver/minters.py | 7 +------ invenio_oaiserver/models.py | 4 +--- invenio_oaiserver/percolator.py | 6 +----- invenio_oaiserver/provider.py | 4 +--- invenio_oaiserver/proxies.py | 4 +--- invenio_oaiserver/query.py | 5 ++--- invenio_oaiserver/receivers.py | 4 +--- invenio_oaiserver/utils.py | 12 ++---------- invenio_oaiserver/verbs.py | 4 +--- invenio_oaiserver/version.py | 4 +--- invenio_oaiserver/views/server.py | 2 -- setup.py | 8 ++++---- tests/conftest.py | 2 -- tests/helpers.py | 3 --- tests/test_admin.py | 2 -- tests/test_app.py | 2 -- tests/test_invenio_oaiserver.py | 2 -- tests/test_verbs.py | 2 -- 22 files changed, 20 insertions(+), 74 deletions(-) diff --git a/invenio_oaiserver/__init__.py b/invenio_oaiserver/__init__.py index fc4e99e..897b907 100644 --- a/invenio_oaiserver/__init__.py +++ b/invenio_oaiserver/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2015-2018 CERN. +# Copyright (C) 2015-2022 CERN. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. @@ -182,8 +182,6 @@ (GPLv3 licensed). """ -from __future__ import absolute_import, print_function - from .ext import InvenioOAIServer from .proxies import current_oaiserver from .version import __version__ diff --git a/invenio_oaiserver/errors.py b/invenio_oaiserver/errors.py index 9de87f2..f13cb8d 100644 --- a/invenio_oaiserver/errors.py +++ b/invenio_oaiserver/errors.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2016-2018 CERN. +# Copyright (C) 2016-2022 CERN. # Copyright (C) 2022 Graz University of Technology. # # Invenio is free software; you can redistribute it and/or modify it @@ -9,8 +9,6 @@ """Error.""" -from __future__ import absolute_import, print_function - class OAIBadMetadataFormatError(Exception): """Metadata format required doesn't exist.""" diff --git a/invenio_oaiserver/ext.py b/invenio_oaiserver/ext.py index 3a49f30..b5d7603 100644 --- a/invenio_oaiserver/ext.py +++ b/invenio_oaiserver/ext.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2015-2018 CERN. +# Copyright (C) 2015-2022 CERN. # Copyright (C) 2021-2022 Graz University of Technology. # # Invenio is free software; you can redistribute it and/or modify it @@ -9,10 +9,7 @@ """Invenio-OAIServer extension implementation.""" -from __future__ import absolute_import, print_function - from invenio_base.utils import obj_or_import_string -from invenio_records import signals as records_signals from sqlalchemy.event import contains, listen, remove from . import config diff --git a/invenio_oaiserver/fetchers.py b/invenio_oaiserver/fetchers.py index f6e598f..361e62c 100644 --- a/invenio_oaiserver/fetchers.py +++ b/invenio_oaiserver/fetchers.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2016-2018 CERN. +# Copyright (C) 2016-2022 CERN. # Copyright (C) 2021 Graz University of Technology. # # Invenio is free software; you can redistribute it and/or modify it @@ -9,8 +9,6 @@ """Persistent identifier fetchers.""" -from __future__ import absolute_import, print_function - from elasticsearch_dsl.query import Q from invenio_pidstore.errors import PersistentIdentifierError from invenio_pidstore.fetchers import FetchedPID diff --git a/invenio_oaiserver/minters.py b/invenio_oaiserver/minters.py index a01dc92..a005541 100644 --- a/invenio_oaiserver/minters.py +++ b/invenio_oaiserver/minters.py @@ -1,22 +1,17 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2016-2018 CERN. +# Copyright (C) 2016-2022 CERN. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. """Persistent identifier minters.""" -from __future__ import absolute_import, print_function - -from datetime import datetime - from flask import current_app from invenio_pidstore import current_pidstore from .provider import OAIIDProvider -from .utils import datetime_to_datestamp def oaiid_minter(record_uuid, data): diff --git a/invenio_oaiserver/models.py b/invenio_oaiserver/models.py index 6c25e3b..7361d9b 100644 --- a/invenio_oaiserver/models.py +++ b/invenio_oaiserver/models.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2015-2018 CERN. +# Copyright (C) 2015-2022 CERN. # Copyright (C) 2022 Graz University of Technology. # # Invenio is free software; you can redistribute it and/or modify it @@ -11,12 +11,10 @@ from flask_babelex import lazy_gettext as _ from invenio_db import db -from sqlalchemy.event import listen from sqlalchemy.orm import validates from sqlalchemy_utils import Timestamp from .errors import OAISetSpecUpdateError -from .proxies import current_oaiserver class OAISet(db.Model, Timestamp): diff --git a/invenio_oaiserver/percolator.py b/invenio_oaiserver/percolator.py index 8900b71..a46408c 100644 --- a/invenio_oaiserver/percolator.py +++ b/invenio_oaiserver/percolator.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2017-2018 CERN. +# Copyright (C) 2017-2022 CERN. # Copyright (C) 2022 Graz University of Technology. # # Invenio is free software; you can redistribute it and/or modify it @@ -9,8 +9,6 @@ """Percolator.""" -from __future__ import absolute_import, print_function - import json from elasticsearch import VERSION as ES_VERSION @@ -23,8 +21,6 @@ from invenio_oaiserver.query import query_string_parser -from .proxies import current_oaiserver - def _build_percolator_index_name(index): """Build percolator index name.""" diff --git a/invenio_oaiserver/provider.py b/invenio_oaiserver/provider.py index b5f0223..b18373a 100644 --- a/invenio_oaiserver/provider.py +++ b/invenio_oaiserver/provider.py @@ -1,15 +1,13 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2015-2018 CERN. +# Copyright (C) 2015-2022 CERN. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. """OAI-PMH ID provider.""" -from __future__ import absolute_import, print_function - from invenio_pidstore.models import PIDStatus from invenio_pidstore.providers.base import BaseProvider diff --git a/invenio_oaiserver/proxies.py b/invenio_oaiserver/proxies.py index 134ee65..eb4ce66 100644 --- a/invenio_oaiserver/proxies.py +++ b/invenio_oaiserver/proxies.py @@ -1,15 +1,13 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2016-2018 CERN. +# Copyright (C) 2016-2022 CERN. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. """Helper proxy to the state object.""" -from __future__ import absolute_import, print_function - from flask import current_app from werkzeug.local import LocalProxy diff --git a/invenio_oaiserver/query.py b/invenio_oaiserver/query.py index 45766d4..dca8cdc 100644 --- a/invenio_oaiserver/query.py +++ b/invenio_oaiserver/query.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2016-2018 CERN. +# Copyright (C) 2016-2022 CERN. # Copyright (C) 2022 Graz University of Technology. # # Invenio is free software; you can redistribute it and/or modify it @@ -9,7 +9,6 @@ """Query parser.""" -import six from elasticsearch import VERSION as ES_VERSION from elasticsearch_dsl import Q from flask import current_app @@ -25,7 +24,7 @@ def query_string_parser(search_pattern): """Elasticsearch query string parser.""" if not hasattr(current_oaiserver, 'query_parser'): query_parser = current_app.config['OAISERVER_QUERY_PARSER'] - if isinstance(query_parser, six.string_types): + if isinstance(query_parser, str): query_parser = import_string(query_parser) current_oaiserver.query_parser = query_parser query_parser_fields = ( diff --git a/invenio_oaiserver/receivers.py b/invenio_oaiserver/receivers.py index 7de3cca..8c74cb1 100644 --- a/invenio_oaiserver/receivers.py +++ b/invenio_oaiserver/receivers.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2016-2018 CERN. +# Copyright (C) 2016-2022 CERN. # Copyright (C) 2022 Graz University of Technology. # # Invenio is free software; you can redistribute it and/or modify it @@ -9,8 +9,6 @@ """Record field function.""" -from __future__ import absolute_import, print_function - from .percolator import _delete_percolator, _new_percolator diff --git a/invenio_oaiserver/utils.py b/invenio_oaiserver/utils.py index 495ec5c..153c625 100644 --- a/invenio_oaiserver/utils.py +++ b/invenio_oaiserver/utils.py @@ -1,18 +1,16 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2015-2018 CERN. +# Copyright (C) 2015-2022 CERN. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. """Utilities.""" -from __future__ import absolute_import, print_function - import re from datetime import datetime -from functools import partial +from functools import lru_cache, partial from flask import current_app from invenio_base.utils import obj_or_import_string @@ -22,12 +20,6 @@ from .proxies import current_oaiserver -try: - from functools import lru_cache -except ImportError: # pragma: no cover - from functools32 import lru_cache - - ns = { None: 'http://datacite.org/schema/kernel-4', 'xsi': 'http://www.w3.org/2001/XMLSchema-instance', diff --git a/invenio_oaiserver/verbs.py b/invenio_oaiserver/verbs.py index 4eed982..f6d361a 100644 --- a/invenio_oaiserver/verbs.py +++ b/invenio_oaiserver/verbs.py @@ -10,11 +10,9 @@ """OAI-PMH verbs.""" -from __future__ import absolute_import - from flask import current_app, request from invenio_rest.serializer import BaseSchema -from marshmallow import ValidationError, fields, utils, validates_schema +from marshmallow import ValidationError, fields, validates_schema from marshmallow.fields import DateTime as _DateTime from marshmallow.utils import isoformat diff --git a/invenio_oaiserver/version.py b/invenio_oaiserver/version.py index a75ec72..3ab7e6d 100644 --- a/invenio_oaiserver/version.py +++ b/invenio_oaiserver/version.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2015-2020 CERN. +# Copyright (C) 2015-2022 CERN. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. @@ -12,6 +12,4 @@ and parsed by ``setup.py``. """ -from __future__ import absolute_import, print_function - __version__ = '1.3.0' diff --git a/invenio_oaiserver/views/server.py b/invenio_oaiserver/views/server.py index 0ce0ddb..5cfff87 100644 --- a/invenio_oaiserver/views/server.py +++ b/invenio_oaiserver/views/server.py @@ -9,8 +9,6 @@ """OAI-PMH 2.0 server.""" -from __future__ import absolute_import - from flask import Blueprint, make_response from invenio_pidstore.errors import PIDDoesNotExistError from itsdangerous import BadSignature diff --git a/setup.py b/setup.py index 17024b4..1b7e1a2 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2015-2019 CERN. +# Copyright (C) 2015-2022 CERN. # Copyright (C) 2021 Graz University of Technology. # Copyright (C) 2021 TU Wien. # @@ -148,10 +148,10 @@ 'Programming Language :: Python', 'Topic :: Internet :: WWW/HTTP :: Dynamic Content', 'Topic :: Software Development :: Libraries :: Python Modules', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', 'Development Status :: 5 - Production/Stable', ], ) diff --git a/tests/conftest.py b/tests/conftest.py index bd9b276..aa6d0e4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,8 +8,6 @@ """Pytest configuration.""" -from __future__ import absolute_import, print_function - import os import shutil import tempfile diff --git a/tests/helpers.py b/tests/helpers.py index 068bf96..ac6d4db 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -9,8 +9,6 @@ """Utilities for loading test records.""" -from __future__ import absolute_import, print_function - import uuid import mock @@ -21,7 +19,6 @@ from invenio_indexer.api import RecordIndexer from invenio_pidstore.minters import recid_minter from invenio_pidstore.models import PersistentIdentifier -from invenio_records import Record from invenio_records.models import RecordMetadata from invenio_search import current_search, current_search_client diff --git a/tests/test_admin.py b/tests/test_admin.py index 58b5a9e..37e4774 100644 --- a/tests/test_admin.py +++ b/tests/test_admin.py @@ -8,8 +8,6 @@ """Test admin interface.""" -from __future__ import absolute_import, print_function - from flask import url_for from flask_admin import Admin, menu from invenio_db import db diff --git a/tests/test_app.py b/tests/test_app.py index 761cbaf..7c0392e 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -8,8 +8,6 @@ """Test app.""" -from __future__ import absolute_import - import pytest from flask import Flask diff --git a/tests/test_invenio_oaiserver.py b/tests/test_invenio_oaiserver.py index 618594b..02cfe36 100644 --- a/tests/test_invenio_oaiserver.py +++ b/tests/test_invenio_oaiserver.py @@ -8,8 +8,6 @@ """Module tests.""" -from __future__ import absolute_import, print_function - import socket import pytest diff --git a/tests/test_verbs.py b/tests/test_verbs.py index f11862e..46350e3 100644 --- a/tests/test_verbs.py +++ b/tests/test_verbs.py @@ -10,8 +10,6 @@ """Test OAI verbs.""" -from __future__ import absolute_import - import uuid from copy import deepcopy from datetime import datetime, timedelta From f1d0d9fe302f699f96fe14bfe0495abb62c2867b Mon Sep 17 00:00:00 2001 From: Pablo Panero Date: Tue, 1 Feb 2022 12:12:56 +0100 Subject: [PATCH 16/19] tests: implement percolator tests --- invenio_oaiserver/percolator.py | 1 - tests/conftest.py | 9 - tests/test_percolator.py | 323 +++++++------------------------- 3 files changed, 71 insertions(+), 262 deletions(-) diff --git a/invenio_oaiserver/percolator.py b/invenio_oaiserver/percolator.py index a46408c..703da07 100644 --- a/invenio_oaiserver/percolator.py +++ b/invenio_oaiserver/percolator.py @@ -134,7 +134,6 @@ def _delete_percolator(spec, search_pattern): # Create the percolator doc_type in the existing index for >= ES5 for index, mapping_path in current_search.mappings.items(): percolator_doc_type = _get_percolator_doc_type(index) - _create_percolator_mapping(index, percolator_doc_type) current_search_client.delete( index=_build_percolator_index_name(index), doc_type=percolator_doc_type, diff --git a/tests/conftest.py b/tests/conftest.py index aa6d0e4..236ce84 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -128,15 +128,6 @@ def without_oaiset_signals(app): current_oaiserver.register_signals_oaiset() -@pytest.yield_fixture -def without_oaiset_signals(app): - """Temporary disable oaiset signals.""" - from invenio_oaiserver import current_oaiserver - current_oaiserver.unregister_signals_oaiset() - yield - current_oaiserver.register_signals_oaiset() - - @pytest.fixture def schema(): """Get record schema.""" diff --git a/tests/test_percolator.py b/tests/test_percolator.py index 5ef23ab..9c31901 100644 --- a/tests/test_percolator.py +++ b/tests/test_percolator.py @@ -12,258 +12,77 @@ import pytest from helpers import create_record, run_after_insert_oai_set from invenio_db import db -from invenio_records.api import Record -from invenio_records.models import RecordMetadata from invenio_search import current_search -from mock import patch -from invenio_oaiserver import current_oaiserver -from invenio_oaiserver.errors import OAISetSpecUpdateError from invenio_oaiserver.models import OAISet -from invenio_oaiserver.receivers import after_delete_oai_set, \ - after_insert_oai_set, after_update_oai_set - -# TODO: Remove test cases or adapt to new implementation - -# def test_search_pattern_change(app, without_oaiset_signals, schema): -# """Test search pattern change.""" -# record0 = create_record(app, { -# '_oai': {'sets': ['a']}, 'title_statement': {'title': 'Test0'}, -# '$schema': schema -# }) -# rec_uuid = record0.id -# oaiset = OAISet(spec="a", search_pattern="title_statement.title:Test0") -# db.session.add(oaiset) -# db.session.commit() -# run_after_insert_oai_set() -# current_search.flush_and_refresh('records') -# record = Record.get_record(rec_uuid) -# assert record['_oai']['sets'] == ['a'] - -# # change search pattern: record0 will not inside it anymore -# oaiset = OAISet.query.first() -# oaiset.search_pattern = 'title_statement.title:Test1' -# db.session.merge(oaiset) -# db.session.commit() -# after_update_oai_set(None, None, oaiset) -# current_search.flush_and_refresh('records') -# record = Record.get_record(rec_uuid) -# record.commit() -# assert record['_oai']['sets'] == [] - - -# def test_populate_oaisets(app, without_oaiset_signals, schema): -# """Populate OAISets.""" -# def create_oaiset(**kwargs): -# oaiset = OAISet(**kwargs) -# db.session.add(oaiset) -# db.session.commit() -# return oaiset - -# a = create_oaiset(spec='a') -# create_oaiset(spec='b') -# create_oaiset( -# spec="e", search_pattern="title_statement.title:Test2 OR " -# "title_statement.title:Test3") -# create_oaiset(spec="c", search_pattern="title_statement.title:Test0") -# create_oaiset(spec="d", search_pattern="title_statement.title:Test1") -# f = create_oaiset(spec="f", search_pattern="title_statement.title:Test2") -# create_oaiset(spec="g") -# create_oaiset(spec="h") -# i = create_oaiset(spec="i", search_pattern="title_statement.title:Test3") -# j = create_oaiset(spec="j with space", -# search_pattern="title_statement.title:Test4") -# # Note below: brackets around AND search query are required -# create_oaiset(spec="math", -# search_pattern="(title_statement.title:foo AND genre:math)") -# create_oaiset(spec="nonmath", -# search_pattern="(title_statement.title:foo AND -genre:math)") - -# run_after_insert_oai_set() - -# a_id = OAISet.query.filter_by(spec=a.spec).one().id -# i_id = OAISet.query.filter_by(spec=i.spec).one().id - -# # start tests - -# record0 = create_record(app, { -# '_oai': {'sets': ['a']}, 'title_statement': {'title': 'Test0'}, -# '$schema': schema -# }) - -# assert 'a' in record0['_oai']['sets'], 'Keep manually managed set "a".' -# assert 'c' in record0['_oai']['sets'] -# assert len(record0['_oai']['sets']) == 2 - -# record_not_found = create_record( -# app, {'title': 'TestNotFound', '$schema': schema} -# ) - -# # Don't create empty sets list just because of commit -# assert 'sets' not in record_not_found['_oai'] - -# record1 = create_record(app, {'title_statement': {'title': 'Test1'}, -# '$schema': schema}) - -# assert 'd' in record1['_oai']['sets'] -# assert len(record1['_oai']['sets']) == 1 - -# record2 = create_record(app, {'title_statement': {'title': 'Test2'}, -# '$schema': schema}) -# record2_id = record2.id - -# assert 'e' in record2['_oai']['sets'] -# assert 'f' in record2['_oai']['sets'] -# assert len(record2['_oai']['sets']) == 2 - -# record3 = create_record(app, {'title_statement': {'title': 'Test3'}, -# '$schema': schema}) -# record3_id = record3.id - -# assert 'e' in record3['_oai']['sets'] -# assert 'i' in record3['_oai']['sets'] -# assert len(record3['_oai']['sets']) == 2 - -# record4 = create_record(app, {'title_statement': {'title': 'Test4'}, -# '$schema': schema}) -# record4_id = record4.id - -# assert 'j with space' in record4['_oai']['sets'] -# assert len(record4['_oai']['sets']) == 1 - -# # If record does not have '_oai', don't add any sets, -# # nor even the default '_oai' key -# record5 = create_record(app, {'title_statement': {'title': 'Test1'}, -# '$schema': schema}, -# mint_oaiid=False) -# assert '_oai' not in record5 - -# # Test 'AND' keyword for records -# record6 = create_record(app, { -# 'title_statement': {'title': 'foo'}, -# 'genre': 'math', '$schema': schema -# }) -# assert record6['_oai']['sets'] == ['math', ] - -# record7 = create_record(app, { -# 'title_statement': {'title': 'foo'}, -# 'genre': 'physics', '$schema': schema -# }) -# assert record7['_oai']['sets'] == ['nonmath', ] - -# record8 = create_record(app, { -# 'title_statement': {'title': 'bar'}, -# 'genre': 'math', '$schema': schema -# }) -# assert 'sets' not in record8['_oai'] # title is not 'foo' - -# current_search.flush_and_refresh('records') - -# # test delete -# current_oaiserver.unregister_signals_oaiset() -# with patch('invenio_oaiserver.receivers.after_delete_oai_set') as f: -# current_oaiserver.register_signals_oaiset() - -# with db.session.begin_nested(): -# db.session.delete(j) -# db.session.commit() -# assert f.called -# after_delete_oai_set(None, None, j) -# record4_model = RecordMetadata.query.filter_by( -# id=record4_id).first().json - -# assert 'j with space' not in record4_model['_oai'].get('sets', []) -# assert len(record4_model['_oai'].get('sets', [])) == 0 - -# current_oaiserver.unregister_signals_oaiset() - -# # test update search_pattern -# with patch('invenio_oaiserver.receivers.after_update_oai_set') as f: -# current_oaiserver.register_signals_oaiset() -# with db.session.begin_nested(): -# i.search_pattern = None -# assert current_oaiserver.sets is None, 'Cache should be empty.' -# db.session.merge(i) -# db.session.commit() -# assert f.called -# i = OAISet.query.get(i_id) -# after_update_oai_set(None, None, i) -# record3_model = RecordMetadata.query.filter_by( -# id=record3_id).first().json - -# assert 'i' not in record3_model['_oai']['sets'], \ -# 'Set "i" is manually managed.' -# assert 'e' in record3_model['_oai']['sets'] -# assert len(record3_model['_oai']['sets']) == 1 - -# current_oaiserver.unregister_signals_oaiset() - -# # test update search_pattern -# with patch('invenio_oaiserver.receivers.after_update_oai_set') as f: -# current_oaiserver.register_signals_oaiset() - -# with db.session.begin_nested(): -# i.search_pattern = 'title_statement.title:Test3' -# db.session.merge(i) -# db.session.commit() -# assert f.called -# i = OAISet.query.get(i_id) -# after_update_oai_set(None, None, i) -# record3_model = RecordMetadata.query.filter_by( -# id=record3_id).first().json - -# assert 'e' in record3_model['_oai']['sets'] -# assert 'i' in record3_model['_oai']['sets'] -# assert len(record3_model['_oai']['sets']) == 2 - -# current_oaiserver.unregister_signals_oaiset() - -# # test update the spec -# with pytest.raises(OAISetSpecUpdateError) as exc_info: -# a = OAISet.query.get(a_id) -# a.spec = 'new-a' -# assert exc_info.type is OAISetSpecUpdateError - -# # test create new set -# with patch('invenio_oaiserver.receivers.after_insert_oai_set') as f: -# current_oaiserver.register_signals_oaiset() - -# with db.session.begin_nested(): -# k = OAISet(spec="k", search_pattern="title_statement.title:Test2") -# db.session.add(k) -# db.session.commit() -# assert f.called -# after_insert_oai_set(None, None, k) -# record2_model = RecordMetadata.query.filter_by( -# id=record2_id).first().json - -# assert 'e' in record2_model['_oai']['sets'] -# assert 'f' in record2_model['_oai']['sets'] -# assert 'k' in record2_model['_oai']['sets'] -# assert len(record2_model['_oai']['sets']) == 3 - -# current_oaiserver.register_signals_oaiset() - - -# def test_oaiset_add_remove_record(app): -# """Test the API method for manual record adding.""" -# with app.app_context(): -# oaiset1 = OAISet(spec='abc') -# rec1 = Record.create({'title_statement': {'title': 'Test1'}}) -# rec1.commit() -# # Adding a record to an OAIset should change the record's updated date -# dt1 = rec1.updated -# assert not oaiset1.has_record(rec1) -# oaiset1.add_record(rec1) -# assert 'abc' in rec1['_oai']['sets'] -# assert oaiset1.has_record(rec1) -# rec1.commit() -# dt2 = rec1.updated -# assert dt2 > dt1 - -# oaiset1.remove_record(rec1) -# rec1.commit() -# dt3 = rec1.updated -# assert 'abc' not in rec1['_oai']['sets'] -# assert not oaiset1.has_record(rec1) -# assert dt3 > dt2 +from invenio_oaiserver.query import OAINoRecordsMatchError, get_records +from invenio_oaiserver.receivers import after_update_oai_set + + +@pytest.fixture() +def test0(app, without_oaiset_signals, schema): + _ = create_record(app, { + 'title_statement': {'title': 'Test0'}, + '$schema': schema + }) + current_search.flush_and_refresh("records") + + +def create_oaiset(name, title_pattern): + oaiset = OAISet( + spec=name, + search_pattern=f"title_statement.title:{title_pattern}" + ) + db.session.add(oaiset) + db.session.commit() + run_after_insert_oai_set() + + return oaiset + + +def test_set_with_no_records(without_oaiset_signals, schema): + _ = create_oaiset("test", "Test0") + with pytest.raises(OAINoRecordsMatchError): + get_records(set="test") + + +def test_empty_set(without_oaiset_signals, test0): + _ = create_oaiset("test", "Test1") + with pytest.raises(OAINoRecordsMatchError): + get_records(set="test") + + +def test_set_with_records(app, without_oaiset_signals, test0, schema): + # create extra record + _ = create_record(app, { + 'title_statement': {'title': 'Test1'}, + '$schema': schema + }) + current_search.flush_and_refresh("records") + + # create and query set + _ = create_oaiset("test", "Test0") + rec_in_set = get_records(set="test") + assert rec_in_set.total == 1 + rec = next(rec_in_set.items) + assert rec["json"]["_source"]["title_statement"]["title"] == "Test0" + + +def test_search_pattern_change(without_oaiset_signals, test0): + """Test search pattern change.""" + # create set + oaiset = create_oaiset("test", "Test0") + # check record is in set + rec_in_set = get_records(set="test") + assert rec_in_set.total == 1 + rec = next(rec_in_set.items) + assert rec["json"]["_source"]["title_statement"]["title"] == "Test0" + + # change search pattern + oaiset.search_pattern = 'title_statement.title:Test1' + db.session.merge(oaiset) + db.session.commit() + after_update_oai_set(None, None, oaiset) + # check records is not in set + with pytest.raises(OAINoRecordsMatchError): + get_records(set="test") From bd51efdbe19f38c3521c4091bb4b4bd3a7eacd03 Mon Sep 17 00:00:00 2001 From: Pablo Panero Date: Tue, 1 Feb 2022 13:56:31 +0100 Subject: [PATCH 17/19] models: remove unused code --- invenio_oaiserver/models.py | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/invenio_oaiserver/models.py b/invenio_oaiserver/models.py index 7361d9b..d212d8d 100644 --- a/invenio_oaiserver/models.py +++ b/invenio_oaiserver/models.py @@ -72,42 +72,5 @@ def validate_spec(self, key, value): raise OAISetSpecUpdateError("Updating spec is not allowed.") return value - # TODO: Add and remove can be implemented but it will require to - # update the `search_pattern` - - # def add_record(self, record): - # """Add a record to the OAISet. - - # :param record: Record to be added. - # :type record: `invenio_records.api.Record` or derivative. - # """ - # record.setdefault('_oai', {}).setdefault('sets', []) - - # assert not self.has_record(record) - - # record['_oai']['sets'].append(self.spec) - - # def remove_record(self, record): - # """Remove a record from the OAISet. - - # :param record: Record to be removed. - # :type record: `invenio_records.api.Record` or derivative. - # """ - # assert self.has_record(record) - - # record['_oai']['sets'] = [ - # s for s in record['_oai']['sets'] if s != self.spec] - - # TODO: has_record can be implemented but it will require to - # to do a full search. - - # def has_record(self, record): - # """Check if the record blongs to the OAISet. - - # :param record: Record to be checked. - # :type record: `invenio_records.api.Record` or derivative. - # """ - # return self.spec in record.get('_oai', {}).get('sets', []) - __all__ = ('OAISet', ) From 897b3ab5b90ebb1e9cb2945a58af759ce3d41003 Mon Sep 17 00:00:00 2001 From: Alexander Ioannidis Date: Tue, 22 Feb 2022 14:17:01 +0100 Subject: [PATCH 18/19] percolators: create less percolator indices * Adds a check to prevent creating percolator mappings for indices not used by OAI-PMH. * Removes the unusded `_percolate_query` function. --- invenio_oaiserver/percolator.py | 44 ++++++--------------------------- 1 file changed, 8 insertions(+), 36 deletions(-) diff --git a/invenio_oaiserver/percolator.py b/invenio_oaiserver/percolator.py index 703da07..9d3b1f5 100644 --- a/invenio_oaiserver/percolator.py +++ b/invenio_oaiserver/percolator.py @@ -57,40 +57,6 @@ def _create_percolator_mapping(index, doc_type, mapping_path=None): ) -def _percolate_query(index, doc_type, percolator_doc_type, document): - """Get results for a percolate query.""" - index = _build_percolator_index_name(index) - if ES_VERSION[0] in (2, 5): - results = current_search_client.percolate( - index=index, - doc_type=doc_type, - allow_no_indices=True, - ignore_unavailable=True, - body={'doc': document}, - ) - return results['matches'] - elif ES_VERSION[0] in (6, 7): - es_client_params = dict( - index=index, - doc_type=percolator_doc_type, - allow_no_indices=True, - ignore_unavailable=True, - body={ - 'query': { - 'percolate': { - 'field': 'query', - 'document_type': percolator_doc_type, - 'document': document, - } - } - }, - ) - if ES_VERSION[0] == 7: - es_client_params.pop('doc_type') - results = current_search_client.search(**es_client_params) - return results['hits']['hits'] - - def _get_percolator_doc_type(index): es_ver = ES_VERSION[0] if es_ver == 2: @@ -110,7 +76,11 @@ def _new_percolator(spec, search_pattern): """Create new percolator associated with the new set.""" if spec and search_pattern: query = query_string_parser(search_pattern=search_pattern).to_dict() + oai_records_index = current_app.config["OAISERVER_RECORD_INDEX"] for index, mapping_path in current_search.mappings.items(): + # Skip indices/mappings not used by OAI-PMH + if not index.startswith(oai_records_index): + continue # Create the percolator doc_type in the existing index for >= ES5 # TODO: Consider doing this only once in app initialization try: @@ -124,15 +94,17 @@ def _new_percolator(spec, search_pattern): body={'query': query} ) except Exception as e: - # caught on schemas, which do not contain the query field current_app.logger.warning(e) - pass def _delete_percolator(spec, search_pattern): """Delete percolator associated with the removed/updated oaiset.""" + oai_records_index = current_app.config["OAISERVER_RECORD_INDEX"] # Create the percolator doc_type in the existing index for >= ES5 for index, mapping_path in current_search.mappings.items(): + # Skip indices/mappings not used by OAI-PMH + if not index.startswith(oai_records_index): + continue percolator_doc_type = _get_percolator_doc_type(index) current_search_client.delete( index=_build_percolator_index_name(index), From 0c01b1d16f9b5635729a76917d80c8aa386a97ab Mon Sep 17 00:00:00 2001 From: Alexander Ioannidis Date: Tue, 22 Feb 2022 14:36:09 +0100 Subject: [PATCH 19/19] release: v1.4.0 --- CHANGES.rst | 8 +++++++- invenio_oaiserver/version.py | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 8b3b798..c5c220e 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,6 @@ .. This file is part of Invenio. - Copyright (C) 2016-2020 CERN. + Copyright (C) 2016-2022 CERN. Invenio is free software; you can redistribute it and/or modify it under the terms of the MIT License; see LICENSE file for more details. @@ -8,6 +8,12 @@ Changes ======= +Version 1.4.0 (released 2022-02-22) + +- OAI-PMH sets reimplementation via percolator queries during result fetching. +- Removes Python 2.7 support. +- Resumption token argument fixes. + Version 1.3.0 (released 2021-10-20) - Unpin Flask version. diff --git a/invenio_oaiserver/version.py b/invenio_oaiserver/version.py index 3ab7e6d..ef76949 100644 --- a/invenio_oaiserver/version.py +++ b/invenio_oaiserver/version.py @@ -12,4 +12,4 @@ and parsed by ``setup.py``. """ -__version__ = '1.3.0' +__version__ = '1.4.0'