From de414af48d517dccbdac37f71c50bad32506512f Mon Sep 17 00:00:00 2001 From: Guillaume Viger Date: Wed, 7 Dec 2022 09:01:39 -0500 Subject: [PATCH] providers: add validate_record to validate record according to provider [+] - in turn this allows validation of publisher by DataciteProvider - closes #1137 --- .../services/components/pids.py | 13 ++-- invenio_rdm_records/services/pids/manager.py | 62 +++++++++++++++---- .../services/pids/providers/base.py | 13 ++++ .../services/pids/providers/datacite.py | 22 +++++++ invenio_rdm_records/services/pids/service.py | 2 +- tests/conftest.py | 6 +- .../serializers/test_dublincore_serializer.py | 1 + .../components/test_pids_component.py | 4 ++ .../providers/test_datacite_pid_provider.py | 47 +++++++++++--- 9 files changed, 141 insertions(+), 29 deletions(-) diff --git a/invenio_rdm_records/services/components/pids.py b/invenio_rdm_records/services/components/pids.py index 88705ac49..f37f172f3 100644 --- a/invenio_rdm_records/services/components/pids.py +++ b/invenio_rdm_records/services/components/pids.py @@ -65,15 +65,13 @@ def publish(self, identity, draft=None, record=None): record_pids = copy(record.get("pids", {})) draft_schemes = set(draft_pids.keys()) record_schemes = set(record_pids.keys()) - - # Determine schemes which are required, but not yet created. - missing_required_schemes = ( - set(self.service.config.pids_required) - record_schemes - draft_schemes - ) + required_schemes = set(self.service.config.pids_required) # Validate the draft PIDs self.service.pids.pid_manager.validate(draft_pids, record, raise_errors=True) + self.service.pids.pid_manager.validate_record(draft, raise_errors=True) + # Detect which PIDs on a published record that has been changed. # # Example: An external DOI (i.e. DOI not managed by us) can be changed @@ -88,8 +86,9 @@ def publish(self, identity, draft=None, record=None): self.service.pids.pid_manager.discard_all(changed_pids) - # Create all PIDs specified on draft or which PIDs schemes which are - # require + # Determine schemes which are required, but not yet created. + missing_required_schemes = required_schemes - record_schemes - draft_schemes + # Create all PIDs specified on draft and all missing required PIDs pids = self.service.pids.pid_manager.create_all( draft, pids=draft_pids, diff --git a/invenio_rdm_records/services/pids/manager.py b/invenio_rdm_records/services/pids/manager.py index 2b4bd75ad..2754b2cac 100644 --- a/invenio_rdm_records/services/pids/manager.py +++ b/invenio_rdm_records/services/pids/manager.py @@ -19,9 +19,10 @@ class PIDManager: """RDM PIDs Manager.""" - def __init__(self, providers): + def __init__(self, providers, required_schemes=None): """Constructor for RecordService.""" self._providers = providers + self._required_schemes = required_schemes def _get_provider(self, scheme, provider_name=None): """Get a provider.""" @@ -47,17 +48,6 @@ def _validate_pids_schemes(self, pids): if unknown_schemes: raise PIDSchemeNotSupportedError(unknown_schemes) - def _validate_pids(self, pids, record, errors): - """Validate an iterator of PIDs. - - This function assumes all pid schemes are supported by the system. - """ - for scheme, pid in pids.items(): - provider = self._get_provider(scheme, pid.get("provider")) - success, val_errors = provider.validate(record=record, **pid) - if not success: - errors.append({"field": f"pids.{scheme}", "messages": val_errors}) - def _validate_identifiers(self, pids, errors): """Validate and normalize identifiers.""" # TODO: Refactor to get it injected instead. @@ -87,6 +77,17 @@ def _validate_identifiers(self, pids, errors): for scheme, id_ in identifiers: pids[scheme]["identifier"] = id_ + def _validate_pids(self, pids, record, errors): + """Validate an iterator of PIDs. + + This function assumes all pid schemes are supported by the system. + """ + for scheme, pid in pids.items(): + provider = self._get_provider(scheme, pid.get("provider")) + success, val_errors = provider.validate(record=record, **pid) + if not success: + errors.append({"field": f"pids.{scheme}", "messages": val_errors}) + def validate(self, pids, record, errors=None, raise_errors=False): """Validate PIDs.""" errors = [] if errors is None else errors @@ -97,6 +98,43 @@ def validate(self, pids, record, errors=None, raise_errors=False): if raise_errors and errors: raise ValidationError(message=errors) + def validate_record(self, record, raise_errors=False): + """Validate the record according to the PIDs' requirements. + + Here we check if the record is compatible from the point of + view of the pids... + - ... it contains + - ... it would contain according to configured required pids + + The responsibility lies with each provider since they are the ones + that know their criteria for a record that is complete enough to get + a PID. + """ + errors = {} + + # scheme, provider_name for record's pids + scheme_names = [ + (scheme, pid.get("provider")) + for scheme, pid in record.get("pids", {}).items() + ] + # scheme, None for required pids + scheme_names += [(scheme, None) for scheme in self._required_schemes] + providers = [ + self._get_provider(scheme, provider_name) + for scheme, provider_name in scheme_names + ] + + for provider in providers: + success, provider_errors = provider.validate_record(record) + if not success: + # This is not perfect as one provider may override the error of another + # but a proper dict merging algorithm is out-of-bounds here and as long + # as an error is raised we are good. + errors.update(provider_errors) + + if raise_errors and errors: + raise ValidationError(message=errors) + def read(self, scheme, identifier, provider_name): """Read a pid.""" provider = self._get_provider(scheme, provider_name) diff --git a/invenio_rdm_records/services/pids/providers/base.py b/invenio_rdm_records/services/pids/providers/base.py index 140d8a81d..0fecbecd5 100644 --- a/invenio_rdm_records/services/pids/providers/base.py +++ b/invenio_rdm_records/services/pids/providers/base.py @@ -164,3 +164,16 @@ def validate(self, record, identifier=None, provider=None, **kwargs): pass return True, [] + + def validate_record(self, record): + """Validate the record according to the provider's rules. + + By default it always validates. Descendants should override this method with + their own validation logic if need be. + + :param record: A record-like (draft or published record) + :returns: A tuple (success, errors). `success` is a bool that specifies + if the validation was successful. `errors` is an + error dict of the form: `{"": ["", ...], ...}`. + """ + return True, {} diff --git a/invenio_rdm_records/services/pids/providers/datacite.py b/invenio_rdm_records/services/pids/providers/datacite.py index 285b89a16..f2e1062e6 100644 --- a/invenio_rdm_records/services/pids/providers/datacite.py +++ b/invenio_rdm_records/services/pids/providers/datacite.py @@ -209,3 +209,25 @@ def validate(self, record, identifier=None, provider=None, **kwargs): errors.append(str(e)) return (True, []) if not errors else (False, errors) + + def validate_record(self, record): + """Validate the record according to DataCite rules. + + We only add check for values not already covered by the record schema. + + :returns: A tuple (success, errors). `success` is a bool that specifies + if the validation was successful. `errors` is an + error dict of the form: `{"": ["", ...], ...}`. + """ + errors = {} + + if not record["metadata"].get("publisher"): + errors.update( + { + "metadata.publisher": [ + _("Missing publisher field required for DOI registration.") + ], + } + ) + + return not bool(errors), errors diff --git a/invenio_rdm_records/services/pids/service.py b/invenio_rdm_records/services/pids/service.py index a697a0d4b..079677d50 100644 --- a/invenio_rdm_records/services/pids/service.py +++ b/invenio_rdm_records/services/pids/service.py @@ -44,7 +44,7 @@ def _manager(self): - limit side-effects (in case using pre-existing `pid_manager` would cause some.) """ - return self.manager_cls(self.config.pids_providers) + return self.manager_cls(self.config.pids_providers, self.config.pids_required) @property def pid_manager(self): diff --git a/tests/conftest.py b/tests/conftest.py index 32a6b3fb8..e93ee84cb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -533,8 +533,6 @@ def minimal_record(): "enabled": False, # Most tests don't care about files }, "metadata": { - "publication_date": "2020-06-01", - "resource_type": {"id": "image-photo"}, "creators": [ { "person_or_org": { @@ -550,6 +548,10 @@ def minimal_record(): }, }, ], + "publication_date": "2020-06-01", + # because DATACITE_ENABLED is True, this field is required + "publisher": "Acme Inc", + "resource_type": {"id": "image-photo"}, "title": "A Romans story", }, } diff --git a/tests/resources/serializers/test_dublincore_serializer.py b/tests/resources/serializers/test_dublincore_serializer.py index 884ba200c..f7fb3f9ac 100644 --- a/tests/resources/serializers/test_dublincore_serializer.py +++ b/tests/resources/serializers/test_dublincore_serializer.py @@ -74,6 +74,7 @@ def test_dublincorejson_serializer_minimal(running_app, updated_minimal_record): "creators": ["Name", "Troy Inc."], "dates": ["2020-06-01"], "rights": ["info:eu-repo/semantics/openAccess"], + "publishers": ["Acme Inc"], } serializer = DublinCoreJSONSerializer() diff --git a/tests/services/components/test_pids_component.py b/tests/services/components/test_pids_component.py index aed2da047..d88fd7b8d 100644 --- a/tests/services/components/test_pids_component.py +++ b/tests/services/components/test_pids_component.py @@ -53,6 +53,10 @@ def validate(self, record, identifier=None, provider=None, **kwargs): errors.append("Identifier must be an integer.") return (True, []) if not errors else (False, errors) + def validate_record(self, record): + """Validate the record according to PID rules i.e. always good.""" + return True, [] + # configs diff --git a/tests/services/pids/providers/test_datacite_pid_provider.py b/tests/services/pids/providers/test_datacite_pid_provider.py index 47a48fde5..17d0ab406 100644 --- a/tests/services/pids/providers/test_datacite_pid_provider.py +++ b/tests/services/pids/providers/test_datacite_pid_provider.py @@ -195,24 +195,57 @@ def custom_format_func(*args): assert datacite_provider.create(record).pid_value == expected_result -def test_datacite_provider_validation(record, mocker): - client = DataCiteClient("datacite") - - # check with default func +def test_datacite_provider_validate(record): current_app.config["DATACITE_PREFIX"] = "10.1000" + client = DataCiteClient("datacite") datacite_provider = DataCitePIDProvider("datacite", client=client) + + # Case - Valid identifier (doi) success, errors = datacite_provider.validate( record=record, identifier="10.1000/valid.1234", provider="datacite" ) assert success - assert errors == [] + assert [] == errors + # Case - Invalid identifier (doi) success, errors = datacite_provider.validate( record=record, identifier="10.2000/invalid.1234", provider="datacite" ) - assert not success - assert errors == [ + expected = [ "Wrong DOI 10.2000 prefix provided, " + "it should be 10.1000 as defined in the rest client" ] + assert expected == errors + + +def test_datacite_provider_validate_record(record): + record["metadata"] = {"publisher": "Acme Inc"} + current_app.config["DATACITE_PREFIX"] = "10.1000" + client = DataCiteClient("datacite") + datacite_provider = DataCitePIDProvider("datacite", client=client) + + # Case - valid new record without pids.doi + success, errors = datacite_provider.validate_record(record) + assert {} == errors + assert success + + # Case - valid record with pre-existing pids.doi + record["pids"] = { + "doi": {"provider": "datacite", "identifier": "10.1000/pre-existing.1234"} + } + success, errors = datacite_provider.validate_record(record) + assert {} == errors + assert success + + # Case - invalid record + del record["metadata"]["publisher"] + success, errors = datacite_provider.validate_record(record) + expected = { + "metadata.publisher": [ + "Missing publisher field required for DOI registration." + ], + } + + assert expected == errors + assert not success