Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

providers: add validate_record to validate record according to provider [+] #1140

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions invenio_rdm_records/services/components/pids.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,13 @@ def publish(self, identity, draft=None, record=None):
record_pids = copy(record.get("pids", {}))
draft_schemes = set(draft_pids.keys())
record_schemes = set(record_pids.keys())

# Determine schemes which are required, but not yet created.
missing_required_schemes = (
set(self.service.config.pids_required) - record_schemes - draft_schemes
)
required_schemes = set(self.service.config.pids_required)

# Validate the draft PIDs
self.service.pids.pid_manager.validate(draft_pids, record, raise_errors=True)

self.service.pids.pid_manager.validate_record(draft, raise_errors=True)

# Detect which PIDs on a published record that has been changed.
#
# Example: An external DOI (i.e. DOI not managed by us) can be changed
Expand All @@ -88,8 +86,9 @@ def publish(self, identity, draft=None, record=None):

self.service.pids.pid_manager.discard_all(changed_pids)

# Create all PIDs specified on draft or which PIDs schemes which are
# require
# Determine schemes which are required, but not yet created.
missing_required_schemes = required_schemes - record_schemes - draft_schemes
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just for readability

# Create all PIDs specified on draft and all missing required PIDs
pids = self.service.pids.pid_manager.create_all(
draft,
pids=draft_pids,
Expand Down
62 changes: 50 additions & 12 deletions invenio_rdm_records/services/pids/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@
class PIDManager:
"""RDM PIDs Manager."""

def __init__(self, providers):
def __init__(self, providers, required_schemes=None):
"""Constructor for RecordService."""
self._providers = providers
self._required_schemes = required_schemes
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's legitimate for the PIDManager to know which pid schemes are required.


def _get_provider(self, scheme, provider_name=None):
"""Get a provider."""
Expand All @@ -47,17 +48,6 @@ def _validate_pids_schemes(self, pids):
if unknown_schemes:
raise PIDSchemeNotSupportedError(unknown_schemes)

def _validate_pids(self, pids, record, errors):
"""Validate an iterator of PIDs.

This function assumes all pid schemes are supported by the system.
"""
for scheme, pid in pids.items():
provider = self._get_provider(scheme, pid.get("provider"))
success, val_errors = provider.validate(record=record, **pid)
if not success:
errors.append({"field": f"pids.{scheme}", "messages": val_errors})

def _validate_identifiers(self, pids, errors):
"""Validate and normalize identifiers."""
# TODO: Refactor to get it injected instead.
Expand Down Expand Up @@ -87,6 +77,17 @@ def _validate_identifiers(self, pids, errors):
for scheme, id_ in identifiers:
pids[scheme]["identifier"] = id_

def _validate_pids(self, pids, record, errors):
"""Validate an iterator of PIDs.

This function assumes all pid schemes are supported by the system.
"""
for scheme, pid in pids.items():
provider = self._get_provider(scheme, pid.get("provider"))
success, val_errors = provider.validate(record=record, **pid)
if not success:
errors.append({"field": f"pids.{scheme}", "messages": val_errors})

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just reordering for easier reading of the file as a developer.

def validate(self, pids, record, errors=None, raise_errors=False):
"""Validate PIDs."""
errors = [] if errors is None else errors
Expand All @@ -97,6 +98,43 @@ def validate(self, pids, record, errors=None, raise_errors=False):
if raise_errors and errors:
raise ValidationError(message=errors)

def validate_record(self, record, raise_errors=False):
"""Validate the record according to the PIDs' requirements.

Here we check if the record is compatible from the point of
view of the pids...
- ... it contains
- ... it would contain according to configured required pids

The responsibility lies with each provider since they are the ones
that know their criteria for a record that is complete enough to get
a PID.
"""
errors = {}

# scheme, provider_name for record's pids
scheme_names = [
(scheme, pid.get("provider"))
for scheme, pid in record.get("pids", {}).items()
]
# scheme, None for required pids
scheme_names += [(scheme, None) for scheme in self._required_schemes]
providers = [
self._get_provider(scheme, provider_name)
for scheme, provider_name in scheme_names
]

for provider in providers:
success, provider_errors = provider.validate_record(record)
if not success:
# This is not perfect as one provider may override the error of another
# but a proper dict merging algorithm is out-of-bounds here and as long
# as an error is raised we are good.
errors.update(provider_errors)

if raise_errors and errors:
raise ValidationError(message=errors)

Copy link
Contributor Author

@fenekku fenekku Dec 8, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's the crux of the feature. It's slightly different than the original idea of modifying the validate() method mostly because it clarifies intent this way, removes the need to change the interface of that method (across many classes and tests) and removes the need to fiddle around to have providers know the scheme.

Note that errors are a dict, so that ValidationError end-up being processed by the error handler https://github.com/inveniosoftware/invenio-records-resources/blob/master/invenio_records_resources/resources/errors.py#L47 (really: https://github.com/inveniosoftware/invenio-records-resources/blob/a7595311b5e66e39e2c831786dbb07e6701f741c/invenio_records_resources/errors.py#L55) to return an array of {field: <field>, messages: [<msg1>, <msg2>, ...]} as any regular draft validation error would.

Further note however, that the frontend doesn't associate the error with the field despite this. The frontend will display the top-level error at least:

image

and publication is prevented which are the important aspects.
The field specific error is not shown but this is the case as well for validate() (even in local tests with this change of format - which IMHO is more correct).

The assignment of errors to fields is a wider frontend issue (e.g., inveniosoftware/invenio-app-rdm#1353 or inveniosoftware/react-invenio-deposit#403), so keeping it out-of-bounds of this PR.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get the point of clear intention but now I think we have split the validation mechanism into 2 independent methods thus losing the self-contained way to validate and we moved the responsibility from the PID manager to the PID component. I think I would include the validation_record as another step in the process of validate and change slightly the interface of the method to

Suggested change
self.service.pids.pid_manager.validate(draft, record, raise_errors=True)

and retrieve the draft_pids from the draft itself. That said, isn't the self._validate_pids(pids, record, errors) enough to be called with draft instead of record? That would simplify the amount of code needed I think...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is going to be long, grab some tea 😉 🍵

isn't the self._validate_pids(pids, record, errors) enough to be called with draft instead of record?

I don't think so. My understanding is that https://github.com/inveniosoftware/invenio-rdm-records/blob/master/invenio_rdm_records/services/pids/providers/base.py#L153 checks if your draft submitted for publication is trying to change the current pid under validation to point to another record than its associated published one. So the associated published record needs to be passed (not just the draft) to compare it against.

And it just so happens that for drafts with no associated published record (completely new drafts), the PID will not exist in the DB in the first place, so PIDDoesNotExistError is raised and caught, and validate()... well... validates. An astute reader will realize that one could pass an existing PID via the API when creating a draft... and that will probably cause a 500. But solving that problem is also out of bounds of this PR.

This is all pretty hard to follow. That's why I think creating a separate method with clear intent is much better here. It allows us to isolate ourselves from this.

And this is not just nice clarity, it also has to do with the interface change. I literally started by putting the change in validate() and changing the interface (specifically the errors one): a lot of things needed to change in turn (breaking change). So I backed away. Mostly those breakages were just about fixing the interfaces elsewhere, but what really made me back away was how providers would have to use their pid_type in the error dict which made me question why RDM_PERSISTENT_IDENTIFIERS and RDM_PERSISTENT_IDENTIFIERS are the way they are:

If a provider already knows their pid scheme (also called pid_type in the code base... not sure why...) why are RDM_PERSISTENT_IDENTIFIERS and RDM_PERSISTENT_IDENTIFIER_PROVIDERS separate and not just one mapping? You can't mix-and-match scheme to providers unless you dynamically change the provider's pid_type (which we don't). And all the other info (e.g., "validator": idutils.is_doi) can just be tied to the provider (because in reality they are tied to the pid_type). Hopefully, it wasn't just a case of someone having DRYinitis 😉 .

That being said, in all seriousness, as usual, I probably don't understand the real intent of the code. That means we probably can't rely on the provider.pid_type which means we can't generate the errors correctly which means we can't place that code in the same method.

So we are back to code intent and back to separate methods 😄

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is hard for me to agree that a second method validate_record, added to the provider interface, adds clarity.
On the contrary, the current validate method is a hook clear enough to allow you to implement the validations that you need. The interface that was created is passing the record as param exactly for this, otherwise draft_pids was probably enough (we added the record for deduplication/integrity check).
I am also not 100% sure why you are passing the draft and not the record: the component's method publish is called after the draft has been "converted" to a record, and what will be sent to the provider is the record. Why not doing the validation on the record then?

If you are seeking for clarity, then you would have to rename validate to something like validate_pid and remove the record param, then add the second method validate_metadata or similar.
validate(..., record, ...) and validate_record(draft, ...) is, for me, more confusing.

The only thing that I have noticed in your changes that might be a sign that we need a separate method is the different return error type, which changes from a list to a dict, one error per field. This makes sense.

To summarize:

  • I would keep one validate method only. The proposed 2 methods are for me confusing.
  • I would change the return error type as you propose. The errors triggered by the HTTP call could be converted to an Exception instead.
  • I agree with the other changes, your solution looks clean and well done :)

Let's try to involve another reviewer to have an extra opinion :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A lot to unpack here 😄 This is a good place to start:

The interface that was created is passing the record as param exactly for this, otherwise draft_pids was probably enough (we added the record for deduplication/integrity check). I am also not 100% sure why you are passing the draft and not the record: the component's method publish is called after the draft has been "converted" to a record, and what will be sent to the provider is the record. Why not doing the validation on the record then?

Validation on draft was favored to sidestep the introduction of a subtle dependency on MetadataComponent when it wasn't needed. To make sure we are on the same page: MetadataComponent is the one that fills record.metadata with draft.metadata. Since draft on its own is enough for validation purposes, by validating it we don't have to depend on MetadataComponent having run before. This makes the design less fragile, because it loosens the constraints on the order of placement of PIDsComponent in the components array. It can also be argued that this plays well with the rest of the code in that component since the component looks at differences between draft and record (on the pids field though) - but this argument is subjective versus loose coupling which is more concrete.

Back to the question of using a separate method validate_record or placing the change in validate. The impetus for a separate method is based on the open for addition, closed to modification principle: by adding a new separate method, there is no breaking of the previous interface and no modification of pre-existing code where I could introduce further problems. A better name for validate_record would be welcome though if it helps clarify intent. The method itself doesn't care if a draft or a record is passed though, so any name is bound to be unclear unless we decide that a Draft (or a Record if the above paragraph wasn't convincing 😉 ) is what is passed to it OR we decide to use a name like recaft or some such new word :laugh:.

That being said, I am happy to place the change in validate 😄 if:

  1. we are fine with the breaking change* (and the potential risk of me breaking other stuff 🐘 ) AND
  2. the uncertainty described in my previous comment can be addressed satisfyingly. To recap that uncertainty (I don't know that I can describe it differently than I did, but I will summarize it):

To generate correct errors, Providers need to know the pid scheme (aka pid_type) they are used for. I would like to use self.pid_type BUT then why does RDM_PERSISTENT_IDENTIFIER_PROVIDERS exist? Presumably to mix-and-match providers and scheme (although that probably wouldn't work currently anyway). In which case pid_type won't be reliable in the future, then the separate method is probably a simpler, least-breaking, forward-looking change. Bringing someone else more familiar with that code will indeed be helpful here!

If 1) and 2) can be addressed, we can place the change in validate and make it take draft and record to gain the advantage listed above (validate(draft, record, ...)).

*: When I initially started work on this at the beginning of last week, I did introduce the change in validate and I want to say 8 files or so needed additional changes due to the breaking interface.

Copy link
Contributor

@ntarocco ntarocco Dec 14, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Validation on draft was favored to sidestep the introduction of a subtle dependency on MetadataComponent when it wasn't needed....

While I understand what you say, having components as a list, where order counts, was vastly discussed when it was designed, concerns were raised and discussed. It was decided to go ahead with this architecture.
What you are not taking into account is that there could be another component, added in-between in the future, that could change the record metadata. You cannot predict that.

The provider should validate the final metadata that will be used to register the PID, not another version of the record (aka the draft).

The impetus for a separate method is based on the open for addition, closed to modification principle...

You need to put design patterns in context. With the change you want to make, you assume that you cannot change anything else. Unfortunately, you are adding more complexity and more confusion to the code, again IMO. We maintain the datacite plugin, we can make this change and document very well what is the change if someone else already implemented the PIDProvider interface. At the end of the day, this is what semantic versioning is here for.
Adding a new method won't break things, correct, but developers might have/want to implement the new method: finding now 2 methods for validation, they will be confused on what they have to validate and where.

To generate correct errors, Providers need to know the pid scheme (aka pid_type) they are used for.

Why? A concrete instance of a provider knows what is handling. In fact, you added the error Missing publisher field required for DOI registration..
By the way, the pid_type is in the constructor of the PIDProviderclass.

change in validate and make it take draft and record to gain the advantage listed above (validate(draft, record, ...)).

I would not do that for the reason explained above. I see no need to validate metadata on the draft.

*: When I initially started work on this at the beginning of last week, I did introduce the change in validate and I want to say 8 files or so needed additional changes due to the breaking interface.

To me, the only change you need is the error handling/return type. Nothing more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok after a lot of work: it turns out we can't change the errors to be a dict because we get a list of errors from schema.load and all other components (and other code) rely on that interface. But that means we can avoid breaking interfaces.

I am deprecating this PR in favor of this new one: #1143 . It should address the points above and solve a couple of other problems. See you over there!

def read(self, scheme, identifier, provider_name):
"""Read a pid."""
provider = self._get_provider(scheme, provider_name)
Expand Down
13 changes: 13 additions & 0 deletions invenio_rdm_records/services/pids/providers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,16 @@ def validate(self, record, identifier=None, provider=None, **kwargs):
pass

return True, []

def validate_record(self, record):
"""Validate the record according to the provider's rules.

By default it always validates. Descendants should override this method with
their own validation logic if need be.

:param record: A record-like (draft or published record)
:returns: A tuple (success, errors). `success` is a bool that specifies
if the validation was successful. `errors` is an
error dict of the form: `{"<fieldA>": ["<msgA1>", ...], ...}`.
"""
return True, {}
22 changes: 22 additions & 0 deletions invenio_rdm_records/services/pids/providers/datacite.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,3 +209,25 @@ def validate(self, record, identifier=None, provider=None, **kwargs):
errors.append(str(e))

return (True, []) if not errors else (False, errors)

def validate_record(self, record):
"""Validate the record according to DataCite rules.

We only add check for values not already covered by the record schema.

:returns: A tuple (success, errors). `success` is a bool that specifies
if the validation was successful. `errors` is an
error dict of the form: `{"<fieldA>": ["<msgA1>", ...], ...}`.
"""
errors = {}

if not record["metadata"].get("publisher"):
errors.update(
{
"metadata.publisher": [
_("Missing publisher field required for DOI registration.")
],
}
)

return not bool(errors), errors
2 changes: 1 addition & 1 deletion invenio_rdm_records/services/pids/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def _manager(self):
- limit side-effects (in case using pre-existing `pid_manager` would
cause some.)
"""
return self.manager_cls(self.config.pids_providers)
return self.manager_cls(self.config.pids_providers, self.config.pids_required)

@property
def pid_manager(self):
Expand Down
6 changes: 4 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,8 +533,6 @@ def minimal_record():
"enabled": False, # Most tests don't care about files
},
"metadata": {
"publication_date": "2020-06-01",
"resource_type": {"id": "image-photo"},
"creators": [
{
"person_or_org": {
Expand All @@ -550,6 +548,10 @@ def minimal_record():
},
},
],
"publication_date": "2020-06-01",
# because DATACITE_ENABLED is True, this field is required
"publisher": "Acme Inc",
"resource_type": {"id": "image-photo"},
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding publisher and reordering alphabetically for readability.

"title": "A Romans story",
},
}
Expand Down
1 change: 1 addition & 0 deletions tests/resources/serializers/test_dublincore_serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def test_dublincorejson_serializer_minimal(running_app, updated_minimal_record):
"creators": ["Name", "Troy Inc."],
"dates": ["2020-06-01"],
"rights": ["info:eu-repo/semantics/openAccess"],
"publishers": ["Acme Inc"],
}

serializer = DublinCoreJSONSerializer()
Expand Down
4 changes: 4 additions & 0 deletions tests/services/components/test_pids_component.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ def validate(self, record, identifier=None, provider=None, **kwargs):
errors.append("Identifier must be an integer.")
return (True, []) if not errors else (False, errors)

def validate_record(self, record):
"""Validate the record according to PID rules i.e. always good."""
return True, []


# configs

Expand Down
47 changes: 40 additions & 7 deletions tests/services/pids/providers/test_datacite_pid_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,24 +195,57 @@ def custom_format_func(*args):
assert datacite_provider.create(record).pid_value == expected_result


def test_datacite_provider_validation(record, mocker):
client = DataCiteClient("datacite")

# check with default func
def test_datacite_provider_validate(record):
current_app.config["DATACITE_PREFIX"] = "10.1000"
client = DataCiteClient("datacite")
datacite_provider = DataCitePIDProvider("datacite", client=client)

# Case - Valid identifier (doi)
success, errors = datacite_provider.validate(
record=record, identifier="10.1000/valid.1234", provider="datacite"
)
assert success
assert errors == []
assert [] == errors
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(pet peeve: pytest shows the results of comparison tests like this with left-hand side as the expected result, so we should do the same to make reading test errors easier)


# Case - Invalid identifier (doi)
success, errors = datacite_provider.validate(
record=record, identifier="10.2000/invalid.1234", provider="datacite"
)

assert not success
assert errors == [
expected = [
"Wrong DOI 10.2000 prefix provided, "
+ "it should be 10.1000 as defined in the rest client"
]
assert expected == errors


def test_datacite_provider_validate_record(record):
record["metadata"] = {"publisher": "Acme Inc"}
current_app.config["DATACITE_PREFIX"] = "10.1000"
client = DataCiteClient("datacite")
datacite_provider = DataCitePIDProvider("datacite", client=client)

# Case - valid new record without pids.doi
success, errors = datacite_provider.validate_record(record)
assert {} == errors
assert success

# Case - valid record with pre-existing pids.doi
record["pids"] = {
"doi": {"provider": "datacite", "identifier": "10.1000/pre-existing.1234"}
}
success, errors = datacite_provider.validate_record(record)
assert {} == errors
assert success

# Case - invalid record
del record["metadata"]["publisher"]
success, errors = datacite_provider.validate_record(record)
expected = {
"metadata.publisher": [
"Missing publisher field required for DOI registration."
],
}

assert expected == errors
assert not success