-
Notifications
You must be signed in to change notification settings - Fork 87
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
providers: add validate_record to validate record according to provider [+] #1140
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -19,9 +19,10 @@ | |||||
class PIDManager: | ||||||
"""RDM PIDs Manager.""" | ||||||
|
||||||
def __init__(self, providers): | ||||||
def __init__(self, providers, required_schemes=None): | ||||||
"""Constructor for RecordService.""" | ||||||
self._providers = providers | ||||||
self._required_schemes = required_schemes | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's legitimate for the PIDManager to know which pid schemes are required. |
||||||
|
||||||
def _get_provider(self, scheme, provider_name=None): | ||||||
"""Get a provider.""" | ||||||
|
@@ -47,17 +48,6 @@ def _validate_pids_schemes(self, pids): | |||||
if unknown_schemes: | ||||||
raise PIDSchemeNotSupportedError(unknown_schemes) | ||||||
|
||||||
def _validate_pids(self, pids, record, errors): | ||||||
"""Validate an iterator of PIDs. | ||||||
|
||||||
This function assumes all pid schemes are supported by the system. | ||||||
""" | ||||||
for scheme, pid in pids.items(): | ||||||
provider = self._get_provider(scheme, pid.get("provider")) | ||||||
success, val_errors = provider.validate(record=record, **pid) | ||||||
if not success: | ||||||
errors.append({"field": f"pids.{scheme}", "messages": val_errors}) | ||||||
|
||||||
def _validate_identifiers(self, pids, errors): | ||||||
"""Validate and normalize identifiers.""" | ||||||
# TODO: Refactor to get it injected instead. | ||||||
|
@@ -87,6 +77,17 @@ def _validate_identifiers(self, pids, errors): | |||||
for scheme, id_ in identifiers: | ||||||
pids[scheme]["identifier"] = id_ | ||||||
|
||||||
def _validate_pids(self, pids, record, errors): | ||||||
"""Validate an iterator of PIDs. | ||||||
|
||||||
This function assumes all pid schemes are supported by the system. | ||||||
""" | ||||||
for scheme, pid in pids.items(): | ||||||
provider = self._get_provider(scheme, pid.get("provider")) | ||||||
success, val_errors = provider.validate(record=record, **pid) | ||||||
if not success: | ||||||
errors.append({"field": f"pids.{scheme}", "messages": val_errors}) | ||||||
|
||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just reordering for easier reading of the file as a developer. |
||||||
def validate(self, pids, record, errors=None, raise_errors=False): | ||||||
"""Validate PIDs.""" | ||||||
errors = [] if errors is None else errors | ||||||
|
@@ -97,6 +98,43 @@ def validate(self, pids, record, errors=None, raise_errors=False): | |||||
if raise_errors and errors: | ||||||
raise ValidationError(message=errors) | ||||||
|
||||||
def validate_record(self, record, raise_errors=False): | ||||||
"""Validate the record according to the PIDs' requirements. | ||||||
|
||||||
Here we check if the record is compatible from the point of | ||||||
view of the pids... | ||||||
- ... it contains | ||||||
- ... it would contain according to configured required pids | ||||||
|
||||||
The responsibility lies with each provider since they are the ones | ||||||
that know their criteria for a record that is complete enough to get | ||||||
a PID. | ||||||
""" | ||||||
errors = {} | ||||||
|
||||||
# scheme, provider_name for record's pids | ||||||
scheme_names = [ | ||||||
(scheme, pid.get("provider")) | ||||||
for scheme, pid in record.get("pids", {}).items() | ||||||
] | ||||||
# scheme, None for required pids | ||||||
scheme_names += [(scheme, None) for scheme in self._required_schemes] | ||||||
providers = [ | ||||||
self._get_provider(scheme, provider_name) | ||||||
for scheme, provider_name in scheme_names | ||||||
] | ||||||
|
||||||
for provider in providers: | ||||||
success, provider_errors = provider.validate_record(record) | ||||||
if not success: | ||||||
# This is not perfect as one provider may override the error of another | ||||||
# but a proper dict merging algorithm is out-of-bounds here and as long | ||||||
# as an error is raised we are good. | ||||||
errors.update(provider_errors) | ||||||
|
||||||
if raise_errors and errors: | ||||||
raise ValidationError(message=errors) | ||||||
|
||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's the crux of the feature. It's slightly different than the original idea of modifying the Note that Further note however, that the frontend doesn't associate the error with the field despite this. The frontend will display the top-level error at least: and publication is prevented which are the important aspects. The assignment of errors to fields is a wider frontend issue (e.g., inveniosoftware/invenio-app-rdm#1353 or inveniosoftware/react-invenio-deposit#403), so keeping it out-of-bounds of this PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I get the point of clear intention but now I think we have split the validation mechanism into 2 independent methods thus losing the self-contained way to validate and we moved the responsibility from the PID manager to the PID component. I think I would include the
Suggested change
and retrieve the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is going to be long, grab some tea 😉 🍵
I don't think so. My understanding is that https://github.com/inveniosoftware/invenio-rdm-records/blob/master/invenio_rdm_records/services/pids/providers/base.py#L153 checks if your draft submitted for publication is trying to change the current pid under validation to point to another record than its associated published one. So the associated published record needs to be passed (not just the draft) to compare it against. And it just so happens that for drafts with no associated published record (completely new drafts), the PID will not exist in the DB in the first place, so This is all pretty hard to follow. That's why I think creating a separate method with clear intent is much better here. It allows us to isolate ourselves from this. And this is not just nice clarity, it also has to do with the interface change. I literally started by putting the change in If a provider already knows their pid scheme (also called That being said, in all seriousness, as usual, I probably don't understand the real intent of the code. That means we probably can't rely on the So we are back to code intent and back to separate methods 😄 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is hard for me to agree that a second method If you are seeking for clarity, then you would have to rename The only thing that I have noticed in your changes that might be a sign that we need a separate method is the different return error type, which changes from a list to a dict, one error per field. This makes sense. To summarize:
Let's try to involve another reviewer to have an extra opinion :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A lot to unpack here 😄 This is a good place to start:
Validation on Back to the question of using a separate method That being said, I am happy to place the change in
To generate correct errors, Providers need to know the pid If 1) and 2) can be addressed, we can place the change in *: When I initially started work on this at the beginning of last week, I did introduce the change in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
While I understand what you say, having components as a list, where order counts, was vastly discussed when it was designed, concerns were raised and discussed. It was decided to go ahead with this architecture. The provider should validate the final metadata that will be used to register the PID, not another version of the record (aka the draft).
You need to put design patterns in context. With the change you want to make, you assume that you cannot change anything else. Unfortunately, you are adding more complexity and more confusion to the code, again IMO. We maintain the
Why? A concrete instance of a provider knows what is handling. In fact, you added the error
I would not do that for the reason explained above. I see no need to validate metadata on the draft.
To me, the only change you need is the error handling/return type. Nothing more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok after a lot of work: it turns out we can't change the errors to be a dict because we get a list of errors from I am deprecating this PR in favor of this new one: #1143 . It should address the points above and solve a couple of other problems. See you over there! |
||||||
def read(self, scheme, identifier, provider_name): | ||||||
"""Read a pid.""" | ||||||
provider = self._get_provider(scheme, provider_name) | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -533,8 +533,6 @@ def minimal_record(): | |
"enabled": False, # Most tests don't care about files | ||
}, | ||
"metadata": { | ||
"publication_date": "2020-06-01", | ||
"resource_type": {"id": "image-photo"}, | ||
"creators": [ | ||
{ | ||
"person_or_org": { | ||
|
@@ -550,6 +548,10 @@ def minimal_record(): | |
}, | ||
}, | ||
], | ||
"publication_date": "2020-06-01", | ||
# because DATACITE_ENABLED is True, this field is required | ||
"publisher": "Acme Inc", | ||
"resource_type": {"id": "image-photo"}, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Adding publisher and reordering alphabetically for readability. |
||
"title": "A Romans story", | ||
}, | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -195,24 +195,57 @@ def custom_format_func(*args): | |
assert datacite_provider.create(record).pid_value == expected_result | ||
|
||
|
||
def test_datacite_provider_validation(record, mocker): | ||
client = DataCiteClient("datacite") | ||
|
||
# check with default func | ||
def test_datacite_provider_validate(record): | ||
current_app.config["DATACITE_PREFIX"] = "10.1000" | ||
client = DataCiteClient("datacite") | ||
datacite_provider = DataCitePIDProvider("datacite", client=client) | ||
|
||
# Case - Valid identifier (doi) | ||
success, errors = datacite_provider.validate( | ||
record=record, identifier="10.1000/valid.1234", provider="datacite" | ||
) | ||
assert success | ||
assert errors == [] | ||
assert [] == errors | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (pet peeve: pytest shows the results of comparison tests like this with left-hand side as the expected result, so we should do the same to make reading test errors easier) |
||
|
||
# Case - Invalid identifier (doi) | ||
success, errors = datacite_provider.validate( | ||
record=record, identifier="10.2000/invalid.1234", provider="datacite" | ||
) | ||
|
||
assert not success | ||
assert errors == [ | ||
expected = [ | ||
"Wrong DOI 10.2000 prefix provided, " | ||
+ "it should be 10.1000 as defined in the rest client" | ||
] | ||
assert expected == errors | ||
|
||
|
||
def test_datacite_provider_validate_record(record): | ||
record["metadata"] = {"publisher": "Acme Inc"} | ||
current_app.config["DATACITE_PREFIX"] = "10.1000" | ||
client = DataCiteClient("datacite") | ||
datacite_provider = DataCitePIDProvider("datacite", client=client) | ||
|
||
# Case - valid new record without pids.doi | ||
success, errors = datacite_provider.validate_record(record) | ||
assert {} == errors | ||
assert success | ||
|
||
# Case - valid record with pre-existing pids.doi | ||
record["pids"] = { | ||
"doi": {"provider": "datacite", "identifier": "10.1000/pre-existing.1234"} | ||
} | ||
success, errors = datacite_provider.validate_record(record) | ||
assert {} == errors | ||
assert success | ||
|
||
# Case - invalid record | ||
del record["metadata"]["publisher"] | ||
success, errors = datacite_provider.validate_record(record) | ||
expected = { | ||
"metadata.publisher": [ | ||
"Missing publisher field required for DOI registration." | ||
], | ||
} | ||
|
||
assert expected == errors | ||
assert not success |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just for readability