Skip to content

Commit 1fb1ed7

Browse files
committed
Add enricher tests
1 parent 4c07362 commit 1fb1ed7

10 files changed

+632
-3
lines changed

__init__.py

Whitespace-only changes.

pipeline/enrichers/doi.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def recover_doi(body, field):
2727
if translated != doi:
2828
doi = translated
2929
update_at_path(body, path, doi)
30-
field.events.append(make_event(type="enrichment", code="unicode", initial_value=initial, value=doi, result="enriched"))
30+
field.events.append(make_event(type="enrichment", code="recovery", initial_value=initial, value=doi, result="enriched"))
3131
field.value = doi
3232

3333
if doi.startswith(VALID_STARTS):

pipeline/enrichers/isbn.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def recover_isbn(body, field):
3333
update_at_path(body, field.path, res[0])
3434
field.value = res[0]
3535
field.enrichment_status = "enriched"
36-
field.events.append(make_event(type="enrichment", code="split", value=res[0], initial_value=isbn, result="enriched"))
36+
field.events.append(make_event(type="enrichment", code="recovery", value=res[0], initial_value=isbn, result="enriched"))
3737

3838
if field.enrichment_status != 'enriched':
3939
field.enrichment_status = 'unsuccessful'

pipeline/enrichers/orcid.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def recover_orcid(body, field):
3737
orcid_list = [h.group() for h in hit]
3838
if len(orcid_list) != 0:
3939
if orcid_list[0] != orcid:
40-
field.events.append(make_event(type="enrichment", code="split", value=orcid_list[0], initial_value=orcid, result="enriched"))
40+
field.events.append(make_event(type="enrichment", code="recovery", value=orcid_list[0], initial_value=orcid, result="enriched"))
4141
update_at_path(body, path, orcid_list[0])
4242
field.enrichment_status = 'enriched'
4343
field.value = orcid_list[0]

pipeline/tests/enrichers/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
from pipeline.validate import FieldMeta
2+
from pipeline.enrichers.doi import recover_doi
3+
4+
5+
def test_doi_enrich_remove_whitespace():
6+
test_data = "10.5468 /ogs.2010.59.10.1"
7+
expected_result = "10.5468/ogs.2010.59.10.1"
8+
_test_doi_enrichment(test_data, expected_result, ["recovery"])
9+
10+
11+
def test_doi_enrich_remove_unicode_cat_cc():
12+
# Test removal of tab from unicode category Cc.
13+
test_data = "10.10/invalid\t"
14+
expected_result = "10.10/invalid"
15+
_test_doi_enrichment(test_data, expected_result, ["recovery"])
16+
17+
18+
def test_doi_enrich_remove_unicode_cat_cf():
19+
# Test removal of zero width space from unicode category Cf.
20+
test_data = "10.10/invalid\u200b"
21+
expected_result = "10.10/invalid"
22+
_test_doi_enrichment(test_data, expected_result, ["recovery"])
23+
24+
25+
def test_doi_enrich_remove_unicode_cat_zl():
26+
# Test removal of line separator from unicode category Zl.
27+
test_data = "10.10/invalid\u2028"
28+
expected_result = "10.10/invalid"
29+
_test_doi_enrichment(test_data, expected_result, ["recovery"])
30+
31+
32+
def test_doi_enrich_remove_unicode_cat_zp():
33+
# Test removal of paragraph separator from unicode category Zp.
34+
test_data = "10.10/invalid\u2029"
35+
expected_result = "10.10/invalid"
36+
_test_doi_enrichment(test_data, expected_result, ["recovery"])
37+
38+
39+
def test_doi_enrich_remove_unicode_cat_zs():
40+
# Test removal of white space (no-break space) from unicode category Zs.
41+
# Regular space also belongs in this category.
42+
test_data = "10.10/invalid\xa0"
43+
expected_result = "10.10/invalid"
44+
_test_doi_enrichment(test_data, expected_result, ["recovery"])
45+
46+
47+
def test_doi_enrich_replace_fraction_slash():
48+
# Test replacement of fractional slash with regular slash.
49+
test_data = "10.10\u2044invalid"
50+
expected_result = "10.10/invalid"
51+
_test_doi_enrichment(test_data, expected_result, ["recovery"])
52+
53+
54+
def test_doi_enrich_remove_prefix():
55+
# Test removal of invalid prefixes.
56+
test_data = "Doi:10.10/valid"
57+
expected_result = "10.10/valid"
58+
_test_doi_enrichment(test_data, expected_result, ["recovery"])
59+
60+
61+
def test_doi_enrich_valid_prefixes():
62+
# Test valid prefixes left untouched.
63+
test_data = "https://doi.org/10.10/valid"
64+
expected_result = "https://doi.org/10.10/valid"
65+
body = {"partOf": [{"identifiedBy": [{"value": test_data}]}]}
66+
67+
field = FieldMeta(
68+
value=test_data,
69+
normalization_status="unchanged",
70+
validation_status="valid",
71+
path="partOf.[0].identifiedBy.[0].value",
72+
)
73+
74+
recover_doi(body, field)
75+
assert field.value == expected_result
76+
assert field.events == []
77+
78+
test_data = "http://doi.org/10.10/valid"
79+
expected_result = "http://doi.org/10.10/valid"
80+
body = {"partOf": [{"identifiedBy": [{"value": test_data}]}]}
81+
82+
field = FieldMeta(
83+
value=test_data,
84+
normalization_status="unchanged",
85+
validation_status="valid",
86+
path="partOf.[0].identifiedBy.[0].value",
87+
)
88+
89+
recover_doi(body, field)
90+
assert field.value == expected_result
91+
assert field.events == []
92+
93+
94+
def _test_doi_enrichment(test_data, expected_result, expected_codes):
95+
body = {"partOf": [{"identifiedBy": [{"value": test_data}]}]}
96+
97+
field = FieldMeta(
98+
value=test_data,
99+
normalization_status="unchanged",
100+
validation_status="valid",
101+
path="partOf.[0].identifiedBy.[0].value",
102+
)
103+
104+
recover_doi(body, field)
105+
106+
assert field.value == expected_result
107+
if len(field.events) > 0:
108+
assert field.events[0]["type"] == "enrichment"
109+
for event in field.events:
110+
assert event["initial_value"] == test_data
111+
assert event["code"] in expected_codes
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
from pipeline.validate import FieldMeta
2+
from pipeline.enrichers.isbn import recover_isbn
3+
4+
5+
def test_isbn_with_prefix():
6+
test_data = "ISBN 978-91-86857-16-5"
7+
expected_result = "978-91-86857-16-5"
8+
_test_isbn_enrichment(test_data, expected_result, ["recovery"])
9+
10+
11+
def test_isbn_with_prefix_and_suffix():
12+
test_data = "ISBN: 978-91-7740-107-0 (pdf)"
13+
expected_result = "978-91-7740-107-0"
14+
_test_isbn_enrichment(test_data, expected_result, ["recovery"])
15+
16+
17+
def test_isbn10_with_prefix_and_numeric_control():
18+
test_data = "ISBN 0415579961"
19+
expected_result = "0415579961"
20+
_test_isbn_enrichment(test_data, expected_result, ["recovery"])
21+
22+
23+
def test_isbn10_with_prefix_and_lowercase_x_control():
24+
test_data = "ISBN 041557996x"
25+
expected_result = "041557996x"
26+
_test_isbn_enrichment(test_data, expected_result, ["recovery"])
27+
28+
29+
def test_isbn10_with_prefix_and_capital_X_control():
30+
test_data = "ISBN: 917291341X"
31+
expected_result = "917291341X"
32+
_test_isbn_enrichment(test_data, expected_result, ["recovery"])
33+
34+
35+
def test_isbn13_with_prefix():
36+
test_data = "ISBN: 9172-91-341-X"
37+
expected_result = "9172-91-341-X"
38+
_test_isbn_enrichment(test_data, expected_result, ["recovery"])
39+
40+
41+
def test_isbn13_with_numerical_prefix():
42+
test_data = "13 9780853583080"
43+
expected_result = "9780853583080"
44+
_test_isbn_enrichment(test_data, expected_result, ["recovery"])
45+
46+
47+
# TODO: Fix split in enricher
48+
# def test_dual_isbn():
49+
# test_data = '0415579961, 9780415579964'
50+
# original_field_data = {'value': test_data, 'enrichment_status': 'pending', 'path': '<path>',
51+
# 'validation_status': 'true', 'events': []}
52+
# field, actions = isbn_enricher(FieldMetadata(original_field_data))
53+
# assert field.value == '0415579961'
54+
# action1 = actions[0]
55+
# event1 = field.events[0]
56+
# assert isinstance(action1, DuplicateFieldAction)
57+
# assert action1.field_type == 'isbn'
58+
# assert event1['new_value'] == '0415579961'
59+
# assert event1['old_value'] == '0415579961, 9780415579964'
60+
# assert event1['code'] == 'recovery'
61+
# action2 = actions[1]
62+
# event2 = field.events[1]
63+
# assert isinstance(action2, ChangeAction)
64+
# assert action2.field_type == 'isbn'
65+
# assert event2['created_value'] == '9780415579964'
66+
# assert event2['old_value'] == '0415579961, 9780415579964'
67+
# assert event2['code'] == 'split'
68+
69+
70+
def test_isbn_non_recoverable_wrong_length_and_strange_prefix():
71+
test_data = "2-s2.0-84863732124"
72+
_test_none_isbn_enrichment(test_data)
73+
74+
75+
def test_isbn_non_recoverable_issn():
76+
test_data = "0022-2380"
77+
_test_none_isbn_enrichment(test_data)
78+
79+
80+
def test_isbn_non_recoverable_doi():
81+
test_data = "10.1046/j.1365-2834.1999.00138.x"
82+
_test_none_isbn_enrichment(test_data)
83+
84+
85+
def test_isbn_non_recoverable_to_many_digits():
86+
test_data = "12345678901234567890"
87+
_test_none_isbn_enrichment(test_data)
88+
89+
90+
def test_isbn_non_recoverable_doi_containing_isbn():
91+
test_data = "10.1046/j.978-91-86857-16-5"
92+
_test_none_isbn_enrichment(test_data)
93+
94+
95+
def test_isbn_non_recoverable_numerical_prefix_without_delimiter():
96+
test_data = "13978-91-975576-6-5"
97+
_test_none_isbn_enrichment(test_data)
98+
99+
100+
def _test_isbn_enrichment(test_data, expected_result, expected_codes):
101+
body = {"identifiedBy": [{"value": test_data}]}
102+
103+
field = FieldMeta(
104+
value=test_data,
105+
normalization_status="unchanged",
106+
validation_status="valid",
107+
enrichment_status="pending",
108+
path="identifiedBy.[0].value",
109+
)
110+
111+
recover_isbn(body, field)
112+
113+
assert field.value == expected_result
114+
if len(field.events) > 0:
115+
assert field.events[0]["type"] == "enrichment"
116+
for event in field.events:
117+
assert event["initial_value"] == test_data
118+
assert event["code"] in expected_codes
119+
120+
121+
def _test_none_isbn_enrichment(test_data):
122+
body = {"identifiedBy": [{"value": test_data}]}
123+
124+
field = FieldMeta(
125+
value=test_data,
126+
normalization_status="unchanged",
127+
validation_status="valid",
128+
enrichment_status="pending",
129+
path="identifiedBy.[0].value",
130+
)
131+
132+
recover_isbn(body, field)
133+
134+
assert field.value == test_data
135+
assert not field.is_enriched()
136+
assert len(field.events) == 0
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
from pipeline.validate import FieldMeta
2+
from pipeline.enrichers.isi import recover_isi
3+
4+
5+
def test_correct_isi():
6+
test_data = "000123456789123"
7+
_test_none_isi_enrichment(test_data)
8+
9+
10+
def test_correct_isi_with_prefix():
11+
test_data = "ISI: 000321123456789"
12+
expected_result = "000321123456789"
13+
_test_isi_enrichment(test_data, expected_result, ["recovery"])
14+
15+
16+
def test_correct_isi_with_suffix():
17+
test_data = "A19123456789XYZ checked"
18+
expected_result = "A19123456789XYZ"
19+
_test_isi_enrichment(test_data, expected_result, ["recovery"])
20+
21+
22+
def test_correct_isi_with_incorrect_first_sign():
23+
test_data = ":000321123456789"
24+
expected_result = "000321123456789"
25+
_test_isi_enrichment(test_data, expected_result, ["recovery"])
26+
27+
28+
def test_isi_non_recoverable_values_incorrect_prefix_001_instead_of_000():
29+
test_data = "13978-91-975576-6-5"
30+
_test_none_isi_enrichment(test_data)
31+
32+
33+
def test_isi_non_recoverable_values_incorrect_prefix_8_instead_of_9():
34+
test_data = "A18123456789123"
35+
_test_none_isi_enrichment(test_data)
36+
37+
38+
def test_isi_non_recoverable_values_to_short():
39+
test_data = "00012345678912"
40+
_test_none_isi_enrichment(test_data)
41+
42+
43+
def test_isi_non_recoverable_values_to_long():
44+
test_data = "00012345678912345"
45+
_test_none_isi_enrichment(test_data)
46+
47+
48+
def test_isi_non_recoverable_values_invalid_characters():
49+
test_data = "A1912A456!@#$%^"
50+
_test_none_isi_enrichment(test_data)
51+
52+
53+
def test_isi_non_recoverable_values_incorrect_prefix_000_with_non_numeric_characters():
54+
test_data = "000A23456789123"
55+
_test_none_isi_enrichment(test_data)
56+
57+
58+
def _test_isi_enrichment(test_data, expected_result, expected_codes):
59+
body = {"identifiedBy": [{"value": test_data}]}
60+
61+
field = FieldMeta(
62+
value=test_data,
63+
normalization_status="unchanged",
64+
validation_status="valid",
65+
enrichment_status="pending",
66+
path="identifiedBy.[0].value",
67+
)
68+
69+
recover_isi(body, field)
70+
71+
assert field.value == expected_result
72+
if len(field.events) > 0:
73+
assert field.events[0]["type"] == "enrichment"
74+
for event in field.events:
75+
assert event["initial_value"] == test_data
76+
assert event["code"] in expected_codes
77+
78+
79+
def _test_none_isi_enrichment(test_data):
80+
body = {"identifiedBy": [{"value": test_data}]}
81+
82+
field = FieldMeta(
83+
value=test_data,
84+
normalization_status="unchanged",
85+
validation_status="valid",
86+
enrichment_status="pending",
87+
path="identifiedBy.[0].value",
88+
)
89+
90+
recover_isi(body, field)
91+
assert field.value == test_data
92+
assert not field.is_enriched()
93+
assert len(field.events) == 0

0 commit comments

Comments
 (0)