Skip to content

Commit 3a40b35

Browse files
committed
OAIClient.get_record catch RequestException
Why these changes are being introduced: A harvest recently failed when the requests library threw a ConnectTimeout exception. We were formerly handling only HTTPErrors, which this exception did not inherit from. This exception aborted the harvest instead of gracefully skipping and reporting record, and continuing. How this addresses that need: * OAIClient.get_records() now catches more base RequestException which includes HTTP and connection errors Side effects of this change: * Connection errors when retrieving a record will be logged and skipped, and will not fail the full harvest. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/IN-1038
1 parent 4915456 commit 3a40b35

13 files changed

+5342
-553
lines changed

Pipfile.lock

+614-551
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

harvester/oai.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from typing import Any, Literal
99

1010
import smart_open
11-
from requests import HTTPError
11+
from requests import RequestException
1212
from sickle import Sickle
1313
from sickle.models import Record
1414
from sickle.oaiexceptions import IdDoesNotExist, OAIError
@@ -99,12 +99,13 @@ def get_records(
9999
identifier,
100100
)
101101
continue
102+
102103
try:
103104
record = self.client.GetRecord(
104105
identifier=identifier, metadataPrefix=self.metadata_format
105106
)
106107
logger.debug("Record retrieved: %s", identifier)
107-
except (HTTPError, OAIError) as e:
108+
except (RequestException, OAIError) as e:
108109
logger.warning(
109110
"GetRecord error for identifier %s, reporting to Sentry", identifier
110111
)

tests/test_oai.py

+31
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
1+
# ruff: noqa: D205, D209
2+
3+
from unittest import mock
4+
15
import pytest
26
import vcr
7+
from requests.exceptions import ConnectTimeout, HTTPError
38
from sickle import Sickle
49
from sickle.oaiexceptions import NoRecordsMatch
510

@@ -272,3 +277,29 @@ def test_aborted_harvest_with_max_errors_reached_and_report(
272277
)
273278
)
274279
assert mock_sentry_capture_message.called
280+
281+
282+
@pytest.mark.parametrize(
283+
"request_lib_exception",
284+
[ConnectTimeout, HTTPError],
285+
ids=["ConnectTimeout", "HTTPError"],
286+
)
287+
def test_get_records_handles_requests_lib_errors(
288+
request_lib_exception, mock_sentry_capture_message
289+
):
290+
"""Tests both ConnectTimeout and HTTPError exceptions raised by requests library
291+
will get handled gracefully by OAIClient.get_records(). Both of these inherit from
292+
the more base RequestException. Error handling was previously missing connection
293+
errors by focusing only on HTTPError."""
294+
with mock.patch("sickle.app.Sickle.GetRecord") as mocked_sickle_get_record:
295+
mocked_sickle_get_record.side_effect = request_lib_exception()
296+
oai_client = OAIClient(
297+
"https://dspace.mit.edu/oai/request",
298+
metadata_format="oai_dc",
299+
retry_status_codes=(),
300+
)
301+
identifiers = ["oai:not-real:will-fail"]
302+
records = list(oai_client.get_records(identifier for identifier in identifiers))
303+
expected_records_count = 0
304+
assert len(records) == expected_records_count
305+
assert mock_sentry_capture_message.called
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
interactions:
2+
- request:
3+
body: null
4+
headers:
5+
Accept:
6+
- '*/*'
7+
Accept-Encoding:
8+
- gzip, deflate
9+
Connection:
10+
- keep-alive
11+
User-Agent:
12+
- python-requests/2.32.3
13+
method: GET
14+
uri: https://dspace.mit.edu/oai/request?metadataPrefix=oai_dc&from=2021-12-26&until=2021-12-26&set=hdl_1721.1_49432&verb=ListIdentifiers
15+
response:
16+
body:
17+
string: <?xml version="1.0" encoding="UTF-8"?><?xml-stylesheet type="text/xsl"
18+
href="static/style.xsl"?><OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/"
19+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/
20+
http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2024-08-21T17:16:17Z</responseDate><request
21+
verb="ListIdentifiers" metadataPrefix="oai_dc" from="2021-12-26T00:00:00Z"
22+
until="2021-12-26T00:00:00Z" set="hdl_1721.1_49432">https://dspace.mit.edu//oai/request</request><error
23+
code="noRecordsMatch">No matches for the query</error></OAI-PMH>
24+
headers:
25+
Connection:
26+
- close
27+
Content-Encoding:
28+
- gzip
29+
Content-Type:
30+
- text/xml;charset=UTF-8
31+
Date:
32+
- Wed, 21 Aug 2024 17:16:17 GMT
33+
Set-cookie:
34+
- HttpOnly;Secure
35+
Strict-Transport-Security:
36+
- max-age=63072000
37+
Transfer-Encoding:
38+
- chunked
39+
Vary:
40+
- Accept-Encoding
41+
X-Content-Type-Options:
42+
- nosniff
43+
X-Frame-Options:
44+
- SAMEORIGIN
45+
X-XSS-Protection:
46+
- 1; mode=block
47+
status:
48+
code: 200
49+
message: OK
50+
version: 1

tests/tests/fixtures/vcr_cassettes/get-identifiers.yaml

+97
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
interactions:
2+
- request:
3+
body: null
4+
headers:
5+
Accept:
6+
- '*/*'
7+
Accept-Encoding:
8+
- gzip, deflate
9+
Connection:
10+
- keep-alive
11+
User-Agent:
12+
- python-requests/2.32.3
13+
method: GET
14+
uri: https://dspace.mit.edu/oai/request?metadataPrefix=oai_dc&from=2017-12-14&until=2017-12-14&verb=ListIdentifiers
15+
response:
16+
body:
17+
string: <?xml version="1.0" encoding="UTF-8"?><?xml-stylesheet type="text/xsl"
18+
href="static/style.xsl"?><OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/"
19+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/
20+
http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2024-08-21T17:16:17Z</responseDate><request
21+
verb="ListIdentifiers" metadataPrefix="oai_dc" from="2017-12-14T00:00:00Z"
22+
until="2017-12-14T00:00:00Z">https://dspace.mit.edu//oai/request</request><ListIdentifiers><header
23+
status="deleted"><identifier>oai:dspace.mit.edu:1721.1/112746</identifier><datestamp>2017-12-14T15:03:59Z</datestamp><setSpec>com_1721.1_7803</setSpec><setSpec>hdl_1721.1_7803</setSpec><setSpec>col_1721.1_42001</setSpec><setSpec>hdl_1721.1_42001</setSpec></header></ListIdentifiers></OAI-PMH>
24+
headers:
25+
Connection:
26+
- close
27+
Content-Encoding:
28+
- gzip
29+
Content-Type:
30+
- text/xml;charset=UTF-8
31+
Date:
32+
- Wed, 21 Aug 2024 17:16:17 GMT
33+
Set-cookie:
34+
- HttpOnly;Secure
35+
Strict-Transport-Security:
36+
- max-age=63072000
37+
Transfer-Encoding:
38+
- chunked
39+
Vary:
40+
- Accept-Encoding
41+
X-Content-Type-Options:
42+
- nosniff
43+
X-Frame-Options:
44+
- SAMEORIGIN
45+
X-XSS-Protection:
46+
- 1; mode=block
47+
status:
48+
code: 200
49+
message: OK
50+
version: 1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
interactions:
2+
- request:
3+
body: null
4+
headers:
5+
Accept:
6+
- '*/*'
7+
Accept-Encoding:
8+
- gzip, deflate
9+
Connection:
10+
- keep-alive
11+
User-Agent:
12+
- python-requests/2.32.3
13+
method: GET
14+
uri: https://dspace.mit.edu/oai/request?metadataPrefix=oai_dc&from=2017-12-14&until=2017-12-14&verb=ListIdentifiers
15+
response:
16+
body:
17+
string: <?xml version="1.0" encoding="UTF-8"?><?xml-stylesheet type="text/xsl"
18+
href="static/style.xsl"?><OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/"
19+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/
20+
http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2024-08-21T17:16:17Z</responseDate><request
21+
verb="ListIdentifiers" metadataPrefix="oai_dc" from="2017-12-14T00:00:00Z"
22+
until="2017-12-14T00:00:00Z">https://dspace.mit.edu//oai/request</request><ListIdentifiers><header
23+
status="deleted"><identifier>oai:dspace.mit.edu:1721.1/112746</identifier><datestamp>2017-12-14T15:03:59Z</datestamp><setSpec>com_1721.1_7803</setSpec><setSpec>hdl_1721.1_7803</setSpec><setSpec>col_1721.1_42001</setSpec><setSpec>hdl_1721.1_42001</setSpec></header></ListIdentifiers></OAI-PMH>
24+
headers:
25+
Connection:
26+
- close
27+
Content-Encoding:
28+
- gzip
29+
Content-Type:
30+
- text/xml;charset=UTF-8
31+
Date:
32+
- Wed, 21 Aug 2024 17:16:17 GMT
33+
Set-cookie:
34+
- HttpOnly;Secure
35+
Strict-Transport-Security:
36+
- max-age=63072000
37+
Transfer-Encoding:
38+
- chunked
39+
Vary:
40+
- Accept-Encoding
41+
X-Content-Type-Options:
42+
- nosniff
43+
X-Frame-Options:
44+
- SAMEORIGIN
45+
X-XSS-Protection:
46+
- 1; mode=block
47+
status:
48+
code: 200
49+
message: OK
50+
- request:
51+
body: null
52+
headers:
53+
Accept:
54+
- '*/*'
55+
Accept-Encoding:
56+
- gzip, deflate
57+
Connection:
58+
- keep-alive
59+
User-Agent:
60+
- python-requests/2.32.3
61+
method: GET
62+
uri: https://dspace.mit.edu/oai/request?identifier=oai%3Adspace.mit.edu%3A1721.1%2F112746&metadataPrefix=oai_dc&verb=GetRecord
63+
response:
64+
body:
65+
string: <?xml version="1.0" encoding="UTF-8"?><?xml-stylesheet type="text/xsl"
66+
href="static/style.xsl"?><OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/"
67+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/
68+
http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2024-08-21T17:16:17Z</responseDate><request
69+
verb="GetRecord" identifier="oai:dspace.mit.edu:1721.1/112746" metadataPrefix="oai_dc">https://dspace.mit.edu//oai/request</request><GetRecord><record><header
70+
status="deleted"><identifier>oai:dspace.mit.edu:1721.1/112746</identifier><datestamp>2017-12-14T15:03:59Z</datestamp><setSpec>com_1721.1_7803</setSpec><setSpec>hdl_1721.1_7803</setSpec><setSpec>col_1721.1_42001</setSpec><setSpec>hdl_1721.1_42001</setSpec></header></record></GetRecord></OAI-PMH>
71+
headers:
72+
Connection:
73+
- close
74+
Content-Encoding:
75+
- gzip
76+
Content-Type:
77+
- text/xml;charset=UTF-8
78+
Date:
79+
- Wed, 21 Aug 2024 17:16:17 GMT
80+
Set-cookie:
81+
- HttpOnly;Secure
82+
Strict-Transport-Security:
83+
- max-age=63072000
84+
Transfer-Encoding:
85+
- chunked
86+
Vary:
87+
- Accept-Encoding
88+
X-Content-Type-Options:
89+
- nosniff
90+
X-Frame-Options:
91+
- SAMEORIGIN
92+
X-XSS-Protection:
93+
- 1; mode=block
94+
status:
95+
code: 200
96+
message: OK
97+
version: 1

0 commit comments

Comments
 (0)