Skip to content

Commit bf434f4

Browse files
liustveADOT Patch workflow
andauthored
Fix flaky jitter tests and add explicit timeout deadline check in OTLPAwsLogsExporter (#501)
*Description of changes:* Unit tests would occasionally fail because of a transitive timing issue using Python's `time()` library, see: https://github.com/aws-observability/aws-otel-python-instrumentation/actions/runs/18681954286/job/53290674826 - Add timeout check before each export attempt to fail fast when deadline is exceeded and fixes flaky failing retry test by mocking time to prevent unexpected timeout errors in the test. - Updates `tox.ini` to point the upstream repos to the one currently in `pyproject.toml` By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice. --------- Co-authored-by: ADOT Patch workflow <[email protected]>
1 parent 90f7830 commit bf434f4

File tree

4 files changed

+86
-43
lines changed

4 files changed

+86
-43
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,5 @@ If your change does not need a CHANGELOG entry, add the "skip changelog" label t
1313
## Unreleased
1414
- Add botocore instrumentation extension for Bedrock AgentCore services with span attributes
1515
([#490](https://github.com/aws-observability/aws-otel-python-instrumentation/pull/490))
16+
- Fix timeout handling for exceeded deadline in retry logic in OTLPAwsLogsExporter
17+
([#501](https://github.com/aws-observability/aws-otel-python-instrumentation/pull/501))

aws-opentelemetry-distro/src/amazon/opentelemetry/distro/exporter/otlp/aws/logs/otlp_aws_logs_exporter.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,14 @@ def export(self, batch: Sequence[LogData]) -> LogExportResult:
104104
# 3) Deadline timeout will be exceeded
105105
# 4) Non-retryable errors (4xx except 429) immediately exit the loop
106106
while True:
107-
resp = self._send(data, deadline_sec - time())
107+
# Check timeout before each attempt. The retry logic below also checks
108+
# timeout, but this provides an additional safeguard against flaky timing edge cases.
109+
remaining_time = deadline_sec - time()
110+
if remaining_time <= 0:
111+
_logger.error("Timeout deadline exceeded before attempt.")
112+
return LogExportResult.FAILURE
113+
114+
resp = self._send(data, remaining_time)
108115

109116
if resp.ok:
110117
return LogExportResult.SUCCESS
@@ -172,8 +179,10 @@ def _get_retry_delay_sec(self, headers: CaseInsensitiveDict, retry_num: int) ->
172179
"""
173180
# Check for Retry-After header first, then use exponential backoff with jitter
174181
retry_after_delay = self._parse_retryable_header(headers.get(self._RETRY_AFTER_HEADER))
182+
175183
if retry_after_delay > -1:
176184
return retry_after_delay
185+
177186
# multiplying by a random number between .8 and 1.2 introduces a +/-20% jitter to each backoff.
178187
return 2**retry_num * random.uniform(0.8, 1.2)
179188

aws-opentelemetry-distro/tests/amazon/opentelemetry/distro/exporter/otlp/aws/logs/test_otlp_aws_logs_exporter.py

Lines changed: 72 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -18,29 +18,31 @@
1818

1919
class TestOTLPAwsLogsExporter(TestCase):
2020
_ENDPOINT = "https://logs.us-west-2.amazonaws.com/v1/logs"
21-
good_response = requests.Response()
22-
good_response.status_code = 200
2321

24-
non_retryable_response = requests.Response()
25-
non_retryable_response.status_code = 404
22+
def setUp(self):
23+
self.logs = self.generate_test_log_data()
24+
self.exporter = OTLPAwsLogExporter(session=get_aws_session(), aws_region="us-east-1", endpoint=self._ENDPOINT)
2625

27-
retryable_response_no_header = requests.Response()
28-
retryable_response_no_header.status_code = 429
26+
self.good_response = requests.Response()
27+
self.good_response.status_code = 200
2928

30-
retryable_response_header = requests.Response()
31-
retryable_response_header.headers = CaseInsensitiveDict({"Retry-After": "10"})
32-
retryable_response_header.status_code = 503
29+
self.non_retryable_response = requests.Response()
30+
self.non_retryable_response.status_code = 404
3331

34-
retryable_response_bad_header = requests.Response()
35-
retryable_response_bad_header.headers = CaseInsensitiveDict({"Retry-After": "-12"})
36-
retryable_response_bad_header.status_code = 503
32+
self.retryable_response_no_header = requests.Response()
33+
self.retryable_response_no_header.status_code = 429
3734

38-
def setUp(self):
39-
self.logs = self.generate_test_log_data()
40-
self.exporter = OTLPAwsLogExporter(session=get_aws_session(), aws_region="us-east-1", endpoint=self._ENDPOINT)
35+
self.retryable_response_header = requests.Response()
36+
self.retryable_response_header.headers = CaseInsensitiveDict({"Retry-After": "10"})
37+
self.retryable_response_header.status_code = 503
4138

42-
@patch("requests.Session.post", return_value=good_response)
39+
self.retryable_response_bad_header = requests.Response()
40+
self.retryable_response_bad_header.headers = CaseInsensitiveDict({"Retry-After": "-12"})
41+
self.retryable_response_bad_header.status_code = 503
42+
43+
@patch("requests.Session.post")
4344
def test_export_success(self, mock_request):
45+
mock_request.return_value = self.good_response
4446
"""Tests that the exporter always compresses the serialized logs with gzip before exporting."""
4547
result = self.exporter.export(self.logs)
4648

@@ -57,17 +59,19 @@ def test_export_success(self, mock_request):
5759
self.assertTrue(len(data) >= 10)
5860
self.assertEqual(data[0:2], b"\x1f\x8b")
5961

60-
@patch("requests.Session.post", return_value=good_response)
62+
@patch("requests.Session.post")
6163
def test_should_not_export_if_shutdown(self, mock_request):
64+
mock_request.return_value = self.good_response
6265
"""Tests that no export request is made if the exporter is shutdown."""
6366
self.exporter.shutdown()
6467
result = self.exporter.export(self.logs)
6568

6669
mock_request.assert_not_called()
6770
self.assertEqual(result, LogExportResult.FAILURE)
6871

69-
@patch("requests.Session.post", return_value=non_retryable_response)
72+
@patch("requests.Session.post")
7073
def test_should_not_export_again_if_not_retryable(self, mock_request):
74+
mock_request.return_value = self.non_retryable_response
7175
"""Tests that only one export request is made if the response status code is non-retryable."""
7276
result = self.exporter.export(self.logs)
7377
mock_request.assert_called_once()
@@ -78,12 +82,16 @@ def test_should_not_export_again_if_not_retryable(self, mock_request):
7882
"amazon.opentelemetry.distro.exporter.otlp.aws.logs.otlp_aws_logs_exporter.Event.wait",
7983
side_effect=lambda x: False,
8084
)
81-
@patch("requests.Session.post", return_value=retryable_response_no_header)
85+
@patch("requests.Session.post")
8286
def test_should_export_again_with_backoff_if_retryable_and_no_retry_after_header(self, mock_request, mock_wait):
87+
mock_request.return_value = self.retryable_response_no_header
8388
"""Tests that multiple export requests are made with exponential delay if the response status code is retryable.
8489
But there is no Retry-After header."""
8590
self.exporter._timeout = 10000 # Large timeout to avoid early exit
86-
result = self.exporter.export(self.logs)
91+
92+
with patch("amazon.opentelemetry.distro.exporter.otlp.aws.logs.otlp_aws_logs_exporter.time") as mock_time:
93+
mock_time.return_value = 0 # Keep time constant to avoid timeout issues
94+
result = self.exporter.export(self.logs)
8795

8896
self.assertEqual(mock_wait.call_count, _MAX_RETRYS - 1)
8997

@@ -103,15 +111,21 @@ def test_should_export_again_with_backoff_if_retryable_and_no_retry_after_header
103111
"amazon.opentelemetry.distro.exporter.otlp.aws.logs.otlp_aws_logs_exporter.Event.wait",
104112
side_effect=lambda x: False,
105113
)
106-
@patch(
107-
"requests.Session.post",
108-
side_effect=[retryable_response_header, retryable_response_header, retryable_response_header, good_response],
109-
)
114+
@patch("requests.Session.post")
110115
def test_should_export_again_with_server_delay_if_retryable_and_retry_after_header(self, mock_request, mock_wait):
116+
mock_request.side_effect = [
117+
self.retryable_response_header,
118+
self.retryable_response_header,
119+
self.retryable_response_header,
120+
self.good_response,
121+
]
111122
"""Tests that multiple export requests are made with the server's suggested
112123
delay if the response status code is retryable and there is a Retry-After header."""
113124
self.exporter._timeout = 10000 # Large timeout to avoid early exit
114-
result = self.exporter.export(self.logs)
125+
126+
with patch("amazon.opentelemetry.distro.exporter.otlp.aws.logs.otlp_aws_logs_exporter.time") as mock_time:
127+
mock_time.return_value = 0 # Keep time constant to avoid timeout issues
128+
result = self.exporter.export(self.logs)
115129

116130
delays = mock_wait.call_args_list
117131

@@ -126,22 +140,23 @@ def test_should_export_again_with_server_delay_if_retryable_and_retry_after_head
126140
"amazon.opentelemetry.distro.exporter.otlp.aws.logs.otlp_aws_logs_exporter.Event.wait",
127141
side_effect=lambda x: False,
128142
)
129-
@patch(
130-
"requests.Session.post",
131-
side_effect=[
132-
retryable_response_bad_header,
133-
retryable_response_bad_header,
134-
retryable_response_bad_header,
135-
good_response,
136-
],
137-
)
143+
@patch("requests.Session.post")
138144
def test_should_export_again_with_backoff_delay_if_retryable_and_bad_retry_after_header(
139145
self, mock_request, mock_wait
140146
):
147+
mock_request.side_effect = [
148+
self.retryable_response_bad_header,
149+
self.retryable_response_bad_header,
150+
self.retryable_response_bad_header,
151+
self.good_response,
152+
]
141153
"""Tests that multiple export requests are made with exponential delay if the response status code is retryable.
142154
but the Retry-After header is invalid or malformed."""
143155
self.exporter._timeout = 10000 # Large timeout to avoid early exit
144-
result = self.exporter.export(self.logs)
156+
157+
with patch("amazon.opentelemetry.distro.exporter.otlp.aws.logs.otlp_aws_logs_exporter.time") as mock_time:
158+
mock_time.return_value = 0 # Keep time constant to avoid timeout issues
159+
result = self.exporter.export(self.logs)
145160

146161
delays = mock_wait.call_args_list
147162

@@ -156,8 +171,9 @@ def test_should_export_again_with_backoff_delay_if_retryable_and_bad_retry_after
156171
self.assertEqual(mock_request.call_count, 4)
157172
self.assertEqual(result, LogExportResult.SUCCESS)
158173

159-
@patch("requests.Session.post", side_effect=[requests.exceptions.ConnectionError(), good_response])
174+
@patch("requests.Session.post")
160175
def test_export_connection_error_retry(self, mock_request):
176+
mock_request.side_effect = [requests.exceptions.ConnectionError(), self.good_response]
161177
"""Tests that the exporter retries on ConnectionError."""
162178
result = self.exporter.export(self.logs)
163179

@@ -168,8 +184,9 @@ def test_export_connection_error_retry(self, mock_request):
168184
"amazon.opentelemetry.distro.exporter.otlp.aws.logs.otlp_aws_logs_exporter.Event.wait",
169185
side_effect=lambda x: False,
170186
)
171-
@patch("requests.Session.post", return_value=retryable_response_no_header)
187+
@patch("requests.Session.post")
172188
def test_should_stop_retrying_when_deadline_exceeded(self, mock_request, mock_wait):
189+
mock_request.return_value = self.retryable_response_no_header
173190
"""Tests that the exporter stops retrying when the deadline is exceeded."""
174191
self.exporter._timeout = 5 # Short timeout to trigger deadline check
175192

@@ -191,19 +208,23 @@ def test_should_stop_retrying_when_deadline_exceeded(self, mock_request, mock_wa
191208
"amazon.opentelemetry.distro.exporter.otlp.aws.logs.otlp_aws_logs_exporter.Event.wait",
192209
side_effect=lambda x: True,
193210
)
194-
@patch("requests.Session.post", return_value=retryable_response_no_header)
211+
@patch("requests.Session.post")
195212
def test_export_interrupted_by_shutdown(self, mock_request, mock_wait):
213+
mock_request.return_value = self.retryable_response_no_header
196214
"""Tests that export can be interrupted by shutdown during retry wait."""
197215
self.exporter._timeout = 10000
198216

199-
result = self.exporter.export(self.logs)
217+
with patch("amazon.opentelemetry.distro.exporter.otlp.aws.logs.otlp_aws_logs_exporter.time") as mock_time:
218+
mock_time.return_value = 0 # Keep time constant to avoid timeout issues
219+
result = self.exporter.export(self.logs)
200220

201221
# Should make one request, then get interrupted during retry wait
202222
self.assertEqual(mock_request.call_count, 1)
203223
self.assertEqual(result, LogExportResult.FAILURE)
204224

205-
@patch("requests.Session.post", return_value=good_response)
225+
@patch("requests.Session.post")
206226
def test_export_with_log_group_and_stream_headers(self, mock_request):
227+
mock_request.return_value = self.good_response
207228
"""Tests that log_group and log_stream are properly set as headers when provided."""
208229
log_group = "test-log-group"
209230
log_stream = "test-log-stream"
@@ -228,6 +249,17 @@ def test_export_with_log_group_and_stream_headers(self, mock_request):
228249
self.assertEqual(session_headers["x-aws-log-group"], log_group)
229250
self.assertEqual(session_headers["x-aws-log-stream"], log_stream)
230251

252+
@patch("requests.Session.post")
253+
def test_should_not_make_request_when_deadline_exceeded_immediately(self, mock_request):
254+
"""Tests that no HTTP request is made when deadline is already exceeded before first attempt."""
255+
self.exporter._timeout = 0 # Deadline already exceeded
256+
257+
result = self.exporter.export(self.logs)
258+
259+
# Should not make any HTTP requests
260+
mock_request.assert_not_called()
261+
self.assertEqual(result, LogExportResult.FAILURE)
262+
231263
@staticmethod
232264
def generate_test_log_data(count=5):
233265
logs = []

tox.ini

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ deps =
2222

2323
setenv =
2424
; TODO: The two repos branches need manual updated over time, need to figure out a more sustainable solution.
25-
CORE_REPO="git+https://github.com/open-telemetry/opentelemetry-python.git@release/v1.25.x-0.46bx"
26-
CONTRIB_REPO="git+https://github.com/open-telemetry/opentelemetry-python-contrib.git@release/v1.25.x-0.46bx"
25+
CORE_REPO="git+https://github.com/open-telemetry/opentelemetry-python.git@release/v1.33.x-0.54bx"
26+
CONTRIB_REPO="git+https://github.com/open-telemetry/opentelemetry-python-contrib.git@release/v1.33.x-0.54bx"
2727

2828
changedir =
2929
test-aws-opentelemetry-distro: aws-opentelemetry-distro/tests

0 commit comments

Comments
 (0)