Skip to content

Commit 1101a25

Browse files
authored
fix: hybrid mode has subtle shut down race conditoin (#1775)
Error below happens during my further E2E testing ([test 6](https://docs.google.com/document/d/1hgS-YPAqvavbkYN0DyrFkjd91lD-n0n6kAedAfAiCFM/edit?tab=t.0#bookmark=id.ri8beh65kx1s)) on more complex langchain LCEL, wasn't caught in [previous 5 test scenarios](https://docs.google.com/document/d/1hgS-YPAqvavbkYN0DyrFkjd91lD-n0n6kAedAfAiCFM/edit?tab=t.0). > Exception in thread Thread-1 (tracing_control_thread_func): > Traceback (most recent call last): > File "/opt/homebrew/Cellar/[email protected]/3.13.3_1/Frameworks/Python.framework/Versions/3.13/lib/python3.13/threading.py", line 1041, in _bootstrap_inner > self.run() > ~~~~~~~~^^ > File "/opt/homebrew/Cellar/[email protected]/3.13.3_1/Frameworks/Python.framework/Versions/3.13/lib/python3.13/threading.py", line 992, in run > self._target(*self._args, **self._kwargs) > ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > File "/Users/xinjin/dev/lc-and-ls/langsmith-sdk/python/langsmith/_internal/_background_thread.py", line 536, in tracing_control_thread_func > _hybrid_tracing_thread_handle_batch( > ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ > client, tracing_queue, next_batch, use_multipart > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > ) > ^ > File "/Users/xinjin/dev/lc-and-ls/langsmith-sdk/python/langsmith/_internal/_background_thread.py", line 308, in _hybrid_tracing_thread_handle_batch > future_langsmith = executor.submit( > _tracing_thread_handle_batch, > ...<5 lines>... > langsmith_ops, > ) > File "/opt/homebrew/Cellar/[email protected]/3.13.3_1/Frameworks/Python.framework/Versions/3.13/lib/python3.13/concurrent/futures/thread.py", line 173, in submit > raise RuntimeError('cannot schedule new futures after ' > 'interpreter shutdown') > RuntimeError: cannot schedule new futures after interpreter shutdown This is a shutdown race condition. The issue is: - Python interpreter is shutting down - Background tracing thread is still running and trying to process queue items - ThreadPoolExecutor can't be created during interpreter shutdown - Error: RuntimeError: cannot schedule new futures after interpreter shutdown - This happens in the _hybrid_tracing_thread_handle_batch() function when we try to create the ThreadPoolExecutor and submit futures during shutdown. - fix this by adding graceful shutdown handling: falling back to sequential processing using the same functions but called sequentially. **The Race Condition In our case:** - Main program finishes (LangChain script completes) - Background tracing thread is still running (it's a daemon thread, but still processing) - Python starts shutdown process - Tracing thread tries to create ThreadPoolExecutor - RuntimeError is raised because threading infrastructure is shutting down **Python Interpreter Shutdown Sequence** - When Python shuts down, it goes through a specific sequence: - Main thread finishes - The main program exits - Non-daemon threads continue - Background threads (like our tracing thread) keep running - Module cleanup begins - Python starts cleaning up modules and their resources - Threading infrastructure shuts down - The threading module starts shutting down - Concurrent.futures cleanup - The concurrent.futures module prevents new thread creation **Why ThreadPoolExecutor Fails** - The ThreadPoolExecutor internally uses the threading module to create worker threads. During shutdown: - Threading module state changes - Python sets internal flags that prevent new thread creation - threading._shutdown flag is set - This signals that no new threads should be created - Thread creation is blocked - Any attempt to create new threads raises RuntimeError **Test Plan** After adding the fallback logic, the sequential process would kick in during interpreter shut down, and i was able to see trace in both OTEL and LangSmith
1 parent f58f486 commit 1101a25

File tree

3 files changed

+44
-28
lines changed

3 files changed

+44
-28
lines changed

python/langsmith/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121

2222
# Avoid calling into importlib on every call to __version__
2323

24-
__version__ = "0.4.0"
24+
__version__ = "0.4.1"
2525
version = __version__ # for backwards compatibility
2626

2727

python/langsmith/_internal/_background_thread.py

Lines changed: 42 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ def _tracing_thread_handle_batch(
200200
exc_info=True,
201201
)
202202
finally:
203-
if mark_task_done:
203+
if mark_task_done and tracing_queue is not None:
204204
for _ in batch:
205205
try:
206206
tracing_queue.task_done()
@@ -263,7 +263,7 @@ def _otel_tracing_thread_handle_batch(
263263
exc_info=True,
264264
)
265265
finally:
266-
if mark_task_done:
266+
if mark_task_done and tracing_queue is not None:
267267
for _ in batch:
268268
try:
269269
tracing_queue.task_done()
@@ -302,33 +302,49 @@ def _hybrid_tracing_thread_handle_batch(
302302
langsmith_ops = copy.deepcopy(ops)
303303
otel_ops = copy.deepcopy(ops)
304304

305-
# Use ThreadPoolExecutor for parallel execution
306-
with cf.ThreadPoolExecutor(max_workers=2) as executor:
307-
# Submit both tasks
308-
future_langsmith = executor.submit(
309-
_tracing_thread_handle_batch,
310-
client,
311-
tracing_queue,
312-
batch,
313-
use_multipart,
314-
False, # Don't mark tasks done - we'll do it once at the end
315-
langsmith_ops,
316-
)
317-
future_otel = executor.submit(
318-
_otel_tracing_thread_handle_batch,
319-
client,
320-
tracing_queue,
321-
batch,
322-
False, # Don't mark tasks done - we'll do it once at the end
323-
otel_ops,
324-
)
305+
try:
306+
# Use ThreadPoolExecutor for parallel execution
307+
with cf.ThreadPoolExecutor(max_workers=2) as executor:
308+
# Submit both tasks
309+
future_langsmith = executor.submit(
310+
_tracing_thread_handle_batch,
311+
client,
312+
tracing_queue,
313+
batch,
314+
use_multipart,
315+
False, # Don't mark tasks done - we'll do it once at the end
316+
langsmith_ops,
317+
)
318+
future_otel = executor.submit(
319+
_otel_tracing_thread_handle_batch,
320+
client,
321+
tracing_queue,
322+
batch,
323+
False, # Don't mark tasks done - we'll do it once at the end
324+
otel_ops,
325+
)
325326

326-
# Wait for both to complete
327-
future_langsmith.result()
328-
future_otel.result()
327+
# Wait for both to complete
328+
future_langsmith.result()
329+
future_otel.result()
330+
except RuntimeError as e:
331+
if "cannot schedule new futures after interpreter shutdown" in str(e):
332+
# During interpreter shutdown, ThreadPoolExecutor is blocked,
333+
# fall back to sequential processing
334+
logger.debug(
335+
"Interpreter shutting down, falling back to sequential processing"
336+
)
337+
_tracing_thread_handle_batch(
338+
client, tracing_queue, batch, use_multipart, False, langsmith_ops
339+
)
340+
_otel_tracing_thread_handle_batch(
341+
client, tracing_queue, batch, False, otel_ops
342+
)
343+
else:
344+
raise
329345

330346
# Mark all tasks as done once, only if requested
331-
if mark_task_done:
347+
if mark_task_done and tracing_queue is not None:
332348
for _ in batch:
333349
try:
334350
tracing_queue.task_done()

python/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "langsmith"
3-
version = "0.4.0"
3+
version = "0.4.1"
44
description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
55
authors = ["LangChain <[email protected]>"]
66
license = "MIT"

0 commit comments

Comments
 (0)