fix: hybrid mode has subtle shut down race conditoin (#1775)

EugeneJinXin · web-flow · commit 1101a25abf9d · 2025-06-10T12:01:11.000-07:00
Error below happens during my further E2E testing ([test 6](https://docs.google.com/document/d/1hgS-YPAqvavbkYN0DyrFkjd91lD-n0n6kAedAfAiCFM/edit?tab=t.0#bookmark=id.ri8beh65kx1s)) on more complex langchain LCEL, wasn't caught in [previous 5 test scenarios](https://docs.google.com/document/d/1hgS-YPAqvavbkYN0DyrFkjd91lD-n0n6kAedAfAiCFM/edit?tab=t.0). > Exception in thread Thread-1 (tracing_control_thread_func): > Traceback (most recent call last): > File "/opt/homebrew/Cellar/python@3.13/3.13.3_1/Frameworks/Python.framework/Versions/3.13/lib/python3.13/threading.py", line 1041, in _bootstrap_inner > self.run() > ~~~~~~~~^^ > File "/opt/homebrew/Cellar/python@3.13/3.13.3_1/Frameworks/Python.framework/Versions/3.13/lib/python3.13/threading.py", line 992, in run > self._target(*self._args, **self._kwargs) > ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > File "/Users/xinjin/dev/lc-and-ls/langsmith-sdk/python/langsmith/_internal/_background_thread.py", line 536, in tracing_control_thread_func > _hybrid_tracing_thread_handle_batch( > ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ > client, tracing_queue, next_batch, use_multipart > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > ) > ^ > File "/Users/xinjin/dev/lc-and-ls/langsmith-sdk/python/langsmith/_internal/_background_thread.py", line 308, in _hybrid_tracing_thread_handle_batch > future_langsmith = executor.submit( > _tracing_thread_handle_batch, > ...<5 lines>... > langsmith_ops, > ) > File "/opt/homebrew/Cellar/python@3.13/3.13.3_1/Frameworks/Python.framework/Versions/3.13/lib/python3.13/concurrent/futures/thread.py", line 173, in submit > raise RuntimeError('cannot schedule new futures after ' > 'interpreter shutdown') > RuntimeError: cannot schedule new futures after interpreter shutdown This is a shutdown race condition. The issue is: - Python interpreter is shutting down - Background tracing thread is still running and trying to process queue items - ThreadPoolExecutor can't be created during interpreter shutdown - Error: RuntimeError: cannot schedule new futures after interpreter shutdown - This happens in the _hybrid_tracing_thread_handle_batch() function when we try to create the ThreadPoolExecutor and submit futures during shutdown. - fix this by adding graceful shutdown handling: falling back to sequential processing using the same functions but called sequentially. **The Race Condition In our case:** - Main program finishes (LangChain script completes) - Background tracing thread is still running (it's a daemon thread, but still processing) - Python starts shutdown process - Tracing thread tries to create ThreadPoolExecutor - RuntimeError is raised because threading infrastructure is shutting down **Python Interpreter Shutdown Sequence** - When Python shuts down, it goes through a specific sequence: - Main thread finishes - The main program exits - Non-daemon threads continue - Background threads (like our tracing thread) keep running - Module cleanup begins - Python starts cleaning up modules and their resources - Threading infrastructure shuts down - The threading module starts shutting down - Concurrent.futures cleanup - The concurrent.futures module prevents new thread creation **Why ThreadPoolExecutor Fails** - The ThreadPoolExecutor internally uses the threading module to create worker threads. During shutdown: - Threading module state changes - Python sets internal flags that prevent new thread creation - threading._shutdown flag is set - This signals that no new threads should be created - Thread creation is blocked - Any attempt to create new threads raises RuntimeError **Test Plan** After adding the fallback logic, the sequential process would kick in during interpreter shut down, and i was able to see trace in both OTEL and LangSmith
diff --git a/python/langsmith/__init__.py b/python/langsmith/__init__.py
@@ -21,7 +21,7 @@
 
 # Avoid calling into importlib on every call to __version__
 
-__version__ = "0.4.0"
+__version__ = "0.4.1"
 version = __version__  # for backwards compatibility
 
 
diff --git a/python/langsmith/_internal/_background_thread.py b/python/langsmith/_internal/_background_thread.py
@@ -200,7 +200,7 @@ def _tracing_thread_handle_batch(
             exc_info=True,
         )
     finally:
-        if mark_task_done:
+        if mark_task_done and tracing_queue is not None:
             for _ in batch:
                 try:
                     tracing_queue.task_done()
@@ -263,7 +263,7 @@ def _otel_tracing_thread_handle_batch(
             exc_info=True,
         )
     finally:
-        if mark_task_done:
+        if mark_task_done and tracing_queue is not None:
             for _ in batch:
                 try:
                     tracing_queue.task_done()
@@ -302,33 +302,49 @@ def _hybrid_tracing_thread_handle_batch(
     langsmith_ops = copy.deepcopy(ops)
     otel_ops = copy.deepcopy(ops)
 
-    # Use ThreadPoolExecutor for parallel execution
-    with cf.ThreadPoolExecutor(max_workers=2) as executor:
-        # Submit both tasks
-        future_langsmith = executor.submit(
-            _tracing_thread_handle_batch,
-            client,
-            tracing_queue,
-            batch,
-            use_multipart,
-            False,  # Don't mark tasks done - we'll do it once at the end
-            langsmith_ops,
-        )
-        future_otel = executor.submit(
-            _otel_tracing_thread_handle_batch,
-            client,
-            tracing_queue,
-            batch,
-            False,  # Don't mark tasks done - we'll do it once at the end
-            otel_ops,
-        )
+    try:
+        # Use ThreadPoolExecutor for parallel execution
+        with cf.ThreadPoolExecutor(max_workers=2) as executor:
+            # Submit both tasks
+            future_langsmith = executor.submit(
+                _tracing_thread_handle_batch,
+                client,
+                tracing_queue,
+                batch,
+                use_multipart,
+                False,  # Don't mark tasks done - we'll do it once at the end
+                langsmith_ops,
+            )
+            future_otel = executor.submit(
+                _otel_tracing_thread_handle_batch,
+                client,
+                tracing_queue,
+                batch,
+                False,  # Don't mark tasks done - we'll do it once at the end
+                otel_ops,
+            )
 
-        # Wait for both to complete
-        future_langsmith.result()
-        future_otel.result()
+            # Wait for both to complete
+            future_langsmith.result()
+            future_otel.result()
+    except RuntimeError as e:
+        if "cannot schedule new futures after interpreter shutdown" in str(e):
+            # During interpreter shutdown, ThreadPoolExecutor is blocked,
+            # fall back to sequential processing
+            logger.debug(
+                "Interpreter shutting down, falling back to sequential processing"
+            )
+            _tracing_thread_handle_batch(
+                client, tracing_queue, batch, use_multipart, False, langsmith_ops
+            )
+            _otel_tracing_thread_handle_batch(
+                client, tracing_queue, batch, False, otel_ops
+            )
+        else:
+            raise
 
     # Mark all tasks as done once, only if requested
-    if mark_task_done:
+    if mark_task_done and tracing_queue is not None:
         for _ in batch:
             try:
                 tracing_queue.task_done()
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langsmith"
-version = "0.4.0"
+version = "0.4.1"
 description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
 authors = ["LangChain <support@langchain.dev>"]
 license = "MIT"