Skip to content

Commit f58f486

Browse files
authored
feat: OTEL_ENABLED mode sends to both otel and ls, while allowing otel only through OTEL_ONLY var (#1762)
# Hybrid Tracing: Overview ### Before: Choose One ``` LANGSMITH_OTEL_ENABLED = "true" traceable_function() │ ▼ ┌─────────────┐ │ Background │ │ Thread │ └─────┬───────┘ │ ▼ Either: ┌──────────┐ OR ┌─────────────┐ │ OTEL │ │ LangSmith │ │ Endpoint │ │ Endpoint │ └──────────┘ └─────────────┘ ``` ### After: Default behavior of LANGSMITH_OTEL_ENABLED: Get Both ``` traceable_function() │ ▼ LANGSMITH_OTEL_ENABLED = "true" ┌─────────────┐ │ Background │ Thread │ └─────┬───────┘ │ ▼ Both: ┌──────────┐ AND ┌─────────────┐ │ OTEL │ │ LangSmith │ │ Endpoint │ │ Endpoint │ └──────────┘ └─────────────┘ ``` User can do Otel-only if insist to, through additional env var LANGSMITH_OTEL_ONLY ``` traceable_function() │ ▼ LANGSMITH_OTEL_ENABLED = "true" and LANGSMITH_OTEL_ONLY = "true" ┌─────────────┐ │ Background │ Thread │ └─────┬───────┘ │ ▼ ONLY OTEL: ┌──────────┐ │ OTEL │ │ Endpoint │ └──────────┘ ``` ## Single Function Call → Dual Observability ### ** Increase Tracing Coverage** - Get the best of both worlds - No more choosing between tools - Future-proof observability ### **Risk Mitigation** - If one service is down, you still have traces - Backup observability data ## Technical detail for reviewer - A new `_hybrid_tracing_thread_handle_batch()` to make sure call task_done() once per item, regardless of how many exporters succeeded - Sub-thread spawning logic also adopted this. - Future improvement can work on the special case of feedback handling, and whether needs to tune up performance of these two parallel exports by using two worker threads etc. ## Test plan - 11 test cases covering three main scenarios: hybrid mode (OTEL_ENABLED=true) that exports to both OTEL and LangSmith simultaneously, OTEL-only mode (OTEL_ONLY=true) that exports exclusively to OTEL, and LangSmith-only mode (default) that exports only to LangSmith. - Also i did a local E2E test, setting up a jaegar and tested the (three mode) * 2 way (from https://docs.smith.langchain.com/observability/how_to_guides/trace_langchain_with_otel#configuring-alternate-otlp-endpoints) and confirmed results through manually inspection , i have https://docs.google.com/document/d/1hgS-YPAqvavbkYN0DyrFkjd91lD-n0n6kAedAfAiCFM/edit?tab=t.0 which contains all the scenarios i tested
1 parent e6cce7d commit f58f486

File tree

7 files changed

+696
-29
lines changed

7 files changed

+696
-29
lines changed

python/langsmith/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
from langsmith.utils import ContextThreadPoolExecutor
2121

2222
# Avoid calling into importlib on every call to __version__
23-
__version__ = "0.3.45"
23+
24+
__version__ = "0.4.0"
2425
version = __version__ # for backwards compatibility
2526

2627

python/langsmith/_internal/_background_thread.py

Lines changed: 237 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import concurrent.futures as cf
4+
import copy
45
import functools
56
import io
67
import logging
@@ -48,8 +49,8 @@ class TracingQueueItem:
4849
4950
Attributes:
5051
priority (str): The priority of the item.
51-
action (str): The action associated with the item.
5252
item (Any): The item itself.
53+
otel_context (Optional[Context]): The OTEL context of the item.
5354
"""
5455

5556
priority: str
@@ -158,9 +159,27 @@ def _tracing_thread_handle_batch(
158159
tracing_queue: Queue,
159160
batch: list[TracingQueueItem],
160161
use_multipart: bool,
162+
mark_task_done: bool = True,
163+
ops: Optional[
164+
list[Union[SerializedRunOperation, SerializedFeedbackOperation]]
165+
] = None,
161166
) -> None:
167+
"""Handle a batch of tracing queue items by sending them to LangSmith.
168+
169+
Args:
170+
client: The LangSmith client to use for sending data.
171+
tracing_queue: The queue containing tracing items (used for task_done calls).
172+
batch: List of tracing queue items to process.
173+
use_multipart: Whether to use multipart endpoint for sending data.
174+
mark_task_done: Whether to mark queue tasks as done after processing.
175+
Set to False when called from parallel execution to avoid double counting.
176+
ops: Pre-combined serialized operations to use instead of combining from batch.
177+
If None, operations will be combined from the batch items.
178+
"""
162179
try:
163-
ops = combine_serialized_queue_operations([item.item for item in batch])
180+
if ops is None:
181+
ops = combine_serialized_queue_operations([item.item for item in batch])
182+
164183
if use_multipart:
165184
client._multipart_ingest_ops(ops)
166185
else:
@@ -180,22 +199,45 @@ def _tracing_thread_handle_batch(
180199
"Error details:",
181200
exc_info=True,
182201
)
183-
# exceptions are logged elsewhere, but we need to make sure the
184-
# background thread continues to run
185-
pass
186202
finally:
187-
for _ in batch:
188-
tracing_queue.task_done()
203+
if mark_task_done:
204+
for _ in batch:
205+
try:
206+
tracing_queue.task_done()
207+
except ValueError as e:
208+
if "task_done() called too many times" in str(e):
209+
# This can happen during shutdown when multiple threads
210+
# process the same queue items. It's harmless.
211+
logger.debug(
212+
f"Ignoring harmless task_done error during shutdown: {e}"
213+
)
214+
else:
215+
raise
189216

190217

191218
def _otel_tracing_thread_handle_batch(
192219
client: Client,
193220
tracing_queue: Queue,
194221
batch: list[TracingQueueItem],
222+
mark_task_done: bool = True,
223+
ops: Optional[
224+
list[Union[SerializedRunOperation, SerializedFeedbackOperation]]
225+
] = None,
195226
) -> None:
196-
"""Handle a batch of tracing queue items by exporting them to OTEL."""
227+
"""Handle a batch of tracing queue items by exporting them to OTEL.
228+
229+
Args:
230+
client: The LangSmith client containing the OTEL exporter.
231+
tracing_queue: The queue containing tracing items (used for task_done calls).
232+
batch: List of tracing queue items to process.
233+
mark_task_done: Whether to mark queue tasks as done after processing.
234+
Set to False when called from parallel execution to avoid double counting.
235+
ops: Pre-combined serialized operations to use instead of combining from batch.
236+
If None, operations will be combined from the batch items.
237+
"""
197238
try:
198-
ops = combine_serialized_queue_operations([item.item for item in batch])
239+
if ops is None:
240+
ops = combine_serialized_queue_operations([item.item for item in batch])
199241

200242
run_ops = [op for op in ops if isinstance(op, SerializedRunOperation)]
201243
otel_context_map = {
@@ -215,17 +257,129 @@ def _otel_tracing_thread_handle_batch(
215257

216258
except Exception:
217259
logger.error(
218-
"LangSmith tracing error: Failed to submit OTEL trace data.\n"
260+
"OTEL tracing error: Failed to submit trace data.\n"
219261
"This does not affect your application's runtime.\n"
220262
"Error details:",
221263
exc_info=True,
222264
)
223-
# Exceptions are logged elsewhere, but we need to make sure the
224-
# background thread continues to run
225265
finally:
226-
# Mark all items in the batch as done
266+
if mark_task_done:
267+
for _ in batch:
268+
try:
269+
tracing_queue.task_done()
270+
except ValueError as e:
271+
if "task_done() called too many times" in str(e):
272+
# This can happen during shutdown when multiple threads
273+
# process the same queue items. It's harmless.
274+
logger.debug(
275+
f"Ignoring harmless task_done error during shutdown: {e}"
276+
)
277+
else:
278+
raise
279+
280+
281+
def _hybrid_tracing_thread_handle_batch(
282+
client: Client,
283+
tracing_queue: Queue,
284+
batch: list[TracingQueueItem],
285+
use_multipart: bool,
286+
mark_task_done: bool = True,
287+
) -> None:
288+
"""Handle a batch of tracing queue items by sending to both both LangSmith and OTEL.
289+
290+
Args:
291+
client: The LangSmith client to use for sending data.
292+
tracing_queue: The queue containing tracing items (used for task_done calls).
293+
batch: List of tracing queue items to process.
294+
use_multipart: Whether to use multipart endpoint for LangSmith.
295+
mark_task_done: Whether to mark queue tasks as done after processing.
296+
Set to False primarily for testing when items weren't actually queued.
297+
"""
298+
# Combine operations once to avoid race conditions
299+
ops = combine_serialized_queue_operations([item.item for item in batch])
300+
301+
# Create copies for each thread to avoid shared mutation
302+
langsmith_ops = copy.deepcopy(ops)
303+
otel_ops = copy.deepcopy(ops)
304+
305+
# Use ThreadPoolExecutor for parallel execution
306+
with cf.ThreadPoolExecutor(max_workers=2) as executor:
307+
# Submit both tasks
308+
future_langsmith = executor.submit(
309+
_tracing_thread_handle_batch,
310+
client,
311+
tracing_queue,
312+
batch,
313+
use_multipart,
314+
False, # Don't mark tasks done - we'll do it once at the end
315+
langsmith_ops,
316+
)
317+
future_otel = executor.submit(
318+
_otel_tracing_thread_handle_batch,
319+
client,
320+
tracing_queue,
321+
batch,
322+
False, # Don't mark tasks done - we'll do it once at the end
323+
otel_ops,
324+
)
325+
326+
# Wait for both to complete
327+
future_langsmith.result()
328+
future_otel.result()
329+
330+
# Mark all tasks as done once, only if requested
331+
if mark_task_done:
227332
for _ in batch:
228-
tracing_queue.task_done()
333+
try:
334+
tracing_queue.task_done()
335+
except ValueError as e:
336+
if "task_done() called too many times" in str(e):
337+
# This can happen during shutdown when multiple threads
338+
# process the same queue items. It's harmless.
339+
logger.debug(
340+
f"Ignoring harmless task_done error during shutdown: {e}"
341+
)
342+
else:
343+
raise
344+
345+
346+
def _is_using_internal_otlp_provider(client: Client) -> bool:
347+
"""Check if client is using LangSmith's internal OTLP provider.
348+
349+
Returns True if using LangSmith's internal provider, False if user
350+
provided their own.
351+
"""
352+
if not hasattr(client, "otel_exporter") or client.otel_exporter is None:
353+
return False
354+
355+
try:
356+
# Use OpenTelemetry's standard API to get the global TracerProvider
357+
# Check if OTEL is available
358+
if not ls_utils.is_truish(ls_utils.get_env_var("OTEL_ENABLED")):
359+
return False
360+
361+
# Get the global TracerProvider and check its resource attributes
362+
from opentelemetry import trace # type: ignore[import]
363+
364+
tracer_provider = trace.get_tracer_provider()
365+
if hasattr(tracer_provider, "resource") and hasattr(
366+
tracer_provider.resource, "attributes"
367+
):
368+
is_internal = tracer_provider.resource.attributes.get(
369+
"langsmith.internal_provider", False
370+
)
371+
logger.debug(
372+
f"TracerProvider resource check: "
373+
f"langsmith.internal_provider={is_internal}"
374+
)
375+
return is_internal
376+
377+
return False
378+
except Exception as e:
379+
logger.debug(
380+
f"Could not determine TracerProvider type: {e}, assuming user-provided"
381+
)
382+
return False
229383

230384

231385
def get_size_limit_from_env() -> Optional[int]:
@@ -267,6 +421,29 @@ def _ensure_ingest_config(
267421
return default_config
268422

269423

424+
def get_tracing_mode() -> tuple[bool, bool]:
425+
"""Get the current tracing mode configuration.
426+
427+
Returns:
428+
tuple[bool, bool]:
429+
- hybrid_otel_and_langsmith: True if both OTEL and LangSmith tracing
430+
are enabled, which is default behavior if OTEL_ENABLED is set to
431+
true and OTEL_ONLY is not set to true
432+
- is_otel_only: True if only OTEL tracing is enabled
433+
"""
434+
otel_enabled = ls_utils.is_truish(ls_utils.get_env_var("OTEL_ENABLED"))
435+
otel_only = ls_utils.is_truish(ls_utils.get_env_var("OTEL_ONLY"))
436+
437+
# If OTEL is not enabled, neither mode should be active
438+
if not otel_enabled:
439+
return False, False
440+
441+
hybrid_otel_and_langsmith = not otel_only
442+
is_otel_only = otel_only
443+
444+
return hybrid_otel_and_langsmith, is_otel_only
445+
446+
270447
def tracing_control_thread_func(client_ref: weakref.ref[Client]) -> None:
271448
client = client_ref()
272449
if client is None:
@@ -351,21 +528,41 @@ def keep_thread_active() -> bool:
351528
)
352529
sub_threads.append(new_thread)
353530
new_thread.start()
531+
532+
hybrid_otel_and_langsmith, is_otel_only = get_tracing_mode()
354533
if next_batch := _tracing_thread_drain_queue(tracing_queue, limit=size_limit):
355-
if client.otel_exporter is not None:
534+
if hybrid_otel_and_langsmith:
535+
# Hybrid mode: both OTEL and LangSmith
536+
_hybrid_tracing_thread_handle_batch(
537+
client, tracing_queue, next_batch, use_multipart
538+
)
539+
elif is_otel_only:
540+
# OTEL-only mode
356541
_otel_tracing_thread_handle_batch(client, tracing_queue, next_batch)
357542
else:
543+
# LangSmith-only mode
358544
_tracing_thread_handle_batch(
359545
client, tracing_queue, next_batch, use_multipart
360546
)
361547

362-
# drain the queue on exit
548+
# drain the queue on exit - apply same logic
549+
hybrid_otel_and_langsmith, is_otel_only = get_tracing_mode()
363550
while next_batch := _tracing_thread_drain_queue(
364551
tracing_queue, limit=size_limit, block=False
365552
):
366-
if client.otel_exporter is not None:
553+
if hybrid_otel_and_langsmith:
554+
# Hybrid mode cleanup
555+
logger.debug("Hybrid mode cleanup")
556+
_hybrid_tracing_thread_handle_batch(
557+
client, tracing_queue, next_batch, use_multipart
558+
)
559+
elif is_otel_only:
560+
# OTEL-only cleanup
561+
logger.debug("OTEL-only cleanup")
367562
_otel_tracing_thread_handle_batch(client, tracing_queue, next_batch)
368563
else:
564+
# LangSmith-only cleanup
565+
logger.debug("LangSmith-only cleanup")
369566
_tracing_thread_handle_batch(
370567
client, tracing_queue, next_batch, use_multipart
371568
)
@@ -378,7 +575,7 @@ def tracing_control_thread_func_compress_parallel(
378575
client = client_ref()
379576
if client is None:
380577
return
381-
578+
logger.debug("Tracing control thread func compress parallel called")
382579
if (
383580
client.compressed_traces is None
384581
or client._data_available_event is None
@@ -542,22 +739,41 @@ def _tracing_sub_thread_func(
542739
):
543740
if next_batch := _tracing_thread_drain_queue(tracing_queue, limit=size_limit):
544741
seen_successive_empty_queues = 0
545-
if client.otel_exporter is not None:
742+
743+
hybrid_otel_and_langsmith, is_otel_only = get_tracing_mode()
744+
if hybrid_otel_and_langsmith:
745+
# Hybrid mode: both OTEL and LangSmith
746+
_hybrid_tracing_thread_handle_batch(
747+
client, tracing_queue, next_batch, use_multipart
748+
)
749+
elif is_otel_only:
750+
# OTEL-only mode
546751
_otel_tracing_thread_handle_batch(client, tracing_queue, next_batch)
547752
else:
753+
# LangSmith-only mode
548754
_tracing_thread_handle_batch(
549755
client, tracing_queue, next_batch, use_multipart
550756
)
551757
else:
552758
seen_successive_empty_queues += 1
553759

554-
# drain the queue on exit
760+
# drain the queue on exit - apply same logic
761+
hybrid_otel_and_langsmith, is_otel_only = get_tracing_mode()
555762
while next_batch := _tracing_thread_drain_queue(
556763
tracing_queue, limit=size_limit, block=False
557764
):
558-
if client.otel_exporter is not None:
765+
if hybrid_otel_and_langsmith:
766+
# Hybrid mode cleanup
767+
_hybrid_tracing_thread_handle_batch(
768+
client, tracing_queue, next_batch, use_multipart
769+
)
770+
elif is_otel_only:
771+
# OTEL-only cleanup
772+
logger.debug("OTEL-only cleanup")
559773
_otel_tracing_thread_handle_batch(client, tracing_queue, next_batch)
560774
else:
775+
# LangSmith-only cleanup
776+
logger.debug("LangSmith-only cleanup")
561777
_tracing_thread_handle_batch(
562778
client, tracing_queue, next_batch, use_multipart
563779
)

python/langsmith/_internal/otel/_otel_client.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,13 @@ def get_otlp_tracer_provider() -> "TracerProvider":
6363
os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = headers
6464

6565
service_name = os.environ.get("OTEL_SERVICE_NAME", "langsmith")
66-
resource = Resource(attributes={SERVICE_NAME: service_name})
66+
resource = Resource(
67+
attributes={
68+
SERVICE_NAME: service_name,
69+
# Marker to identify LangSmith's internal provider
70+
"langsmith.internal_provider": True,
71+
}
72+
)
6773

6874
tracer_provider = TracerProvider(resource=resource)
6975

python/langsmith/run_trees.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ def set(
217217
by the @traceable decorator.
218218
219219
If your LangChain or LangGraph versions are sufficiently up-to-date,
220-
this will also override the default behavior LangChainTracer.
220+
this will also override the default behavior of LangChainTracer.
221221
222222
Args:
223223
inputs: The inputs to set.

0 commit comments

Comments
 (0)