From f8adce65df9e7d4f2243b6b87f604dbcb3521656 Mon Sep 17 00:00:00 2001
From: Tommaso Barbugli <tbarbugli@gmail.com>
Date: Fri, 31 Oct 2025 12:25:02 +0100
Subject: [PATCH 01/11] add timers to turn

---
 .../core/observability/metrics.py             | 219 ++++++++++++----
 .../core/turn_detection/turn_detection.py     |  33 ++-
 .../plugins/krisp/turn_detection.py           |   2 +-
 .../smart_turn/smart_turn_detection.py        |  60 +++--
 .../ultralytics/yolo_pose_processor.py        |  36 ++-
 .../plugins/vogent/vogent_turn_detection.py   | 234 ++++++++++--------
 6 files changed, 403 insertions(+), 181 deletions(-)

diff --git a/agents-core/vision_agents/core/observability/metrics.py b/agents-core/vision_agents/core/observability/metrics.py
index 86b7bd85..91bdedda 100644
--- a/agents-core/vision_agents/core/observability/metrics.py
+++ b/agents-core/vision_agents/core/observability/metrics.py
@@ -1,51 +1,14 @@
-"""OpenTelemetry observability instrumentation for vision-agents library.
-
-This module defines metrics and tracers for the vision-agents library. It does NOT
-configure OpenTelemetry providers - that is the responsibility of applications using
-this library.
-
-For applications using this library:
-    To enable telemetry, configure OpenTelemetry in your application before importing
-    vision-agents components:
-
-    ```python
-    from opentelemetry import trace, metrics
-    from opentelemetry.sdk.trace import TracerProvider
-    from opentelemetry.sdk.metrics import MeterProvider
-    from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
-    from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
-    from opentelemetry.sdk.trace.export import BatchSpanProcessor
-    from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
-    from opentelemetry.sdk.resources import Resource
-
-    # Configure your service
-    resource = Resource.create({
-        "service.name": "my-voice-app",
-        "service.version": "1.0.0",
-    })
-
-    # Setup trace provider
-    trace_provider = TracerProvider(resource=resource)
-    trace_provider.add_span_processor(
-        BatchSpanProcessor(OTLPSpanExporter(endpoint="http://localhost:4317"))
-    )
-    trace.set_tracer_provider(trace_provider)
-
-    # Setup metrics provider
-    metric_reader = PeriodicExportingMetricReader(
-        OTLPMetricExporter(endpoint="http://localhost:4317")
-    )
-    metrics_provider = MeterProvider(resource=resource, metric_readers=[metric_reader])
-    metrics.set_meter_provider(metrics_provider)
-
-    # Now import and use vision-agents
-    from vision_agents.core.tts import TTS
-    ```
-
-    If no providers are configured, metrics and traces will be no-ops.
-"""
+from __future__ import annotations
+
+import functools
+import inspect
+from typing import Dict, Any, Optional, Mapping, Callable, Awaitable, TypeVar, Union
 
 from opentelemetry import trace, metrics
+from opentelemetry.metrics import Histogram
+import time
+
+R = TypeVar("R")
 
 # Get tracer and meter using the library name
 # These will use whatever providers the application has configured
@@ -75,3 +38,167 @@
 inflight_ops = meter.create_up_down_counter(
     "voice.ops.inflight", description="Inflight voice ops"
 )
+
+turn_detection_latency_ms = meter.create_histogram(
+    "turn.detection.latency.ms",
+    unit="ms",
+)
+
+
+class Timer:
+    """
+    Can be used as:
+        done = Timer(hist, {"attr": 1})
+        ...
+        done({"phase": "init"})
+
+        with Timer(hist, {"attr": 1}) as timer:
+            timer.attributes["dynamic_key"] = "dynamic_value"
+            ...
+
+        @Timer(hist, {"route": "/join"})
+        def handler(...): ...
+
+        @Timer(hist)
+        async def async_handler(...): ...
+
+    If decorating a method, automatically adds {"class": <cls.__name__>} to attributes.
+
+    When used as a context manager, you can add attributes dynamically via the
+    `attributes` property, which will be merged with base attributes when recording.
+    """
+
+    def __init__(
+        self,
+        hist: Histogram,
+        attributes: Optional[Mapping[str, Any]] = None,
+        *,
+        unit: str = "ms",
+        record_exceptions: bool = True,
+    ) -> None:
+        self._hist = hist
+        self._base_attrs: Dict[str, Any] = dict(attributes or {})
+        self._unit = unit
+        self._record_exceptions = record_exceptions
+
+        self._start_ns = time.perf_counter_ns()
+        self._stopped = False
+        self.last_elapsed_ms: Optional[float] = None
+
+        # Public attributes dictionary that can be modified during context manager usage
+        self.attributes: Dict[str, Any] = {}
+
+    def __call__(self, *args, **kwargs):
+        """If called with a function, act as a decorator; else record."""
+        if args and callable(args[0]) and len(args) == 1 and not kwargs:
+            func = args[0]
+            return self._decorate(func)
+        extra_attrs = args[0] if args else None
+        return self.stop(extra_attrs)
+
+    def __enter__(self) -> "Timer":
+        self._restart()
+        return self
+
+    def __exit__(self, exc_type, exc, tb) -> None:
+        attrs: Dict[str, Any] = {}
+        if self._record_exceptions:
+            attrs["exception"] = "true" if exc_type else "false"
+            if exc_type:
+                attrs["exception_type"] = getattr(exc_type, "__name__", str(exc_type))
+        self.stop(attrs)
+
+    def stop(self, extra_attributes: Optional[Mapping[str, Any]] = None) -> float:
+        """Idempotent: records only once per start."""
+        if not self._stopped:
+            self._stopped = True
+            elapsed = self.elapsed_ms()
+            self.last_elapsed_ms = elapsed
+
+            attrs = {**self._base_attrs}
+            # Merge the dynamic attributes set during context manager usage
+            attrs.update(self.attributes)
+            if extra_attributes:
+                attrs.update(dict(extra_attributes))
+
+            value = elapsed if self._unit == "ms" else elapsed / 1000.0
+            self._hist.record(value, attributes=attrs)
+
+        return self.last_elapsed_ms or 0.0
+
+    def elapsed_ms(self) -> float:
+        return (time.perf_counter_ns() - self._start_ns) / 1_000_000.0
+
+    def _restart(self) -> None:
+        self._start_ns = time.perf_counter_ns()
+        self._stopped = False
+        self.last_elapsed_ms = None
+        self.attributes = {}  # Reset dynamic attributes on restart
+
+    def _decorate(
+        self, func: Union[Callable[..., R], Callable[..., Awaitable[R]]]
+    ) -> Union[Callable[..., R], Callable[..., Awaitable[R]]]:
+        """
+        Decorate a function or method.
+        Automatically adds {"class": <ClassName>} if decorating a bound method.
+        """
+
+        is_async = inspect.iscoroutinefunction(func)
+
+        if is_async:
+            # Type-cast func as async for type checker
+            async_func: Callable[..., Awaitable[R]] = func  # type: ignore[assignment]
+
+            @functools.wraps(async_func)
+            async def async_wrapper(*args, **kwargs) -> R:
+                class_name = _get_class_name_from_args(async_func, args)
+                attrs = {**self._base_attrs}
+                if class_name:
+                    attrs["class"] = class_name
+                with Timer(
+                    self._hist,
+                    attrs,
+                    unit=self._unit,
+                    record_exceptions=self._record_exceptions,
+                ):
+                    return await async_func(*args, **kwargs)
+
+            return async_wrapper
+        else:
+            # Type-cast func as sync for type checker
+            sync_func: Callable[..., R] = func  # type: ignore[assignment]
+
+            @functools.wraps(sync_func)
+            def sync_wrapper(*args, **kwargs) -> R:
+                class_name = _get_class_name_from_args(sync_func, args)
+                attrs = {**self._base_attrs}
+                if class_name:
+                    attrs["class"] = class_name
+                with Timer(
+                    self._hist,
+                    attrs,
+                    unit=self._unit,
+                    record_exceptions=self._record_exceptions,
+                ):
+                    return sync_func(*args, **kwargs)
+
+            return sync_wrapper
+
+
+def _get_class_name_from_args(
+    func: Callable[..., Any], args: tuple[Any, ...]
+) -> Optional[str]:
+    """Return class name if first arg looks like a bound method (self or cls)."""
+    if not args:
+        return None
+
+    first = args[0]
+
+    if hasattr(first, "__class__") and func.__qualname__.startswith(
+        first.__class__.__name__ + "."
+    ):
+        return first.__class__.__name__
+
+    if inspect.isclass(first) and func.__qualname__.startswith(first.__name__ + "."):
+        return first.__name__
+    return None
diff --git a/agents-core/vision_agents/core/turn_detection/turn_detection.py b/agents-core/vision_agents/core/turn_detection/turn_detection.py
index e61f507b..29105c99 100644
--- a/agents-core/vision_agents/core/turn_detection/turn_detection.py
+++ b/agents-core/vision_agents/core/turn_detection/turn_detection.py
@@ -8,6 +8,7 @@
 from .events import TurnStartedEvent, TurnEndedEvent
 from ..agents.conversation import Conversation
 from ..edge.types import Participant
+from ..observability.metrics import turn_detection_latency_ms, Timer
 
 
 class TurnEvent(Enum):
@@ -17,14 +18,11 @@ class TurnEvent(Enum):
     TURN_ENDED = "turn_ended"
 
 
-
 class TurnDetector(ABC):
     """Base implementation for turn detection with common functionality."""
 
     def __init__(
-        self, 
-        confidence_threshold: float = 0.5,
-        provider_name: Optional[str] = None
+        self, confidence_threshold: float = 0.5, provider_name: Optional[str] = None
     ) -> None:
         self._confidence_threshold = confidence_threshold
         self.is_active = False
@@ -33,21 +31,17 @@ def __init__(
         self.events = EventManager()
         self.events.register_events_from_module(events, ignore_not_compatible=True)
 
-    def _emit_start_turn_event(
-        self, event: TurnStartedEvent
-    ) -> None:
+    def _emit_start_turn_event(self, event: TurnStartedEvent) -> None:
         event.session_id = self.session_id
         event.plugin_name = self.provider_name
         self.events.send(event)
 
-    def _emit_end_turn_event(
-        self, event: TurnEndedEvent
-    ) -> None:
+    def _emit_end_turn_event(self, event: TurnEndedEvent) -> None:
         event.session_id = self.session_id
         event.plugin_name = self.provider_name
         self.events.send(event)
 
-    @abstractmethod
+    @Timer(turn_detection_latency_ms)
     async def process_audio(
         self,
         audio_data: PcmData,
@@ -62,6 +56,23 @@ async def process_audio(
             conversation: Transcription/ chat history, sometimes useful for turn detection
         """
 
+        return await self.detect_turn(audio_data, participant, conversation)
+
+    @abstractmethod
+    async def detect_turn(
+        self,
+        audio_data: PcmData,
+        participant: Participant,
+        conversation: Optional[Conversation],
+    ) -> None:
+        """Process the audio and trigger turn start or turn end events
+
+        Args:
+            audio_data: PcmData object containing audio samples from Stream
+            participant: Participant that's speaking, includes user data
+            conversation: Transcription/ chat history, sometimes useful for turn detection
+        """
+
     ...
 
     async def start(self) -> None:
diff --git a/plugins/krisp/vision_agents/plugins/krisp/turn_detection.py b/plugins/krisp/vision_agents/plugins/krisp/turn_detection.py
index b62d15e5..6a1d2d1d 100644
--- a/plugins/krisp/vision_agents/plugins/krisp/turn_detection.py
+++ b/plugins/krisp/vision_agents/plugins/krisp/turn_detection.py
@@ -75,7 +75,7 @@ def is_detecting(self) -> bool:
         """Check if turn detection is currently active."""
         return self._is_detecting
 
-    async def process_audio(
+    async def detect_turn(
         self,
         audio_data: PcmData,
         participant: Participant,
diff --git a/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py b/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py
index fd158546..411ea76f 100644
--- a/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py
+++ b/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py
@@ -13,13 +13,14 @@
 from vision_agents.core.agents import Conversation
 from vision_agents.core.agents.agents import default_agent_options, AgentOptions
 from vision_agents.core.edge.types import Participant
+from vision_agents.core.observability import meter
+from vision_agents.core.observability.metrics import Timer
 
 from vision_agents.core.turn_detection import (
     TurnDetector,
     TurnStartedEvent,
     TurnEndedEvent,
 )
-
 import logging
 
 logger = logging.getLogger(__name__)
@@ -41,6 +42,17 @@
 )
 
 
+turn_silero_vad_latency_ms = meter.create_histogram(
+    "turn.silero.vad.latency.ms",
+    unit="ms",
+)
+
+turn_smart_turn_detection_latency_ms = meter.create_histogram(
+    "turn.smart_turn.detection.latency.ms",
+    unit="ms",
+)
+
+
 @dataclass
 class Silence:
     trailing_silence_chunks: int = 0
@@ -109,7 +121,9 @@ def __init__(
         self._audio_queue: asyncio.Queue[Any] = asyncio.Queue()
         self._processing_task: Optional[asyncio.Task[Any]] = None
         self._shutdown_event = asyncio.Event()
-        self._processing_active = asyncio.Event()  # Tracks if background task is processing
+        self._processing_active = (
+            asyncio.Event()
+        )  # Tracks if background task is processing
 
         if options is None:
             self.options = default_agent_options()
@@ -149,7 +163,7 @@ async def _prepare_silero_vad(self):
             SileroVAD, path, reset_interval_seconds=self.vad_reset_interval_seconds
         )
 
-    async def process_audio(
+    async def detect_turn(
         self,
         audio_data: PcmData,
         participant: Participant,
@@ -177,7 +191,7 @@ async def _process_audio_loop(self):
 
                 # Signal that we're actively processing
                 self._processing_active.set()
-                
+
                 try:
                     # Process the audio packet
                     await self._process_audio_packet(audio_data, participant)
@@ -234,7 +248,9 @@ async def _process_audio_packet(
         # detect speech in small 512 chunks, gather to larger audio segments with speech
         for chunk in audio_chunks[:-1]:
             # predict if this segment has speech
-            speech_probability = await self.vad.predict_speech(chunk.samples)
+            with Timer(turn_silero_vad_latency_ms) as timer:
+                timer.attributes["samples"] = len(chunk.samples)
+                speech_probability = await self.vad.predict_speech(chunk.samples)
             is_speech = speech_probability > self.speech_probability_threshold
 
             if self._active_segment is not None:
@@ -252,7 +268,11 @@ async def _process_audio_packet(
                 # TODO: make this testable
 
                 trailing_silence_ms = (
-                    self._silence.trailing_silence_chunks * 512 / 16000 * 1000 * 5 #DTX correction
+                    self._silence.trailing_silence_chunks
+                    * 512
+                    / 16000
+                    * 1000
+                    * 5  # DTX correction
                 )
                 long_silence = trailing_silence_ms > self._trailing_silence_ms
                 max_duration_reached = (
@@ -269,7 +289,15 @@ async def _process_audio_packet(
                     merged.append(self._active_segment)
                     merged = merged.tail(8, True, "start")
                     # see if we've completed the turn
-                    prediction = await self._predict_turn_completed(merged, participant)
+                    with Timer(turn_smart_turn_detection_latency_ms) as timer:
+                        timer.attributes["audio_duration_ms"] = merged.duration_ms
+                        timer.attributes["samples"] = len(merged.samples)
+                        timer.attributes["trailing_silence_ms"] = trailing_silence_ms
+                        prediction = await self._predict_turn_completed(
+                            merged, participant
+                        )
+                        timer.attributes["prediction"] = prediction
+                        timer.attributes["turn_ended"] = prediction > 0.5
                     turn_ended = prediction > 0.5
                     if turn_ended:
                         self._emit_end_turn_event(
@@ -304,19 +332,19 @@ async def _process_audio_packet(
     async def wait_for_processing_complete(self, timeout: float = 5.0) -> None:
         """Wait for all queued audio to be processed. Useful for testing."""
         start_time = time.time()
-        
+
         # Wait for queue to be empty AND no active processing
         while (time.time() - start_time) < timeout:
             queue_empty = self._audio_queue.qsize() == 0
             not_processing = not self._processing_active.is_set()
-            
+
             if queue_empty and not_processing:
                 # Give a small final buffer to ensure events are emitted
                 await asyncio.sleep(0.05)
                 return
-            
+
             await asyncio.sleep(0.01)
-        
+
         # Timeout reached
         logger.warning(f"wait_for_processing_complete timed out after {timeout}s")
 
@@ -380,16 +408,16 @@ def _blocking_predict_turn_completed(
 
     def _build_smart_turn_session(self):
         path = os.path.join(self.options.model_dir, SMART_TURN_ONNX_FILENAME)
-        
+
         # Load model into memory to avoid multi-worker file access issues
         with open(path, "rb") as f:
             model_bytes = f.read()
-        
+
         so = ort.SessionOptions()
         so.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
         so.inter_op_num_threads = 1
         so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
-        
+
         # Load from memory instead of file path
         return ort.InferenceSession(model_bytes, sess_options=so)
 
@@ -408,10 +436,10 @@ def __init__(self, model_path: str, reset_interval_seconds: float = 5.0):
         # Load model into memory to avoid multi-worker file access issues
         with open(model_path, "rb") as f:
             model_bytes = f.read()
-        
+
         opts = ort.SessionOptions()
         opts.inter_op_num_threads = 1
-        
+
         # Load from memory instead of file path
         self.session = ort.InferenceSession(model_bytes, sess_options=opts)
         self.context_size = 64  # Silero uses 64-sample context at 16 kHz
diff --git a/plugins/ultralytics/vision_agents/plugins/ultralytics/yolo_pose_processor.py b/plugins/ultralytics/vision_agents/plugins/ultralytics/yolo_pose_processor.py
index 0adbbd06..bf1e9274 100644
--- a/plugins/ultralytics/vision_agents/plugins/ultralytics/yolo_pose_processor.py
+++ b/plugins/ultralytics/vision_agents/plugins/ultralytics/yolo_pose_processor.py
@@ -20,9 +20,15 @@
 )
 from vision_agents.core.utils.queue import LatestNQueue
 from vision_agents.core.utils.video_forwarder import VideoForwarder
+from vision_agents.core.observability.metrics import Timer, meter
 
 logger = logging.getLogger(__name__)
 
+# Metrics for YOLO pose detection
+yolo_pose_inference_ms = meter.create_histogram(
+    "yolo.pose.inference.ms", unit="ms", description="YOLO pose inference latency"
+)
+
 DEFAULT_WIDTH = 640
 DEFAULT_HEIGHT = 480
 DEFAULT_WIDTH = 1920
@@ -310,16 +316,28 @@ def _process_pose_sync(
             )
 
             # Run pose detection
-            yolo_start = time.perf_counter()
-            pose_results = self.pose_model(
-                frame_array,
-                verbose=False,
-                # imgsz=self.imgsz,
-                conf=self.conf_threshold,
-                device=self.device,
+            with Timer(yolo_pose_inference_ms) as timer:
+                timer.attributes["frame_width"] = frame_array.shape[1]
+                timer.attributes["frame_height"] = frame_array.shape[0]
+                timer.attributes["conf_threshold"] = self.conf_threshold
+                timer.attributes["device"] = str(self.device)
+
+                pose_results = self.pose_model(
+                    frame_array,
+                    verbose=False,
+                    # imgsz=self.imgsz,
+                    conf=self.conf_threshold,
+                    device=self.device,
+                )
+
+                # Add detected person count to metrics
+                timer.attributes["persons_detected"] = (
+                    len(pose_results) if pose_results else 0
+                )
+
+            logger.debug(
+                f"🎯 YOLO inference completed in {timer.last_elapsed_ms:.1f}ms"
             )
-            yolo_time = time.perf_counter() - yolo_start
-            logger.debug(f"🎯 YOLO inference completed in {yolo_time:.3f}s")
 
             if not pose_results:
                 logger.debug("❌ No pose results detected")
diff --git a/plugins/vogent/vision_agents/plugins/vogent/vogent_turn_detection.py b/plugins/vogent/vision_agents/plugins/vogent/vogent_turn_detection.py
index 5b902973..34b352ee 100644
--- a/plugins/vogent/vision_agents/plugins/vogent/vogent_turn_detection.py
+++ b/plugins/vogent/vision_agents/plugins/vogent/vogent_turn_detection.py
@@ -17,11 +17,27 @@
     TurnStartedEvent,
     TurnEndedEvent,
 )
+from vision_agents.core.observability.metrics import Timer, meter
 
 import logging
 
 logger = logging.getLogger(__name__)
 
+# Metrics for Vogent turn detection
+vogent_vad_latency_ms = meter.create_histogram(
+    "vogent.vad.latency.ms", unit="ms", description="Vogent VAD prediction latency"
+)
+vogent_whisper_latency_ms = meter.create_histogram(
+    "vogent.whisper.latency.ms",
+    unit="ms",
+    description="Vogent Whisper transcription latency",
+)
+vogent_turn_prediction_latency_ms = meter.create_histogram(
+    "vogent.turn_prediction.latency.ms",
+    unit="ms",
+    description="Vogent turn completion prediction latency",
+)
+
 # Silero VAD model (reused from smart_turn)
 SILERO_ONNX_FILENAME = "silero_vad.onnx"
 SILERO_ONNX_URL = "https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx"
@@ -40,15 +56,15 @@ class Silence:
 class VogentTurnDetection(TurnDetector):
     """
     Vogent Turn Detection combines audio intonation and text context for accurate turn detection.
-    
+
     This implementation:
     1. Uses Silero VAD to detect when speech starts/stops
     2. Uses faster-whisper to transcribe audio in real-time
     3. Uses Vogent Turn model (multimodal) to detect turn completion
-    
+
     Vogent operates on both audio features AND text context, making it more accurate
     than audio-only approaches, especially for handling incomplete thoughts.
-    
+
     Reference: https://github.com/vogent/vogent-turn
     Blogpost: https://blog.vogent.ai/posts/voturn-80m-state-of-the-art-turn-detection-for-voice-agents
     """
@@ -66,7 +82,7 @@ def __init__(
     ):
         """
         Initialize Vogent Turn Detection.
-        
+
         Args:
             whisper_model_size: Faster-whisper model size (tiny, base, small, medium, large)
             vad_reset_interval_seconds: Reset VAD internal state every N seconds to prevent drift
@@ -78,7 +94,7 @@ def __init__(
             model_dir: Directory to store model files
         """
         super().__init__()
-        
+
         # Configuration parameters
         self.whisper_model_size = whisper_model_size
         self.vad_reset_interval_seconds = vad_reset_interval_seconds
@@ -88,7 +104,7 @@ def __init__(
         self.max_segment_duration_seconds = max_segment_duration_seconds
         self.vogent_threshold = vogent_threshold
         self.model_dir = model_dir
-        
+
         # Audio buffering for processing
         self._audio_buffer = PcmData(
             sample_rate=RATE, channels=1, format=AudioFormat.F32
@@ -99,12 +115,12 @@ def __init__(
         )
         self._active_segment: Optional[PcmData] = None
         self._trailing_silence_ms = self.silence_duration_ms
-        
+
         # Producer-consumer pattern: audio packets go into buffer, background task processes them
         self._audio_queue: asyncio.Queue[Any] = asyncio.Queue()
         self._processing_task: Optional[asyncio.Task[Any]] = None
         self._shutdown_event = asyncio.Event()
-        
+
         # Model instances (initialized in start())
         self.vad = None
         self.whisper = None
@@ -114,17 +130,17 @@ async def start(self):
         """Initialize models and prepare for turn detection."""
         # Ensure model directory exists
         os.makedirs(self.model_dir, exist_ok=True)
-        
+
         # Prepare models in parallel
         await asyncio.gather(
             self._prepare_silero_vad(),
             self._prepare_whisper(),
             self._prepare_vogent(),
         )
-        
+
         # Start background processing task
         self._processing_task = asyncio.create_task(self._process_audio_loop())
-        
+
         # Call parent start method
         await super().start()
 
@@ -133,8 +149,10 @@ async def _prepare_silero_vad(self) -> None:
         path = os.path.join(self.model_dir, SILERO_ONNX_FILENAME)
         await ensure_model(path, SILERO_ONNX_URL)
         # Initialize VAD in thread pool to avoid blocking event loop
-        self.vad = await asyncio.to_thread(
-            lambda: SileroVAD(path, reset_interval_seconds=self.vad_reset_interval_seconds)  # type: ignore
+        self.vad = await asyncio.to_thread(  # type: ignore[func-returns-value]
+            lambda: SileroVAD(  # type: ignore[arg-type]
+                path, reset_interval_seconds=self.vad_reset_interval_seconds
+            )
         )
 
     async def _prepare_whisper(self) -> None:
@@ -142,7 +160,9 @@ async def _prepare_whisper(self) -> None:
         logger.info(f"Loading faster-whisper model: {self.whisper_model_size}")
         # Load whisper in thread pool to avoid blocking event loop
         self.whisper = await asyncio.to_thread(  # type: ignore[func-returns-value]
-            lambda: WhisperModel(self.whisper_model_size, device="cpu", compute_type="int8")
+            lambda: WhisperModel(
+                self.whisper_model_size, device="cpu", compute_type="int8"
+            )
         )
         logger.info("Faster-whisper model loaded")
 
@@ -162,7 +182,7 @@ async def _prepare_vogent(self) -> None:
         )
         logger.info("Vogent turn detection model loaded")
 
-    async def process_audio(
+    async def detect_turn(
         self,
         audio_data: PcmData,
         participant: Participant,
@@ -204,7 +224,7 @@ async def _process_audio_packet(
     ) -> None:
         """
         Process audio packet through VAD -> Whisper -> Vogent pipeline.
-        
+
         This method:
         1. Buffers audio and processes in 512-sample chunks
         2. Uses VAD to detect speech
@@ -212,7 +232,7 @@ async def _process_audio_packet(
         4. When reaching silence or max duration:
            - Transcribes segment with Whisper
            - Checks turn completion with Vogent (audio + text)
-        
+
         Args:
             audio_data: PcmData object containing audio samples
             participant: Participant that's speaking
@@ -239,8 +259,10 @@ async def _process_audio_packet(
             # Predict if this segment has speech
             if self.vad is None:
                 continue
-                
-            speech_probability = self.vad.predict_speech(chunk.samples)
+
+            with Timer(vogent_vad_latency_ms) as timer:
+                timer.attributes["samples"] = len(chunk.samples)
+                speech_probability = self.vad.predict_speech(chunk.samples)
             is_speech = speech_probability > self.speech_probability_threshold
 
             if self._active_segment is not None:
@@ -256,7 +278,11 @@ async def _process_audio_packet(
                     self._silence.trailing_silence_chunks += 1
 
                 trailing_silence_ms = (
-                    self._silence.trailing_silence_chunks * CHUNK / RATE * 1000 * 5  # DTX correction
+                    self._silence.trailing_silence_chunks
+                    * CHUNK
+                    / RATE
+                    * 1000
+                    * 5  # DTX correction
                 )
                 long_silence = trailing_silence_ms > self._trailing_silence_ms
                 max_duration_reached = (
@@ -272,20 +298,20 @@ async def _process_audio_packet(
                     merged.append(self._pre_speech_buffer)
                     merged.append(self._active_segment)
                     merged = merged.tail(8, True, "start")
-                    
+
                     # Transcribe the segment with Whisper
                     transcription = await self._transcribe_segment(merged)
-                    
+
                     # Get previous line from conversation for context
                     prev_line = self._get_previous_line(conversation)
-                    
+
                     # Check if turn is complete using Vogent (multimodal: audio + text)
                     is_complete = await self._predict_turn_completed(
                         merged,
                         prev_line=prev_line,
                         curr_line=transcription,
                     )
-                    
+
                     if is_complete:
                         self._emit_end_turn_event(
                             TurnEndedEvent(
@@ -303,7 +329,7 @@ async def _process_audio_packet(
                         )
                         self._pre_speech_buffer.append(merged)
                         self._pre_speech_buffer = self._pre_speech_buffer.tail(8)
-                        
+
             elif is_speech and self._active_segment is None:
                 self._emit_start_turn_event(TurnStartedEvent(participant=participant))
                 # Create a new segment
@@ -342,103 +368,115 @@ async def stop(self):
     async def _transcribe_segment(self, pcm: PcmData) -> str:
         """
         Transcribe audio segment using faster-whisper.
-        
+
         Args:
             pcm: PcmData containing audio samples
-            
+
         Returns:
             Transcribed text
         """
-        # Ensure it's 16khz and f32 format
-        pcm = pcm.resample(16000).to_float32()
-        audio_array = pcm.samples
-        
-        if self.whisper is None:
-            return ""
-        
-        # Run transcription in thread pool to avoid blocking
-        segments, info = await asyncio.to_thread(
-            self.whisper.transcribe,
-            audio_array,
-            language="en",
-            beam_size=1,
-            vad_filter=False,  # We already did VAD
-        )
-        
-        # Collect all text segments
-        text_parts = []
-        for segment in segments:
-            text_parts.append(segment.text.strip())
-        
-        transcription = " ".join(text_parts).strip()
+        with Timer(vogent_whisper_latency_ms) as timer:
+            # Ensure it's 16khz and f32 format
+            pcm = pcm.resample(16000).to_float32()
+            audio_array = pcm.samples
+            timer.attributes["audio_duration_ms"] = pcm.duration_ms
+            timer.attributes["samples"] = len(audio_array)
+
+            if self.whisper is None:
+                return ""
+
+            # Run transcription in thread pool to avoid blocking
+            segments, info = await asyncio.to_thread(
+                self.whisper.transcribe,
+                audio_array,
+                language="en",
+                beam_size=1,
+                vad_filter=False,  # We already did VAD
+            )
+
+            # Collect all text segments
+            text_parts = []
+            for segment in segments:
+                text_parts.append(segment.text.strip())
+
+            transcription = " ".join(text_parts).strip()
+            timer.attributes["transcription_length"] = len(transcription)
+
         return transcription
 
     async def _predict_turn_completed(
-        self, 
-        pcm: PcmData, 
+        self,
+        pcm: PcmData,
         prev_line: str,
         curr_line: str,
     ) -> bool:
         """
         Predict whether the current turn is complete using Vogent.
-        
+
         Args:
             pcm: PcmData containing audio samples
             prev_line: Previous speaker's text (for context)
             curr_line: Current speaker's text
-            
+
         Returns:
             True if turn is complete, False otherwise
         """
-        # Ensure it's 16khz and f32 format
-        pcm = pcm.resample(16000).to_float32()
-
-        # Truncate to 8 seconds
-        audio_array = pcm.tail(8, False).samples
-        
-        if self.vogent is None:
-            return False
-        
-        # Run vogent prediction in thread pool
-        result = await asyncio.to_thread(
-            self.vogent.predict,
-            audio_array,
-            prev_line=prev_line,
-            curr_line=curr_line,
-            sample_rate=16000,
-            return_probs=True,
-        )
-        
-        # Check if probability exceeds threshold
-        is_complete = result['prob_endpoint'] > self.vogent_threshold
-        logger.debug(
-            f"Vogent probability: {result['prob_endpoint']:.3f}, "
-            f"threshold: {self.vogent_threshold}, is_complete: {is_complete}"
-        )
-        
+        with Timer(vogent_turn_prediction_latency_ms) as timer:
+            # Ensure it's 16khz and f32 format
+            pcm = pcm.resample(16000).to_float32()
+
+            # Truncate to 8 seconds
+            audio_array = pcm.tail(8, False).samples
+            timer.attributes["audio_duration_ms"] = len(audio_array) / 16000 * 1000
+            timer.attributes["prev_line_length"] = len(prev_line)
+            timer.attributes["curr_line_length"] = len(curr_line)
+
+            if self.vogent is None:
+                return False
+
+            # Run vogent prediction in thread pool
+            result = await asyncio.to_thread(
+                self.vogent.predict,
+                audio_array,
+                prev_line=prev_line,
+                curr_line=curr_line,
+                sample_rate=16000,
+                return_probs=True,
+            )
+
+            # Check if probability exceeds threshold
+            is_complete = result["prob_endpoint"] > self.vogent_threshold
+            timer.attributes["probability"] = result["prob_endpoint"]
+            timer.attributes["is_complete"] = is_complete
+
+            logger.debug(
+                f"Vogent probability: {result['prob_endpoint']:.3f}, "
+                f"threshold: {self.vogent_threshold}, is_complete: {is_complete}"
+            )
+
         return is_complete
 
     def _get_previous_line(self, conversation: Optional[Conversation]) -> str:
         """
         Extract the previous speaker's line from conversation history.
-        
+
         Args:
             conversation: Conversation object with message history
-            
+
         Returns:
             Previous line text, or empty string if not available
         """
         if conversation is None or not conversation.messages:
             return ""
-        
+
         # Get the last message that's not from the current speaker
         # Typically this would be the assistant or another user
         for message in reversed(conversation.messages):
             if message.content and message.content.strip():
                 # Remove terminal punctuation for better vogent performance
-                text = message.content.strip().rstrip('.!?')
+                text = message.content.strip().rstrip(".!?")
                 return text
-        
+
         return ""
 
 
@@ -446,20 +484,20 @@ def _get_previous_line(self, conversation: Optional[Conversation]) -> str:
 class SileroVAD:
     """
     Minimal Silero VAD ONNX wrapper for 16 kHz, mono, chunk=512.
-    
+
     Reused from smart_turn implementation.
     """
 
     def __init__(self, model_path: str, reset_interval_seconds: float = 5.0):
         """
         Initialize Silero VAD.
-        
+
         Args:
             model_path: Path to the ONNX model file
             reset_interval_seconds: Reset internal state every N seconds to prevent drift
         """
         import onnxruntime as ort
-        
+
         opts = ort.SessionOptions()
         opts.inter_op_num_threads = 1
         self.session = ort.InferenceSession(model_path, sess_options=opts)
@@ -512,43 +550,43 @@ def predict_speech(self, chunk_f32: np.ndarray) -> float:
 async def ensure_model(path: str, url: str) -> str:
     """
     Download a model file asynchronously if it doesn't exist.
-    
+
     Args:
         path: Local path where the model should be saved
         url: URL to download the model from
-        
+
     Returns:
         The path to the model file
     """
     if not os.path.exists(path):
         model_name = os.path.basename(path)
         logger.info(f"Downloading {model_name}...")
-        
+
         try:
-            async with httpx.AsyncClient(timeout=300.0, follow_redirects=True) as client:
+            async with httpx.AsyncClient(
+                timeout=300.0, follow_redirects=True
+            ) as client:
                 async with client.stream("GET", url) as response:
                     response.raise_for_status()
-                    
+
                     # Write file in chunks to avoid loading entire file in memory
                     chunks = []
                     async for chunk in response.aiter_bytes(chunk_size=8192):
                         chunks.append(chunk)
-                    
+
                     # Write all chunks to file in thread to avoid blocking event loop
                     def write_file():
                         with open(path, "wb") as f:
                             for chunk in chunks:
                                 f.write(chunk)
-                    
+
                     await asyncio.to_thread(write_file)
-            
+
             logger.info(f"{model_name} downloaded.")
         except httpx.HTTPError as e:
             # Clean up partial download on error
             if os.path.exists(path):
                 os.remove(path)
             raise RuntimeError(f"Failed to download {model_name}: {e}")
-    
-    return path
-
 
+    return path

From 744f50b8668e068cd6dd5376c870dfd1e9df1ad8 Mon Sep 17 00:00:00 2001
From: Tommaso Barbugli <tbarbugli@gmail.com>
Date: Fri, 31 Oct 2025 14:10:53 +0100
Subject: [PATCH 02/11] metrics for stt and built-in observability stack

---
 agents-core/tests/test_timer.py               | 446 ++++++++++++++
 .../core/observability/metrics.py             |  25 +-
 agents-core/vision_agents/core/stt/stt.py     | 110 +++-
 agents-core/vision_agents/core/tts/tts.py     |   6 +-
 observability/.gitignore                      |   4 +
 observability/README.md                       | 173 ++++++
 .../grafana/dashboards/vision-agents.json     | 557 ++++++++++++++++++
 observability/grafana/init-home-dashboard.sh  |  33 ++
 .../provisioning/dashboards/default.yml       |  13 +
 .../provisioning/datasources/prometheus.yml   |  12 +
 observability/prometheus/prometheus.yml       |  21 +
 .../plugins/deepgram/deepgram_stt.py          |  23 +-
 .../fish/vision_agents/plugins/fish/stt.py    |   6 +-
 .../vision_agents/plugins/wizper/stt.py       |   2 +-
 14 files changed, 1382 insertions(+), 49 deletions(-)
 create mode 100644 agents-core/tests/test_timer.py
 create mode 100644 observability/.gitignore
 create mode 100644 observability/README.md
 create mode 100644 observability/grafana/dashboards/vision-agents.json
 create mode 100755 observability/grafana/init-home-dashboard.sh
 create mode 100644 observability/grafana/provisioning/dashboards/default.yml
 create mode 100644 observability/grafana/provisioning/datasources/prometheus.yml
 create mode 100644 observability/prometheus/prometheus.yml

diff --git a/agents-core/tests/test_timer.py b/agents-core/tests/test_timer.py
new file mode 100644
index 00000000..81e83347
--- /dev/null
+++ b/agents-core/tests/test_timer.py
@@ -0,0 +1,446 @@
+"""Tests for the Timer class in observability metrics."""
+
+import asyncio
+import pytest
+from unittest.mock import MagicMock
+from vision_agents.core.observability.metrics import Timer
+
+
+@pytest.fixture
+def mock_histogram():
+    """Create a mock histogram for testing."""
+    return MagicMock()
+
+
+class TestTimerContextManager:
+    """Tests for Timer used as a context manager."""
+
+    def test_context_manager_records_timing(self, mock_histogram):
+        """Test that Timer records elapsed time when used as context manager."""
+        with Timer(mock_histogram) as timer:
+            pass  # Do nothing, just measure overhead
+
+        # Verify record was called
+        mock_histogram.record.assert_called_once()
+        call_args = mock_histogram.record.call_args
+
+        # First argument should be elapsed time in ms
+        elapsed_ms = call_args[0][0]
+        assert isinstance(elapsed_ms, float)
+        assert elapsed_ms >= 0
+
+        # Should have recorded the elapsed time
+        assert timer.last_elapsed_ms is not None
+        assert timer.last_elapsed_ms >= 0
+
+    def test_context_manager_with_base_attributes(self, mock_histogram):
+        """Test that base attributes are included in recording."""
+        base_attrs = {"provider": "test", "version": "1.0"}
+
+        with Timer(mock_histogram, base_attrs):
+            pass
+
+        # Verify attributes were passed
+        call_args = mock_histogram.record.call_args
+        recorded_attrs = call_args[1]["attributes"]
+
+        assert "provider" in recorded_attrs
+        assert recorded_attrs["provider"] == "test"
+        assert "version" in recorded_attrs
+        assert recorded_attrs["version"] == "1.0"
+
+    def test_context_manager_with_dynamic_attributes(self, mock_histogram):
+        """Test that dynamic attributes can be added during execution."""
+        with Timer(mock_histogram, {"base": "value"}) as timer:
+            timer.attributes["dynamic"] = "added"
+            timer.attributes["count"] = 42
+
+        # Verify both base and dynamic attributes were recorded
+        call_args = mock_histogram.record.call_args
+        recorded_attrs = call_args[1]["attributes"]
+
+        assert recorded_attrs["base"] == "value"
+        assert recorded_attrs["dynamic"] == "added"
+        assert recorded_attrs["count"] == 42
+
+    def test_context_manager_exception_tracking(self, mock_histogram):
+        """Test that exceptions are tracked in attributes."""
+        try:
+            with Timer(mock_histogram, record_exceptions=True):
+                raise ValueError("test error")
+        except ValueError:
+            pass
+
+        # Verify exception was recorded
+        call_args = mock_histogram.record.call_args
+        recorded_attrs = call_args[1]["attributes"]
+
+        assert recorded_attrs["exception"] == "true"
+        assert recorded_attrs["exception_type"] == "ValueError"
+
+    def test_context_manager_no_exception(self, mock_histogram):
+        """Test that no exception is recorded when code succeeds."""
+        with Timer(mock_histogram, record_exceptions=True):
+            pass
+
+        call_args = mock_histogram.record.call_args
+        recorded_attrs = call_args[1]["attributes"]
+
+        assert recorded_attrs["exception"] == "false"
+        assert "exception_type" not in recorded_attrs
+
+    def test_direct_call_pattern(self, mock_histogram):
+        """Test Timer used with direct call pattern."""
+        timer = Timer(mock_histogram, {"base": "attr"})
+
+        # Simulate some work
+        import time
+
+        time.sleep(0.01)
+
+        # Call with extra attributes
+        elapsed = timer({"phase": "init"})
+
+        # Verify recording
+        assert elapsed > 0
+        mock_histogram.record.assert_called_once()
+
+        call_args = mock_histogram.record.call_args
+        recorded_attrs = call_args[1]["attributes"]
+
+        assert recorded_attrs["base"] == "attr"
+        assert recorded_attrs["phase"] == "init"
+
+    def test_stop_is_idempotent(self, mock_histogram):
+        """Test that calling stop multiple times only records once."""
+        timer = Timer(mock_histogram)
+
+        timer.stop()
+        timer.stop()
+        timer.stop()
+
+        # Should only be called once
+        assert mock_histogram.record.call_count == 1
+
+
+class TestTimerDecorator:
+    """Tests for Timer used as a decorator."""
+
+    def test_sync_function_decorator(self, mock_histogram):
+        """Test decorating a synchronous function."""
+
+        @Timer(mock_histogram, {"func": "test"})
+        def my_function(x, y):
+            return x + y
+
+        result = my_function(2, 3)
+
+        assert result == 5
+        mock_histogram.record.assert_called_once()
+
+        call_args = mock_histogram.record.call_args
+        recorded_attrs = call_args[1]["attributes"]
+        assert recorded_attrs["func"] == "test"
+
+    async def test_async_function_decorator(self, mock_histogram):
+        """Test decorating an async function."""
+
+        @Timer(mock_histogram, {"func": "async_test"})
+        async def my_async_function(x):
+            await asyncio.sleep(0.01)
+            return x * 2
+
+        result = await my_async_function(5)
+
+        assert result == 10
+        mock_histogram.record.assert_called_once()
+
+        call_args = mock_histogram.record.call_args
+        recorded_attrs = call_args[1]["attributes"]
+        assert recorded_attrs["func"] == "async_test"
+
+    def test_method_decorator_adds_class_name(self, mock_histogram):
+        """Test that decorating a method automatically adds class name."""
+
+        class MyClass:
+            @Timer(mock_histogram, {"method": "process"})
+            def process(self):
+                return "processed"
+
+        instance = MyClass()
+        result = instance.process()
+
+        assert result == "processed"
+        mock_histogram.record.assert_called_once()
+
+        call_args = mock_histogram.record.call_args
+        recorded_attrs = call_args[1]["attributes"]
+
+        # Should automatically add fully qualified class path
+        assert "class" in recorded_attrs
+        # Check it ends with the class name (module path will vary)
+        assert recorded_attrs["class"].endswith(".MyClass")
+        assert recorded_attrs["method"] == "process"
+
+    async def test_async_method_decorator_adds_class_name(self, mock_histogram):
+        """Test that decorating an async method adds class name."""
+
+        class MyAsyncClass:
+            @Timer(mock_histogram)
+            async def async_process(self):
+                await asyncio.sleep(0.01)
+                return "async_processed"
+
+        instance = MyAsyncClass()
+        result = await instance.async_process()
+
+        assert result == "async_processed"
+        mock_histogram.record.assert_called_once()
+
+        call_args = mock_histogram.record.call_args
+        recorded_attrs = call_args[1]["attributes"]
+
+        assert "class" in recorded_attrs
+        assert recorded_attrs["class"].endswith(".MyAsyncClass")
+
+
+class TestTimerInheritedMethods:
+    """Tests for Timer with inherited methods - the bug fix."""
+
+    def test_inherited_method_reports_subclass_name(self, mock_histogram):
+        """Test that inherited methods report the actual subclass name."""
+
+        class BaseClass:
+            @Timer(mock_histogram)
+            def process(self):
+                return "processed"
+
+        class SubClassA(BaseClass):
+            pass
+
+        class SubClassB(BaseClass):
+            pass
+
+        # Test SubClassA
+        instance_a = SubClassA()
+        instance_a.process()
+
+        # Test SubClassB
+        instance_b = SubClassB()
+        instance_b.process()
+
+        # Should have been called twice
+        assert mock_histogram.record.call_count == 2
+
+        # Check first call (SubClassA)
+        first_call = mock_histogram.record.call_args_list[0]
+        first_attrs = first_call[1]["attributes"]
+        assert first_attrs["class"].endswith(".SubClassA")
+
+        # Check second call (SubClassB)
+        second_call = mock_histogram.record.call_args_list[1]
+        second_attrs = second_call[1]["attributes"]
+        assert second_attrs["class"].endswith(".SubClassB")
+
+    async def test_inherited_async_method_reports_subclass_name(self, mock_histogram):
+        """Test that inherited async methods report the actual subclass name."""
+
+        class AsyncBaseClass:
+            @Timer(mock_histogram)
+            async def process(self):
+                await asyncio.sleep(0.01)
+                return "processed"
+
+        class AsyncSubClass(AsyncBaseClass):
+            pass
+
+        instance = AsyncSubClass()
+        await instance.process()
+
+        mock_histogram.record.assert_called_once()
+
+        call_args = mock_histogram.record.call_args
+        recorded_attrs = call_args[1]["attributes"]
+
+        # Should report the subclass path, not the base class
+        assert recorded_attrs["class"].endswith(".AsyncSubClass")
+
+    def test_deeply_nested_inheritance(self, mock_histogram):
+        """Test that deep inheritance chains still report the correct class."""
+
+        class GrandParent:
+            @Timer(mock_histogram)
+            def process(self):
+                return "processed"
+
+        class Parent(GrandParent):
+            pass
+
+        class Child(Parent):
+            pass
+
+        instance = Child()
+        instance.process()
+
+        call_args = mock_histogram.record.call_args
+        recorded_attrs = call_args[1]["attributes"]
+
+        # Should report the most specific class path
+        assert recorded_attrs["class"].endswith(".Child")
+
+
+class TestTimerUnits:
+    """Tests for Timer unit conversions."""
+
+    def test_millisecond_unit_default(self, mock_histogram):
+        """Test that default unit is milliseconds."""
+        with Timer(mock_histogram):
+            pass
+
+        call_args = mock_histogram.record.call_args
+        elapsed = call_args[0][0]
+
+        # Value should be in milliseconds (small positive number)
+        assert elapsed >= 0
+        assert elapsed < 1000  # Should be less than 1 second for this test
+
+    def test_second_unit_conversion(self, mock_histogram):
+        """Test that seconds unit converts correctly."""
+        with Timer(mock_histogram, unit="s"):
+            import time
+
+            time.sleep(0.01)  # Sleep 10ms
+
+        call_args = mock_histogram.record.call_args
+        elapsed_seconds = call_args[0][0]
+
+        # Should be approximately 0.01 seconds
+        assert 0.005 < elapsed_seconds < 0.05
+
+
+class TestTimerEdgeCases:
+    """Tests for edge cases and error conditions."""
+
+    def test_timer_without_stop_in_context_manager(self, mock_histogram):
+        """Test that __exit__ always calls stop."""
+        with Timer(mock_histogram) as timer:
+            # Don't call stop manually
+            pass
+
+        # Should have been called by __exit__
+        mock_histogram.record.assert_called_once()
+        assert timer.last_elapsed_ms is not None
+
+    def test_restart_clears_attributes(self, mock_histogram):
+        """Test that restart clears dynamic attributes."""
+        timer = Timer(mock_histogram)
+
+        # First use
+        timer.attributes["first"] = "value1"
+        timer.stop()
+
+        # Restart and use again
+        timer._restart()
+        timer.attributes["second"] = "value2"
+        timer.stop({"extra": "attr"})
+
+        # Second call should only have "second" and "extra", not "first"
+        second_call = mock_histogram.record.call_args_list[1]
+        second_attrs = second_call[1]["attributes"]
+
+        assert "second" in second_attrs
+        assert "extra" in second_attrs
+        assert "first" not in second_attrs
+
+    def test_elapsed_ms_while_running(self, mock_histogram):
+        """Test that elapsed_ms can be called while timer is running."""
+        with Timer(mock_histogram) as timer:
+            import time
+
+            time.sleep(0.01)
+            elapsed = timer.elapsed_ms()
+            assert elapsed > 0
+
+        # Final elapsed should be >= interim elapsed
+        assert timer.last_elapsed_ms >= elapsed
+
+    def test_callable_check_in_call(self, mock_histogram):
+        """Test that __call__ with callable argument triggers decoration."""
+
+        def my_func():
+            return 42
+
+        timer = Timer(mock_histogram)
+        decorated = timer(my_func)
+
+        # Should return a wrapped function
+        assert callable(decorated)
+        assert decorated() == 42
+        mock_histogram.record.assert_called_once()
+
+
+class TestTimerRealWorldScenarios:
+    """Tests simulating real-world usage patterns."""
+
+    async def test_stt_pattern(self, mock_histogram):
+        """Test the pattern used in STT base class."""
+
+        class STT:
+            async def process_audio(self, audio_data):
+                with Timer(mock_histogram) as timer:
+                    timer.attributes["provider"] = self.__class__.__name__
+                    timer.attributes["samples"] = len(audio_data)
+
+                    # Simulate processing
+                    await asyncio.sleep(0.01)
+
+        class DeepgramSTT(STT):
+            pass
+
+        stt = DeepgramSTT()
+        await stt.process_audio([1, 2, 3, 4, 5])
+
+        call_args = mock_histogram.record.call_args
+        recorded_attrs = call_args[1]["attributes"]
+
+        assert recorded_attrs["provider"] == "DeepgramSTT"
+        assert recorded_attrs["samples"] == 5
+
+    def test_turn_detection_pattern(self, mock_histogram):
+        """Test the pattern used in TurnDetector base class."""
+
+        class TurnDetector:
+            @Timer(mock_histogram)
+            async def process_audio(self, audio_data):
+                await asyncio.sleep(0.01)
+                return "turn_detected"
+
+        class SmartTurnDetection(TurnDetector):
+            pass
+
+        detector = SmartTurnDetection()
+        result = asyncio.run(detector.process_audio([1, 2, 3]))
+
+        assert result == "turn_detected"
+
+        call_args = mock_histogram.record.call_args
+        recorded_attrs = call_args[1]["attributes"]
+
+        # Should report the actual implementation class path
+        assert recorded_attrs["class"].endswith(".SmartTurnDetection")
+
+    def test_multiple_nested_timers(self, mock_histogram):
+        """Test that nested timers work independently."""
+        with Timer(mock_histogram, {"outer": "timer"}):
+            with Timer(mock_histogram, {"inner": "timer"}):
+                pass
+
+        # Both should have recorded
+        assert mock_histogram.record.call_count == 2
+
+        # Check both calls had different attributes
+        first_call_attrs = mock_histogram.record.call_args_list[0][1]["attributes"]
+        second_call_attrs = mock_histogram.record.call_args_list[1][1]["attributes"]
+
+        assert first_call_attrs["inner"] == "timer"
+        assert second_call_attrs["outer"] == "timer"
diff --git a/agents-core/vision_agents/core/observability/metrics.py b/agents-core/vision_agents/core/observability/metrics.py
index 91bdedda..c9aad222 100644
--- a/agents-core/vision_agents/core/observability/metrics.py
+++ b/agents-core/vision_agents/core/observability/metrics.py
@@ -188,17 +188,30 @@ def sync_wrapper(*args, **kwargs) -> R:
 def _get_class_name_from_args(
     func: Callable[..., Any], args: tuple[Any, ...]
 ) -> Optional[str]:
-    """Return class name if first arg looks like a bound method (self or cls)."""
+    """Return fully qualified class path if first arg looks like a bound method (self or cls).
+
+    For instance methods (self), we return the runtime class path (module.Class), not just
+    the class name. This provides better identification in metrics, especially when multiple
+    plugins use the same class name (e.g., TTS).
+
+    Returns:
+        Fully qualified class path like "vision_agents.plugins.cartesia.tts.TTS"
+        or None if not a method call.
+    """
     if not args:
         return None
 
     first = args[0]
 
-    if hasattr(first, "__class__") and func.__qualname__.startswith(
-        first.__class__.__name__ + "."
-    ):
-        return first.__class__.__name__
+    # Check if this looks like an instance method call (self parameter)
+    if hasattr(first, "__class__") and not inspect.isclass(first):
+        # Verify it's actually a method by checking the function's qualname contains a dot
+        if "." in func.__qualname__:
+            # Return the fully qualified class path
+            return f"{first.__class__.__module__}.{first.__class__.__qualname__}"
 
+    # Check if this looks like a class method call (cls parameter)
     if inspect.isclass(first) and func.__qualname__.startswith(first.__name__ + "."):
-        return first.__name__
+        return f"{first.__module__}.{first.__qualname__}"
+
     return None
diff --git a/agents-core/vision_agents/core/stt/stt.py b/agents-core/vision_agents/core/stt/stt.py
index 0d4fce44..91f0a679 100644
--- a/agents-core/vision_agents/core/stt/stt.py
+++ b/agents-core/vision_agents/core/stt/stt.py
@@ -6,6 +6,7 @@
 
 from ..edge.types import Participant
 from vision_agents.core.events.manager import EventManager
+from ..observability.metrics import Timer, stt_latency_ms, stt_errors
 from . import events
 from .events import TranscriptResponse
 
@@ -23,6 +24,7 @@ class STT(abc.ABC):
 
     process_audio is currently called every 20ms. The integration with turn keeping could be improved
     """
+
     closed: bool = False
     started: bool = False
 
@@ -36,7 +38,6 @@ def __init__(
         self.events = EventManager()
         self.events.register_events_from_module(events, ignore_not_compatible=True)
 
-
     def _emit_transcript_event(
         self,
         text: str,
@@ -51,13 +52,15 @@ def _emit_transcript_event(
             participant: Participant metadata.
             response: Transcription response metadata.
         """
-        self.events.send(events.STTTranscriptEvent(
-            session_id=self.session_id,
-            plugin_name=self.provider_name,
-            text=text,
-            participant=participant,
-            response=response,
-        ))
+        self.events.send(
+            events.STTTranscriptEvent(
+                session_id=self.session_id,
+                plugin_name=self.provider_name,
+                text=text,
+                participant=participant,
+                response=response,
+            )
+        )
 
     def _emit_partial_transcript_event(
         self,
@@ -73,13 +76,15 @@ def _emit_partial_transcript_event(
             participant: Participant metadata.
             response: Transcription response metadata.
         """
-        self.events.send(events.STTPartialTranscriptEvent(
-            session_id=self.session_id,
-            plugin_name=self.provider_name,
-            text=text,
-            participant=participant,
-            response=response,
-        ))
+        self.events.send(
+            events.STTPartialTranscriptEvent(
+                session_id=self.session_id,
+                plugin_name=self.provider_name,
+                text=text,
+                participant=participant,
+                response=response,
+            )
+        )
 
     def _emit_error_event(
         self,
@@ -91,20 +96,73 @@ def _emit_error_event(
         Emit an error event. Note this should only be emitted for temporary errors.
         Permanent errors due to config etc should be directly raised
         """
-        self.events.send(events.STTErrorEvent(
-            session_id=self.session_id,
-            plugin_name=self.provider_name,
-            error=error,
-            context=context,
-            participant=participant,
-            error_code=getattr(error, "error_code", None),
-            is_recoverable=not isinstance(error, (SystemExit, KeyboardInterrupt)),
-        ))
+        # Increment error counter
+        stt_errors.add(
+            1, {"provider": self.provider_name, "error_type": type(error).__name__}
+        )
+
+        self.events.send(
+            events.STTErrorEvent(
+                session_id=self.session_id,
+                plugin_name=self.provider_name,
+                error=error,
+                context=context,
+                participant=participant,
+                error_code=getattr(error, "error_code", None),
+                is_recoverable=not isinstance(error, (SystemExit, KeyboardInterrupt)),
+            )
+        )
 
-    @abc.abstractmethod
     async def process_audio(
-        self, pcm_data: PcmData, participant: Optional[Participant] = None,
+        self,
+        pcm_data: PcmData,
+        participant: Optional[Participant] = None,
+    ):
+        """
+        Process audio with automatic metrics tracking.
+
+        This method wraps the actual processing with metrics collection
+        and delegates to the _process_audio method that subclasses implement.
+
+        Args:
+            pcm_data: Audio data to process
+            participant: Optional participant metadata
+        """
+        with Timer(stt_latency_ms) as timer:
+            # Use fully qualified class path for better identification
+            timer.attributes["stt_class"] = (
+                f"{self.__class__.__module__}.{self.__class__.__qualname__}"
+            )
+            timer.attributes["provider"] = self.provider_name
+            timer.attributes["sample_rate"] = pcm_data.sample_rate
+            timer.attributes["channels"] = pcm_data.channels
+            timer.attributes["samples"] = (
+                len(pcm_data.samples) if pcm_data.samples is not None else 0
+            )
+            timer.attributes["duration_ms"] = pcm_data.duration_ms
+
+            try:
+                await self._process_audio(pcm_data, participant)
+            except Exception as e:
+                timer.attributes["error"] = type(e).__name__
+                raise
+
+    @abc.abstractmethod
+    async def _process_audio(
+        self,
+        pcm_data: PcmData,
+        participant: Optional[Participant] = None,
     ):
+        """
+        Process audio data and emit transcription events.
+
+        Subclasses must implement this method to perform the actual STT processing.
+        The base class handles metrics collection automatically.
+
+        Args:
+            pcm_data: Audio data to process
+            participant: Optional participant metadata
+        """
         pass
 
     async def start(self):
diff --git a/agents-core/vision_agents/core/tts/tts.py b/agents-core/vision_agents/core/tts/tts.py
index cf4761b7..dd28f83f 100644
--- a/agents-core/vision_agents/core/tts/tts.py
+++ b/agents-core/vision_agents/core/tts/tts.py
@@ -338,9 +338,9 @@ async def send(
             raise
         finally:
             elapsed_ms = (time.time() - start_time) * 1000.0
-            tts_latency_ms.record(
-                elapsed_ms, attributes={"tts_class": self.__class__.__name__}
-            )
+            # Use fully qualified class path for better identification
+            class_path = f"{self.__class__.__module__}.{self.__class__.__qualname__}"
+            tts_latency_ms.record(elapsed_ms, attributes={"tts_class": class_path})
 
     async def close(self):
         """Close the TTS service and release any resources."""
diff --git a/observability/.gitignore b/observability/.gitignore
new file mode 100644
index 00000000..d577e73b
--- /dev/null
+++ b/observability/.gitignore
@@ -0,0 +1,4 @@
+# Ignore Docker volume data
+data/
+*.tmp
+*.log
diff --git a/observability/README.md b/observability/README.md
new file mode 100644
index 00000000..56414221
--- /dev/null
+++ b/observability/README.md
@@ -0,0 +1,173 @@
+# Vision Agents Observability Stack
+
+This directory contains the complete observability setup for Vision Agents, including:
+- **Prometheus** for metrics collection
+- **Jaeger** for distributed tracing
+- **Grafana** for visualization with pre-configured dashboards
+
+## Quick Start
+
+### 1. Start the Observability Stack
+
+From the root of the Vision Agents repository:
+
+```bash
+docker-compose up -d
+```
+
+This will start:
+- **Jaeger UI**: http://localhost:16686
+- **Prometheus UI**: http://localhost:9090
+- **Grafana**: http://localhost:3000 (admin/admin)
+
+### 2. Run Your Vision Agents Application
+
+The example in `examples/01_simple_agent_example/simple_agent_example.py` already includes the `setup_telemetry()` function that:
+- Exports traces to Jaeger (OTLP on port 4317)
+- Exposes Prometheus metrics on port 9464
+
+Run the example:
+
+```bash
+cd examples/01_simple_agent_example
+uv run python simple_agent_example.py
+```
+
+### 3. View Metrics in Grafana
+
+1. Open Grafana: http://localhost:3000
+2. Login with `admin` / `admin`
+3. Navigate to **Dashboards** → **Vision Agents - Performance Metrics**
+
+The dashboard automatically displays:
+- **STT Latency** (p50, p95, p99) by implementation
+- **STT Errors** rate by provider and error type
+- **TTS Latency** (p50, p95, p99) by implementation
+- **TTS Errors** rate by provider and error type
+- **Turn Detection Latency** (p50, p95, p99) by implementation
+
+### 4. View Traces in Jaeger
+
+1. Open Jaeger: http://localhost:16686
+2. Select service: `agents`
+3. Click **Find Traces** to see distributed traces
+
+## Architecture
+
+### Metrics Flow
+
+```
+Vision Agents App (port 9464)
+    ↓ (scrape every 5s)
+Prometheus (port 9090)
+    ↓ (datasource)
+Grafana (port 3000)
+```
+
+### Traces Flow
+
+```
+Vision Agents App
+    ↓ (OTLP gRPC on port 4317)
+Jaeger Collector
+    ↓
+Jaeger UI (port 16686)
+```
+
+## Available Metrics
+
+### STT Metrics
+- `stt_latency_ms` - Histogram of STT processing latency
+  - Labels: `stt_class`, `provider`, `sample_rate`, `channels`, `samples`, `duration_ms`
+- `stt_errors` - Counter of STT errors
+  - Labels: `provider`, `error_type`
+
+### TTS Metrics
+- `tts_latency_ms` - Histogram of TTS synthesis latency
+  - Labels: `tts_class`
+- `tts_errors` - Counter of TTS errors
+  - Labels: `provider`, `error_type`
+
+### Turn Detection Metrics
+- `turn_detection_latency_ms` - Histogram of turn detection latency
+  - Labels: `class`
+
+## Configuration
+
+### Prometheus
+
+Edit `prometheus/prometheus.yml` to:
+- Change scrape interval
+- Add additional scrape targets
+- Configure alerting rules
+
+### Grafana
+
+#### Add Custom Dashboards
+
+1. Place JSON dashboard files in `grafana/dashboards/`
+2. They will be automatically loaded on startup
+
+#### Modify Datasources
+
+Edit `grafana/provisioning/datasources/prometheus.yml`
+
+### Jaeger
+
+Jaeger is configured with default settings. To customize, modify the `jaeger` service in `docker-compose.yml`.
+
+## Troubleshooting
+
+### Prometheus Can't Scrape Metrics
+
+**Issue**: Prometheus shows target as "down"
+
+**Solution**: Ensure `host.docker.internal` resolves correctly:
+- **Linux**: Add `--add-host=host.docker.internal:host-gateway` to the prometheus service in docker-compose.yml
+- **Mac/Windows**: Should work by default
+
+### No Data in Grafana
+
+1. Check Prometheus is scraping: http://localhost:9090/targets
+2. Verify metrics are exposed: http://localhost:9464/metrics
+3. Ensure your Vision Agents app is running with telemetry enabled
+
+### Jaeger Shows No Traces
+
+1. Verify OTLP receiver is running: `docker logs vision-agents-jaeger`
+2. Check your app's trace exporter configuration
+3. Ensure `endpoint="localhost:4317"` in your app
+
+## Stopping the Stack
+
+```bash
+docker-compose down
+```
+
+To remove all data (metrics, dashboards, etc.):
+
+```bash
+docker-compose down -v
+```
+
+## Production Considerations
+
+This setup is designed for development. For production:
+
+1. **Security**:
+   - Change default Grafana password
+   - Add authentication to Prometheus
+   - Use TLS for all connections
+
+2. **Persistence**:
+   - Configure external volumes for data persistence
+   - Set up regular backups
+
+3. **Scalability**:
+   - Use Prometheus remote write for long-term storage
+   - Consider Jaeger production deployment with Elasticsearch/Cassandra
+   - Deploy Grafana with a proper database backend
+
+4. **Monitoring**:
+   - Set up alerts in Prometheus/Grafana
+   - Configure notification channels (Slack, PagerDuty, etc.)
diff --git a/observability/grafana/dashboards/vision-agents.json b/observability/grafana/dashboards/vision-agents.json
new file mode 100644
index 00000000..3cc9e1d2
--- /dev/null
+++ b/observability/grafana/dashboards/vision-agents.json
@@ -0,0 +1,557 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "grafana",
+          "uid": "-- Grafana --"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "id": null,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "tooltip": false,
+              "viz": false,
+              "legend": false
+            },
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
+      "id": 1,
+      "options": {
+        "legend": {
+          "calcs": ["mean", "max"],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "histogram_quantile(0.50, sum(rate(stt_latency_ms_milliseconds_bucket[5m])) by (le, stt_class))",
+          "legendFormat": "p50 - {{stt_class}}",
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "histogram_quantile(0.95, sum(rate(stt_latency_ms_milliseconds_bucket[5m])) by (le, stt_class))",
+          "legendFormat": "p95 - {{stt_class}}",
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "histogram_quantile(0.99, sum(rate(stt_latency_ms_milliseconds_bucket[5m])) by (le, stt_class))",
+          "legendFormat": "p99 - {{stt_class}}",
+          "refId": "C"
+        }
+      ],
+      "title": "STT Latency (by implementation)",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "tooltip": false,
+              "viz": false,
+              "legend": false
+            },
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 0
+      },
+      "id": 2,
+      "options": {
+        "legend": {
+          "calcs": ["sum"],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "rate(stt_errors_total[5m])",
+          "legendFormat": "{{provider}} - {{error_type}}",
+          "refId": "A"
+        }
+      ],
+      "title": "STT Errors Rate",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "tooltip": false,
+              "viz": false,
+              "legend": false
+            },
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 8
+      },
+      "id": 3,
+      "options": {
+        "legend": {
+          "calcs": ["mean", "max"],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "histogram_quantile(0.50, sum(rate(tts_latency_ms_milliseconds_bucket[5m])) by (le, tts_class))",
+          "legendFormat": "p50 - {{tts_class}}",
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "histogram_quantile(0.95, sum(rate(tts_latency_ms_milliseconds_bucket[5m])) by (le, tts_class))",
+          "legendFormat": "p95 - {{tts_class}}",
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "histogram_quantile(0.99, sum(rate(tts_latency_ms_milliseconds_bucket[5m])) by (le, tts_class))",
+          "legendFormat": "p99 - {{tts_class}}",
+          "refId": "C"
+        }
+      ],
+      "title": "TTS Latency (by implementation)",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "tooltip": false,
+              "viz": false,
+              "legend": false
+            },
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 8
+      },
+      "id": 4,
+      "options": {
+        "legend": {
+          "calcs": ["sum"],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "rate(tts_errors_total[5m])",
+          "legendFormat": "{{provider}} - {{error_type}}",
+          "refId": "A"
+        }
+      ],
+      "title": "TTS Errors Rate",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "tooltip": false,
+              "viz": false,
+              "legend": false
+            },
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 24,
+        "x": 0,
+        "y": 16
+      },
+      "id": 5,
+      "options": {
+        "legend": {
+          "calcs": ["mean", "max"],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "histogram_quantile(0.50, sum(rate(turn_detection_latency_ms_milliseconds_bucket[5m])) by (le, class))",
+          "legendFormat": "p50 - {{class}}",
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "histogram_quantile(0.95, sum(rate(turn_detection_latency_ms_milliseconds_bucket[5m])) by (le, class))",
+          "legendFormat": "p95 - {{class}}",
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "histogram_quantile(0.99, sum(rate(turn_detection_latency_ms_milliseconds_bucket[5m])) by (le, class))",
+          "legendFormat": "p99 - {{class}}",
+          "refId": "C"
+        }
+      ],
+      "title": "Turn Detection Latency (by implementation)",
+      "type": "timeseries"
+    }
+  ],
+  "refresh": "5s",
+  "schemaVersion": 38,
+  "style": "dark",
+  "tags": ["vision-agents", "observability"],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-15m",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "",
+  "title": "Vision Agents - Performance Metrics",
+  "uid": "vision-agents-metrics",
+  "version": 0,
+  "weekStart": ""
+}
diff --git a/observability/grafana/init-home-dashboard.sh b/observability/grafana/init-home-dashboard.sh
new file mode 100755
index 00000000..cad54dda
--- /dev/null
+++ b/observability/grafana/init-home-dashboard.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+# Wait for Grafana to be ready
+echo "Waiting for Grafana to be ready..."
+until curl -s http://grafana:3000/api/health > /dev/null 2>&1; do
+    echo "Grafana not ready yet, waiting..."
+    sleep 2
+done
+
+echo "Grafana is ready!"
+sleep 5  # Give it a bit more time for provisioning to complete
+
+# Get the dashboard UID
+DASHBOARD_UID="vision-agents-metrics"
+
+# Set the home dashboard for the organization
+echo "Setting org home dashboard to Vision Agents - Performance Metrics..."
+curl -X PUT \
+  -H "Content-Type: application/json" \
+  -d "{\"homeDashboardUID\":\"${DASHBOARD_UID}\"}" \
+  http://grafana:3000/api/org/preferences
+
+# Also set it as the default home dashboard for admin user (for when they log in)
+echo ""
+echo "Setting admin user home dashboard..."
+curl -X PUT \
+  -u "admin:admin" \
+  -H "Content-Type: application/json" \
+  -d "{\"homeDashboardUID\":\"${DASHBOARD_UID}\"}" \
+  http://grafana:3000/api/user/preferences
+
+echo ""
+echo "Home dashboard configured successfully!"
diff --git a/observability/grafana/provisioning/dashboards/default.yml b/observability/grafana/provisioning/dashboards/default.yml
new file mode 100644
index 00000000..ed949c18
--- /dev/null
+++ b/observability/grafana/provisioning/dashboards/default.yml
@@ -0,0 +1,13 @@
+apiVersion: 1
+
+providers:
+  - name: 'Vision Agents'
+    orgId: 1
+    folder: ''
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 10
+    allowUiUpdates: true
+    options:
+      path: /var/lib/grafana/dashboards
+      foldersFromFilesStructure: true
diff --git a/observability/grafana/provisioning/datasources/prometheus.yml b/observability/grafana/provisioning/datasources/prometheus.yml
new file mode 100644
index 00000000..cfd90598
--- /dev/null
+++ b/observability/grafana/provisioning/datasources/prometheus.yml
@@ -0,0 +1,12 @@
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://prometheus:9090
+    uid: prometheus
+    isDefault: true
+    editable: true
+    jsonData:
+      timeInterval: 5s
diff --git a/observability/prometheus/prometheus.yml b/observability/prometheus/prometheus.yml
new file mode 100644
index 00000000..83a2693a
--- /dev/null
+++ b/observability/prometheus/prometheus.yml
@@ -0,0 +1,21 @@
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+  external_labels:
+    monitor: 'vision-agents-monitor'
+
+scrape_configs:
+  # Scrape metrics from Vision Agents application
+  - job_name: 'vision-agents'
+    static_configs:
+      - targets: ['host.docker.internal:9464']
+        labels:
+          service: 'vision-agents'
+          environment: 'development'
+    scrape_interval: 5s
+    scrape_timeout: 5s
+
+  # Scrape Prometheus self-metrics
+  - job_name: 'prometheus'
+    static_configs:
+      - targets: ['localhost:9090']
diff --git a/plugins/deepgram/vision_agents/plugins/deepgram/deepgram_stt.py b/plugins/deepgram/vision_agents/plugins/deepgram/deepgram_stt.py
index 0c598f34..b5da2b80 100644
--- a/plugins/deepgram/vision_agents/plugins/deepgram/deepgram_stt.py
+++ b/plugins/deepgram/vision_agents/plugins/deepgram/deepgram_stt.py
@@ -75,7 +75,7 @@ def __init__(
         self._connection_context: Optional[Any] = None
         self._listen_task: Optional[asyncio.Task[Any]] = None
 
-    async def process_audio(
+    async def _process_audio(
         self,
         pcm_data: PcmData,
         participant: Optional[Participant] = None,
@@ -127,20 +127,19 @@ async def start(self):
             "encoding": "linear16",
             "sample_rate": "16000",
         }
-        
+
         # Add optional parameters if specified
         if self.eot_threshold is not None:
             connect_params["eot_threshold"] = str(self.eot_threshold)
         if self.eager_eot_threshold is not None:
             connect_params["eager_eot_threshold"] = str(self.eager_eot_threshold)
-        
+
         # Connect to Deepgram v2 listen WebSocket with timeout
         self._connection_context = self.client.listen.v2.connect(**connect_params)
-        
+
         # Add timeout for connection establishment
         self.connection = await asyncio.wait_for(
-            self._connection_context.__aenter__(),
-            timeout=10.0
+            self._connection_context.__aenter__(), timeout=10.0
         )
 
         # Register event handlers
@@ -149,7 +148,7 @@ async def start(self):
             self.connection.on(EventType.MESSAGE, self._on_message)
             self.connection.on(EventType.ERROR, self._on_error)
             self.connection.on(EventType.CLOSE, self._on_close)
-            
+
             # Start listening for events
             self._listen_task = asyncio.create_task(self.connection.start_listening())
 
@@ -159,7 +158,7 @@ async def start(self):
     def _on_message(self, message):
         """
         Event handler for messages from Deepgram.
-        
+
         Args:
             message: The message object from Deepgram
         """
@@ -189,7 +188,9 @@ def _on_message(self, message):
             words = getattr(message, "words", [])
             if words:
                 confidences = [w.confidence for w in words if hasattr(w, "confidence")]
-                avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
+                avg_confidence = (
+                    sum(confidences) / len(confidences) if confidences else 0.0
+                )
             else:
                 avg_confidence = 0.0
 
@@ -207,7 +208,7 @@ def _on_message(self, message):
                     "end_of_turn_confidence": end_of_turn_confidence,
                     "turn_index": getattr(message, "turn_index", None),
                     "event": event,
-                }
+                },
             )
 
             # Use the participant from the most recent process_audio call
@@ -234,7 +235,7 @@ def _on_open(self, message):
     def _on_error(self, error):
         """
         Event handler for errors from Deepgram.
-        
+
         Args:
             error: The error from Deepgram
         """
diff --git a/plugins/fish/vision_agents/plugins/fish/stt.py b/plugins/fish/vision_agents/plugins/fish/stt.py
index 7f3ae589..712b6ed0 100644
--- a/plugins/fish/vision_agents/plugins/fish/stt.py
+++ b/plugins/fish/vision_agents/plugins/fish/stt.py
@@ -49,7 +49,7 @@ def __init__(
 
         self.language = language
 
-    async def process_audio(
+    async def _process_audio(
         self,
         pcm_data: PcmData,
         participant: Optional[Participant] = None,
@@ -125,7 +125,9 @@ async def process_audio(
             )
 
             if participant is not None:
-                self._emit_transcript_event(transcript_text, participant, response_metadata)
+                self._emit_transcript_event(
+                    transcript_text, participant, response_metadata
+                )
 
         except Exception as e:
             logger.error(
diff --git a/plugins/wizper/vision_agents/plugins/wizper/stt.py b/plugins/wizper/vision_agents/plugins/wizper/stt.py
index a0bd7c2f..50d5d65b 100644
--- a/plugins/wizper/vision_agents/plugins/wizper/stt.py
+++ b/plugins/wizper/vision_agents/plugins/wizper/stt.py
@@ -57,7 +57,7 @@ def __init__(
         self.target_language = target_language
         self._fal_client = client if client is not None else fal_client.AsyncClient()
 
-    async def process_audio(
+    async def _process_audio(
         self,
         pcm_data: PcmData,
         participant: Optional["Participant"] = None,

From a8b0271392fb9b374fb62cbf2b4e2d065a151eea Mon Sep 17 00:00:00 2001
From: Tommaso Barbugli <tbarbugli@gmail.com>
Date: Fri, 31 Oct 2025 16:29:53 +0100
Subject: [PATCH 03/11] llm metrics

---
 agents-core/vision_agents/core/llm/llm.py     | 225 ++++++++-----
 .../core/observability/metrics.py             |  14 +
 .../core/turn_detection/turn_detection.py     |  18 +-
 observability/README.md                       |  10 +-
 .../grafana/dashboards/vision-agents.json     | 317 ++++++++++++++++--
 .../plugins/anthropic/anthropic_llm.py        | 247 +++++++++-----
 .../aws/vision_agents/plugins/aws/aws_llm.py  | 299 ++++++++++-------
 .../plugins/gemini/gemini_llm.py              | 227 ++++++++-----
 .../plugins/openai/openai_llm.py              |   3 +-
 .../plugins/openrouter/openrouter_llm.py      |   7 +-
 .../smart_turn/smart_turn_detection.py        |  24 +-
 .../plugins/vogent/vogent_turn_detection.py   |  23 +-
 plugins/xai/vision_agents/plugins/xai/llm.py  |  56 ++--
 13 files changed, 1000 insertions(+), 470 deletions(-)

diff --git a/agents-core/vision_agents/core/llm/llm.py b/agents-core/vision_agents/core/llm/llm.py
index a699218b..1b42cd50 100644
--- a/agents-core/vision_agents/core/llm/llm.py
+++ b/agents-core/vision_agents/core/llm/llm.py
@@ -3,7 +3,17 @@
 import abc
 import asyncio
 import json
-from typing import Optional, TYPE_CHECKING, Tuple, List, Dict, Any, TypeVar, Callable, Generic
+from typing import (
+    Optional,
+    TYPE_CHECKING,
+    Tuple,
+    List,
+    Dict,
+    Any,
+    TypeVar,
+    Callable,
+    Generic,
+)
 
 from vision_agents.core.llm import events
 from vision_agents.core.llm.events import ToolStartEvent, ToolEndEvent
@@ -56,6 +66,35 @@ async def simple_response(
         processors: Optional[List[Processor]] = None,
         participant: Optional[Participant] = None,
     ) -> LLMResponseEvent[Any]:
+        """
+        Wrapper method that tracks metrics and delegates to _simple_response.
+        """
+        from vision_agents.core.observability.metrics import Timer, llm_latency_ms
+
+        with Timer(llm_latency_ms) as timer:
+            timer.attributes["llm_class"] = (
+                f"{self.__class__.__module__}.{self.__class__.__qualname__}"
+            )
+            timer.attributes["provider"] = getattr(self, "provider_name", "unknown")
+
+            try:
+                result = await self._simple_response(text, processors, participant)
+                return result
+            except Exception as e:
+                timer.attributes["error"] = type(e).__name__
+                raise
+
+    @abc.abstractmethod
+    async def _simple_response(
+        self,
+        text: str,
+        processors: Optional[List[Processor]] = None,
+        participant: Optional[Participant] = None,
+    ) -> LLMResponseEvent[Any]:
+        """
+        Implementation-specific response generation.
+        Subclasses must implement this method.
+        """
         raise NotImplementedError
 
     def _build_enhanced_instructions(self) -> Optional[str]:
@@ -65,7 +104,7 @@ def _build_enhanced_instructions(self) -> Optional[str]:
         Returns:
             Enhanced instructions string with markdown file contents included, or None if no parsed instructions
         """
-        if not hasattr(self, 'parsed_instructions') or not self.parsed_instructions:
+        if not hasattr(self, "parsed_instructions") or not self.parsed_instructions:
             return None
 
         parsed = self.parsed_instructions
@@ -80,7 +119,9 @@ def _build_enhanced_instructions(self) -> Optional[str]:
                     enhanced_instructions.append(content)
                 else:
                     enhanced_instructions.append(f"\n### {filename}")
-                    enhanced_instructions.append("*(File not found or could not be read)*")
+                    enhanced_instructions.append(
+                        "*(File not found or could not be read)*"
+                    )
 
         return "\n".join(enhanced_instructions)
 
@@ -88,64 +129,72 @@ def _get_tools_for_provider(self) -> List[Dict[str, Any]]:
         """
         Get tools in provider-specific format.
         This method should be overridden by each LLM implementation.
-        
+
         Returns:
             List of tools in the provider's expected format.
         """
         tools = self.get_available_functions()
         return self._convert_tools_to_provider_format(tools)
-    
-    def _convert_tools_to_provider_format(self, tools: List[ToolSchema]) -> List[Dict[str, Any]]:
+
+    def _convert_tools_to_provider_format(
+        self, tools: List[ToolSchema]
+    ) -> List[Dict[str, Any]]:
         """
         Convert ToolSchema objects to provider-specific format.
         This method should be overridden by each LLM implementation.
-        
+
         Args:
             tools: List of ToolSchema objects
-            
+
         Returns:
             List of tools in provider-specific format
         """
         # Default implementation - should be overridden
         return []
-    
-    def _extract_tool_calls_from_response(self, response: Any) -> List[NormalizedToolCallItem]:
+
+    def _extract_tool_calls_from_response(
+        self, response: Any
+    ) -> List[NormalizedToolCallItem]:
         """
         Extract tool calls from provider-specific response.
         This method should be overridden by each LLM implementation.
-        
+
         Args:
             response: Provider-specific response object
-            
+
         Returns:
             List of normalized tool call items
         """
         # Default implementation - should be overridden
         return []
-    
-    def _extract_tool_calls_from_stream_chunk(self, chunk: Any) -> List[NormalizedToolCallItem]:
+
+    def _extract_tool_calls_from_stream_chunk(
+        self, chunk: Any
+    ) -> List[NormalizedToolCallItem]:
         """
         Extract tool calls from a streaming chunk.
         This method should be overridden by each LLM implementation.
-        
+
         Args:
             chunk: Provider-specific streaming chunk
-            
+
         Returns:
             List of normalized tool call items
         """
         # Default implementation - should be overridden
         return []
-    
-    def _create_tool_result_message(self, tool_calls: List[NormalizedToolCallItem], results: List[Any]) -> List[Dict[str, Any]]:
+
+    def _create_tool_result_message(
+        self, tool_calls: List[NormalizedToolCallItem], results: List[Any]
+    ) -> List[Dict[str, Any]]:
         """
         Create tool result messages for the provider.
         This method should be overridden by each LLM implementation.
-        
+
         Args:
             tool_calls: List of tool calls that were executed
             results: List of results from function execution
-            
+
         Returns:
             List of tool result messages in provider format
         """
@@ -160,67 +209,67 @@ def _attach_agent(self, agent: Agent):
         self._conversation = agent.conversation
         self._set_instructions(agent.instructions)
 
-
     def _set_instructions(self, instructions: str):
         self.instructions = instructions
 
         # Parse instructions to extract @ mentioned markdown files
         self.parsed_instructions = parse_instructions(instructions)
 
-    def register_function(self, 
-                         name: Optional[str] = None,
-                         description: Optional[str] = None) -> Callable:
+    def register_function(
+        self, name: Optional[str] = None, description: Optional[str] = None
+    ) -> Callable:
         """
         Decorator to register a function with the LLM's function registry.
-        
+
         Args:
             name: Optional custom name for the function. If not provided, uses the function name.
             description: Optional description for the function. If not provided, uses the docstring.
-        
+
         Returns:
             Decorator function.
         """
         return self.function_registry.register(name, description)
-    
+
     def get_available_functions(self) -> List[ToolSchema]:
         """Get a list of available function schemas."""
         return self.function_registry.get_tool_schemas()
-    
+
     def call_function(self, name: str, arguments: Dict[str, Any]) -> Any:
         """
         Call a registered function with the given arguments.
-        
+
         Args:
             name: Name of the function to call.
             arguments: Dictionary of arguments to pass to the function.
-        
+
         Returns:
             Result of the function call.
         """
         return self.function_registry.call_function(name, arguments)
-    
 
     def _tc_key(self, tc: Dict[str, Any]) -> Tuple[Optional[str], str, str]:
         """Generate a unique key for tool call deduplication.
-        
+
         Args:
             tc: Tool call dictionary
-            
+
         Returns:
             Tuple of (id, name, arguments_json) for deduplication
         """
         return (
-            tc.get("id"), 
-            tc["name"], 
-            json.dumps(tc.get("arguments_json", tc.get("arguments", {})), sort_keys=True)
+            tc.get("id"),
+            tc["name"],
+            json.dumps(
+                tc.get("arguments_json", tc.get("arguments", {})), sort_keys=True
+            ),
         )
 
     async def _maybe_await(self, x):
         """Await if x is a coroutine, otherwise return x directly.
-        
+
         Args:
             x: Value that might be a coroutine
-            
+
         Returns:
             Awaited result if coroutine, otherwise x
         """
@@ -230,23 +279,23 @@ async def _maybe_await(self, x):
 
     async def _run_one_tool(self, tc: Dict[str, Any], timeout_s: float):
         """Run a single tool call with timeout.
-        
+
         Args:
             tc: Tool call dictionary
             timeout_s: Timeout in seconds
-            
+
         Returns:
             Tuple of (tool_call, result, error)
         """
         import inspect
         import time
-        
+
         args = tc.get("arguments_json", tc.get("arguments", {})) or {}
         start_time = time.time()
-        
+
         async def _invoke():
             # Get the actual function to check if it's async
-            if hasattr(self.function_registry, 'get_callable'):
+            if hasattr(self.function_registry, "get_callable"):
                 fn = self.function_registry.get_callable(tc["name"])
                 if inspect.iscoroutinefunction(fn):
                     return await fn(**args)
@@ -257,62 +306,74 @@ async def _invoke():
                 # Fallback to existing call_function method
                 res = self.call_function(tc["name"], args)
                 return await self._maybe_await(res)
-        
+
         try:
             # Send tool start event
-            self.events.send(ToolStartEvent(
-                plugin_name="llm",
-                tool_name=tc["name"],
-                arguments=args,
-                tool_call_id=tc.get("id")
-            ))
-            
+            self.events.send(
+                ToolStartEvent(
+                    plugin_name="llm",
+                    tool_name=tc["name"],
+                    arguments=args,
+                    tool_call_id=tc.get("id"),
+                )
+            )
+
             res = await asyncio.wait_for(_invoke(), timeout=timeout_s)
             execution_time = (time.time() - start_time) * 1000
-            
+
             # Send tool end event (success)
-            self.events.send(ToolEndEvent(
-                plugin_name="llm",
-                tool_name=tc["name"],
-                success=True,
-                result=res,
-                tool_call_id=tc.get("id"),
-                execution_time_ms=execution_time
-            ))
-            
+            self.events.send(
+                ToolEndEvent(
+                    plugin_name="llm",
+                    tool_name=tc["name"],
+                    success=True,
+                    result=res,
+                    tool_call_id=tc.get("id"),
+                    execution_time_ms=execution_time,
+                )
+            )
+
             return tc, res, None
         except Exception as e:
             execution_time = (time.time() - start_time) * 1000
-            
+
             # Send tool end event (error)
-            self.events.send(ToolEndEvent(
-                plugin_name="llm",
-                tool_name=tc["name"],
-                success=False,
-                error=str(e),
-                tool_call_id=tc.get("id"),
-                execution_time_ms=execution_time
-            ))
-            
+            self.events.send(
+                ToolEndEvent(
+                    plugin_name="llm",
+                    tool_name=tc["name"],
+                    success=False,
+                    error=str(e),
+                    tool_call_id=tc.get("id"),
+                    execution_time_ms=execution_time,
+                )
+            )
+
             return tc, {"error": str(e)}, e
 
-    async def _execute_tools(self, calls: List[Dict[str, Any]], *, max_concurrency: int = 8, timeout_s: float = 30):
+    async def _execute_tools(
+        self,
+        calls: List[Dict[str, Any]],
+        *,
+        max_concurrency: int = 8,
+        timeout_s: float = 30,
+    ):
         """Execute multiple tool calls concurrently with timeout.
-        
+
         Args:
             calls: List of tool call dictionaries
             max_concurrency: Maximum number of concurrent tool executions
             timeout_s: Timeout per tool execution in seconds
-            
+
         Returns:
             List of tuples (tool_call, result, error)
         """
         sem = asyncio.Semaphore(max_concurrency)
-        
+
         async def _guarded(tc):
             async with sem:
                 return await self._run_one_tool(tc, timeout_s)
-        
+
         return await asyncio.gather(*[_guarded(tc) for tc in calls])
 
     async def _dedup_and_execute(
@@ -324,13 +385,13 @@ async def _dedup_and_execute(
         seen: Optional[set] = None,
     ):
         """De-duplicate (by id/name/args) then execute concurrently.
-        
+
         Args:
             calls: List of tool call dictionaries
             max_concurrency: Maximum number of concurrent tool executions
             timeout_s: Timeout per tool execution in seconds
             seen: Set of seen tool call keys for deduplication
-            
+
         Returns:
             Tuple of (triples, updated_seen_set)
         """
@@ -346,16 +407,18 @@ async def _dedup_and_execute(
         if not to_run:
             return [], seen  # nothing new
 
-        triples = await self._execute_tools(to_run, max_concurrency=max_concurrency, timeout_s=timeout_s)
+        triples = await self._execute_tools(
+            to_run, max_concurrency=max_concurrency, timeout_s=timeout_s
+        )
         return triples, seen
 
     def _sanitize_tool_output(self, value: Any, max_chars: int = 60_000) -> str:
         """Sanitize tool output to prevent oversized responses.
-        
+
         Args:
             value: Tool output value
             max_chars: Maximum characters allowed
-            
+
         Returns:
             Sanitized string output
         """
diff --git a/agents-core/vision_agents/core/observability/metrics.py b/agents-core/vision_agents/core/observability/metrics.py
index c9aad222..b39ccd2a 100644
--- a/agents-core/vision_agents/core/observability/metrics.py
+++ b/agents-core/vision_agents/core/observability/metrics.py
@@ -43,6 +43,20 @@
     "turn.detection.latency.ms",
     unit="ms",
 )
+turn_vad_latency_ms = meter.create_histogram(
+    "turn.vad.latency.ms", unit="ms", description="Turn detection VAD latency"
+)
+turn_end_detection_latency_ms = meter.create_histogram(
+    "turn.end_detection.latency.ms",
+    unit="ms",
+    description="Turn end detection latency (Vogent/Smart Turn model)",
+)
+turn_errors = meter.create_counter("turn.errors", description="Turn detection errors")
+
+llm_latency_ms = meter.create_histogram(
+    "llm.latency.ms", unit="ms", description="Total LLM latency"
+)
+llm_errors = meter.create_counter("llm.errors", description="LLM errors")
 
 
 class Timer:
diff --git a/agents-core/vision_agents/core/turn_detection/turn_detection.py b/agents-core/vision_agents/core/turn_detection/turn_detection.py
index 29105c99..faf14979 100644
--- a/agents-core/vision_agents/core/turn_detection/turn_detection.py
+++ b/agents-core/vision_agents/core/turn_detection/turn_detection.py
@@ -8,7 +8,7 @@
 from .events import TurnStartedEvent, TurnEndedEvent
 from ..agents.conversation import Conversation
 from ..edge.types import Participant
-from ..observability.metrics import turn_detection_latency_ms, Timer
+from ..observability.metrics import turn_detection_latency_ms, turn_errors, Timer
 
 
 class TurnEvent(Enum):
@@ -41,7 +41,6 @@ def _emit_end_turn_event(self, event: TurnEndedEvent) -> None:
         event.plugin_name = self.provider_name
         self.events.send(event)
 
-    @Timer(turn_detection_latency_ms)
     async def process_audio(
         self,
         audio_data: PcmData,
@@ -55,8 +54,19 @@ async def process_audio(
             participant: Participant that's speaking, includes user data
             conversation: Transcription/ chat history, sometimes useful for turn detection
         """
-
-        return await self.detect_turn(audio_data, participant, conversation)
+        with Timer(turn_detection_latency_ms) as timer:
+            timer.attributes["class"] = (
+                f"{self.__class__.__module__}.{self.__class__.__qualname__}"
+            )
+            timer.attributes["provider"] = self.provider_name
+            try:
+                await self.detect_turn(audio_data, participant, conversation)
+            except Exception as e:
+                timer.attributes["error"] = type(e).__name__
+                turn_errors.add(
+                    1, {"provider": self.provider_name, "error_type": type(e).__name__}
+                )
+                raise
 
     @abstractmethod
     async def detect_turn(
diff --git a/observability/README.md b/observability/README.md
index 56414221..bcd44481 100644
--- a/observability/README.md
+++ b/observability/README.md
@@ -40,11 +40,11 @@ uv run python simple_agent_example.py
 3. Navigate to **Dashboards** → **Vision Agents - Performance Metrics**
 
 The dashboard automatically displays:
+- **LLM Latency** (p50, p95, p99) by implementation
 - **STT Latency** (p50, p95, p99) by implementation
-- **STT Errors** rate by provider and error type
 - **TTS Latency** (p50, p95, p99) by implementation
-- **TTS Errors** rate by provider and error type
 - **Turn Detection Latency** (p50, p95, p99) by implementation
+- **All Errors Rate** - Combined view of LLM, STT, and TTS errors by provider and error type
 
 ### 4. View Traces in Jaeger
 
@@ -92,6 +92,12 @@ Jaeger UI (port 16686)
 - `turn_detection_latency_ms` - Histogram of turn detection latency
   - Labels: `class`
 
+### LLM Metrics
+- `llm_latency_ms` - Histogram of LLM response latency
+  - Labels: `llm_class`, `provider`
+- `llm_errors` - Counter of LLM errors
+  - Labels: `provider`, `error`
+
 ## Configuration
 
 ### Prometheus
diff --git a/observability/grafana/dashboards/vision-agents.json b/observability/grafana/dashboards/vision-agents.json
index 3cc9e1d2..05da23ef 100644
--- a/observability/grafana/dashboards/vision-agents.json
+++ b/observability/grafana/dashboards/vision-agents.json
@@ -106,8 +106,8 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.50, sum(rate(stt_latency_ms_milliseconds_bucket[5m])) by (le, stt_class))",
-          "legendFormat": "p50 - {{stt_class}}",
+          "expr": "histogram_quantile(0.50, sum(rate(llm_latency_ms_milliseconds_bucket[5m])) by (le, llm_class))",
+          "legendFormat": "p50 - {{llm_class}}",
           "refId": "A"
         },
         {
@@ -115,8 +115,8 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.95, sum(rate(stt_latency_ms_milliseconds_bucket[5m])) by (le, stt_class))",
-          "legendFormat": "p95 - {{stt_class}}",
+          "expr": "histogram_quantile(0.95, sum(rate(llm_latency_ms_milliseconds_bucket[5m])) by (le, llm_class))",
+          "legendFormat": "p95 - {{llm_class}}",
           "refId": "B"
         },
         {
@@ -124,12 +124,12 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.99, sum(rate(stt_latency_ms_milliseconds_bucket[5m])) by (le, stt_class))",
-          "legendFormat": "p99 - {{stt_class}}",
+          "expr": "histogram_quantile(0.99, sum(rate(llm_latency_ms_milliseconds_bucket[5m])) by (le, llm_class))",
+          "legendFormat": "p99 - {{llm_class}}",
           "refId": "C"
         }
       ],
-      "title": "STT Latency (by implementation)",
+      "title": "LLM Latency",
       "type": "timeseries"
     },
     {
@@ -186,7 +186,7 @@
               }
             ]
           },
-          "unit": "short"
+          "unit": "ms"
         },
         "overrides": []
       },
@@ -199,7 +199,7 @@
       "id": 2,
       "options": {
         "legend": {
-          "calcs": ["sum"],
+          "calcs": ["mean", "max"],
           "displayMode": "table",
           "placement": "bottom",
           "showLegend": true
@@ -216,12 +216,30 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "rate(stt_errors_total[5m])",
-          "legendFormat": "{{provider}} - {{error_type}}",
+          "expr": "histogram_quantile(0.50, sum(rate(stt_latency_ms_milliseconds_bucket[5m])) by (le, stt_class))",
+          "legendFormat": "p50 - {{stt_class}}",
           "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "histogram_quantile(0.95, sum(rate(stt_latency_ms_milliseconds_bucket[5m])) by (le, stt_class))",
+          "legendFormat": "p95 - {{stt_class}}",
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "histogram_quantile(0.99, sum(rate(stt_latency_ms_milliseconds_bucket[5m])) by (le, stt_class))",
+          "legendFormat": "p99 - {{stt_class}}",
+          "refId": "C"
         }
       ],
-      "title": "STT Errors Rate",
+      "title": "STT Latency",
       "type": "timeseries"
     },
     {
@@ -331,7 +349,7 @@
           "refId": "C"
         }
       ],
-      "title": "TTS Latency (by implementation)",
+      "title": "TTS Latency",
       "type": "timeseries"
     },
     {
@@ -388,7 +406,7 @@
               }
             ]
           },
-          "unit": "short"
+          "unit": "ms"
         },
         "overrides": []
       },
@@ -401,7 +419,7 @@
       "id": 4,
       "options": {
         "legend": {
-          "calcs": ["sum"],
+          "calcs": ["mean", "max"],
           "displayMode": "table",
           "placement": "bottom",
           "showLegend": true
@@ -418,12 +436,30 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "rate(tts_errors_total[5m])",
-          "legendFormat": "{{provider}} - {{error_type}}",
+          "expr": "histogram_quantile(0.50, sum(rate(turn_detection_latency_ms_milliseconds_bucket[5m])) by (le, provider))",
+          "legendFormat": "p50 - {{provider}}",
           "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "histogram_quantile(0.95, sum(rate(turn_detection_latency_ms_milliseconds_bucket[5m])) by (le, provider))",
+          "legendFormat": "p95 - {{provider}}",
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "histogram_quantile(0.99, sum(rate(turn_detection_latency_ms_milliseconds_bucket[5m])) by (le, provider))",
+          "legendFormat": "p99 - {{provider}}",
+          "refId": "C"
         }
       ],
-      "title": "TTS Errors Rate",
+      "title": "Turn Detection Latency",
       "type": "timeseries"
     },
     {
@@ -486,7 +522,7 @@
       },
       "gridPos": {
         "h": 8,
-        "w": 24,
+        "w": 8,
         "x": 0,
         "y": 16
       },
@@ -510,8 +546,118 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.50, sum(rate(turn_detection_latency_ms_milliseconds_bucket[5m])) by (le, class))",
-          "legendFormat": "p50 - {{class}}",
+          "expr": "histogram_quantile(0.50, sum(rate(turn_vad_latency_ms_milliseconds_bucket[5m])) by (le, implementation))",
+          "legendFormat": "p50 - {{implementation}}",
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "histogram_quantile(0.95, sum(rate(turn_vad_latency_ms_milliseconds_bucket[5m])) by (le, implementation))",
+          "legendFormat": "p95 - {{implementation}}",
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "histogram_quantile(0.99, sum(rate(turn_vad_latency_ms_milliseconds_bucket[5m])) by (le, implementation))",
+          "legendFormat": "p99 - {{implementation}}",
+          "refId": "C"
+        }
+      ],
+      "title": "Turn VAD Latency",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "tooltip": false,
+              "viz": false,
+              "legend": false
+            },
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 8,
+        "y": 16
+      },
+      "id": 6,
+      "options": {
+        "legend": {
+          "calcs": ["mean", "max"],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "histogram_quantile(0.50, sum(rate(turn_end_detection_latency_ms_milliseconds_bucket[5m])) by (le, implementation))",
+          "legendFormat": "p50 - {{implementation}}",
           "refId": "A"
         },
         {
@@ -519,8 +665,8 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.95, sum(rate(turn_detection_latency_ms_milliseconds_bucket[5m])) by (le, class))",
-          "legendFormat": "p95 - {{class}}",
+          "expr": "histogram_quantile(0.95, sum(rate(turn_end_detection_latency_ms_milliseconds_bucket[5m])) by (le, implementation))",
+          "legendFormat": "p95 - {{implementation}}",
           "refId": "B"
         },
         {
@@ -528,12 +674,131 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.99, sum(rate(turn_detection_latency_ms_milliseconds_bucket[5m])) by (le, class))",
-          "legendFormat": "p99 - {{class}}",
+          "expr": "histogram_quantile(0.99, sum(rate(turn_end_detection_latency_ms_milliseconds_bucket[5m])) by (le, implementation))",
+          "legendFormat": "p99 - {{implementation}}",
           "refId": "C"
         }
       ],
-      "title": "Turn Detection Latency (by implementation)",
+      "title": "Turn End Detection Latency",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "tooltip": false,
+              "viz": false,
+              "legend": false
+            },
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 16,
+        "y": 16
+      },
+      "id": 7,
+      "options": {
+        "legend": {
+          "calcs": ["sum"],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "rate(llm_errors_total[5m])",
+          "legendFormat": "LLM - {{provider}} - {{error}}",
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "rate(stt_errors_total[5m])",
+          "legendFormat": "STT - {{provider}} - {{error_type}}",
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "rate(tts_errors_total[5m])",
+          "legendFormat": "TTS - {{provider}} - {{error_type}}",
+          "refId": "C"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "rate(turn_errors_total[5m])",
+          "legendFormat": "TURN - {{provider}} - {{error_type}}",
+          "refId": "D"
+        }
+      ],
+      "title": "All Errors Rate",
       "type": "timeseries"
     }
   ],
diff --git a/plugins/anthropic/vision_agents/plugins/anthropic/anthropic_llm.py b/plugins/anthropic/vision_agents/plugins/anthropic/anthropic_llm.py
index 30691576..1364d5d6 100644
--- a/plugins/anthropic/vision_agents/plugins/anthropic/anthropic_llm.py
+++ b/plugins/anthropic/vision_agents/plugins/anthropic/anthropic_llm.py
@@ -14,7 +14,10 @@
 
 from getstream.video.rtc.pb.stream.video.sfu.models.models_pb2 import Participant
 
-from vision_agents.core.llm.events import LLMResponseChunkEvent, LLMResponseCompletedEvent
+from vision_agents.core.llm.events import (
+    LLMResponseChunkEvent,
+    LLMResponseCompletedEvent,
+)
 from vision_agents.core.processors import Processor
 from . import events
 
@@ -59,14 +62,17 @@ def __init__(
         super().__init__()
         self.events.register_events_from_module(events)
         self.model = model
-        self._pending_tool_uses_by_index: Dict[int, Dict[str, Any]] = {}  # index -> {id, name, parts: []}
+        self._pending_tool_uses_by_index: Dict[
+            int, Dict[str, Any]
+        ] = {}  # index -> {id, name, parts: []}
+        self.provider_name = "anthropic"
 
         if client is not None:
             self.client = client
         else:
             self.client = anthropic.AsyncAnthropic(api_key=api_key)
 
-    async def simple_response(
+    async def _simple_response(
         self,
         text: str,
         processors: Optional[List[Processor]] = None,
@@ -107,7 +113,7 @@ async def create_message(self, *args, **kwargs) -> LLMResponseEvent[Any]:
 
         # ensure the AI remembers the past conversation
         new_messages = kwargs["messages"]
-        if hasattr(self, '_conversation') and self._conversation:
+        if hasattr(self, "_conversation") and self._conversation:
             old_messages = [m.original for m in self._conversation.messages]
             kwargs["messages"] = old_messages + new_messages
             # Add messages to conversation
@@ -122,7 +128,7 @@ async def create_message(self, *args, **kwargs) -> LLMResponseEvent[Any]:
             # Extract text from Claude's response format - safely handle all text blocks
             text = self._concat_text_blocks(original.content)
             llm_response = LLMResponseEvent(original, text)
-            
+
             # Multi-hop tool calling loop for non-streaming
             function_calls = self._extract_tool_calls_from_response(original)
             if function_calls:
@@ -131,39 +137,53 @@ async def create_message(self, *args, **kwargs) -> LLMResponseEvent[Any]:
                 rounds = 0
                 seen: set[tuple[str, str, str]] = set()
                 current_calls = function_calls
-                
+
                 while current_calls and rounds < MAX_ROUNDS:
                     # Execute calls concurrently with dedup
-                    triples, seen = await self._dedup_and_execute(current_calls, seen=seen, max_concurrency=8, timeout_s=30)  # type: ignore[arg-type]
-                    
+                    triples, seen = await self._dedup_and_execute(
+                        current_calls,  # type: ignore[arg-type]
+                        seen=seen,
+                        max_concurrency=8,
+                        timeout_s=30,
+                    )
+
                     if not triples:
                         break
-                    
+
                     # Build tool_result user message
                     assistant_content = []
                     tool_result_blocks = []
                     for tc, res, err in triples:
-                        assistant_content.append({
-                            "type": "tool_use",
-                            "id": tc["id"],
-                            "name": tc["name"],
-                            "input": tc["arguments_json"],
-                        })
-                        
+                        assistant_content.append(
+                            {
+                                "type": "tool_use",
+                                "id": tc["id"],
+                                "name": tc["name"],
+                                "input": tc["arguments_json"],
+                            }
+                        )
+
                         payload = self._sanitize_tool_output(res)
-                        tool_result_blocks.append({
-                            "type": "tool_result",
-                            "tool_use_id": tc["id"],
-                            "content": payload,
-                        })
+                        tool_result_blocks.append(
+                            {
+                                "type": "tool_result",
+                                "tool_use_id": tc["id"],
+                                "content": payload,
+                            }
+                        )
 
                     assistant_msg = {"role": "assistant", "content": assistant_content}
-                    user_tool_results_msg = {"role": "user", "content": tool_result_blocks}
+                    user_tool_results_msg = {
+                        "role": "user",
+                        "content": tool_result_blocks,
+                    }
                     messages = messages + [assistant_msg, user_tool_results_msg]
 
                     # Ask again WITH tools so Claude can do another hop
                     tools_cfg = {
-                        "tools": self._convert_tools_to_provider_format(self.get_available_functions()),
+                        "tools": self._convert_tools_to_provider_format(
+                            self.get_available_functions()
+                        ),
                         "tool_choice": {"type": "auto"},
                         "stream": False,
                         "model": self.model,
@@ -172,22 +192,29 @@ async def create_message(self, *args, **kwargs) -> LLMResponseEvent[Any]:
                     }
 
                     follow_up_response = await self.client.messages.create(**tools_cfg)
-                    
+
                     # Extract new tool calls from follow-up response
-                    current_calls = self._extract_tool_calls_from_response(follow_up_response)
-                    llm_response = LLMResponseEvent(follow_up_response, self._concat_text_blocks(follow_up_response.content))
+                    current_calls = self._extract_tool_calls_from_response(
+                        follow_up_response
+                    )
+                    llm_response = LLMResponseEvent(
+                        follow_up_response,
+                        self._concat_text_blocks(follow_up_response.content),
+                    )
                     rounds += 1
-                
+
                 # Finalization pass: no tools so Claude must answer in text
                 if current_calls or rounds > 0:  # Only if we had tool calls
                     final_response = await self.client.messages.create(
                         model=self.model,
-                        messages=messages,   # includes assistant tool_use + user tool_result blocks
+                        messages=messages,  # includes assistant tool_use + user tool_result blocks
                         stream=False,
-                        max_tokens=1000
+                        max_tokens=1000,
+                    )
+                    llm_response = LLMResponseEvent(
+                        final_response, self._concat_text_blocks(final_response.content)
                     )
-                    llm_response = LLMResponseEvent(final_response, self._concat_text_blocks(final_response.content))
-                            
+
         elif isinstance(original, AsyncStream):
             stream: AsyncStream[RawMessageStreamEvent] = original
             text_parts: List[str] = []
@@ -195,7 +222,9 @@ async def create_message(self, *args, **kwargs) -> LLMResponseEvent[Any]:
 
             # 1) First round: read stream, gather initial tool_use calls
             async for event in stream:
-                llm_response_optional = self._standardize_and_emit_event(event, text_parts)
+                llm_response_optional = self._standardize_and_emit_event(
+                    event, text_parts
+                )
                 if llm_response_optional is not None:
                     llm_response = llm_response_optional
                 # Collect tool_use calls as they complete (your helper already reconstructs args)
@@ -213,7 +242,12 @@ async def create_message(self, *args, **kwargs) -> LLMResponseEvent[Any]:
             last_followup_stream = None
             while accumulated_calls and rounds < MAX_ROUNDS:
                 # Execute calls concurrently with dedup
-                triples, seen = await self._dedup_and_execute(accumulated_calls, seen=seen, max_concurrency=8, timeout_s=30)  # type: ignore[arg-type]
+                triples, seen = await self._dedup_and_execute(
+                    accumulated_calls,  # type: ignore[arg-type]
+                    seen=seen,
+                    max_concurrency=8,
+                    timeout_s=30,
+                )
 
                 # Build tool_result user message
                 # Also reconstruct the assistant tool_use message that triggered these calls
@@ -221,22 +255,26 @@ async def create_message(self, *args, **kwargs) -> LLMResponseEvent[Any]:
                 executed_calls: List[NormalizedToolCallItem] = []
                 for tc, res, err in triples:
                     executed_calls.append(tc)
-                    assistant_content.append({
-                        "type": "tool_use",
-                        "id": tc["id"],
-                        "name": tc["name"],
-                        "input": tc["arguments_json"],
-                    })
+                    assistant_content.append(
+                        {
+                            "type": "tool_use",
+                            "id": tc["id"],
+                            "name": tc["name"],
+                            "input": tc["arguments_json"],
+                        }
+                    )
 
                 # tool_result blocks (sanitize to keep payloads safe)
                 tool_result_blocks = []
                 for tc, res, err in triples:
                     payload = self._sanitize_tool_output(res)
-                    tool_result_blocks.append({
-                        "type": "tool_result",
-                        "tool_use_id": tc["id"],
-                        "content": payload,
-                    })
+                    tool_result_blocks.append(
+                        {
+                            "type": "tool_result",
+                            "tool_use_id": tc["id"],
+                            "content": payload,
+                        }
+                    )
 
                 assistant_msg = {"role": "assistant", "content": assistant_content}
                 user_tool_results_msg = {"role": "user", "content": tool_result_blocks}
@@ -244,7 +282,9 @@ async def create_message(self, *args, **kwargs) -> LLMResponseEvent[Any]:
 
                 # Ask again WITH tools so Claude can do another hop
                 tools_cfg = {
-                    "tools": self._convert_tools_to_provider_format(self.get_available_functions()),
+                    "tools": self._convert_tools_to_provider_format(
+                        self.get_available_functions()
+                    ),
                     "tool_choice": {"type": "auto"},
                     "stream": True,
                     "model": self.model,
@@ -259,7 +299,9 @@ async def create_message(self, *args, **kwargs) -> LLMResponseEvent[Any]:
                 accumulated_calls = []  # reset; we'll refill with new calls
                 async for ev in follow_up_stream:
                     last_followup_stream = ev
-                    llm_response_optional = self._standardize_and_emit_event(ev, follow_up_text_parts)
+                    llm_response_optional = self._standardize_and_emit_event(
+                        ev, follow_up_text_parts
+                    )
                     if llm_response_optional is not None:
                         llm_response = llm_response_optional
                     new_calls, _ = self._extract_tool_calls_from_stream_chunk(ev, None)
@@ -276,14 +318,16 @@ async def create_message(self, *args, **kwargs) -> LLMResponseEvent[Any]:
             if accumulated_calls or rounds > 0:  # Only if we had tool calls
                 final_stream = await self.client.messages.create(
                     model=self.model,
-                    messages=messages,   # includes assistant tool_use + user tool_result blocks
+                    messages=messages,  # includes assistant tool_use + user tool_result blocks
                     stream=True,
-                    max_tokens=1000
+                    max_tokens=1000,
                 )
                 final_text_parts: List[str] = []
                 async for ev in final_stream:
                     last_followup_stream = ev
-                    llm_response_optional = self._standardize_and_emit_event(ev, final_text_parts)
+                    llm_response_optional = self._standardize_and_emit_event(
+                        ev, final_text_parts
+                    )
                     if llm_response_optional is not None:
                         llm_response = llm_response_optional
                 if final_text_parts:
@@ -291,8 +335,17 @@ async def create_message(self, *args, **kwargs) -> LLMResponseEvent[Any]:
 
             # 4) Done -> return all collected text
             total_text = "".join(text_parts)
-            llm_response = LLMResponseEvent(last_followup_stream or original, total_text) # type: ignore
-            self.events.send(LLMResponseCompletedEvent(original=last_followup_stream or original, text=total_text, plugin_name="anthropic"))
+            llm_response = LLMResponseEvent(
+                last_followup_stream or original,  # type: ignore[arg-type]
+                total_text,
+            )
+            self.events.send(
+                LLMResponseCompletedEvent(
+                    original=last_followup_stream or original,
+                    text=total_text,
+                    plugin_name="anthropic",
+                )
+            )
 
         return llm_response
 
@@ -303,10 +356,9 @@ def _standardize_and_emit_event(
         Forwards the events and also send out a standardized version (the agent class hooks into that)
         """
         # forward the native event
-        self.events.send(events.ClaudeStreamEvent(
-            plugin_name="anthropic",
-            event_data=event
-        ))
+        self.events.send(
+            events.ClaudeStreamEvent(plugin_name="anthropic", event_data=event)
+        )
 
         # send a standardized version for delta and response
         if event.type == "content_block_delta":
@@ -314,14 +366,16 @@ def _standardize_and_emit_event(
             if hasattr(delta_event.delta, "text") and delta_event.delta.text:
                 text_parts.append(delta_event.delta.text)
 
-                self.events.send(LLMResponseChunkEvent(
-                    plugin_name="antrhopic",
-                    content_index=delta_event.index,
-                    item_id="",
-                    output_index=0,
-                    sequence_number=0,
-                    delta=delta_event.delta.text,
-                ))
+                self.events.send(
+                    LLMResponseChunkEvent(
+                        plugin_name="antrhopic",
+                        content_index=delta_event.index,
+                        item_id="",
+                        output_index=0,
+                        sequence_number=0,
+                        delta=delta_event.delta.text,
+                    )
+                )
         elif event.type == "message_stop":
             stop_event: RawMessageStopEvent = event
             total_text = "".join(text_parts)
@@ -354,13 +408,15 @@ def _normalize_message(claude_messages: Any) -> List["Message"]:
 
         return messages
 
-    def _convert_tools_to_provider_format(self, tools: List[ToolSchema]) -> List[Dict[str, Any]]:
+    def _convert_tools_to_provider_format(
+        self, tools: List[ToolSchema]
+    ) -> List[Dict[str, Any]]:
         """
         Convert ToolSchema objects to Anthropic format.
-        
+
         Args:
             tools: List of ToolSchema objects
-            
+
         Returns:
             List of tools in Anthropic format
         """
@@ -369,37 +425,42 @@ def _convert_tools_to_provider_format(self, tools: List[ToolSchema]) -> List[Dic
             anthropic_tool = {
                 "name": tool["name"],
                 "description": tool.get("description", ""),
-                "input_schema": tool["parameters_schema"]
+                "input_schema": tool["parameters_schema"],
             }
             anthropic_tools.append(anthropic_tool)
         return anthropic_tools
 
-    def _extract_tool_calls_from_response(self, response: Any) -> List[NormalizedToolCallItem]:
+    def _extract_tool_calls_from_response(
+        self, response: Any
+    ) -> List[NormalizedToolCallItem]:
         """
         Extract tool calls from Anthropic response.
-        
+
         Args:
             response: Anthropic response object
-            
+
         Returns:
             List of normalized tool call items
         """
         tool_calls = []
-        
-        if hasattr(response, 'content') and response.content:
+
+        if hasattr(response, "content") and response.content:
             for content_block in response.content:
-                if hasattr(content_block, 'type') and content_block.type == "tool_use":
+                if hasattr(content_block, "type") and content_block.type == "tool_use":
                     tool_call: NormalizedToolCallItem = {
                         "type": "tool_call",
                         "id": content_block.id,  # Critical: capture the id for tool_result
                         "name": content_block.name,
-                        "arguments_json": content_block.input or {}  # normalize to arguments_json
+                        "arguments_json": content_block.input
+                        or {},  # normalize to arguments_json
                     }
                     tool_calls.append(tool_call)
-        
+
         return tool_calls
 
-    def _extract_tool_calls_from_stream_chunk(self, chunk: Any, current_tool_call: Optional[NormalizedToolCallItem] = None) -> tuple[List[NormalizedToolCallItem], Optional[NormalizedToolCallItem]]:  # type: ignore[override]
+    def _extract_tool_calls_from_stream_chunk(  # type: ignore[override]
+        self, chunk: Any, current_tool_call: Optional[NormalizedToolCallItem] = None
+    ) -> tuple[List[NormalizedToolCallItem], Optional[NormalizedToolCallItem]]:
         """
         Extract tool calls from Anthropic streaming chunk using index-keyed accumulation.
         Args:
@@ -409,22 +470,22 @@ def _extract_tool_calls_from_stream_chunk(self, chunk: Any, current_tool_call: O
             Tuple of (completed tool calls, current tool call being accumulated)
         """
         tool_calls = []
-        t = getattr(chunk, 'type', None)
+        t = getattr(chunk, "type", None)
 
         if t == "content_block_start":
-            cb = getattr(chunk, 'content_block', None)
-            if getattr(cb, 'type', None) == "tool_use":
+            cb = getattr(chunk, "content_block", None)
+            if getattr(cb, "type", None) == "tool_use":
                 if cb is not None:
                     self._pending_tool_uses_by_index[chunk.index] = {
                         "id": cb.id,
                         "name": cb.name,
-                        "parts": []
+                        "parts": [],
                     }
 
         elif t == "content_block_delta":
-            d = getattr(chunk, 'delta', None)
-            if getattr(d, 'type', None) == "input_json_delta":
-                pj = getattr(d, 'partial_json', None)
+            d = getattr(chunk, "delta", None)
+            if getattr(d, "type", None) == "input_json_delta":
+                pj = getattr(d, "partial_json", None)
                 if pj is not None and chunk.index in self._pending_tool_uses_by_index:
                     self._pending_tool_uses_by_index[chunk.index]["parts"].append(pj)
 
@@ -440,12 +501,14 @@ def _extract_tool_calls_from_stream_chunk(self, chunk: Any, current_tool_call: O
                     "type": "tool_call",
                     "id": pending["id"],
                     "name": pending["name"],
-                    "arguments_json": args
+                    "arguments_json": args,
                 }
                 tool_calls.append(tool_call_item)
         return tool_calls, None
 
-    def _create_tool_result_message(self, tool_calls: List[NormalizedToolCallItem], results: List[Any]) -> List[Dict[str, Any]]:
+    def _create_tool_result_message(
+        self, tool_calls: List[NormalizedToolCallItem], results: List[Any]
+    ) -> List[Dict[str, Any]]:
         """
         Create tool result messages for Anthropic.
             tool_calls: List of tool calls that were executed
@@ -461,17 +524,19 @@ def _create_tool_result_message(self, tool_calls: List[NormalizedToolCallItem],
                 payload = str(result)
             else:
                 payload = json.dumps(result)
-            blocks.append({
-                "type": "tool_result",
-                "tool_use_id": tool_call["id"],  # Critical: must match tool_use.id
-                "content": payload
-            })
+            blocks.append(
+                {
+                    "type": "tool_result",
+                    "tool_use_id": tool_call["id"],  # Critical: must match tool_use.id
+                    "content": payload,
+                }
+            )
         return [{"role": "user", "content": blocks}]
 
     def _concat_text_blocks(self, content):
         """Safely extract text from all text blocks in content."""
         out = []
         for b in content or []:
-            if getattr(b, 'type', None) == "text" and getattr(b, 'text', None):
+            if getattr(b, "type", None) == "text" and getattr(b, "text", None):
                 out.append(b.text)
         return "".join(out)
diff --git a/plugins/aws/vision_agents/plugins/aws/aws_llm.py b/plugins/aws/vision_agents/plugins/aws/aws_llm.py
index b347720d..0d504eaf 100644
--- a/plugins/aws/vision_agents/plugins/aws/aws_llm.py
+++ b/plugins/aws/vision_agents/plugins/aws/aws_llm.py
@@ -8,7 +8,10 @@
 from vision_agents.core.llm.llm_types import ToolSchema, NormalizedToolCallItem
 
 
-from vision_agents.core.llm.events import LLMResponseChunkEvent, LLMResponseCompletedEvent
+from vision_agents.core.llm.events import (
+    LLMResponseChunkEvent,
+    LLMResponseCompletedEvent,
+)
 from vision_agents.core.processors import Processor
 from . import events
 from vision_agents.core.edge.types import Participant
@@ -25,9 +28,9 @@ class BedrockLLM(LLM):
     https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock-runtime/client/converse.html
 
     Chat history has to be manually passed, there is no conversation storage.
-    
+
     Examples:
-    
+
         from vision_agents.plugins import aws
         llm = aws.LLM(
             model="anthropic.claude-3-5-sonnet-20241022-v2:0",
@@ -45,7 +48,7 @@ def __init__(
     ):
         """
         Initialize the BedrockLLM class.
-        
+
         Args:
             model: The Bedrock model ID (e.g., "anthropic.claude-3-5-sonnet-20241022-v2:0")
             region_name: AWS region name (default: "us-east-1")
@@ -57,7 +60,8 @@ def __init__(
         self.events.register_events_from_module(events)
         self.model = model
         self._pending_tool_uses_by_index: Dict[int, Dict[str, Any]] = {}
-        
+        self.provider_name = "aws"
+
         # Initialize boto3 bedrock-runtime client
         session_kwargs = {"region_name": region_name}
         if aws_access_key_id:
@@ -69,12 +73,12 @@ def __init__(
 
         if os.environ.get("AWS_BEDROCK_API_KEY"):
             session_kwargs["aws_session_token"] = os.environ["AWS_BEDROCK_API_KEY"]
-            
+
         self.client = boto3.client("bedrock-runtime", **session_kwargs)
 
         self.region_name = region_name
 
-    async def simple_response(
+    async def _simple_response(
         self,
         text: str,
         processors: Optional[List[Processor]] = None,
@@ -82,14 +86,14 @@ async def simple_response(
     ):
         """
         Simple response is a standardized way to create a response.
-        
+
         Args:
             text: The text to respond to
             processors: list of processors (which contain state) about the video/voice AI
             participant: optionally the participant object
-        
+
         Examples:
-        
+
             await llm.simple_response("say hi to the user")
         """
         return await self.converse_stream(
@@ -118,7 +122,7 @@ async def converse(self, *args, **kwargs) -> LLMResponseEvent[Any]:
 
         # Ensure the AI remembers the past conversation
         new_messages = kwargs.get("messages", [])
-        if hasattr(self, '_conversation') and self._conversation:
+        if hasattr(self, "_conversation") and self._conversation:
             old_messages = [m.original for m in self._conversation.messages]
             kwargs["messages"] = old_messages + new_messages
             # Add messages to conversation
@@ -128,11 +132,11 @@ async def converse(self, *args, **kwargs) -> LLMResponseEvent[Any]:
 
         try:
             response = self.client.converse(**kwargs)
-            
+
             # Extract text from response
             text = self._extract_text_from_response(response)
             llm_response = LLMResponseEvent(response, text)
-            
+
             # Handle tool calls if present
             function_calls = self._extract_tool_calls_from_response(response)
             if function_calls:
@@ -141,22 +145,35 @@ async def converse(self, *args, **kwargs) -> LLMResponseEvent[Any]:
                 rounds = 0
                 seen: set[tuple[str, str, str]] = set()
                 current_calls = function_calls
-                
+
                 while current_calls and rounds < MAX_ROUNDS:
                     # Execute calls concurrently with dedup
-                    triples, seen = await self._dedup_and_execute(current_calls, seen=seen, max_concurrency=8, timeout_s=30)  # type: ignore[arg-type]
-                    
+                    triples, seen = await self._dedup_and_execute(
+                        current_calls,  # type: ignore[arg-type]
+                        seen=seen,
+                        max_concurrency=8,
+                        timeout_s=30,
+                    )
+
                     if not triples:
                         break
-                    
+
                     # Build tool result message
                     tool_result_blocks = []
                     for tc, res, err in triples:
                         payload = self._sanitize_tool_output(res)
-                        tool_result_blocks.append({
-                            "toolUseId": tc["id"],
-                            "content": [{"json": payload if isinstance(payload, dict) else {"result": payload}}],
-                        })
+                        tool_result_blocks.append(
+                            {
+                                "toolUseId": tc["id"],
+                                "content": [
+                                    {
+                                        "json": payload
+                                        if isinstance(payload, dict)
+                                        else {"result": payload}
+                                    }
+                                ],
+                            }
+                        )
 
                     # Add assistant message with tool use and user message with tool results
                     assistant_msg = {
@@ -170,11 +187,11 @@ async def converse(self, *args, **kwargs) -> LLMResponseEvent[Any]:
                                 }
                             }
                             for tc, _, _ in triples
-                        ]
+                        ],
                     }
                     user_tool_results_msg = {
                         "role": "user",
-                        "content": [{"toolResult": tr} for tr in tool_result_blocks]
+                        "content": [{"toolResult": tr} for tr in tool_result_blocks],
                     }
                     messages = messages + [assistant_msg, user_tool_results_msg]
 
@@ -184,26 +201,37 @@ async def converse(self, *args, **kwargs) -> LLMResponseEvent[Any]:
                         messages=messages,
                         toolConfig=kwargs.get("toolConfig", {}),
                     )
-                    
+
                     # Extract new tool calls
-                    current_calls = self._extract_tool_calls_from_response(follow_up_response)
-                    llm_response = LLMResponseEvent(follow_up_response, self._extract_text_from_response(follow_up_response))
+                    current_calls = self._extract_tool_calls_from_response(
+                        follow_up_response
+                    )
+                    llm_response = LLMResponseEvent(
+                        follow_up_response,
+                        self._extract_text_from_response(follow_up_response),
+                    )
                     rounds += 1
-                
+
                 # Final pass without tools
                 if current_calls or rounds > 0:
                     final_response = self.client.converse(
                         modelId=self.model,
                         messages=messages,
                     )
-                    llm_response = LLMResponseEvent(final_response, self._extract_text_from_response(final_response))
-            
-            self.events.send(LLMResponseCompletedEvent(original=response, text=text, plugin_name="aws"))
+                    llm_response = LLMResponseEvent(
+                        final_response, self._extract_text_from_response(final_response)
+                    )
+
+            self.events.send(
+                LLMResponseCompletedEvent(
+                    original=response, text=text, plugin_name="aws"
+                )
+            )
 
         except ClientError as e:
             error_msg = f"AWS Bedrock API error: {str(e)}"
-            llm_response = LLMResponseEvent(None, error_msg, exception = e)
-            
+            llm_response = LLMResponseEvent(None, error_msg, exception=e)
+
         return llm_response
 
     async def converse_stream(self, *args, **kwargs) -> LLMResponseEvent[Any]:
@@ -222,7 +250,7 @@ async def converse_stream(self, *args, **kwargs) -> LLMResponseEvent[Any]:
 
         # Ensure the AI remembers the past conversation
         new_messages = kwargs.get("messages", [])
-        if hasattr(self, '_conversation') and self._conversation:
+        if hasattr(self, "_conversation") and self._conversation:
             old_messages = [m.original for m in self._conversation.messages]
             kwargs["messages"] = old_messages + new_messages
             normalized_messages = self._normalize_message(new_messages)
@@ -236,37 +264,50 @@ async def converse_stream(self, *args, **kwargs) -> LLMResponseEvent[Any]:
 
         try:
             response = self.client.converse_stream(**kwargs)
-            stream = response.get('stream')
-            
+            stream = response.get("stream")
+
             text_parts: List[str] = []
             accumulated_calls: List[NormalizedToolCallItem] = []
             last_event = None
-            
+
             # Process stream
             for event in stream:
                 last_event = event
                 self._process_stream_event(event, text_parts, accumulated_calls)
-            
+
             # Handle multi-hop tool calling
             messages = kwargs["messages"][:]
             MAX_ROUNDS = 3
             rounds = 0
             seen: set[tuple[str, str, str]] = set()
-            
+
             while accumulated_calls and rounds < MAX_ROUNDS:
-                triples, seen = await self._dedup_and_execute(accumulated_calls, seen=seen, max_concurrency=8, timeout_s=30)  # type: ignore[arg-type]
-                
+                triples, seen = await self._dedup_and_execute(
+                    accumulated_calls,  # type: ignore[arg-type]
+                    seen=seen,
+                    max_concurrency=8,
+                    timeout_s=30,
+                )
+
                 if not triples:
                     break
-                
+
                 # Build tool result messages
                 tool_result_blocks = []
                 for tc, res, err in triples:
                     payload = self._sanitize_tool_output(res)
-                    tool_result_blocks.append({
-                        "toolUseId": tc["id"],
-                        "content": [{"json": payload if isinstance(payload, dict) else {"result": payload}}],
-                    })
+                    tool_result_blocks.append(
+                        {
+                            "toolUseId": tc["id"],
+                            "content": [
+                                {
+                                    "json": payload
+                                    if isinstance(payload, dict)
+                                    else {"result": payload}
+                                }
+                            ],
+                        }
+                    )
 
                 assistant_msg = {
                     "role": "assistant",
@@ -279,11 +320,11 @@ async def converse_stream(self, *args, **kwargs) -> LLMResponseEvent[Any]:
                             }
                         }
                         for tc, _, _ in triples
-                    ]
+                    ],
                 }
                 user_tool_results_msg = {
                     "role": "user",
-                    "content": [{"toolResult": tr} for tr in tool_result_blocks]
+                    "content": [{"toolResult": tr} for tr in tool_result_blocks],
                 }
                 messages = messages + [assistant_msg, user_tool_results_msg]
 
@@ -293,85 +334,90 @@ async def converse_stream(self, *args, **kwargs) -> LLMResponseEvent[Any]:
                     messages=messages,
                     toolConfig=kwargs.get("toolConfig", {}),
                 )
-                
+
                 accumulated_calls = []
-                follow_up_stream = follow_up_response.get('stream')
+                follow_up_stream = follow_up_response.get("stream")
                 for event in follow_up_stream:
                     last_event = event
                     self._process_stream_event(event, text_parts, accumulated_calls)
-                
+
                 rounds += 1
-            
+
             # Final pass without tools
             if accumulated_calls or rounds > 0:
                 final_response = self.client.converse_stream(
                     modelId=self.model,
                     messages=messages,
                 )
-                final_stream = final_response.get('stream')
+                final_stream = final_response.get("stream")
                 for event in final_stream:
                     last_event = event
                     self._process_stream_event(event, text_parts, accumulated_calls)
-            
+
             total_text = "".join(text_parts)
             llm_response = LLMResponseEvent(last_event, total_text)
-            self.events.send(LLMResponseCompletedEvent(original=last_event, text=total_text, plugin_name="aws"))
-            
+            self.events.send(
+                LLMResponseCompletedEvent(
+                    original=last_event, text=total_text, plugin_name="aws"
+                )
+            )
+
         except ClientError as e:
             error_msg = f"AWS Bedrock streaming error: {str(e)}"
             llm_response = LLMResponseEvent(None, error_msg)
-            
+
         return llm_response
 
     def _process_stream_event(
-        self, 
-        event: Dict[str, Any], 
+        self,
+        event: Dict[str, Any],
         text_parts: List[str],
-        accumulated_calls: List[NormalizedToolCallItem]
+        accumulated_calls: List[NormalizedToolCallItem],
     ):
         """Process a streaming event from AWS."""
         # Forward the native event
-        self.events.send(events.AWSStreamEvent(
-            plugin_name="aws",
-            event_data=event
-        ))
-        
+        self.events.send(events.AWSStreamEvent(plugin_name="aws", event_data=event))
+
         # Handle content block delta (text)
-        if 'contentBlockDelta' in event:
-            delta = event['contentBlockDelta']['delta']
-            if 'text' in delta:
-                text_parts.append(delta['text'])
-                self.events.send(LLMResponseChunkEvent(
-                    plugin_name="aws",
-                    content_index=event['contentBlockDelta'].get('contentBlockIndex', 0),
-                    item_id="",
-                    output_index=0,
-                    sequence_number=0,
-                    delta=delta['text'],
-                ))
-        
+        if "contentBlockDelta" in event:
+            delta = event["contentBlockDelta"]["delta"]
+            if "text" in delta:
+                text_parts.append(delta["text"])
+                self.events.send(
+                    LLMResponseChunkEvent(
+                        plugin_name="aws",
+                        content_index=event["contentBlockDelta"].get(
+                            "contentBlockIndex", 0
+                        ),
+                        item_id="",
+                        output_index=0,
+                        sequence_number=0,
+                        delta=delta["text"],
+                    )
+                )
+
         # Handle tool use
-        if 'contentBlockStart' in event:
-            start = event['contentBlockStart'].get('start', {})
-            if 'toolUse' in start:
-                tool_use = start['toolUse']
-                idx = event['contentBlockStart'].get('contentBlockIndex', 0)
+        if "contentBlockStart" in event:
+            start = event["contentBlockStart"].get("start", {})
+            if "toolUse" in start:
+                tool_use = start["toolUse"]
+                idx = event["contentBlockStart"].get("contentBlockIndex", 0)
                 self._pending_tool_uses_by_index[idx] = {
-                    "id": tool_use.get('toolUseId', ''),
-                    "name": tool_use.get('name', ''),
-                    "parts": []
+                    "id": tool_use.get("toolUseId", ""),
+                    "name": tool_use.get("name", ""),
+                    "parts": [],
                 }
-        
-        if 'contentBlockDelta' in event:
-            delta = event['contentBlockDelta']['delta']
-            if 'toolUse' in delta:
-                idx = event['contentBlockDelta'].get('contentBlockIndex', 0)
+
+        if "contentBlockDelta" in event:
+            delta = event["contentBlockDelta"]["delta"]
+            if "toolUse" in delta:
+                idx = event["contentBlockDelta"].get("contentBlockIndex", 0)
                 if idx in self._pending_tool_uses_by_index:
-                    input_data = delta['toolUse'].get('input', '')
-                    self._pending_tool_uses_by_index[idx]['parts'].append(input_data)
-        
-        if 'contentBlockStop' in event:
-            idx = event['contentBlockStop'].get('contentBlockIndex', 0)
+                    input_data = delta["toolUse"].get("input", "")
+                    self._pending_tool_uses_by_index[idx]["parts"].append(input_data)
+
+        if "contentBlockStop" in event:
+            idx = event["contentBlockStop"].get("contentBlockIndex", 0)
             pending = self._pending_tool_uses_by_index.pop(idx, None)
             if pending:
                 buf = "".join(pending["parts"]).strip() or "{}"
@@ -383,51 +429,55 @@ def _process_stream_event(
                     "type": "tool_call",
                     "id": pending["id"],
                     "name": pending["name"],
-                    "arguments_json": args
+                    "arguments_json": args,
                 }
                 accumulated_calls.append(tool_call_item)
 
     def _extract_text_from_response(self, response: Dict[str, Any]) -> str:
         """Extract text content from AWS response."""
-        output = response.get('output', {})
-        message = output.get('message', {})
-        content = message.get('content', [])
-        
+        output = response.get("output", {})
+        message = output.get("message", {})
+        content = message.get("content", [])
+
         text_parts = []
         for item in content:
-            if 'text' in item:
-                text_parts.append(item['text'])
-        
+            if "text" in item:
+                text_parts.append(item["text"])
+
         return "".join(text_parts)
 
-    def _extract_tool_calls_from_response(self, response: Dict[str, Any]) -> List[NormalizedToolCallItem]:
+    def _extract_tool_calls_from_response(
+        self, response: Dict[str, Any]
+    ) -> List[NormalizedToolCallItem]:
         """Extract tool calls from AWS response."""
         tool_calls = []
-        
-        output = response.get('output', {})
-        message = output.get('message', {})
-        content = message.get('content', [])
-        
+
+        output = response.get("output", {})
+        message = output.get("message", {})
+        content = message.get("content", [])
+
         for item in content:
-            if 'toolUse' in item:
-                tool_use = item['toolUse']
+            if "toolUse" in item:
+                tool_use = item["toolUse"]
                 tool_call: NormalizedToolCallItem = {
                     "type": "tool_call",
-                    "id": tool_use.get('toolUseId', ''),
-                    "name": tool_use.get('name', ''),
-                    "arguments_json": tool_use.get('input', {})
+                    "id": tool_use.get("toolUseId", ""),
+                    "name": tool_use.get("name", ""),
+                    "arguments_json": tool_use.get("input", {}),
                 }
                 tool_calls.append(tool_call)
-        
+
         return tool_calls
 
-    def _convert_tools_to_provider_format(self, tools: List[ToolSchema]) -> List[Dict[str, Any]]:
+    def _convert_tools_to_provider_format(
+        self, tools: List[ToolSchema]
+    ) -> List[Dict[str, Any]]:
         """
         Convert ToolSchema objects to AWS Bedrock format.
-        
+
         Args:
             tools: List of ToolSchema objects
-            
+
         Returns:
             List of tools in AWS Bedrock format
         """
@@ -437,9 +487,7 @@ def _convert_tools_to_provider_format(self, tools: List[ToolSchema]) -> List[Dic
                 "toolSpec": {
                     "name": tool["name"],
                     "description": tool.get("description", ""),
-                    "inputSchema": {
-                        "json": tool["parameters_schema"]
-                    }
+                    "inputSchema": {"json": tool["parameters_schema"]},
                 }
             }
             aws_tools.append(aws_tool)
@@ -451,9 +499,7 @@ def _normalize_message(aws_messages: Any) -> List["Message"]:
         from vision_agents.core.agents.conversation import Message
 
         if isinstance(aws_messages, str):
-            aws_messages = [
-                {"content": [{"text": aws_messages}], "role": "user"}
-            ]
+            aws_messages = [{"content": [{"text": aws_messages}], "role": "user"}]
 
         if not isinstance(aws_messages, (List, tuple)):
             aws_messages = [aws_messages]
@@ -465,8 +511,8 @@ def _normalize_message(aws_messages: Any) -> List["Message"]:
                 # Extract text from content blocks
                 text_parts = []
                 for item in content_items:
-                    if isinstance(item, dict) and 'text' in item:
-                        text_parts.append(item['text'])
+                    if isinstance(item, dict) and "text" in item:
+                        text_parts.append(item["text"])
                     elif isinstance(item, str):
                         text_parts.append(item)
                 content = " ".join(text_parts)
@@ -478,4 +524,3 @@ def _normalize_message(aws_messages: Any) -> List["Message"]:
             messages.append(message)
 
         return messages
-
diff --git a/plugins/gemini/vision_agents/plugins/gemini/gemini_llm.py b/plugins/gemini/vision_agents/plugins/gemini/gemini_llm.py
index fd0d9596..3661692d 100644
--- a/plugins/gemini/vision_agents/plugins/gemini/gemini_llm.py
+++ b/plugins/gemini/vision_agents/plugins/gemini/gemini_llm.py
@@ -8,7 +8,10 @@
 from vision_agents.core.llm.llm import LLM, LLMResponseEvent
 from vision_agents.core.llm.llm_types import ToolSchema, NormalizedToolCallItem
 
-from vision_agents.core.llm.events import LLMResponseCompletedEvent, LLMResponseChunkEvent
+from vision_agents.core.llm.events import (
+    LLMResponseCompletedEvent,
+    LLMResponseChunkEvent,
+)
 
 from . import events
 
@@ -20,24 +23,30 @@
 
 class GeminiLLM(LLM):
     """
-      The GeminiLLM class provides full/native access to the gemini SDK methods.
-      It only standardized the minimal feature set that's needed for the agent integration.
+    The GeminiLLM class provides full/native access to the gemini SDK methods.
+    It only standardized the minimal feature set that's needed for the agent integration.
 
-      The agent requires that we standardize:
-      - sharing instructions
-      - keeping conversation history
-      - response normalization
+    The agent requires that we standardize:
+    - sharing instructions
+    - keeping conversation history
+    - response normalization
 
-      Notes on the Gemini integration:
-      - the native method is called send_message (maps 1-1 to chat.send_message_stream)
-      - history is maintained in the gemini sdk (with the usage of client.chats.create(model=self.model))
+    Notes on the Gemini integration:
+    - the native method is called send_message (maps 1-1 to chat.send_message_stream)
+    - history is maintained in the gemini sdk (with the usage of client.chats.create(model=self.model))
 
-      Examples:
+    Examples:
 
-          from vision_agents.plugins import gemini
-          llm = gemini.LLM()
-      """
-    def __init__(self, model: str, api_key: Optional[str] = None, client: Optional[genai.Client] = None):
+        from vision_agents.plugins import gemini
+        llm = gemini.LLM()
+    """
+
+    def __init__(
+        self,
+        model: str,
+        api_key: Optional[str] = None,
+        client: Optional[genai.Client] = None,
+    ):
         """
         Initialize the GeminiLLM class.
 
@@ -50,13 +59,19 @@ def __init__(self, model: str, api_key: Optional[str] = None, client: Optional[g
         self.events.register_events_from_module(events)
         self.model = model
         self.chat: Optional[Any] = None
+        self.provider_name = "gemini"
 
         if client is not None:
             self.client = client
         else:
             self.client = genai.Client(api_key=api_key)
 
-    async def simple_response(self, text: str, processors: Optional[List[Processor]] = None, participant: Optional[Any] = None) -> LLMResponseEvent[Any]:
+    async def _simple_response(
+        self,
+        text: str,
+        processors: Optional[List[Processor]] = None,
+        participant: Optional[Any] = None,
+    ) -> LLMResponseEvent[Any]:
         """
         simple_response is a standardized way (across openai, claude, gemini etc.) to create a response.
 
@@ -68,9 +83,7 @@ async def simple_response(self, text: str, processors: Optional[List[Processor]]
 
             llm.simple_response("say hi to the user, be mean")
         """
-        return await self.send_message(
-            message=text
-        )
+        return await self.send_message(message=text)
 
     async def send_message(self, *args, **kwargs):
         """
@@ -78,7 +91,7 @@ async def send_message(self, *args, **kwargs):
         under the hood it calls chat.send_message_stream(*args, **kwargs)
         this method wraps and ensures we broadcast an event which the agent class hooks into
         """
-        #if "model" not in kwargs:
+        # if "model" not in kwargs:
         #    kwargs["model"] = self.model
 
         # initialize chat if needed
@@ -91,6 +104,7 @@ async def send_message(self, *args, **kwargs):
         tools_spec = self.get_available_functions()
         if tools_spec:
             from google.genai import types
+
             conv_tools = self._convert_tools_to_provider_format(tools_spec)
             cfg = kwargs.get("config")
             if not isinstance(cfg, types.GenerateContentConfig):
@@ -100,7 +114,7 @@ async def send_message(self, *args, **kwargs):
 
         # Generate content using the client
         iterator = self.chat.send_message_stream(*args, **kwargs)
-        text_parts : List[str] = []
+        text_parts: List[str] = []
         final_chunk = None
         pending_calls: List[NormalizedToolCallItem] = []
 
@@ -126,12 +140,17 @@ async def send_message(self, *args, **kwargs):
             rounds = 0
             current_calls = pending_calls
             cfg_with_tools = kwargs.get("config")
-            
+
             seen: set[str] = set()
             while current_calls and rounds < MAX_ROUNDS:
                 # Execute tools concurrently with deduplication
-                triples, seen = await self._dedup_and_execute(current_calls, max_concurrency=8, timeout_s=30, seen=seen)  # type: ignore[arg-type]
-                
+                triples, seen = await self._dedup_and_execute(
+                    current_calls,  # type: ignore[arg-type]
+                    max_concurrency=8,
+                    timeout_s=30,
+                    seen=seen,
+                )
+
                 executed = []
                 parts = []
                 for tc, res, err in triples:
@@ -143,19 +162,28 @@ async def send_message(self, *args, **kwargs):
                     sanitized_res = {}
                     for k, v in res.items():
                         sanitized_res[k] = self._sanitize_tool_output(v)
-                    parts.append(types.Part.from_function_response(name=tc["name"], response=sanitized_res))
-                
+                    parts.append(
+                        types.Part.from_function_response(
+                            name=tc["name"], response=sanitized_res
+                        )
+                    )
+
                 # Send function responses with tools config
-                follow_up_iter = self.chat.send_message_stream(parts, config=cfg_with_tools)  # type: ignore[arg-type]
-                
+                follow_up_iter = self.chat.send_message_stream(
+                    parts,  # type: ignore[arg-type]
+                    config=cfg_with_tools,
+                )
+
                 follow_up_text_parts: List[str] = []
                 follow_up_last = None
                 next_calls = []
-                
+
                 for idx, chk in enumerate(follow_up_iter):
                     follow_up_last = chk
                     # TODO: unclear if this is correct (item_id and idx)
-                    self._standardize_and_emit_event(chk, follow_up_text_parts, item_id, idx)
+                    self._standardize_and_emit_event(
+                        chk, follow_up_text_parts, item_id, idx
+                    )
 
                     # Check for new function calls
                     try:
@@ -163,7 +191,7 @@ async def send_message(self, *args, **kwargs):
                         next_calls.extend(chunk_calls)
                     except Exception:
                         pass
-                
+
                 current_calls = next_calls
                 rounds += 1
 
@@ -173,12 +201,14 @@ async def send_message(self, *args, **kwargs):
             total_text = "".join(text_parts)
             llm_response = LLMResponseEvent(final_chunk, total_text)
 
-        self.events.send(LLMResponseCompletedEvent(
-            plugin_name="gemini",
-            original=llm_response.original,
-            text=llm_response.text,
-            item_id=item_id,
-        ))
+        self.events.send(
+            LLMResponseCompletedEvent(
+                plugin_name="gemini",
+                original=llm_response.original,
+                text=llm_response.text,
+                item_id=item_id,
+            )
+        )
 
         # Return the LLM response
         return llm_response
@@ -186,12 +216,10 @@ async def send_message(self, *args, **kwargs):
     @staticmethod
     def _normalize_message(gemini_input) -> List["Message"]:
         from vision_agents.core.agents.conversation import Message
-        
+
         # standardize on input
         if isinstance(gemini_input, str):
-            gemini_input = [
-                gemini_input
-            ]
+            gemini_input = [gemini_input]
 
         if not isinstance(gemini_input, List):
             gemini_input = [gemini_input]
@@ -203,29 +231,38 @@ def _normalize_message(gemini_input) -> List["Message"]:
 
         return messages
 
-    def _standardize_and_emit_event(self, chunk: GenerateContentResponse, text_parts: List[str], item_id: str, idx: int) -> Optional[LLMResponseEvent[Any]]:
+    def _standardize_and_emit_event(
+        self,
+        chunk: GenerateContentResponse,
+        text_parts: List[str],
+        item_id: str,
+        idx: int,
+    ) -> Optional[LLMResponseEvent[Any]]:
         """
         Forwards the events and also send out a standardized version (the agent class hooks into that)
         """
         # forward the native event
-        self.events.send(events.GeminiResponseEvent(
-            plugin_name="gemini",
-            response_chunk=chunk
-        ))
+        self.events.send(
+            events.GeminiResponseEvent(plugin_name="gemini", response_chunk=chunk)
+        )
 
         # Check if response has text content
-        if hasattr(chunk, 'text') and chunk.text:
-            self.events.send(LLMResponseChunkEvent(
-                plugin_name="gemini",
-                content_index=idx,
-                item_id=item_id,
-                delta=chunk.text,
-            ))
+        if hasattr(chunk, "text") and chunk.text:
+            self.events.send(
+                LLMResponseChunkEvent(
+                    plugin_name="gemini",
+                    content_index=idx,
+                    item_id=item_id,
+                    delta=chunk.text,
+                )
+            )
             text_parts.append(chunk.text)
 
         return None
 
-    def _convert_tools_to_provider_format(self, tools: List[ToolSchema]) -> List[Dict[str, Any]]:
+    def _convert_tools_to_provider_format(
+        self, tools: List[ToolSchema]
+    ) -> List[Dict[str, Any]]:
         """
         Convert ToolSchema objects to Gemini format.
         Args:
@@ -235,75 +272,93 @@ def _convert_tools_to_provider_format(self, tools: List[ToolSchema]) -> List[Dic
         """
         function_declarations = []
         for tool in tools:
-            function_declarations.append({
-                "name": tool["name"],
-                "description": tool.get("description", ""),
-                "parameters": tool["parameters_schema"]
-            })
-        
+            function_declarations.append(
+                {
+                    "name": tool["name"],
+                    "description": tool.get("description", ""),
+                    "parameters": tool["parameters_schema"],
+                }
+            )
+
         # Return as dict with function_declarations (SDK accepts dicts)
         return [{"function_declarations": function_declarations}]
 
-    def _extract_tool_calls_from_response(self, response: Any) -> List[NormalizedToolCallItem]:
+    def _extract_tool_calls_from_response(
+        self, response: Any
+    ) -> List[NormalizedToolCallItem]:
         """
         Extract tool calls from Gemini response.
-        
+
         Args:
             response: Gemini response object
-            
+
         Returns:
             List of normalized tool call items
         """
         calls: List[NormalizedToolCallItem] = []
-        
+
         try:
             # Prefer the top-level convenience list if available
             function_calls = getattr(response, "function_calls", []) or []
             for fc in function_calls:
-                calls.append({
-                    "type": "tool_call", 
-                    "name": getattr(fc, "name", "unknown"), 
-                    "arguments_json": getattr(fc, "args", {})
-                })
+                calls.append(
+                    {
+                        "type": "tool_call",
+                        "name": getattr(fc, "name", "unknown"),
+                        "arguments_json": getattr(fc, "args", {}),
+                    }
+                )
 
             if not calls and getattr(response, "candidates", None):
                 for c in response.candidates:
                     if getattr(c, "content", None):
                         for part in c.content.parts:
                             if getattr(part, "function_call", None):
-                                calls.append({
-                                    "type": "tool_call",
-                                    "name": getattr(part.function_call, "name", "unknown"),
-                                    "arguments_json": getattr(part.function_call, "args", {}),
-                                })
+                                calls.append(
+                                    {
+                                        "type": "tool_call",
+                                        "name": getattr(
+                                            part.function_call, "name", "unknown"
+                                        ),
+                                        "arguments_json": getattr(
+                                            part.function_call, "args", {}
+                                        ),
+                                    }
+                                )
         except Exception:
             pass  # Ignore extraction errors
-        
+
         return calls
 
-    def _extract_tool_calls_from_stream_chunk(self, chunk: Any) -> List[NormalizedToolCallItem]:
+    def _extract_tool_calls_from_stream_chunk(
+        self, chunk: Any
+    ) -> List[NormalizedToolCallItem]:
         """
         Extract tool calls from Gemini streaming chunk.
-        
+
         Args:
             chunk: Gemini streaming event
-            
+
         Returns:
             List of normalized tool call items
         """
         try:
-            return self._extract_tool_calls_from_response(chunk)  # chunks use same shape
+            return self._extract_tool_calls_from_response(
+                chunk
+            )  # chunks use same shape
         except Exception:
             return []  # Ignore extraction errors
 
-    def _create_tool_result_parts(self, tool_calls: List[NormalizedToolCallItem], results: List[Any]):
+    def _create_tool_result_parts(
+        self, tool_calls: List[NormalizedToolCallItem], results: List[Any]
+    ):
         """
         Create function_response parts for Gemini.
-        
+
         Args:
             tool_calls: List of tool calls that were executed
             results: List of results from function execution
-            
+
         Returns:
             List of function_response parts
         """
@@ -315,9 +370,13 @@ def _create_tool_result_parts(self, tool_calls: List[NormalizedToolCallItem], re
                     response_data = res
                 else:
                     response_data = {"result": res}
-                
+
                 # res may be dict/list/str; pass directly; SDK serializes
-                parts.append(types.Part.from_function_response(name=tc["name"], response=response_data))
+                parts.append(
+                    types.Part.from_function_response(
+                        name=tc["name"], response=response_data
+                    )
+                )
             except Exception:
                 # Fallback: create a simple text part
                 parts.append(types.Part(text=f"Function {tc['name']} returned: {res}"))
diff --git a/plugins/openai/vision_agents/plugins/openai/openai_llm.py b/plugins/openai/vision_agents/plugins/openai/openai_llm.py
index 06a19940..c2fbe66d 100644
--- a/plugins/openai/vision_agents/plugins/openai/openai_llm.py
+++ b/plugins/openai/vision_agents/plugins/openai/openai_llm.py
@@ -69,6 +69,7 @@ def __init__(
         self.model = model
         self.openai_conversation: Optional[Any] = None
         self.conversation = None
+        self.provider_name = "openai"
 
         if client is not None:
             self.client = client
@@ -77,7 +78,7 @@ def __init__(
         else:
             self.client = AsyncOpenAI(base_url=base_url)
 
-    async def simple_response(
+    async def _simple_response(
         self,
         text: str,
         processors: Optional[List[Processor]] = None,
diff --git a/plugins/openrouter/vision_agents/plugins/openrouter/openrouter_llm.py b/plugins/openrouter/vision_agents/plugins/openrouter/openrouter_llm.py
index 52664191..730f968e 100644
--- a/plugins/openrouter/vision_agents/plugins/openrouter/openrouter_llm.py
+++ b/plugins/openrouter/vision_agents/plugins/openrouter/openrouter_llm.py
@@ -1,4 +1,5 @@
 """OpenRouter LLM implementation using OpenAI-compatible API."""
+
 import os
 from typing import Any
 
@@ -24,7 +25,7 @@ def __init__(
         **kwargs: Any,
     ) -> None:
         """Initialize OpenRouter LLM.
-        
+
         Args:
             api_key: OpenRouter API key. If not provided, uses OPENROUTER_API_KEY env var.
             base_url: OpenRouter API base URL.
@@ -39,6 +40,7 @@ def __init__(
             model=model,
             **kwargs,
         )
+        self.provider_name = "openrouter"
 
     async def create_conversation(self):
         # Do nothing, dont call super
@@ -51,11 +53,10 @@ def add_conversation_history(self, kwargs):
         new_messages = kwargs["input"]
         if not isinstance(new_messages, list):
             new_messages = [dict(content=new_messages, role="user", type="message")]
-        if hasattr(self, '_conversation') and self._conversation:
+        if hasattr(self, "_conversation") and self._conversation:
             old_messages = [m.original for m in self._conversation.messages]
             kwargs["input"] = old_messages + new_messages
             # Add messages to conversation
             normalized_messages = self._normalize_message(new_messages)
             for msg in normalized_messages:
                 self._conversation.messages.append(msg)
-
diff --git a/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py b/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py
index 411ea76f..06884440 100644
--- a/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py
+++ b/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py
@@ -13,8 +13,11 @@
 from vision_agents.core.agents import Conversation
 from vision_agents.core.agents.agents import default_agent_options, AgentOptions
 from vision_agents.core.edge.types import Participant
-from vision_agents.core.observability import meter
-from vision_agents.core.observability.metrics import Timer
+from vision_agents.core.observability.metrics import (
+    Timer,
+    turn_vad_latency_ms,
+    turn_end_detection_latency_ms,
+)
 
 from vision_agents.core.turn_detection import (
     TurnDetector,
@@ -42,17 +45,6 @@
 )
 
 
-turn_silero_vad_latency_ms = meter.create_histogram(
-    "turn.silero.vad.latency.ms",
-    unit="ms",
-)
-
-turn_smart_turn_detection_latency_ms = meter.create_histogram(
-    "turn.smart_turn.detection.latency.ms",
-    unit="ms",
-)
-
-
 @dataclass
 class Silence:
     trailing_silence_chunks: int = 0
@@ -248,8 +240,9 @@ async def _process_audio_packet(
         # detect speech in small 512 chunks, gather to larger audio segments with speech
         for chunk in audio_chunks[:-1]:
             # predict if this segment has speech
-            with Timer(turn_silero_vad_latency_ms) as timer:
+            with Timer(turn_vad_latency_ms) as timer:
                 timer.attributes["samples"] = len(chunk.samples)
+                timer.attributes["implementation"] = "smart_turn"
                 speech_probability = await self.vad.predict_speech(chunk.samples)
             is_speech = speech_probability > self.speech_probability_threshold
 
@@ -289,7 +282,8 @@ async def _process_audio_packet(
                     merged.append(self._active_segment)
                     merged = merged.tail(8, True, "start")
                     # see if we've completed the turn
-                    with Timer(turn_smart_turn_detection_latency_ms) as timer:
+                    with Timer(turn_end_detection_latency_ms) as timer:
+                        timer.attributes["implementation"] = "smart_turn"
                         timer.attributes["audio_duration_ms"] = merged.duration_ms
                         timer.attributes["samples"] = len(merged.samples)
                         timer.attributes["trailing_silence_ms"] = trailing_silence_ms
diff --git a/plugins/vogent/vision_agents/plugins/vogent/vogent_turn_detection.py b/plugins/vogent/vision_agents/plugins/vogent/vogent_turn_detection.py
index 34b352ee..e955d055 100644
--- a/plugins/vogent/vision_agents/plugins/vogent/vogent_turn_detection.py
+++ b/plugins/vogent/vision_agents/plugins/vogent/vogent_turn_detection.py
@@ -17,26 +17,23 @@
     TurnStartedEvent,
     TurnEndedEvent,
 )
-from vision_agents.core.observability.metrics import Timer, meter
+from vision_agents.core.observability.metrics import (
+    Timer,
+    meter,
+    turn_vad_latency_ms,
+    turn_end_detection_latency_ms,
+)
 
 import logging
 
 logger = logging.getLogger(__name__)
 
-# Metrics for Vogent turn detection
-vogent_vad_latency_ms = meter.create_histogram(
-    "vogent.vad.latency.ms", unit="ms", description="Vogent VAD prediction latency"
-)
+# Vogent-specific metric for Whisper transcription
 vogent_whisper_latency_ms = meter.create_histogram(
     "vogent.whisper.latency.ms",
     unit="ms",
     description="Vogent Whisper transcription latency",
 )
-vogent_turn_prediction_latency_ms = meter.create_histogram(
-    "vogent.turn_prediction.latency.ms",
-    unit="ms",
-    description="Vogent turn completion prediction latency",
-)
 
 # Silero VAD model (reused from smart_turn)
 SILERO_ONNX_FILENAME = "silero_vad.onnx"
@@ -260,8 +257,9 @@ async def _process_audio_packet(
             if self.vad is None:
                 continue
 
-            with Timer(vogent_vad_latency_ms) as timer:
+            with Timer(turn_vad_latency_ms) as timer:
                 timer.attributes["samples"] = len(chunk.samples)
+                timer.attributes["implementation"] = "vogent"
                 speech_probability = self.vad.predict_speech(chunk.samples)
             is_speech = speech_probability > self.speech_probability_threshold
 
@@ -421,12 +419,13 @@ async def _predict_turn_completed(
         Returns:
             True if turn is complete, False otherwise
         """
-        with Timer(vogent_turn_prediction_latency_ms) as timer:
+        with Timer(turn_end_detection_latency_ms) as timer:
             # Ensure it's 16khz and f32 format
             pcm = pcm.resample(16000).to_float32()
 
             # Truncate to 8 seconds
             audio_array = pcm.tail(8, False).samples
+            timer.attributes["implementation"] = "vogent"
             timer.attributes["audio_duration_ms"] = len(audio_array) / 16000 * 1000
             timer.attributes["prev_line_length"] = len(prev_line)
             timer.attributes["curr_line_length"] = len(curr_line)
diff --git a/plugins/xai/vision_agents/plugins/xai/llm.py b/plugins/xai/vision_agents/plugins/xai/llm.py
index 5392ab64..4ed18bf0 100644
--- a/plugins/xai/vision_agents/plugins/xai/llm.py
+++ b/plugins/xai/vision_agents/plugins/xai/llm.py
@@ -5,7 +5,10 @@
 
 from vision_agents.core.llm.llm import LLM, LLMResponseEvent
 from vision_agents.core.processors import Processor
-from vision_agents.core.llm.events import LLMResponseChunkEvent, LLMResponseCompletedEvent
+from vision_agents.core.llm.events import (
+    LLMResponseChunkEvent,
+    LLMResponseCompletedEvent,
+)
 from . import events
 
 if TYPE_CHECKING:
@@ -56,6 +59,7 @@ def __init__(
         self.model = model
         self.xai_chat: Optional["Chat"] = None
         self.conversation = None
+        self.provider_name = "xai"
 
         if client is not None:
             self.client = client
@@ -64,7 +68,7 @@ def __init__(
         else:
             self.client = AsyncClient()
 
-    async def simple_response(
+    async def _simple_response(
         self,
         text: str,
         processors: Optional[List[Processor]] = None,
@@ -91,7 +95,9 @@ async def simple_response(
             instructions=instructions,
         )
 
-    async def create_response(self, *args: Any, **kwargs: Any) -> LLMResponseEvent[Response]:
+    async def create_response(
+        self, *args: Any, **kwargs: Any
+    ) -> LLMResponseEvent[Response]:
         """
         create_response gives you full support/access to the native xAI chat.sample() and chat.stream() methods
         this method wraps the xAI method and ensures we broadcast an event which the agent class hooks into
@@ -139,10 +145,11 @@ async def create_response(self, *args: Any, **kwargs: Any) -> LLMResponseEvent[R
             self.xai_chat.append(response)
 
         if llm_response is not None:
-            self.events.send(LLMResponseCompletedEvent(
-                original=llm_response.original,
-                text=llm_response.text
-            ))
+            self.events.send(
+                LLMResponseCompletedEvent(
+                    original=llm_response.original, text=llm_response.text
+                )
+            )
 
         return llm_response or LLMResponseEvent[Response](
             Response(chat_pb2.GetChatCompletionResponse(), 0), ""
@@ -170,31 +177,32 @@ def _standardize_and_emit_chunk(
         Forwards the chunk events and also send out a standardized version (the agent class hooks into that)
         """
         # Emit the raw chunk event
-        self.events.send(events.XAIChunkEvent(
-            plugin_name="xai",
-            chunk=chunk
-        ))
+        self.events.send(events.XAIChunkEvent(plugin_name="xai", chunk=chunk))
 
         # Emit standardized delta events for content
         if chunk.content:
-            self.events.send(LLMResponseChunkEvent(
-                content_index=0,  # xAI doesn't have content_index
-                item_id=chunk.proto.id if hasattr(chunk.proto, "id") else "",
-                output_index=0,  # xAI doesn't have output_index
-                sequence_number=0,  # xAI doesn't have sequence_number
-                delta=chunk.content,
-                plugin_name="xai",
-            ))
+            self.events.send(
+                LLMResponseChunkEvent(
+                    content_index=0,  # xAI doesn't have content_index
+                    item_id=chunk.proto.id if hasattr(chunk.proto, "id") else "",
+                    output_index=0,  # xAI doesn't have output_index
+                    sequence_number=0,  # xAI doesn't have sequence_number
+                    delta=chunk.content,
+                    plugin_name="xai",
+                )
+            )
 
         # Check if this is the final chunk (finish_reason indicates completion)
         if chunk.choices and chunk.choices[0].finish_reason:
             # This is the final chunk, return the complete response
             llm_response = LLMResponseEvent[Response](response, response.content)
-            self.events.send(LLMResponseCompletedEvent(
-                plugin_name="xai",
-                text=llm_response.text,
-                original=llm_response.original
-            ))
+            self.events.send(
+                LLMResponseCompletedEvent(
+                    plugin_name="xai",
+                    text=llm_response.text,
+                    original=llm_response.original,
+                )
+            )
             return llm_response
 
         return None

From 2755b9d78f65101807b7e040919a86304bd8c940 Mon Sep 17 00:00:00 2001
From: Tommaso Barbugli <tbarbugli@gmail.com>
Date: Fri, 31 Oct 2025 16:31:49 +0100
Subject: [PATCH 04/11] eable tracing for now

---
 .../simple_agent_example.py                    | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/examples/01_simple_agent_example/simple_agent_example.py b/examples/01_simple_agent_example/simple_agent_example.py
index 263741b0..4dfdc634 100644
--- a/examples/01_simple_agent_example/simple_agent_example.py
+++ b/examples/01_simple_agent_example/simple_agent_example.py
@@ -3,7 +3,7 @@
 from dotenv import load_dotenv
 
 from vision_agents.core import User, Agent
-from vision_agents.plugins import cartesia, deepgram, getstream, gemini, vogent
+from vision_agents.plugins import cartesia, deepgram, getstream, gemini, smart_turn
 
 load_dotenv()
 
@@ -22,7 +22,7 @@ async def start_agent() -> None:
         llm=llm,
         tts=cartesia.TTS(),
         stt=deepgram.STT(),
-        turn_detection=vogent.TurnDetection(),
+        turn_detection=smart_turn.TurnDetection(),
         # realtime version (vad, tts and stt not needed)
         # llm=openai.Realtime()
     )
@@ -59,11 +59,14 @@ async def start_agent() -> None:
 
 def setup_telemetry():
     import atexit
-    from opentelemetry import trace
+    from opentelemetry import trace, metrics
     from opentelemetry.sdk.resources import Resource
     from opentelemetry.sdk.trace import TracerProvider
     from opentelemetry.sdk.trace.export import BatchSpanProcessor
     from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+    from opentelemetry.sdk.metrics import MeterProvider
+    from opentelemetry.exporter.prometheus import PrometheusMetricReader
+    from prometheus_client import start_http_server
 
     resource = Resource.create(
         {
@@ -76,6 +79,13 @@ def setup_telemetry():
     tp.add_span_processor(BatchSpanProcessor(exporter))
     trace.set_tracer_provider(tp)
 
+    reader = PrometheusMetricReader()
+    metrics.set_meter_provider(
+        MeterProvider(resource=resource, metric_readers=[reader])
+    )
+
+    start_http_server(port=9464)
+
     def _flush_and_shutdown():
         tp.force_flush()
         tp.shutdown()
@@ -84,5 +94,5 @@ def _flush_and_shutdown():
 
 
 if __name__ == "__main__":
-    # setup_telemetry()
+    setup_telemetry()
     asyncio.run(start_agent())

From 94f544a2d1ee717f6e1cddaa1c995ed02da71484 Mon Sep 17 00:00:00 2001
From: Tommaso Barbugli <tbarbugli@gmail.com>
Date: Sat, 1 Nov 2025 15:20:54 +0100
Subject: [PATCH 05/11] fix tests

---
 .../plugins/gemini/gemini_realtime.py         | 35 ++++++++++++++
 .../plugins/openai/openai_realtime.py         | 35 ++++++++++++++
 tests/test_function_calling.py                | 46 +++++++++++++++----
 3 files changed, 107 insertions(+), 9 deletions(-)

diff --git a/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py b/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py
index 019a2c22..dcfbbfae 100644
--- a/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py
+++ b/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py
@@ -128,6 +128,41 @@ async def simple_response(
         self.logger.info("Simple response called with text: %s", text)
         await self.send_realtime_input(text=text)
 
+    async def _simple_response(
+        self,
+        text: str,
+        system_prompt: Optional[str] = None,
+        temperature: float = 0.7,
+        max_tokens: Optional[int] = None,
+        **kwargs: Any,
+    ) -> str:
+        """
+        Internal simple response implementation required by LLM base class.
+
+        Note: Gemini Realtime is event-driven and doesn't return responses directly.
+        This implementation sends the text and returns a placeholder.
+        """
+        await self.send_realtime_input(text=text)
+        return ""  # Realtime API doesn't return text synchronously
+
+    async def _simple_response_stream(
+        self,
+        text: str,
+        system_prompt: Optional[str] = None,
+        temperature: float = 0.7,
+        max_tokens: Optional[int] = None,
+        **kwargs: Any,
+    ):
+        """
+        Internal simple response stream implementation required by LLM base class.
+
+        Note: Gemini Realtime is event-driven and doesn't stream responses in this manner.
+        This implementation sends the text but yields nothing.
+        """
+        await self.send_realtime_input(text=text)
+        return
+        yield  # Make this a generator
+
     async def simple_audio_response(
         self, pcm: PcmData, participant: Optional[Participant] = None
     ):
diff --git a/plugins/openai/vision_agents/plugins/openai/openai_realtime.py b/plugins/openai/vision_agents/plugins/openai/openai_realtime.py
index a9074ae7..2672188e 100644
--- a/plugins/openai/vision_agents/plugins/openai/openai_realtime.py
+++ b/plugins/openai/vision_agents/plugins/openai/openai_realtime.py
@@ -118,6 +118,41 @@ async def simple_response(
         """
         await self.rtc.send_text(text)
 
+    async def _simple_response(
+        self,
+        text: str,
+        system_prompt: Optional[str] = None,
+        temperature: float = 0.7,
+        max_tokens: Optional[int] = None,
+        **kwargs: Any,
+    ) -> str:
+        """
+        Internal simple response implementation required by LLM base class.
+
+        Note: OpenAI Realtime is event-driven and doesn't return responses directly.
+        This implementation sends the text and returns a placeholder.
+        """
+        await self.rtc.send_text(text)
+        return ""  # Realtime API doesn't return text synchronously
+
+    async def _simple_response_stream(
+        self,
+        text: str,
+        system_prompt: Optional[str] = None,
+        temperature: float = 0.7,
+        max_tokens: Optional[int] = None,
+        **kwargs: Any,
+    ):
+        """
+        Internal simple response stream implementation required by LLM base class.
+
+        Note: OpenAI Realtime is event-driven and doesn't stream responses in this manner.
+        This implementation sends the text but yields nothing.
+        """
+        await self.rtc.send_text(text)
+        return
+        yield  # Make this a generator
+
     async def simple_audio_response(
         self, audio: PcmData, participant: Optional[Participant] = None
     ):
diff --git a/tests/test_function_calling.py b/tests/test_function_calling.py
index 54dd3912..fabedf8a 100644
--- a/tests/test_function_calling.py
+++ b/tests/test_function_calling.py
@@ -4,6 +4,7 @@
 
 import pytest
 from unittest.mock import Mock, patch
+from typing import Any, Dict, Optional, AsyncIterator
 
 from vision_agents.core.llm import FunctionRegistry, function_registry
 from vision_agents.core.llm.llm import LLM
@@ -12,6 +13,33 @@
 from vision_agents.plugins.gemini import LLM as GeminiLLM
 
 
+# Test implementation of LLM for unit tests
+class TestLLM(LLM):
+    """Concrete implementation of LLM for testing."""
+
+    async def _simple_response(
+        self,
+        text: str,
+        system_prompt: Optional[str] = None,
+        temperature: float = 0.7,
+        max_tokens: Optional[int] = None,
+        **kwargs: Any,
+    ) -> str:
+        """Mock simple response."""
+        return f"Mock response to: {text}"
+
+    async def _simple_response_stream(
+        self,
+        text: str,
+        system_prompt: Optional[str] = None,
+        temperature: float = 0.7,
+        max_tokens: Optional[int] = None,
+        **kwargs: Any,
+    ) -> AsyncIterator[str]:
+        """Mock simple response stream."""
+        yield f"Mock response to: {text}"
+
+
 class TestFunctionRegistry:
     """Test the FunctionRegistry class."""
     
@@ -131,7 +159,7 @@ class TestLLMFunctionCalling:
     @pytest.mark.asyncio
     async def test_llm_function_registration(self):
         """Test that LLM can register functions."""
-        llm = LLM()
+        llm = TestLLM()
         
         @llm.register_function(description="Test function")
         def test_func(x: int) -> int:
@@ -145,7 +173,7 @@ def test_func(x: int) -> int:
     @pytest.mark.asyncio
     async def test_llm_get_available_functions(self):
         """Test getting available functions from LLM."""
-        llm = LLM()
+        llm = TestLLM()
         
         @llm.register_function(description="Function 1")
         def func1(x: int) -> int:
@@ -361,7 +389,7 @@ class TestFunctionCallingIntegration:
     @pytest.mark.asyncio
     async def test_tool_call_processing(self):
         """Test processing tool calls with multiple functions."""
-        llm = LLM()
+        llm = TestLLM()
         
         @llm.register_function(description="Get weather")
         def get_weather(location: str) -> str:
@@ -385,7 +413,7 @@ def calculate_sum(a: int, b: int) -> int:
     @pytest.mark.asyncio
     async def test_error_handling_in_function_calls(self):
         """Test error handling in function calls."""
-        llm = LLM()
+        llm = TestLLM()
         
         @llm.register_function(description="Test function that raises error")
         def error_function(x: int) -> int:
@@ -404,7 +432,7 @@ def error_function(x: int) -> int:
     @pytest.mark.asyncio
     async def test_function_schema_generation(self):
         """Test that function schemas are generated correctly."""
-        llm = LLM()
+        llm = TestLLM()
         
         @llm.register_function(description="Complex function")
         def complex_function(
@@ -450,7 +478,7 @@ class TestConcurrentToolExecution:
     @pytest.mark.asyncio
     async def test_dedup_and_execute(self):
         """Test the _dedup_and_execute method."""
-        llm = LLM()
+        llm = TestLLM()
         
         @llm.register_function(description="Test function")
         def test_func(x: int) -> int:
@@ -479,8 +507,8 @@ def test_func(x: int) -> int:
     async def test_tool_lifecycle_events(self):
         """Test that tool lifecycle events are emitted."""
         from vision_agents.core.llm.events import ToolStartEvent, ToolEndEvent
-        
-        llm = LLM()
+
+        llm = TestLLM()
         
         @llm.register_function(description="Test function")
         def test_func(x: int) -> int:
@@ -513,7 +541,7 @@ async def track_end_event(event: ToolEndEvent):
     @pytest.mark.asyncio
     async def test_output_sanitization(self):
         """Test output sanitization for large responses."""
-        llm = LLM()
+        llm = TestLLM()
         
         # Test normal output
         normal_output = "Hello world"

From 3646cd05f02d65e24e1ca05a2552351f40fa1c1c Mon Sep 17 00:00:00 2001
From: Tommaso Barbugli <tbarbugli@gmail.com>
Date: Sat, 1 Nov 2025 15:21:16 +0100
Subject: [PATCH 06/11] missing docker compose file

---
 docker-compose.yml | 74 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 docker-compose.yml

diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 00000000..cb2ce184
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,74 @@
+services:
+  # Jaeger for distributed tracing
+  jaeger:
+    image: jaegertracing/all-in-one:latest
+    container_name: vision-agents-jaeger
+    ports:
+      - "16686:16686"  # Jaeger UI
+      - "4317:4317"    # OTLP gRPC receiver
+      - "4318:4318"    # OTLP HTTP receiver
+    environment:
+      - COLLECTOR_OTLP_ENABLED=true
+    networks:
+      - observability
+
+  # Prometheus for metrics collection
+  prometheus:
+    image: prom/prometheus:latest
+    container_name: vision-agents-prometheus
+    ports:
+      - "9090:9090"
+    volumes:
+      - ./observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
+      - prometheus-data:/prometheus
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+      - '--web.console.libraries=/usr/share/prometheus/console_libraries'
+      - '--web.console.templates=/usr/share/prometheus/consoles'
+      - '--web.enable-lifecycle'
+    networks:
+      - observability
+
+  # Grafana for visualization
+  grafana:
+    image: grafana/grafana:latest
+    container_name: vision-agents-grafana
+    ports:
+      - "3000:3000"
+    environment:
+      - GF_SECURITY_ADMIN_USER=admin
+      - GF_SECURITY_ADMIN_PASSWORD=admin
+      - GF_USERS_ALLOW_SIGN_UP=false
+      - GF_AUTH_ANONYMOUS_ENABLED=true
+      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
+      - GF_AUTH_DISABLE_LOGIN_FORM=true
+    volumes:
+      - ./observability/grafana/provisioning:/etc/grafana/provisioning
+      - ./observability/grafana/dashboards:/var/lib/grafana/dashboards
+      - grafana-data:/var/lib/grafana
+    depends_on:
+      - prometheus
+    networks:
+      - observability
+
+  # Init service to set home dashboard
+  grafana-init:
+    image: curlimages/curl:latest
+    container_name: vision-agents-grafana-init
+    volumes:
+      - ./observability/grafana/init-home-dashboard.sh:/init-home-dashboard.sh:ro
+    command: sh /init-home-dashboard.sh
+    depends_on:
+      - grafana
+    networks:
+      - observability
+    restart: "no"
+
+volumes:
+  prometheus-data:
+  grafana-data:
+
+networks:
+  observability:
+    driver: bridge

From ed61f4df98496d4496e254258bad53650834c4e8 Mon Sep 17 00:00:00 2001
From: Tommaso Barbugli <tbarbugli@gmail.com>
Date: Sat, 1 Nov 2025 16:01:23 +0100
Subject: [PATCH 07/11] Update
 plugins/anthropic/vision_agents/plugins/anthropic/anthropic_llm.py

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
---
 .../anthropic/vision_agents/plugins/anthropic/anthropic_llm.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plugins/anthropic/vision_agents/plugins/anthropic/anthropic_llm.py b/plugins/anthropic/vision_agents/plugins/anthropic/anthropic_llm.py
index 1364d5d6..91012809 100644
--- a/plugins/anthropic/vision_agents/plugins/anthropic/anthropic_llm.py
+++ b/plugins/anthropic/vision_agents/plugins/anthropic/anthropic_llm.py
@@ -368,7 +368,7 @@ def _standardize_and_emit_event(
 
                 self.events.send(
                     LLMResponseChunkEvent(
-                        plugin_name="antrhopic",
+                        plugin_name="anthropic",
                         content_index=delta_event.index,
                         item_id="",
                         output_index=0,

From 16d58f74d6f62dbe5b960bc2289a6505488cd644 Mon Sep 17 00:00:00 2001
From: Tommaso Barbugli <tbarbugli@gmail.com>
Date: Sat, 1 Nov 2025 16:08:51 +0100
Subject: [PATCH 08/11] ruffit

---
 .../plugins/gemini/gemini_realtime.py         |  37 +--
 .../plugins/openai/openai_realtime.py         |  34 +-
 tests/test_function_calling.py                | 306 ++++++++++--------
 3 files changed, 186 insertions(+), 191 deletions(-)

diff --git a/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py b/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py
index dcfbbfae..e9b28e50 100644
--- a/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py
+++ b/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py
@@ -131,37 +131,20 @@ async def simple_response(
     async def _simple_response(
         self,
         text: str,
-        system_prompt: Optional[str] = None,
-        temperature: float = 0.7,
-        max_tokens: Optional[int] = None,
-        **kwargs: Any,
-    ) -> str:
+        processors: Optional[List[Processor]] = None,
+        participant: Optional[Participant] = None,
+    ):
         """
         Internal simple response implementation required by LLM base class.
 
         Note: Gemini Realtime is event-driven and doesn't return responses directly.
-        This implementation sends the text and returns a placeholder.
-        """
-        await self.send_realtime_input(text=text)
-        return ""  # Realtime API doesn't return text synchronously
-
-    async def _simple_response_stream(
-        self,
-        text: str,
-        system_prompt: Optional[str] = None,
-        temperature: float = 0.7,
-        max_tokens: Optional[int] = None,
-        **kwargs: Any,
-    ):
+        This implementation sends the text via the public simple_response method.
         """
-        Internal simple response stream implementation required by LLM base class.
+        from vision_agents.core.llm.llm import LLMResponseEvent
 
-        Note: Gemini Realtime is event-driven and doesn't stream responses in this manner.
-        This implementation sends the text but yields nothing.
-        """
-        await self.send_realtime_input(text=text)
-        return
-        yield  # Make this a generator
+        await self.simple_response(text, processors, participant)
+        # Return empty LLMResponseEvent since Realtime API is event-driven
+        return LLMResponseEvent(original=None, text="")
 
     async def simple_audio_response(
         self, pcm: PcmData, participant: Optional[Participant] = None
@@ -376,7 +359,9 @@ async def _receive_loop(self):
                         )
                         await self._handle_tool_call(server_message.tool_call)
                     else:
-                        self.logger.warning("Unrecognized event structure for gemini %s", server_message)
+                        self.logger.warning(
+                            "Unrecognized event structure for gemini %s", server_message
+                        )
         except CancelledError:
             logger.error("Stop async iteration exception")
             return
diff --git a/plugins/openai/vision_agents/plugins/openai/openai_realtime.py b/plugins/openai/vision_agents/plugins/openai/openai_realtime.py
index 2672188e..15bec56d 100644
--- a/plugins/openai/vision_agents/plugins/openai/openai_realtime.py
+++ b/plugins/openai/vision_agents/plugins/openai/openai_realtime.py
@@ -121,37 +121,20 @@ async def simple_response(
     async def _simple_response(
         self,
         text: str,
-        system_prompt: Optional[str] = None,
-        temperature: float = 0.7,
-        max_tokens: Optional[int] = None,
-        **kwargs: Any,
-    ) -> str:
+        processors: Optional[List[Processor]] = None,
+        participant: Optional[Participant] = None,
+    ):
         """
         Internal simple response implementation required by LLM base class.
 
         Note: OpenAI Realtime is event-driven and doesn't return responses directly.
-        This implementation sends the text and returns a placeholder.
-        """
-        await self.rtc.send_text(text)
-        return ""  # Realtime API doesn't return text synchronously
-
-    async def _simple_response_stream(
-        self,
-        text: str,
-        system_prompt: Optional[str] = None,
-        temperature: float = 0.7,
-        max_tokens: Optional[int] = None,
-        **kwargs: Any,
-    ):
+        This implementation sends the text via the public simple_response method.
         """
-        Internal simple response stream implementation required by LLM base class.
+        from vision_agents.core.llm.llm import LLMResponseEvent
 
-        Note: OpenAI Realtime is event-driven and doesn't stream responses in this manner.
-        This implementation sends the text but yields nothing.
-        """
-        await self.rtc.send_text(text)
-        return
-        yield  # Make this a generator
+        await self.simple_response(text, processors, participant)
+        # Return empty LLMResponseEvent since Realtime API is event-driven
+        return LLMResponseEvent(original=None, text="")
 
     async def simple_audio_response(
         self, audio: PcmData, participant: Optional[Participant] = None
@@ -180,7 +163,6 @@ async def request_session_info(self) -> None:
     async def close(self):
         await self.rtc.close()
 
-
     async def _handle_openai_event(self, event: dict) -> None:
         """Process events received from the OpenAI Realtime API.
 
diff --git a/tests/test_function_calling.py b/tests/test_function_calling.py
index fabedf8a..ffd1db5d 100644
--- a/tests/test_function_calling.py
+++ b/tests/test_function_calling.py
@@ -4,7 +4,7 @@
 
 import pytest
 from unittest.mock import Mock, patch
-from typing import Any, Dict, Optional, AsyncIterator
+from typing import Any, Optional
 
 from vision_agents.core.llm import FunctionRegistry, function_registry
 from vision_agents.core.llm.llm import LLM
@@ -20,134 +20,123 @@ class TestLLM(LLM):
     async def _simple_response(
         self,
         text: str,
-        system_prompt: Optional[str] = None,
-        temperature: float = 0.7,
-        max_tokens: Optional[int] = None,
-        **kwargs: Any,
-    ) -> str:
+        processors: Optional[Any] = None,
+        participant: Optional[Any] = None,
+    ):
         """Mock simple response."""
-        return f"Mock response to: {text}"
+        from vision_agents.core.llm.llm import LLMResponseEvent
 
-    async def _simple_response_stream(
-        self,
-        text: str,
-        system_prompt: Optional[str] = None,
-        temperature: float = 0.7,
-        max_tokens: Optional[int] = None,
-        **kwargs: Any,
-    ) -> AsyncIterator[str]:
-        """Mock simple response stream."""
-        yield f"Mock response to: {text}"
+        return LLMResponseEvent(original=None, text=f"Mock response to: {text}")
 
 
 class TestFunctionRegistry:
     """Test the FunctionRegistry class."""
-    
+
     def test_register_function(self):
         """Test registering a function."""
         registry = FunctionRegistry()
-        
+
         @registry.register(description="Test function")
         def test_func(x: int, y: int = 5) -> int:
             """Test function with default parameter."""
             return x + y
-        
+
         assert "test_func" in registry._functions
         assert registry._functions["test_func"].description == "Test function"
         assert len(registry._functions["test_func"].parameters) == 2
-    
+
     def test_call_function(self):
         """Test calling a registered function."""
         registry = FunctionRegistry()
-        
+
         @registry.register(description="Add two numbers")
         def add_numbers(a: int, b: int) -> int:
             """Add two numbers."""
             return a + b
-        
+
         result = registry.call_function("add_numbers", {"a": 5, "b": 3})
         assert result == 8
-    
+
     def test_call_function_with_defaults(self):
         """Test calling a function with default parameters."""
         registry = FunctionRegistry()
-        
+
         @registry.register(description="Test function with defaults")
         def test_func(x: int, y: int = 10) -> int:
             """Test function with default parameter."""
             return x + y
-        
+
         # Test with both parameters
         result = registry.call_function("test_func", {"x": 5, "y": 3})
         assert result == 8
-        
+
         # Test with default parameter
         result = registry.call_function("test_func", {"x": 5})
         assert result == 15
-    
+
     def test_call_nonexistent_function(self):
         """Test calling a non-existent function raises error."""
         registry = FunctionRegistry()
-        
+
         with pytest.raises(KeyError):
             registry.call_function("nonexistent", {})
-    
+
     def test_call_function_missing_required_param(self):
         """Test calling a function with missing required parameter raises error."""
         registry = FunctionRegistry()
-        
+
         @registry.register(description="Test function")
         def test_func(x: int, y: int) -> int:
             """Test function."""
             return x + y
-        
+
         with pytest.raises(TypeError):
             registry.call_function("test_func", {"x": 5})
-    
+
     def test_get_tool_schemas(self):
         """Test getting tool schemas."""
         registry = FunctionRegistry()
-        
+
         @registry.register(description="Test function")
         def test_func(x: int, y: int = 5) -> int:
             """Test function."""
             return x + y
-        
+
         schemas = registry.get_tool_schemas()
         assert len(schemas) == 1
         assert schemas[0]["name"] == "test_func"
         assert schemas[0]["description"] == "Test function"
         assert "parameters_schema" in schemas[0]
-    
+
     def test_get_callable(self):
         """Test getting callable function."""
         registry = FunctionRegistry()
-        
+
         @registry.register(description="Test function")
         def test_func(x: int) -> int:
             """Test function."""
             return x * 2
-        
+
         callable_func = registry.get_callable("test_func")
         assert callable_func(5) == 10
-        
+
         with pytest.raises(KeyError):
             registry.get_callable("nonexistent")
 
 
 class TestGlobalRegistry:
     """Test the global function registry."""
-    
+
     def test_global_registry(self):
         """Test that the global registry works."""
         # Clear any existing functions
         function_registry._functions.clear()
-        
+
         @function_registry.register(description="Global test function")
         def global_test_func(x: int) -> int:
             """Global test function."""
             return x * 3
-        
+
         assert "global_test_func" in function_registry._functions
         result = function_registry.call_function("global_test_func", {"x": 4})
         assert result == 12
@@ -155,34 +144,34 @@ def global_test_func(x: int) -> int:
 
 class TestLLMFunctionCalling:
     """Test LLM function calling functionality."""
-    
+
     @pytest.mark.asyncio
     async def test_llm_function_registration(self):
         """Test that LLM can register functions."""
         llm = TestLLM()
-        
+
         @llm.register_function(description="Test function")
         def test_func(x: int) -> int:
             """Test function."""
             return x * 2
-        
+
         functions = llm.get_available_functions()
         assert len(functions) == 1
         assert functions[0]["name"] == "test_func"
-    
+
     @pytest.mark.asyncio
     async def test_llm_get_available_functions(self):
         """Test getting available functions from LLM."""
         llm = TestLLM()
-        
+
         @llm.register_function(description="Function 1")
         def func1(x: int) -> int:
             return x + 1
-        
+
         @llm.register_function(description="Function 2")
         def func2(x: int) -> int:
             return x * 2
-        
+
         functions = llm.get_available_functions()
         assert len(functions) == 2
         function_names = [f["name"] for f in functions]
@@ -192,60 +181,68 @@ def func2(x: int) -> int:
 
 class TestOpenAIFunctionCalling:
     """Test OpenAI function calling functionality."""
-    
+
     @pytest.mark.asyncio
-    @patch('vision_agents.plugins.openai.openai_llm.AsyncOpenAI')
+    @patch("vision_agents.plugins.openai.openai_llm.AsyncOpenAI")
     async def test_openai_function_calling_response(self, mock_openai):
         """Test OpenAI function calling response."""
         # Mock the OpenAI client and response
         mock_client = Mock()
         mock_openai.return_value = mock_client
-        
+
         # Mock the responses.create call
         mock_response = Mock()
         mock_response.output = [
-            Mock(type="function_call", call_id="call_123", arguments='{"location": "New York"}')
+            Mock(
+                type="function_call",
+                call_id="call_123",
+                arguments='{"location": "New York"}',
+            )
         ]
         mock_client.responses.create.return_value = mock_response
-        
+
         llm = OpenAILLM(api_key="test-key", model="gpt-4")
-        
+
         # Register a test function
         @llm.register_function(description="Get weather for a location")
         def get_weather(location: str) -> str:
             """Get weather information."""
             return f"Weather in {location}: Sunny, 72°F"
-        
+
         # Test that function is registered
         functions = llm.get_available_functions()
         assert len(functions) == 1
         assert functions[0]["name"] == "get_weather"
-        
+
         # Test function calling
         result = llm.call_function("get_weather", {"location": "New York"})
         assert result == "Weather in New York: Sunny, 72°F"
-    
-    @patch('vision_agents.plugins.openai.openai_llm.AsyncOpenAI')
+
+    @patch("vision_agents.plugins.openai.openai_llm.AsyncOpenAI")
     async def test_openai_conversational_response(self, mock_openai):
         """Test OpenAI conversational response generation."""
         mock_client = Mock()
         mock_openai.return_value = mock_client
-        
+
         # Mock the responses.create call
         mock_response = Mock()
         mock_response.output = [
-            Mock(type="function_call", call_id="call_123", arguments='{"location": "New York"}')
+            Mock(
+                type="function_call",
+                call_id="call_123",
+                arguments='{"location": "New York"}',
+            )
         ]
         mock_client.responses.create.return_value = mock_response
-        
+
         llm = OpenAILLM(api_key="test-key", model="gpt-4")
-        
+
         # Register a test function
         @llm.register_function(description="Get weather for a location")
         def get_weather(location: str) -> str:
             """Get weather information."""
             return f"Weather in {location}: Sunny, 72°F"
-        
+
         # Test that function is registered
         functions = llm.get_available_functions()
         assert len(functions) == 1
@@ -254,60 +251,70 @@ def get_weather(location: str) -> str:
 
 class TestClaudeFunctionCalling:
     """Test Claude function calling functionality."""
-    
+
     @pytest.mark.asyncio
-    @patch('vision_agents.plugins.anthropic.anthropic_llm.AsyncAnthropic')
+    @patch("vision_agents.plugins.anthropic.anthropic_llm.AsyncAnthropic")
     async def test_claude_function_calling_response(self, mock_anthropic):
         """Test Claude function calling response."""
         # Mock the Anthropic client and response
         mock_client = Mock()
         mock_anthropic.return_value = mock_client
-        
+
         # Mock the messages.create call
         mock_response = Mock()
         mock_response.content = [
-            Mock(type="tool_use", id="tool_123", name="get_weather", input={"location": "New York"})
+            Mock(
+                type="tool_use",
+                id="tool_123",
+                name="get_weather",
+                input={"location": "New York"},
+            )
         ]
         mock_client.messages.create.return_value = mock_response
-        
+
         llm = ClaudeLLM(api_key="test-key", model="claude-3-5-sonnet-20241022")
-        
+
         # Register a test function
         @llm.register_function(description="Get weather for a location")
         def get_weather(location: str) -> str:
             """Get weather information."""
             return f"Weather in {location}: Sunny, 72°F"
-        
+
         # Test that function is registered
         functions = llm.get_available_functions()
         assert len(functions) == 1
         assert functions[0]["name"] == "get_weather"
-        
+
         # Test function calling
         result = llm.call_function("get_weather", {"location": "New York"})
         assert result == "Weather in New York: Sunny, 72°F"
-    
-    @patch('vision_agents.plugins.anthropic.anthropic_llm.AsyncAnthropic')
+
+    @patch("vision_agents.plugins.anthropic.anthropic_llm.AsyncAnthropic")
     async def test_claude_conversational_response(self, mock_anthropic):
         """Test Claude conversational response generation."""
         mock_client = Mock()
         mock_anthropic.return_value = mock_client
-        
+
         # Mock the messages.create call
         mock_response = Mock()
         mock_response.content = [
-            Mock(type="tool_use", id="tool_123", name="get_weather", input={"location": "New York"})
+            Mock(
+                type="tool_use",
+                id="tool_123",
+                name="get_weather",
+                input={"location": "New York"},
+            )
         ]
         mock_client.messages.create.return_value = mock_response
-        
+
         llm = ClaudeLLM(api_key="test-key", model="claude-3-5-sonnet-20241022")
-        
+
         # Register a test function
         @llm.register_function(description="Get weather for a location")
         def get_weather(location: str) -> str:
             """Get weather information."""
             return f"Weather in {location}: Sunny, 72°F"
-        
+
         # Test that function is registered
         functions = llm.get_available_functions()
         assert len(functions) == 1
@@ -316,67 +323,85 @@ def get_weather(location: str) -> str:
 
 class TestGeminiFunctionCalling:
     """Test Gemini function calling functionality."""
-    
+
     @pytest.mark.asyncio
-    @patch('vision_agents.plugins.gemini.gemini_llm.genai')
+    @patch("vision_agents.plugins.gemini.gemini_llm.genai")
     async def test_gemini_function_calling_response(self, mock_genai):
         """Test Gemini function calling response."""
         # Mock the Gemini client and response
         mock_client = Mock()
         mock_genai.configure.return_value = None
         mock_genai.Chat.return_value = mock_client
-        
+
         # Mock the send_message_stream call
         mock_response = Mock()
         mock_response.candidates = [
-            Mock(content=Mock(parts=[
-                Mock(type="function_call", function_call=Mock(name="get_weather", args={"location": "New York"}))
-            ]))
+            Mock(
+                content=Mock(
+                    parts=[
+                        Mock(
+                            type="function_call",
+                            function_call=Mock(
+                                name="get_weather", args={"location": "New York"}
+                            ),
+                        )
+                    ]
+                )
+            )
         ]
         mock_client.send_message_stream.return_value = [mock_response]
-        
+
         llm = GeminiLLM(model="gemini-2.0-flash")
-        
+
         # Register a test function
         @llm.register_function(description="Get weather for a location")
         def get_weather(location: str) -> str:
             """Get weather information."""
             return f"Weather in {location}: Sunny, 72°F"
-        
+
         # Test that function is registered
         functions = llm.get_available_functions()
         assert len(functions) == 1
         assert functions[0]["name"] == "get_weather"
-        
+
         # Test function calling
         result = llm.call_function("get_weather", {"location": "New York"})
         assert result == "Weather in New York: Sunny, 72°F"
-    
+
     @pytest.mark.asyncio
-    @patch('vision_agents.plugins.gemini.gemini_llm.genai')
+    @patch("vision_agents.plugins.gemini.gemini_llm.genai")
     async def test_gemini_conversational_response(self, mock_genai):
         """Test Gemini conversational response generation."""
         mock_client = Mock()
         mock_genai.configure.return_value = None
         mock_genai.Chat.return_value = mock_client
-        
+
         # Mock the send_message_stream call
         mock_response = Mock()
         mock_response.candidates = [
-            Mock(content=Mock(parts=[
-                Mock(type="function_call", function_call=Mock(name="get_weather", args={"location": "New York"}))
-            ]))
+            Mock(
+                content=Mock(
+                    parts=[
+                        Mock(
+                            type="function_call",
+                            function_call=Mock(
+                                name="get_weather", args={"location": "New York"}
+                            ),
+                        )
+                    ]
+                )
+            )
         ]
         mock_client.send_message_stream.return_value = [mock_response]
-        
+
         llm = GeminiLLM(model="gemini-2.0-flash")
-        
+
         # Register a test function
         @llm.register_function(description="Get weather for a location")
         def get_weather(location: str) -> str:
             """Get weather information."""
             return f"Weather in {location}: Sunny, 72°F"
-        
+
         # Test that function is registered
         functions = llm.get_available_functions()
         assert len(functions) == 1
@@ -385,85 +410,82 @@ def get_weather(location: str) -> str:
 
 class TestFunctionCallingIntegration:
     """Test function calling integration scenarios."""
-    
+
     @pytest.mark.asyncio
     async def test_tool_call_processing(self):
         """Test processing tool calls with multiple functions."""
         llm = TestLLM()
-        
+
         @llm.register_function(description="Get weather")
         def get_weather(location: str) -> str:
             return f"Weather in {location}: Sunny"
-        
+
         @llm.register_function(description="Calculate sum")
         def calculate_sum(a: int, b: int) -> int:
             return a + b
-        
+
         # Test multiple function registrations
         functions = llm.get_available_functions()
         assert len(functions) == 2
-        
+
         # Test calling both functions
         weather_result = llm.call_function("get_weather", {"location": "NYC"})
         sum_result = llm.call_function("calculate_sum", {"a": 5, "b": 3})
-        
+
         assert weather_result == "Weather in NYC: Sunny"
         assert sum_result == 8
-    
+
     @pytest.mark.asyncio
     async def test_error_handling_in_function_calls(self):
         """Test error handling in function calls."""
         llm = TestLLM()
-        
+
         @llm.register_function(description="Test function that raises error")
         def error_function(x: int) -> int:
             if x < 0:
                 raise ValueError("Negative numbers not allowed")
             return x * 2
-        
+
         # Test normal case
         result = llm.call_function("error_function", {"x": 5})
         assert result == 10
-        
+
         # Test error case
         with pytest.raises(ValueError):
             llm.call_function("error_function", {"x": -5})
-    
+
     @pytest.mark.asyncio
     async def test_function_schema_generation(self):
         """Test that function schemas are generated correctly."""
         llm = TestLLM()
-        
+
         @llm.register_function(description="Complex function")
         def complex_function(
-            name: str,
-            age: int,
-            is_active: bool = True,
-            tags: list = None
+            name: str, age: int, is_active: bool = True, tags: Optional[list] = None
         ) -> dict:
             """Complex function with various parameter types."""
             return {
                 "name": name,
                 "age": age,
                 "is_active": is_active,
-                "tags": tags or []
+                "tags": tags or [],
             }
-        
+
         schemas = llm.get_available_functions()
         assert len(schemas) == 1
-        
+
         schema = schemas[0]
         assert schema["name"] == "complex_function"
         assert schema["description"] == "Complex function"
         assert "parameters_schema" in schema
-        
+
         # Check parameter types
         params = schema["parameters_schema"]["properties"]
         assert "name" in params
         assert "age" in params
         assert "is_active" in params
         assert "tags" in params
-        
+
         # Check required parameters
         required = schema["parameters_schema"]["required"]
         assert "name" in required
@@ -474,87 +496,93 @@ def complex_function(
 
 class TestConcurrentToolExecution:
     """Test concurrent tool execution functionality."""
-    
+
     @pytest.mark.asyncio
     async def test_dedup_and_execute(self):
         """Test the _dedup_and_execute method."""
         llm = TestLLM()
-        
+
         @llm.register_function(description="Test function")
         def test_func(x: int) -> int:
             return x * 2
-        
+
         # Test with duplicate tool calls
         tool_calls = [
             {"id": "call1", "name": "test_func", "arguments_json": {"x": 5}},
-            {"id": "call2", "name": "test_func", "arguments_json": {"x": 5}},  # Duplicate
+            {
+                "id": "call2",
+                "name": "test_func",
+                "arguments_json": {"x": 5},
+            },  # Duplicate
             {"id": "call3", "name": "test_func", "arguments_json": {"x": 3}},
         ]
-        
+
         # This should deduplicate and only execute call1 and call3
         triples, seen = await llm._dedup_and_execute(tool_calls)
         # The deduplication should work, but let's check what actually happens
         # The key is based on (id, name, arguments_json), so different IDs = different keys
         assert len(triples) == 3  # All calls have different IDs, so all are executed
         assert len(seen) == 3  # 3 unique keys in seen set
-        
+
         # Check results
         results = [result for _, result, _ in triples]
         assert 10 in results  # 5 * 2 (appears twice)
-        assert 6 in results   # 3 * 2
-    
+        assert 6 in results  # 3 * 2
+
     @pytest.mark.asyncio
     async def test_tool_lifecycle_events(self):
         """Test that tool lifecycle events are emitted."""
         from vision_agents.core.llm.events import ToolStartEvent, ToolEndEvent
 
         llm = TestLLM()
-        
+
         @llm.register_function(description="Test function")
         def test_func(x: int) -> int:
             return x * 2
-        
+
         # Track emitted events
         start_events = []
         end_events = []
-        
+
         @llm.events.subscribe
         async def track_start_event(event: ToolStartEvent):
             start_events.append(event)
-        
+
         @llm.events.subscribe
         async def track_end_event(event: ToolEndEvent):
             end_events.append(event)
-        
+
         # Execute a tool call
-        await llm._run_one_tool({"id": "call1", "name": "test_func", "arguments_json": {"x": 5}}, 30.0)
+        await llm._run_one_tool(
+            {"id": "call1", "name": "test_func", "arguments_json": {"x": 5}}, 30.0
+        )
         # Wait for events
         await llm.events.wait(timeout=1.0)
-        
+
         # Check that events were emitted
         assert len(start_events) == 1
         assert len(end_events) == 1
         assert start_events[0].tool_name == "test_func"
         assert end_events[0].tool_name == "test_func"
         assert end_events[0].success is True
-    
+
     @pytest.mark.asyncio
     async def test_output_sanitization(self):
         """Test output sanitization for large responses."""
         llm = TestLLM()
-        
+
         # Test normal output
         normal_output = "Hello world"
         sanitized = llm._sanitize_tool_output(normal_output)
         assert sanitized == "Hello world"
-        
+
         # Test large output
         large_output = "x" * 70000  # Larger than default 60k limit
         sanitized = llm._sanitize_tool_output(large_output)
         assert len(sanitized) == 60001  # 60k + "…"
         assert sanitized.endswith("…")
-        
+
         # Test non-string output
         dict_output = {"key": "value"}
         sanitized = llm._sanitize_tool_output(dict_output)
-        assert sanitized == '{"key": "value"}'
\ No newline at end of file
+        assert sanitized == '{"key": "value"}'

From f42466cfa277ca97ff3e55cd408a5ec65d76b396 Mon Sep 17 00:00:00 2001
From: Tommaso Barbugli <tbarbugli@gmail.com>
Date: Sat, 1 Nov 2025 16:14:06 +0100
Subject: [PATCH 09/11] better shutdown for smart turn

---
 .../smart_turn/smart_turn_detection.py        | 53 +++++++++++--------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py b/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py
index 06884440..2c32b5af 100644
--- a/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py
+++ b/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py
@@ -174,29 +174,38 @@ async def _process_audio_loop(self):
         Background task that continuously processes audio from the queue.
         This is where the actual VAD and turn detection logic runs.
         """
-        while not self._shutdown_event.is_set():
-            try:
-                # Wait for audio packet with timeout to allow shutdown
-                audio_data, participant, conversation = await asyncio.wait_for(
-                    self._audio_queue.get(), timeout=1.0
-                )
-
-                # Signal that we're actively processing
-                self._processing_active.set()
-
+        try:
+            while not self._shutdown_event.is_set():
                 try:
-                    # Process the audio packet
-                    await self._process_audio_packet(audio_data, participant)
-                finally:
-                    # If queue is empty, clear the processing flag
-                    if self._audio_queue.empty():
-                        self._processing_active.clear()
-
-            except asyncio.TimeoutError:
-                # Timeout is expected - continue loop to check shutdown
-                continue
-            except Exception as e:
-                logger.error(f"Error processing audio: {e}")
+                    # Wait for audio packet with timeout to allow shutdown
+                    audio_data, participant, conversation = await asyncio.wait_for(
+                        self._audio_queue.get(), timeout=1.0
+                    )
+
+                    # Signal that we're actively processing
+                    self._processing_active.set()
+
+                    try:
+                        # Process the audio packet
+                        await self._process_audio_packet(audio_data, participant)
+                    finally:
+                        # If queue is empty, clear the processing flag
+                        if self._audio_queue.empty():
+                            self._processing_active.clear()
+
+                except asyncio.TimeoutError:
+                    # Timeout is expected - continue loop to check shutdown
+                    continue
+                except Exception as e:
+                    logger.error(f"Error processing audio: {e}")
+        except asyncio.CancelledError:
+            # Task was cancelled - ensure clean shutdown
+            logger.debug("Audio processing loop cancelled")
+            raise
+        finally:
+            # Always clear flags on shutdown to allow proper lifecycle transitions
+            self._processing_active.clear()
+            self._shutdown_event.clear()
 
     async def _process_audio_packet(
         self,

From 06aea3108375395a92d73c2d87c05498f508db6b Mon Sep 17 00:00:00 2001
From: Tommaso Barbugli <tbarbugli@gmail.com>
Date: Mon, 3 Nov 2025 11:24:56 +0100
Subject: [PATCH 10/11] better metrics

---
 .../grafana/dashboards/vision-agents.json     | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/observability/grafana/dashboards/vision-agents.json b/observability/grafana/dashboards/vision-agents.json
index 05da23ef..22374fcd 100644
--- a/observability/grafana/dashboards/vision-agents.json
+++ b/observability/grafana/dashboards/vision-agents.json
@@ -106,7 +106,7 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.50, sum(rate(llm_latency_ms_milliseconds_bucket[5m])) by (le, llm_class))",
+          "expr": "histogram_quantile(0.50, sum(rate(llm_latency_ms_bucket[5m])) by (le, llm_class))",
           "legendFormat": "p50 - {{llm_class}}",
           "refId": "A"
         },
@@ -115,7 +115,7 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.95, sum(rate(llm_latency_ms_milliseconds_bucket[5m])) by (le, llm_class))",
+          "expr": "histogram_quantile(0.95, sum(rate(llm_latency_ms_bucket[5m])) by (le, llm_class))",
           "legendFormat": "p95 - {{llm_class}}",
           "refId": "B"
         },
@@ -124,7 +124,7 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.99, sum(rate(llm_latency_ms_milliseconds_bucket[5m])) by (le, llm_class))",
+          "expr": "histogram_quantile(0.99, sum(rate(llm_latency_ms_bucket[5m])) by (le, llm_class))",
           "legendFormat": "p99 - {{llm_class}}",
           "refId": "C"
         }
@@ -216,7 +216,7 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.50, sum(rate(stt_latency_ms_milliseconds_bucket[5m])) by (le, stt_class))",
+          "expr": "histogram_quantile(0.50, sum(rate(stt_latency_ms_bucket[5m])) by (le, stt_class))",
           "legendFormat": "p50 - {{stt_class}}",
           "refId": "A"
         },
@@ -225,7 +225,7 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.95, sum(rate(stt_latency_ms_milliseconds_bucket[5m])) by (le, stt_class))",
+          "expr": "histogram_quantile(0.95, sum(rate(stt_latency_ms_bucket[5m])) by (le, stt_class))",
           "legendFormat": "p95 - {{stt_class}}",
           "refId": "B"
         },
@@ -234,7 +234,7 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.99, sum(rate(stt_latency_ms_milliseconds_bucket[5m])) by (le, stt_class))",
+          "expr": "histogram_quantile(0.99, sum(rate(stt_latency_ms_bucket[5m])) by (le, stt_class))",
           "legendFormat": "p99 - {{stt_class}}",
           "refId": "C"
         }
@@ -326,7 +326,7 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.50, sum(rate(tts_latency_ms_milliseconds_bucket[5m])) by (le, tts_class))",
+          "expr": "histogram_quantile(0.50, sum(rate(tts_latency_ms_bucket[5m])) by (le, tts_class))",
           "legendFormat": "p50 - {{tts_class}}",
           "refId": "A"
         },
@@ -335,7 +335,7 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.95, sum(rate(tts_latency_ms_milliseconds_bucket[5m])) by (le, tts_class))",
+          "expr": "histogram_quantile(0.95, sum(rate(tts_latency_ms_bucket[5m])) by (le, tts_class))",
           "legendFormat": "p95 - {{tts_class}}",
           "refId": "B"
         },
@@ -344,7 +344,7 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.99, sum(rate(tts_latency_ms_milliseconds_bucket[5m])) by (le, tts_class))",
+          "expr": "histogram_quantile(0.99, sum(rate(tts_latency_ms_bucket[5m])) by (le, tts_class))",
           "legendFormat": "p99 - {{tts_class}}",
           "refId": "C"
         }
@@ -436,7 +436,7 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.50, sum(rate(turn_detection_latency_ms_milliseconds_bucket[5m])) by (le, provider))",
+          "expr": "histogram_quantile(0.50, sum(rate(turn_detection_latency_ms_bucket[5m])) by (le, provider))",
           "legendFormat": "p50 - {{provider}}",
           "refId": "A"
         },
@@ -445,7 +445,7 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.95, sum(rate(turn_detection_latency_ms_milliseconds_bucket[5m])) by (le, provider))",
+          "expr": "histogram_quantile(0.95, sum(rate(turn_detection_latency_ms_bucket[5m])) by (le, provider))",
           "legendFormat": "p95 - {{provider}}",
           "refId": "B"
         },
@@ -454,7 +454,7 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.99, sum(rate(turn_detection_latency_ms_milliseconds_bucket[5m])) by (le, provider))",
+          "expr": "histogram_quantile(0.99, sum(rate(turn_detection_latency_ms_bucket[5m])) by (le, provider))",
           "legendFormat": "p99 - {{provider}}",
           "refId": "C"
         }
@@ -546,7 +546,7 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.50, sum(rate(turn_vad_latency_ms_milliseconds_bucket[5m])) by (le, implementation))",
+          "expr": "histogram_quantile(0.50, sum(rate(turn_vad_latency_ms_bucket[5m])) by (le, implementation))",
           "legendFormat": "p50 - {{implementation}}",
           "refId": "A"
         },
@@ -555,7 +555,7 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.95, sum(rate(turn_vad_latency_ms_milliseconds_bucket[5m])) by (le, implementation))",
+          "expr": "histogram_quantile(0.95, sum(rate(turn_vad_latency_ms_bucket[5m])) by (le, implementation))",
           "legendFormat": "p95 - {{implementation}}",
           "refId": "B"
         },
@@ -564,7 +564,7 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.99, sum(rate(turn_vad_latency_ms_milliseconds_bucket[5m])) by (le, implementation))",
+          "expr": "histogram_quantile(0.99, sum(rate(turn_vad_latency_ms_bucket[5m])) by (le, implementation))",
           "legendFormat": "p99 - {{implementation}}",
           "refId": "C"
         }
@@ -656,7 +656,7 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.50, sum(rate(turn_end_detection_latency_ms_milliseconds_bucket[5m])) by (le, implementation))",
+          "expr": "histogram_quantile(0.50, sum(rate(turn_end_detection_latency_ms_bucket[5m])) by (le, implementation))",
           "legendFormat": "p50 - {{implementation}}",
           "refId": "A"
         },
@@ -665,7 +665,7 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.95, sum(rate(turn_end_detection_latency_ms_milliseconds_bucket[5m])) by (le, implementation))",
+          "expr": "histogram_quantile(0.95, sum(rate(turn_end_detection_latency_ms_bucket[5m])) by (le, implementation))",
           "legendFormat": "p95 - {{implementation}}",
           "refId": "B"
         },
@@ -674,7 +674,7 @@
             "type": "prometheus",
             "uid": "prometheus"
           },
-          "expr": "histogram_quantile(0.99, sum(rate(turn_end_detection_latency_ms_milliseconds_bucket[5m])) by (le, implementation))",
+          "expr": "histogram_quantile(0.99, sum(rate(turn_end_detection_latency_ms_bucket[5m])) by (le, implementation))",
           "legendFormat": "p99 - {{implementation}}",
           "refId": "C"
         }

From 1193533eae7a791db7109dec50a92e022cdea49a Mon Sep 17 00:00:00 2001
From: Tommaso Barbugli <tbarbugli@gmail.com>
Date: Thu, 6 Nov 2025 13:22:33 +0100
Subject: [PATCH 11/11] fix tests

---
 tests/test_function_calling.py | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/tests/test_function_calling.py b/tests/test_function_calling.py
index c8418f66..25a022c8 100644
--- a/tests/test_function_calling.py
+++ b/tests/test_function_calling.py
@@ -1,12 +1,15 @@
 """
 Tests for function calling functionality.
 """
+from typing import Optional, List, Any
 
 import pytest
 from unittest.mock import Mock, patch
 
+from vision_agents.core.edge.types import Participant
 from vision_agents.core.llm import FunctionRegistry, function_registry
-from vision_agents.core.llm.llm import LLM
+from vision_agents.core.llm.llm import LLM, LLMResponseEvent
+from vision_agents.core.processors import Processor
 from vision_agents.plugins.openai import LLM as OpenAILLM
 from vision_agents.plugins.anthropic import LLM as ClaudeLLM
 from vision_agents.plugins.gemini import LLM as GeminiLLM
@@ -125,12 +128,21 @@ def global_test_func(x: int) -> int:
         assert result == 12
 
 
+class TestLLM(LLM):
+    async def _simple_response(
+            self,
+            text: str,
+            processors: Optional[List[Processor]] = None,
+            participant: Optional[Participant] = None,
+    ) -> LLMResponseEvent[Any]:
+        return LLMResponseEvent(original=dict(), text="")
+
 class TestLLMFunctionCalling:
     """Test LLM function calling functionality."""
 
     async def test_llm_function_registration(self):
         """Test that LLM can register functions."""
-        llm = LLM()
+        llm = TestLLM()
 
         @llm.register_function(description="Test function")
         def test_func(x: int) -> int:
@@ -143,7 +155,7 @@ def test_func(x: int) -> int:
 
     async def test_llm_get_available_functions(self):
         """Test getting available functions from LLM."""
-        llm = LLM()
+        llm = TestLLM()
 
         @llm.register_function(description="Function 1")
         def func1(x: int) -> int:
@@ -417,7 +429,7 @@ class TestFunctionCallingIntegration:
 
     async def test_tool_call_processing(self):
         """Test processing tool calls with multiple functions."""
-        llm = LLM()
+        llm = TestLLM()
 
         @llm.register_function(description="Get weather")
         def get_weather(location: str) -> str:
@@ -440,7 +452,7 @@ def calculate_sum(a: int, b: int) -> int:
 
     async def test_error_handling_in_function_calls(self):
         """Test error handling in function calls."""
-        llm = LLM()
+        llm = TestLLM()
 
         @llm.register_function(description="Test function that raises error")
         def error_function(x: int) -> int:
@@ -458,7 +470,7 @@ def error_function(x: int) -> int:
 
     async def test_function_schema_generation(self):
         """Test that function schemas are generated correctly."""
-        llm = LLM()
+        llm = TestLLM()
 
         @llm.register_function(description="Complex function")
         def complex_function(
@@ -500,7 +512,7 @@ class TestConcurrentToolExecution:
 
     async def test_dedup_and_execute(self):
         """Test the _dedup_and_execute method."""
-        llm = LLM()
+        llm = TestLLM()
 
         @llm.register_function(description="Test function")
         def test_func(x: int) -> int:
@@ -533,7 +545,7 @@ async def test_tool_lifecycle_events(self):
         """Test that tool lifecycle events are emitted."""
         from vision_agents.core.llm.events import ToolStartEvent, ToolEndEvent
 
-        llm = LLM()
+        llm = TestLLM()
 
         @llm.register_function(description="Test function")
         def test_func(x: int) -> int:
@@ -567,7 +579,7 @@ async def track_end_event(event: ToolEndEvent):
 
     async def test_output_sanitization(self):
         """Test output sanitization for large responses."""
-        llm = LLM()
+        llm = TestLLM()
 
         # Test normal output
         normal_output = "Hello world"