From f8adce65df9e7d4f2243b6b87f604dbcb3521656 Mon Sep 17 00:00:00 2001 From: Tommaso Barbugli Date: Fri, 31 Oct 2025 12:25:02 +0100 Subject: [PATCH 01/11] add timers to turn --- .../core/observability/metrics.py | 219 ++++++++++++---- .../core/turn_detection/turn_detection.py | 33 ++- .../plugins/krisp/turn_detection.py | 2 +- .../smart_turn/smart_turn_detection.py | 60 +++-- .../ultralytics/yolo_pose_processor.py | 36 ++- .../plugins/vogent/vogent_turn_detection.py | 234 ++++++++++-------- 6 files changed, 403 insertions(+), 181 deletions(-) diff --git a/agents-core/vision_agents/core/observability/metrics.py b/agents-core/vision_agents/core/observability/metrics.py index 86b7bd85..91bdedda 100644 --- a/agents-core/vision_agents/core/observability/metrics.py +++ b/agents-core/vision_agents/core/observability/metrics.py @@ -1,51 +1,14 @@ -"""OpenTelemetry observability instrumentation for vision-agents library. - -This module defines metrics and tracers for the vision-agents library. It does NOT -configure OpenTelemetry providers - that is the responsibility of applications using -this library. - -For applications using this library: - To enable telemetry, configure OpenTelemetry in your application before importing - vision-agents components: - - ```python - from opentelemetry import trace, metrics - from opentelemetry.sdk.trace import TracerProvider - from opentelemetry.sdk.metrics import MeterProvider - from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter - from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter - from opentelemetry.sdk.trace.export import BatchSpanProcessor - from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader - from opentelemetry.sdk.resources import Resource - - # Configure your service - resource = Resource.create({ - "service.name": "my-voice-app", - "service.version": "1.0.0", - }) - - # Setup trace provider - trace_provider = TracerProvider(resource=resource) - trace_provider.add_span_processor( - BatchSpanProcessor(OTLPSpanExporter(endpoint="http://localhost:4317")) - ) - trace.set_tracer_provider(trace_provider) - - # Setup metrics provider - metric_reader = PeriodicExportingMetricReader( - OTLPMetricExporter(endpoint="http://localhost:4317") - ) - metrics_provider = MeterProvider(resource=resource, metric_readers=[metric_reader]) - metrics.set_meter_provider(metrics_provider) - - # Now import and use vision-agents - from vision_agents.core.tts import TTS - ``` - - If no providers are configured, metrics and traces will be no-ops. -""" +from __future__ import annotations + +import functools +import inspect +from typing import Dict, Any, Optional, Mapping, Callable, Awaitable, TypeVar, Union from opentelemetry import trace, metrics +from opentelemetry.metrics import Histogram +import time + +R = TypeVar("R") # Get tracer and meter using the library name # These will use whatever providers the application has configured @@ -75,3 +38,167 @@ inflight_ops = meter.create_up_down_counter( "voice.ops.inflight", description="Inflight voice ops" ) + +turn_detection_latency_ms = meter.create_histogram( + "turn.detection.latency.ms", + unit="ms", +) + + +class Timer: + """ + Can be used as: + done = Timer(hist, {"attr": 1}) + ... + done({"phase": "init"}) + + with Timer(hist, {"attr": 1}) as timer: + timer.attributes["dynamic_key"] = "dynamic_value" + ... + + @Timer(hist, {"route": "/join"}) + def handler(...): ... + + @Timer(hist) + async def async_handler(...): ... + + If decorating a method, automatically adds {"class": } to attributes. + + When used as a context manager, you can add attributes dynamically via the + `attributes` property, which will be merged with base attributes when recording. + """ + + def __init__( + self, + hist: Histogram, + attributes: Optional[Mapping[str, Any]] = None, + *, + unit: str = "ms", + record_exceptions: bool = True, + ) -> None: + self._hist = hist + self._base_attrs: Dict[str, Any] = dict(attributes or {}) + self._unit = unit + self._record_exceptions = record_exceptions + + self._start_ns = time.perf_counter_ns() + self._stopped = False + self.last_elapsed_ms: Optional[float] = None + + # Public attributes dictionary that can be modified during context manager usage + self.attributes: Dict[str, Any] = {} + + def __call__(self, *args, **kwargs): + """If called with a function, act as a decorator; else record.""" + if args and callable(args[0]) and len(args) == 1 and not kwargs: + func = args[0] + return self._decorate(func) + extra_attrs = args[0] if args else None + return self.stop(extra_attrs) + + def __enter__(self) -> "Timer": + self._restart() + return self + + def __exit__(self, exc_type, exc, tb) -> None: + attrs: Dict[str, Any] = {} + if self._record_exceptions: + attrs["exception"] = "true" if exc_type else "false" + if exc_type: + attrs["exception_type"] = getattr(exc_type, "__name__", str(exc_type)) + self.stop(attrs) + + def stop(self, extra_attributes: Optional[Mapping[str, Any]] = None) -> float: + """Idempotent: records only once per start.""" + if not self._stopped: + self._stopped = True + elapsed = self.elapsed_ms() + self.last_elapsed_ms = elapsed + + attrs = {**self._base_attrs} + # Merge the dynamic attributes set during context manager usage + attrs.update(self.attributes) + if extra_attributes: + attrs.update(dict(extra_attributes)) + + value = elapsed if self._unit == "ms" else elapsed / 1000.0 + self._hist.record(value, attributes=attrs) + + return self.last_elapsed_ms or 0.0 + + def elapsed_ms(self) -> float: + return (time.perf_counter_ns() - self._start_ns) / 1_000_000.0 + + def _restart(self) -> None: + self._start_ns = time.perf_counter_ns() + self._stopped = False + self.last_elapsed_ms = None + self.attributes = {} # Reset dynamic attributes on restart + + def _decorate( + self, func: Union[Callable[..., R], Callable[..., Awaitable[R]]] + ) -> Union[Callable[..., R], Callable[..., Awaitable[R]]]: + """ + Decorate a function or method. + Automatically adds {"class": } if decorating a bound method. + """ + + is_async = inspect.iscoroutinefunction(func) + + if is_async: + # Type-cast func as async for type checker + async_func: Callable[..., Awaitable[R]] = func # type: ignore[assignment] + + @functools.wraps(async_func) + async def async_wrapper(*args, **kwargs) -> R: + class_name = _get_class_name_from_args(async_func, args) + attrs = {**self._base_attrs} + if class_name: + attrs["class"] = class_name + with Timer( + self._hist, + attrs, + unit=self._unit, + record_exceptions=self._record_exceptions, + ): + return await async_func(*args, **kwargs) + + return async_wrapper + else: + # Type-cast func as sync for type checker + sync_func: Callable[..., R] = func # type: ignore[assignment] + + @functools.wraps(sync_func) + def sync_wrapper(*args, **kwargs) -> R: + class_name = _get_class_name_from_args(sync_func, args) + attrs = {**self._base_attrs} + if class_name: + attrs["class"] = class_name + with Timer( + self._hist, + attrs, + unit=self._unit, + record_exceptions=self._record_exceptions, + ): + return sync_func(*args, **kwargs) + + return sync_wrapper + + +def _get_class_name_from_args( + func: Callable[..., Any], args: tuple[Any, ...] +) -> Optional[str]: + """Return class name if first arg looks like a bound method (self or cls).""" + if not args: + return None + + first = args[0] + + if hasattr(first, "__class__") and func.__qualname__.startswith( + first.__class__.__name__ + "." + ): + return first.__class__.__name__ + + if inspect.isclass(first) and func.__qualname__.startswith(first.__name__ + "."): + return first.__name__ + return None diff --git a/agents-core/vision_agents/core/turn_detection/turn_detection.py b/agents-core/vision_agents/core/turn_detection/turn_detection.py index e61f507b..29105c99 100644 --- a/agents-core/vision_agents/core/turn_detection/turn_detection.py +++ b/agents-core/vision_agents/core/turn_detection/turn_detection.py @@ -8,6 +8,7 @@ from .events import TurnStartedEvent, TurnEndedEvent from ..agents.conversation import Conversation from ..edge.types import Participant +from ..observability.metrics import turn_detection_latency_ms, Timer class TurnEvent(Enum): @@ -17,14 +18,11 @@ class TurnEvent(Enum): TURN_ENDED = "turn_ended" - class TurnDetector(ABC): """Base implementation for turn detection with common functionality.""" def __init__( - self, - confidence_threshold: float = 0.5, - provider_name: Optional[str] = None + self, confidence_threshold: float = 0.5, provider_name: Optional[str] = None ) -> None: self._confidence_threshold = confidence_threshold self.is_active = False @@ -33,21 +31,17 @@ def __init__( self.events = EventManager() self.events.register_events_from_module(events, ignore_not_compatible=True) - def _emit_start_turn_event( - self, event: TurnStartedEvent - ) -> None: + def _emit_start_turn_event(self, event: TurnStartedEvent) -> None: event.session_id = self.session_id event.plugin_name = self.provider_name self.events.send(event) - def _emit_end_turn_event( - self, event: TurnEndedEvent - ) -> None: + def _emit_end_turn_event(self, event: TurnEndedEvent) -> None: event.session_id = self.session_id event.plugin_name = self.provider_name self.events.send(event) - @abstractmethod + @Timer(turn_detection_latency_ms) async def process_audio( self, audio_data: PcmData, @@ -62,6 +56,23 @@ async def process_audio( conversation: Transcription/ chat history, sometimes useful for turn detection """ + return await self.detect_turn(audio_data, participant, conversation) + + @abstractmethod + async def detect_turn( + self, + audio_data: PcmData, + participant: Participant, + conversation: Optional[Conversation], + ) -> None: + """Process the audio and trigger turn start or turn end events + + Args: + audio_data: PcmData object containing audio samples from Stream + participant: Participant that's speaking, includes user data + conversation: Transcription/ chat history, sometimes useful for turn detection + """ + ... async def start(self) -> None: diff --git a/plugins/krisp/vision_agents/plugins/krisp/turn_detection.py b/plugins/krisp/vision_agents/plugins/krisp/turn_detection.py index b62d15e5..6a1d2d1d 100644 --- a/plugins/krisp/vision_agents/plugins/krisp/turn_detection.py +++ b/plugins/krisp/vision_agents/plugins/krisp/turn_detection.py @@ -75,7 +75,7 @@ def is_detecting(self) -> bool: """Check if turn detection is currently active.""" return self._is_detecting - async def process_audio( + async def detect_turn( self, audio_data: PcmData, participant: Participant, diff --git a/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py b/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py index fd158546..411ea76f 100644 --- a/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py +++ b/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py @@ -13,13 +13,14 @@ from vision_agents.core.agents import Conversation from vision_agents.core.agents.agents import default_agent_options, AgentOptions from vision_agents.core.edge.types import Participant +from vision_agents.core.observability import meter +from vision_agents.core.observability.metrics import Timer from vision_agents.core.turn_detection import ( TurnDetector, TurnStartedEvent, TurnEndedEvent, ) - import logging logger = logging.getLogger(__name__) @@ -41,6 +42,17 @@ ) +turn_silero_vad_latency_ms = meter.create_histogram( + "turn.silero.vad.latency.ms", + unit="ms", +) + +turn_smart_turn_detection_latency_ms = meter.create_histogram( + "turn.smart_turn.detection.latency.ms", + unit="ms", +) + + @dataclass class Silence: trailing_silence_chunks: int = 0 @@ -109,7 +121,9 @@ def __init__( self._audio_queue: asyncio.Queue[Any] = asyncio.Queue() self._processing_task: Optional[asyncio.Task[Any]] = None self._shutdown_event = asyncio.Event() - self._processing_active = asyncio.Event() # Tracks if background task is processing + self._processing_active = ( + asyncio.Event() + ) # Tracks if background task is processing if options is None: self.options = default_agent_options() @@ -149,7 +163,7 @@ async def _prepare_silero_vad(self): SileroVAD, path, reset_interval_seconds=self.vad_reset_interval_seconds ) - async def process_audio( + async def detect_turn( self, audio_data: PcmData, participant: Participant, @@ -177,7 +191,7 @@ async def _process_audio_loop(self): # Signal that we're actively processing self._processing_active.set() - + try: # Process the audio packet await self._process_audio_packet(audio_data, participant) @@ -234,7 +248,9 @@ async def _process_audio_packet( # detect speech in small 512 chunks, gather to larger audio segments with speech for chunk in audio_chunks[:-1]: # predict if this segment has speech - speech_probability = await self.vad.predict_speech(chunk.samples) + with Timer(turn_silero_vad_latency_ms) as timer: + timer.attributes["samples"] = len(chunk.samples) + speech_probability = await self.vad.predict_speech(chunk.samples) is_speech = speech_probability > self.speech_probability_threshold if self._active_segment is not None: @@ -252,7 +268,11 @@ async def _process_audio_packet( # TODO: make this testable trailing_silence_ms = ( - self._silence.trailing_silence_chunks * 512 / 16000 * 1000 * 5 #DTX correction + self._silence.trailing_silence_chunks + * 512 + / 16000 + * 1000 + * 5 # DTX correction ) long_silence = trailing_silence_ms > self._trailing_silence_ms max_duration_reached = ( @@ -269,7 +289,15 @@ async def _process_audio_packet( merged.append(self._active_segment) merged = merged.tail(8, True, "start") # see if we've completed the turn - prediction = await self._predict_turn_completed(merged, participant) + with Timer(turn_smart_turn_detection_latency_ms) as timer: + timer.attributes["audio_duration_ms"] = merged.duration_ms + timer.attributes["samples"] = len(merged.samples) + timer.attributes["trailing_silence_ms"] = trailing_silence_ms + prediction = await self._predict_turn_completed( + merged, participant + ) + timer.attributes["prediction"] = prediction + timer.attributes["turn_ended"] = prediction > 0.5 turn_ended = prediction > 0.5 if turn_ended: self._emit_end_turn_event( @@ -304,19 +332,19 @@ async def _process_audio_packet( async def wait_for_processing_complete(self, timeout: float = 5.0) -> None: """Wait for all queued audio to be processed. Useful for testing.""" start_time = time.time() - + # Wait for queue to be empty AND no active processing while (time.time() - start_time) < timeout: queue_empty = self._audio_queue.qsize() == 0 not_processing = not self._processing_active.is_set() - + if queue_empty and not_processing: # Give a small final buffer to ensure events are emitted await asyncio.sleep(0.05) return - + await asyncio.sleep(0.01) - + # Timeout reached logger.warning(f"wait_for_processing_complete timed out after {timeout}s") @@ -380,16 +408,16 @@ def _blocking_predict_turn_completed( def _build_smart_turn_session(self): path = os.path.join(self.options.model_dir, SMART_TURN_ONNX_FILENAME) - + # Load model into memory to avoid multi-worker file access issues with open(path, "rb") as f: model_bytes = f.read() - + so = ort.SessionOptions() so.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL so.inter_op_num_threads = 1 so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL - + # Load from memory instead of file path return ort.InferenceSession(model_bytes, sess_options=so) @@ -408,10 +436,10 @@ def __init__(self, model_path: str, reset_interval_seconds: float = 5.0): # Load model into memory to avoid multi-worker file access issues with open(model_path, "rb") as f: model_bytes = f.read() - + opts = ort.SessionOptions() opts.inter_op_num_threads = 1 - + # Load from memory instead of file path self.session = ort.InferenceSession(model_bytes, sess_options=opts) self.context_size = 64 # Silero uses 64-sample context at 16 kHz diff --git a/plugins/ultralytics/vision_agents/plugins/ultralytics/yolo_pose_processor.py b/plugins/ultralytics/vision_agents/plugins/ultralytics/yolo_pose_processor.py index 0adbbd06..bf1e9274 100644 --- a/plugins/ultralytics/vision_agents/plugins/ultralytics/yolo_pose_processor.py +++ b/plugins/ultralytics/vision_agents/plugins/ultralytics/yolo_pose_processor.py @@ -20,9 +20,15 @@ ) from vision_agents.core.utils.queue import LatestNQueue from vision_agents.core.utils.video_forwarder import VideoForwarder +from vision_agents.core.observability.metrics import Timer, meter logger = logging.getLogger(__name__) +# Metrics for YOLO pose detection +yolo_pose_inference_ms = meter.create_histogram( + "yolo.pose.inference.ms", unit="ms", description="YOLO pose inference latency" +) + DEFAULT_WIDTH = 640 DEFAULT_HEIGHT = 480 DEFAULT_WIDTH = 1920 @@ -310,16 +316,28 @@ def _process_pose_sync( ) # Run pose detection - yolo_start = time.perf_counter() - pose_results = self.pose_model( - frame_array, - verbose=False, - # imgsz=self.imgsz, - conf=self.conf_threshold, - device=self.device, + with Timer(yolo_pose_inference_ms) as timer: + timer.attributes["frame_width"] = frame_array.shape[1] + timer.attributes["frame_height"] = frame_array.shape[0] + timer.attributes["conf_threshold"] = self.conf_threshold + timer.attributes["device"] = str(self.device) + + pose_results = self.pose_model( + frame_array, + verbose=False, + # imgsz=self.imgsz, + conf=self.conf_threshold, + device=self.device, + ) + + # Add detected person count to metrics + timer.attributes["persons_detected"] = ( + len(pose_results) if pose_results else 0 + ) + + logger.debug( + f"🎯 YOLO inference completed in {timer.last_elapsed_ms:.1f}ms" ) - yolo_time = time.perf_counter() - yolo_start - logger.debug(f"🎯 YOLO inference completed in {yolo_time:.3f}s") if not pose_results: logger.debug("❌ No pose results detected") diff --git a/plugins/vogent/vision_agents/plugins/vogent/vogent_turn_detection.py b/plugins/vogent/vision_agents/plugins/vogent/vogent_turn_detection.py index 5b902973..34b352ee 100644 --- a/plugins/vogent/vision_agents/plugins/vogent/vogent_turn_detection.py +++ b/plugins/vogent/vision_agents/plugins/vogent/vogent_turn_detection.py @@ -17,11 +17,27 @@ TurnStartedEvent, TurnEndedEvent, ) +from vision_agents.core.observability.metrics import Timer, meter import logging logger = logging.getLogger(__name__) +# Metrics for Vogent turn detection +vogent_vad_latency_ms = meter.create_histogram( + "vogent.vad.latency.ms", unit="ms", description="Vogent VAD prediction latency" +) +vogent_whisper_latency_ms = meter.create_histogram( + "vogent.whisper.latency.ms", + unit="ms", + description="Vogent Whisper transcription latency", +) +vogent_turn_prediction_latency_ms = meter.create_histogram( + "vogent.turn_prediction.latency.ms", + unit="ms", + description="Vogent turn completion prediction latency", +) + # Silero VAD model (reused from smart_turn) SILERO_ONNX_FILENAME = "silero_vad.onnx" SILERO_ONNX_URL = "https://github.com/snakers4/silero-vad/raw/master/src/silero_vad/data/silero_vad.onnx" @@ -40,15 +56,15 @@ class Silence: class VogentTurnDetection(TurnDetector): """ Vogent Turn Detection combines audio intonation and text context for accurate turn detection. - + This implementation: 1. Uses Silero VAD to detect when speech starts/stops 2. Uses faster-whisper to transcribe audio in real-time 3. Uses Vogent Turn model (multimodal) to detect turn completion - + Vogent operates on both audio features AND text context, making it more accurate than audio-only approaches, especially for handling incomplete thoughts. - + Reference: https://github.com/vogent/vogent-turn Blogpost: https://blog.vogent.ai/posts/voturn-80m-state-of-the-art-turn-detection-for-voice-agents """ @@ -66,7 +82,7 @@ def __init__( ): """ Initialize Vogent Turn Detection. - + Args: whisper_model_size: Faster-whisper model size (tiny, base, small, medium, large) vad_reset_interval_seconds: Reset VAD internal state every N seconds to prevent drift @@ -78,7 +94,7 @@ def __init__( model_dir: Directory to store model files """ super().__init__() - + # Configuration parameters self.whisper_model_size = whisper_model_size self.vad_reset_interval_seconds = vad_reset_interval_seconds @@ -88,7 +104,7 @@ def __init__( self.max_segment_duration_seconds = max_segment_duration_seconds self.vogent_threshold = vogent_threshold self.model_dir = model_dir - + # Audio buffering for processing self._audio_buffer = PcmData( sample_rate=RATE, channels=1, format=AudioFormat.F32 @@ -99,12 +115,12 @@ def __init__( ) self._active_segment: Optional[PcmData] = None self._trailing_silence_ms = self.silence_duration_ms - + # Producer-consumer pattern: audio packets go into buffer, background task processes them self._audio_queue: asyncio.Queue[Any] = asyncio.Queue() self._processing_task: Optional[asyncio.Task[Any]] = None self._shutdown_event = asyncio.Event() - + # Model instances (initialized in start()) self.vad = None self.whisper = None @@ -114,17 +130,17 @@ async def start(self): """Initialize models and prepare for turn detection.""" # Ensure model directory exists os.makedirs(self.model_dir, exist_ok=True) - + # Prepare models in parallel await asyncio.gather( self._prepare_silero_vad(), self._prepare_whisper(), self._prepare_vogent(), ) - + # Start background processing task self._processing_task = asyncio.create_task(self._process_audio_loop()) - + # Call parent start method await super().start() @@ -133,8 +149,10 @@ async def _prepare_silero_vad(self) -> None: path = os.path.join(self.model_dir, SILERO_ONNX_FILENAME) await ensure_model(path, SILERO_ONNX_URL) # Initialize VAD in thread pool to avoid blocking event loop - self.vad = await asyncio.to_thread( - lambda: SileroVAD(path, reset_interval_seconds=self.vad_reset_interval_seconds) # type: ignore + self.vad = await asyncio.to_thread( # type: ignore[func-returns-value] + lambda: SileroVAD( # type: ignore[arg-type] + path, reset_interval_seconds=self.vad_reset_interval_seconds + ) ) async def _prepare_whisper(self) -> None: @@ -142,7 +160,9 @@ async def _prepare_whisper(self) -> None: logger.info(f"Loading faster-whisper model: {self.whisper_model_size}") # Load whisper in thread pool to avoid blocking event loop self.whisper = await asyncio.to_thread( # type: ignore[func-returns-value] - lambda: WhisperModel(self.whisper_model_size, device="cpu", compute_type="int8") + lambda: WhisperModel( + self.whisper_model_size, device="cpu", compute_type="int8" + ) ) logger.info("Faster-whisper model loaded") @@ -162,7 +182,7 @@ async def _prepare_vogent(self) -> None: ) logger.info("Vogent turn detection model loaded") - async def process_audio( + async def detect_turn( self, audio_data: PcmData, participant: Participant, @@ -204,7 +224,7 @@ async def _process_audio_packet( ) -> None: """ Process audio packet through VAD -> Whisper -> Vogent pipeline. - + This method: 1. Buffers audio and processes in 512-sample chunks 2. Uses VAD to detect speech @@ -212,7 +232,7 @@ async def _process_audio_packet( 4. When reaching silence or max duration: - Transcribes segment with Whisper - Checks turn completion with Vogent (audio + text) - + Args: audio_data: PcmData object containing audio samples participant: Participant that's speaking @@ -239,8 +259,10 @@ async def _process_audio_packet( # Predict if this segment has speech if self.vad is None: continue - - speech_probability = self.vad.predict_speech(chunk.samples) + + with Timer(vogent_vad_latency_ms) as timer: + timer.attributes["samples"] = len(chunk.samples) + speech_probability = self.vad.predict_speech(chunk.samples) is_speech = speech_probability > self.speech_probability_threshold if self._active_segment is not None: @@ -256,7 +278,11 @@ async def _process_audio_packet( self._silence.trailing_silence_chunks += 1 trailing_silence_ms = ( - self._silence.trailing_silence_chunks * CHUNK / RATE * 1000 * 5 # DTX correction + self._silence.trailing_silence_chunks + * CHUNK + / RATE + * 1000 + * 5 # DTX correction ) long_silence = trailing_silence_ms > self._trailing_silence_ms max_duration_reached = ( @@ -272,20 +298,20 @@ async def _process_audio_packet( merged.append(self._pre_speech_buffer) merged.append(self._active_segment) merged = merged.tail(8, True, "start") - + # Transcribe the segment with Whisper transcription = await self._transcribe_segment(merged) - + # Get previous line from conversation for context prev_line = self._get_previous_line(conversation) - + # Check if turn is complete using Vogent (multimodal: audio + text) is_complete = await self._predict_turn_completed( merged, prev_line=prev_line, curr_line=transcription, ) - + if is_complete: self._emit_end_turn_event( TurnEndedEvent( @@ -303,7 +329,7 @@ async def _process_audio_packet( ) self._pre_speech_buffer.append(merged) self._pre_speech_buffer = self._pre_speech_buffer.tail(8) - + elif is_speech and self._active_segment is None: self._emit_start_turn_event(TurnStartedEvent(participant=participant)) # Create a new segment @@ -342,103 +368,115 @@ async def stop(self): async def _transcribe_segment(self, pcm: PcmData) -> str: """ Transcribe audio segment using faster-whisper. - + Args: pcm: PcmData containing audio samples - + Returns: Transcribed text """ - # Ensure it's 16khz and f32 format - pcm = pcm.resample(16000).to_float32() - audio_array = pcm.samples - - if self.whisper is None: - return "" - - # Run transcription in thread pool to avoid blocking - segments, info = await asyncio.to_thread( - self.whisper.transcribe, - audio_array, - language="en", - beam_size=1, - vad_filter=False, # We already did VAD - ) - - # Collect all text segments - text_parts = [] - for segment in segments: - text_parts.append(segment.text.strip()) - - transcription = " ".join(text_parts).strip() + with Timer(vogent_whisper_latency_ms) as timer: + # Ensure it's 16khz and f32 format + pcm = pcm.resample(16000).to_float32() + audio_array = pcm.samples + timer.attributes["audio_duration_ms"] = pcm.duration_ms + timer.attributes["samples"] = len(audio_array) + + if self.whisper is None: + return "" + + # Run transcription in thread pool to avoid blocking + segments, info = await asyncio.to_thread( + self.whisper.transcribe, + audio_array, + language="en", + beam_size=1, + vad_filter=False, # We already did VAD + ) + + # Collect all text segments + text_parts = [] + for segment in segments: + text_parts.append(segment.text.strip()) + + transcription = " ".join(text_parts).strip() + timer.attributes["transcription_length"] = len(transcription) + return transcription async def _predict_turn_completed( - self, - pcm: PcmData, + self, + pcm: PcmData, prev_line: str, curr_line: str, ) -> bool: """ Predict whether the current turn is complete using Vogent. - + Args: pcm: PcmData containing audio samples prev_line: Previous speaker's text (for context) curr_line: Current speaker's text - + Returns: True if turn is complete, False otherwise """ - # Ensure it's 16khz and f32 format - pcm = pcm.resample(16000).to_float32() - - # Truncate to 8 seconds - audio_array = pcm.tail(8, False).samples - - if self.vogent is None: - return False - - # Run vogent prediction in thread pool - result = await asyncio.to_thread( - self.vogent.predict, - audio_array, - prev_line=prev_line, - curr_line=curr_line, - sample_rate=16000, - return_probs=True, - ) - - # Check if probability exceeds threshold - is_complete = result['prob_endpoint'] > self.vogent_threshold - logger.debug( - f"Vogent probability: {result['prob_endpoint']:.3f}, " - f"threshold: {self.vogent_threshold}, is_complete: {is_complete}" - ) - + with Timer(vogent_turn_prediction_latency_ms) as timer: + # Ensure it's 16khz and f32 format + pcm = pcm.resample(16000).to_float32() + + # Truncate to 8 seconds + audio_array = pcm.tail(8, False).samples + timer.attributes["audio_duration_ms"] = len(audio_array) / 16000 * 1000 + timer.attributes["prev_line_length"] = len(prev_line) + timer.attributes["curr_line_length"] = len(curr_line) + + if self.vogent is None: + return False + + # Run vogent prediction in thread pool + result = await asyncio.to_thread( + self.vogent.predict, + audio_array, + prev_line=prev_line, + curr_line=curr_line, + sample_rate=16000, + return_probs=True, + ) + + # Check if probability exceeds threshold + is_complete = result["prob_endpoint"] > self.vogent_threshold + timer.attributes["probability"] = result["prob_endpoint"] + timer.attributes["is_complete"] = is_complete + + logger.debug( + f"Vogent probability: {result['prob_endpoint']:.3f}, " + f"threshold: {self.vogent_threshold}, is_complete: {is_complete}" + ) + return is_complete def _get_previous_line(self, conversation: Optional[Conversation]) -> str: """ Extract the previous speaker's line from conversation history. - + Args: conversation: Conversation object with message history - + Returns: Previous line text, or empty string if not available """ if conversation is None or not conversation.messages: return "" - + # Get the last message that's not from the current speaker # Typically this would be the assistant or another user for message in reversed(conversation.messages): if message.content and message.content.strip(): # Remove terminal punctuation for better vogent performance - text = message.content.strip().rstrip('.!?') + text = message.content.strip().rstrip(".!?") return text - + return "" @@ -446,20 +484,20 @@ def _get_previous_line(self, conversation: Optional[Conversation]) -> str: class SileroVAD: """ Minimal Silero VAD ONNX wrapper for 16 kHz, mono, chunk=512. - + Reused from smart_turn implementation. """ def __init__(self, model_path: str, reset_interval_seconds: float = 5.0): """ Initialize Silero VAD. - + Args: model_path: Path to the ONNX model file reset_interval_seconds: Reset internal state every N seconds to prevent drift """ import onnxruntime as ort - + opts = ort.SessionOptions() opts.inter_op_num_threads = 1 self.session = ort.InferenceSession(model_path, sess_options=opts) @@ -512,43 +550,43 @@ def predict_speech(self, chunk_f32: np.ndarray) -> float: async def ensure_model(path: str, url: str) -> str: """ Download a model file asynchronously if it doesn't exist. - + Args: path: Local path where the model should be saved url: URL to download the model from - + Returns: The path to the model file """ if not os.path.exists(path): model_name = os.path.basename(path) logger.info(f"Downloading {model_name}...") - + try: - async with httpx.AsyncClient(timeout=300.0, follow_redirects=True) as client: + async with httpx.AsyncClient( + timeout=300.0, follow_redirects=True + ) as client: async with client.stream("GET", url) as response: response.raise_for_status() - + # Write file in chunks to avoid loading entire file in memory chunks = [] async for chunk in response.aiter_bytes(chunk_size=8192): chunks.append(chunk) - + # Write all chunks to file in thread to avoid blocking event loop def write_file(): with open(path, "wb") as f: for chunk in chunks: f.write(chunk) - + await asyncio.to_thread(write_file) - + logger.info(f"{model_name} downloaded.") except httpx.HTTPError as e: # Clean up partial download on error if os.path.exists(path): os.remove(path) raise RuntimeError(f"Failed to download {model_name}: {e}") - - return path - + return path From 744f50b8668e068cd6dd5376c870dfd1e9df1ad8 Mon Sep 17 00:00:00 2001 From: Tommaso Barbugli Date: Fri, 31 Oct 2025 14:10:53 +0100 Subject: [PATCH 02/11] metrics for stt and built-in observability stack --- agents-core/tests/test_timer.py | 446 ++++++++++++++ .../core/observability/metrics.py | 25 +- agents-core/vision_agents/core/stt/stt.py | 110 +++- agents-core/vision_agents/core/tts/tts.py | 6 +- observability/.gitignore | 4 + observability/README.md | 173 ++++++ .../grafana/dashboards/vision-agents.json | 557 ++++++++++++++++++ observability/grafana/init-home-dashboard.sh | 33 ++ .../provisioning/dashboards/default.yml | 13 + .../provisioning/datasources/prometheus.yml | 12 + observability/prometheus/prometheus.yml | 21 + .../plugins/deepgram/deepgram_stt.py | 23 +- .../fish/vision_agents/plugins/fish/stt.py | 6 +- .../vision_agents/plugins/wizper/stt.py | 2 +- 14 files changed, 1382 insertions(+), 49 deletions(-) create mode 100644 agents-core/tests/test_timer.py create mode 100644 observability/.gitignore create mode 100644 observability/README.md create mode 100644 observability/grafana/dashboards/vision-agents.json create mode 100755 observability/grafana/init-home-dashboard.sh create mode 100644 observability/grafana/provisioning/dashboards/default.yml create mode 100644 observability/grafana/provisioning/datasources/prometheus.yml create mode 100644 observability/prometheus/prometheus.yml diff --git a/agents-core/tests/test_timer.py b/agents-core/tests/test_timer.py new file mode 100644 index 00000000..81e83347 --- /dev/null +++ b/agents-core/tests/test_timer.py @@ -0,0 +1,446 @@ +"""Tests for the Timer class in observability metrics.""" + +import asyncio +import pytest +from unittest.mock import MagicMock +from vision_agents.core.observability.metrics import Timer + + +@pytest.fixture +def mock_histogram(): + """Create a mock histogram for testing.""" + return MagicMock() + + +class TestTimerContextManager: + """Tests for Timer used as a context manager.""" + + def test_context_manager_records_timing(self, mock_histogram): + """Test that Timer records elapsed time when used as context manager.""" + with Timer(mock_histogram) as timer: + pass # Do nothing, just measure overhead + + # Verify record was called + mock_histogram.record.assert_called_once() + call_args = mock_histogram.record.call_args + + # First argument should be elapsed time in ms + elapsed_ms = call_args[0][0] + assert isinstance(elapsed_ms, float) + assert elapsed_ms >= 0 + + # Should have recorded the elapsed time + assert timer.last_elapsed_ms is not None + assert timer.last_elapsed_ms >= 0 + + def test_context_manager_with_base_attributes(self, mock_histogram): + """Test that base attributes are included in recording.""" + base_attrs = {"provider": "test", "version": "1.0"} + + with Timer(mock_histogram, base_attrs): + pass + + # Verify attributes were passed + call_args = mock_histogram.record.call_args + recorded_attrs = call_args[1]["attributes"] + + assert "provider" in recorded_attrs + assert recorded_attrs["provider"] == "test" + assert "version" in recorded_attrs + assert recorded_attrs["version"] == "1.0" + + def test_context_manager_with_dynamic_attributes(self, mock_histogram): + """Test that dynamic attributes can be added during execution.""" + with Timer(mock_histogram, {"base": "value"}) as timer: + timer.attributes["dynamic"] = "added" + timer.attributes["count"] = 42 + + # Verify both base and dynamic attributes were recorded + call_args = mock_histogram.record.call_args + recorded_attrs = call_args[1]["attributes"] + + assert recorded_attrs["base"] == "value" + assert recorded_attrs["dynamic"] == "added" + assert recorded_attrs["count"] == 42 + + def test_context_manager_exception_tracking(self, mock_histogram): + """Test that exceptions are tracked in attributes.""" + try: + with Timer(mock_histogram, record_exceptions=True): + raise ValueError("test error") + except ValueError: + pass + + # Verify exception was recorded + call_args = mock_histogram.record.call_args + recorded_attrs = call_args[1]["attributes"] + + assert recorded_attrs["exception"] == "true" + assert recorded_attrs["exception_type"] == "ValueError" + + def test_context_manager_no_exception(self, mock_histogram): + """Test that no exception is recorded when code succeeds.""" + with Timer(mock_histogram, record_exceptions=True): + pass + + call_args = mock_histogram.record.call_args + recorded_attrs = call_args[1]["attributes"] + + assert recorded_attrs["exception"] == "false" + assert "exception_type" not in recorded_attrs + + def test_direct_call_pattern(self, mock_histogram): + """Test Timer used with direct call pattern.""" + timer = Timer(mock_histogram, {"base": "attr"}) + + # Simulate some work + import time + + time.sleep(0.01) + + # Call with extra attributes + elapsed = timer({"phase": "init"}) + + # Verify recording + assert elapsed > 0 + mock_histogram.record.assert_called_once() + + call_args = mock_histogram.record.call_args + recorded_attrs = call_args[1]["attributes"] + + assert recorded_attrs["base"] == "attr" + assert recorded_attrs["phase"] == "init" + + def test_stop_is_idempotent(self, mock_histogram): + """Test that calling stop multiple times only records once.""" + timer = Timer(mock_histogram) + + timer.stop() + timer.stop() + timer.stop() + + # Should only be called once + assert mock_histogram.record.call_count == 1 + + +class TestTimerDecorator: + """Tests for Timer used as a decorator.""" + + def test_sync_function_decorator(self, mock_histogram): + """Test decorating a synchronous function.""" + + @Timer(mock_histogram, {"func": "test"}) + def my_function(x, y): + return x + y + + result = my_function(2, 3) + + assert result == 5 + mock_histogram.record.assert_called_once() + + call_args = mock_histogram.record.call_args + recorded_attrs = call_args[1]["attributes"] + assert recorded_attrs["func"] == "test" + + async def test_async_function_decorator(self, mock_histogram): + """Test decorating an async function.""" + + @Timer(mock_histogram, {"func": "async_test"}) + async def my_async_function(x): + await asyncio.sleep(0.01) + return x * 2 + + result = await my_async_function(5) + + assert result == 10 + mock_histogram.record.assert_called_once() + + call_args = mock_histogram.record.call_args + recorded_attrs = call_args[1]["attributes"] + assert recorded_attrs["func"] == "async_test" + + def test_method_decorator_adds_class_name(self, mock_histogram): + """Test that decorating a method automatically adds class name.""" + + class MyClass: + @Timer(mock_histogram, {"method": "process"}) + def process(self): + return "processed" + + instance = MyClass() + result = instance.process() + + assert result == "processed" + mock_histogram.record.assert_called_once() + + call_args = mock_histogram.record.call_args + recorded_attrs = call_args[1]["attributes"] + + # Should automatically add fully qualified class path + assert "class" in recorded_attrs + # Check it ends with the class name (module path will vary) + assert recorded_attrs["class"].endswith(".MyClass") + assert recorded_attrs["method"] == "process" + + async def test_async_method_decorator_adds_class_name(self, mock_histogram): + """Test that decorating an async method adds class name.""" + + class MyAsyncClass: + @Timer(mock_histogram) + async def async_process(self): + await asyncio.sleep(0.01) + return "async_processed" + + instance = MyAsyncClass() + result = await instance.async_process() + + assert result == "async_processed" + mock_histogram.record.assert_called_once() + + call_args = mock_histogram.record.call_args + recorded_attrs = call_args[1]["attributes"] + + assert "class" in recorded_attrs + assert recorded_attrs["class"].endswith(".MyAsyncClass") + + +class TestTimerInheritedMethods: + """Tests for Timer with inherited methods - the bug fix.""" + + def test_inherited_method_reports_subclass_name(self, mock_histogram): + """Test that inherited methods report the actual subclass name.""" + + class BaseClass: + @Timer(mock_histogram) + def process(self): + return "processed" + + class SubClassA(BaseClass): + pass + + class SubClassB(BaseClass): + pass + + # Test SubClassA + instance_a = SubClassA() + instance_a.process() + + # Test SubClassB + instance_b = SubClassB() + instance_b.process() + + # Should have been called twice + assert mock_histogram.record.call_count == 2 + + # Check first call (SubClassA) + first_call = mock_histogram.record.call_args_list[0] + first_attrs = first_call[1]["attributes"] + assert first_attrs["class"].endswith(".SubClassA") + + # Check second call (SubClassB) + second_call = mock_histogram.record.call_args_list[1] + second_attrs = second_call[1]["attributes"] + assert second_attrs["class"].endswith(".SubClassB") + + async def test_inherited_async_method_reports_subclass_name(self, mock_histogram): + """Test that inherited async methods report the actual subclass name.""" + + class AsyncBaseClass: + @Timer(mock_histogram) + async def process(self): + await asyncio.sleep(0.01) + return "processed" + + class AsyncSubClass(AsyncBaseClass): + pass + + instance = AsyncSubClass() + await instance.process() + + mock_histogram.record.assert_called_once() + + call_args = mock_histogram.record.call_args + recorded_attrs = call_args[1]["attributes"] + + # Should report the subclass path, not the base class + assert recorded_attrs["class"].endswith(".AsyncSubClass") + + def test_deeply_nested_inheritance(self, mock_histogram): + """Test that deep inheritance chains still report the correct class.""" + + class GrandParent: + @Timer(mock_histogram) + def process(self): + return "processed" + + class Parent(GrandParent): + pass + + class Child(Parent): + pass + + instance = Child() + instance.process() + + call_args = mock_histogram.record.call_args + recorded_attrs = call_args[1]["attributes"] + + # Should report the most specific class path + assert recorded_attrs["class"].endswith(".Child") + + +class TestTimerUnits: + """Tests for Timer unit conversions.""" + + def test_millisecond_unit_default(self, mock_histogram): + """Test that default unit is milliseconds.""" + with Timer(mock_histogram): + pass + + call_args = mock_histogram.record.call_args + elapsed = call_args[0][0] + + # Value should be in milliseconds (small positive number) + assert elapsed >= 0 + assert elapsed < 1000 # Should be less than 1 second for this test + + def test_second_unit_conversion(self, mock_histogram): + """Test that seconds unit converts correctly.""" + with Timer(mock_histogram, unit="s"): + import time + + time.sleep(0.01) # Sleep 10ms + + call_args = mock_histogram.record.call_args + elapsed_seconds = call_args[0][0] + + # Should be approximately 0.01 seconds + assert 0.005 < elapsed_seconds < 0.05 + + +class TestTimerEdgeCases: + """Tests for edge cases and error conditions.""" + + def test_timer_without_stop_in_context_manager(self, mock_histogram): + """Test that __exit__ always calls stop.""" + with Timer(mock_histogram) as timer: + # Don't call stop manually + pass + + # Should have been called by __exit__ + mock_histogram.record.assert_called_once() + assert timer.last_elapsed_ms is not None + + def test_restart_clears_attributes(self, mock_histogram): + """Test that restart clears dynamic attributes.""" + timer = Timer(mock_histogram) + + # First use + timer.attributes["first"] = "value1" + timer.stop() + + # Restart and use again + timer._restart() + timer.attributes["second"] = "value2" + timer.stop({"extra": "attr"}) + + # Second call should only have "second" and "extra", not "first" + second_call = mock_histogram.record.call_args_list[1] + second_attrs = second_call[1]["attributes"] + + assert "second" in second_attrs + assert "extra" in second_attrs + assert "first" not in second_attrs + + def test_elapsed_ms_while_running(self, mock_histogram): + """Test that elapsed_ms can be called while timer is running.""" + with Timer(mock_histogram) as timer: + import time + + time.sleep(0.01) + elapsed = timer.elapsed_ms() + assert elapsed > 0 + + # Final elapsed should be >= interim elapsed + assert timer.last_elapsed_ms >= elapsed + + def test_callable_check_in_call(self, mock_histogram): + """Test that __call__ with callable argument triggers decoration.""" + + def my_func(): + return 42 + + timer = Timer(mock_histogram) + decorated = timer(my_func) + + # Should return a wrapped function + assert callable(decorated) + assert decorated() == 42 + mock_histogram.record.assert_called_once() + + +class TestTimerRealWorldScenarios: + """Tests simulating real-world usage patterns.""" + + async def test_stt_pattern(self, mock_histogram): + """Test the pattern used in STT base class.""" + + class STT: + async def process_audio(self, audio_data): + with Timer(mock_histogram) as timer: + timer.attributes["provider"] = self.__class__.__name__ + timer.attributes["samples"] = len(audio_data) + + # Simulate processing + await asyncio.sleep(0.01) + + class DeepgramSTT(STT): + pass + + stt = DeepgramSTT() + await stt.process_audio([1, 2, 3, 4, 5]) + + call_args = mock_histogram.record.call_args + recorded_attrs = call_args[1]["attributes"] + + assert recorded_attrs["provider"] == "DeepgramSTT" + assert recorded_attrs["samples"] == 5 + + def test_turn_detection_pattern(self, mock_histogram): + """Test the pattern used in TurnDetector base class.""" + + class TurnDetector: + @Timer(mock_histogram) + async def process_audio(self, audio_data): + await asyncio.sleep(0.01) + return "turn_detected" + + class SmartTurnDetection(TurnDetector): + pass + + detector = SmartTurnDetection() + result = asyncio.run(detector.process_audio([1, 2, 3])) + + assert result == "turn_detected" + + call_args = mock_histogram.record.call_args + recorded_attrs = call_args[1]["attributes"] + + # Should report the actual implementation class path + assert recorded_attrs["class"].endswith(".SmartTurnDetection") + + def test_multiple_nested_timers(self, mock_histogram): + """Test that nested timers work independently.""" + with Timer(mock_histogram, {"outer": "timer"}): + with Timer(mock_histogram, {"inner": "timer"}): + pass + + # Both should have recorded + assert mock_histogram.record.call_count == 2 + + # Check both calls had different attributes + first_call_attrs = mock_histogram.record.call_args_list[0][1]["attributes"] + second_call_attrs = mock_histogram.record.call_args_list[1][1]["attributes"] + + assert first_call_attrs["inner"] == "timer" + assert second_call_attrs["outer"] == "timer" diff --git a/agents-core/vision_agents/core/observability/metrics.py b/agents-core/vision_agents/core/observability/metrics.py index 91bdedda..c9aad222 100644 --- a/agents-core/vision_agents/core/observability/metrics.py +++ b/agents-core/vision_agents/core/observability/metrics.py @@ -188,17 +188,30 @@ def sync_wrapper(*args, **kwargs) -> R: def _get_class_name_from_args( func: Callable[..., Any], args: tuple[Any, ...] ) -> Optional[str]: - """Return class name if first arg looks like a bound method (self or cls).""" + """Return fully qualified class path if first arg looks like a bound method (self or cls). + + For instance methods (self), we return the runtime class path (module.Class), not just + the class name. This provides better identification in metrics, especially when multiple + plugins use the same class name (e.g., TTS). + + Returns: + Fully qualified class path like "vision_agents.plugins.cartesia.tts.TTS" + or None if not a method call. + """ if not args: return None first = args[0] - if hasattr(first, "__class__") and func.__qualname__.startswith( - first.__class__.__name__ + "." - ): - return first.__class__.__name__ + # Check if this looks like an instance method call (self parameter) + if hasattr(first, "__class__") and not inspect.isclass(first): + # Verify it's actually a method by checking the function's qualname contains a dot + if "." in func.__qualname__: + # Return the fully qualified class path + return f"{first.__class__.__module__}.{first.__class__.__qualname__}" + # Check if this looks like a class method call (cls parameter) if inspect.isclass(first) and func.__qualname__.startswith(first.__name__ + "."): - return first.__name__ + return f"{first.__module__}.{first.__qualname__}" + return None diff --git a/agents-core/vision_agents/core/stt/stt.py b/agents-core/vision_agents/core/stt/stt.py index 0d4fce44..91f0a679 100644 --- a/agents-core/vision_agents/core/stt/stt.py +++ b/agents-core/vision_agents/core/stt/stt.py @@ -6,6 +6,7 @@ from ..edge.types import Participant from vision_agents.core.events.manager import EventManager +from ..observability.metrics import Timer, stt_latency_ms, stt_errors from . import events from .events import TranscriptResponse @@ -23,6 +24,7 @@ class STT(abc.ABC): process_audio is currently called every 20ms. The integration with turn keeping could be improved """ + closed: bool = False started: bool = False @@ -36,7 +38,6 @@ def __init__( self.events = EventManager() self.events.register_events_from_module(events, ignore_not_compatible=True) - def _emit_transcript_event( self, text: str, @@ -51,13 +52,15 @@ def _emit_transcript_event( participant: Participant metadata. response: Transcription response metadata. """ - self.events.send(events.STTTranscriptEvent( - session_id=self.session_id, - plugin_name=self.provider_name, - text=text, - participant=participant, - response=response, - )) + self.events.send( + events.STTTranscriptEvent( + session_id=self.session_id, + plugin_name=self.provider_name, + text=text, + participant=participant, + response=response, + ) + ) def _emit_partial_transcript_event( self, @@ -73,13 +76,15 @@ def _emit_partial_transcript_event( participant: Participant metadata. response: Transcription response metadata. """ - self.events.send(events.STTPartialTranscriptEvent( - session_id=self.session_id, - plugin_name=self.provider_name, - text=text, - participant=participant, - response=response, - )) + self.events.send( + events.STTPartialTranscriptEvent( + session_id=self.session_id, + plugin_name=self.provider_name, + text=text, + participant=participant, + response=response, + ) + ) def _emit_error_event( self, @@ -91,20 +96,73 @@ def _emit_error_event( Emit an error event. Note this should only be emitted for temporary errors. Permanent errors due to config etc should be directly raised """ - self.events.send(events.STTErrorEvent( - session_id=self.session_id, - plugin_name=self.provider_name, - error=error, - context=context, - participant=participant, - error_code=getattr(error, "error_code", None), - is_recoverable=not isinstance(error, (SystemExit, KeyboardInterrupt)), - )) + # Increment error counter + stt_errors.add( + 1, {"provider": self.provider_name, "error_type": type(error).__name__} + ) + + self.events.send( + events.STTErrorEvent( + session_id=self.session_id, + plugin_name=self.provider_name, + error=error, + context=context, + participant=participant, + error_code=getattr(error, "error_code", None), + is_recoverable=not isinstance(error, (SystemExit, KeyboardInterrupt)), + ) + ) - @abc.abstractmethod async def process_audio( - self, pcm_data: PcmData, participant: Optional[Participant] = None, + self, + pcm_data: PcmData, + participant: Optional[Participant] = None, + ): + """ + Process audio with automatic metrics tracking. + + This method wraps the actual processing with metrics collection + and delegates to the _process_audio method that subclasses implement. + + Args: + pcm_data: Audio data to process + participant: Optional participant metadata + """ + with Timer(stt_latency_ms) as timer: + # Use fully qualified class path for better identification + timer.attributes["stt_class"] = ( + f"{self.__class__.__module__}.{self.__class__.__qualname__}" + ) + timer.attributes["provider"] = self.provider_name + timer.attributes["sample_rate"] = pcm_data.sample_rate + timer.attributes["channels"] = pcm_data.channels + timer.attributes["samples"] = ( + len(pcm_data.samples) if pcm_data.samples is not None else 0 + ) + timer.attributes["duration_ms"] = pcm_data.duration_ms + + try: + await self._process_audio(pcm_data, participant) + except Exception as e: + timer.attributes["error"] = type(e).__name__ + raise + + @abc.abstractmethod + async def _process_audio( + self, + pcm_data: PcmData, + participant: Optional[Participant] = None, ): + """ + Process audio data and emit transcription events. + + Subclasses must implement this method to perform the actual STT processing. + The base class handles metrics collection automatically. + + Args: + pcm_data: Audio data to process + participant: Optional participant metadata + """ pass async def start(self): diff --git a/agents-core/vision_agents/core/tts/tts.py b/agents-core/vision_agents/core/tts/tts.py index cf4761b7..dd28f83f 100644 --- a/agents-core/vision_agents/core/tts/tts.py +++ b/agents-core/vision_agents/core/tts/tts.py @@ -338,9 +338,9 @@ async def send( raise finally: elapsed_ms = (time.time() - start_time) * 1000.0 - tts_latency_ms.record( - elapsed_ms, attributes={"tts_class": self.__class__.__name__} - ) + # Use fully qualified class path for better identification + class_path = f"{self.__class__.__module__}.{self.__class__.__qualname__}" + tts_latency_ms.record(elapsed_ms, attributes={"tts_class": class_path}) async def close(self): """Close the TTS service and release any resources.""" diff --git a/observability/.gitignore b/observability/.gitignore new file mode 100644 index 00000000..d577e73b --- /dev/null +++ b/observability/.gitignore @@ -0,0 +1,4 @@ +# Ignore Docker volume data +data/ +*.tmp +*.log diff --git a/observability/README.md b/observability/README.md new file mode 100644 index 00000000..56414221 --- /dev/null +++ b/observability/README.md @@ -0,0 +1,173 @@ +# Vision Agents Observability Stack + +This directory contains the complete observability setup for Vision Agents, including: +- **Prometheus** for metrics collection +- **Jaeger** for distributed tracing +- **Grafana** for visualization with pre-configured dashboards + +## Quick Start + +### 1. Start the Observability Stack + +From the root of the Vision Agents repository: + +```bash +docker-compose up -d +``` + +This will start: +- **Jaeger UI**: http://localhost:16686 +- **Prometheus UI**: http://localhost:9090 +- **Grafana**: http://localhost:3000 (admin/admin) + +### 2. Run Your Vision Agents Application + +The example in `examples/01_simple_agent_example/simple_agent_example.py` already includes the `setup_telemetry()` function that: +- Exports traces to Jaeger (OTLP on port 4317) +- Exposes Prometheus metrics on port 9464 + +Run the example: + +```bash +cd examples/01_simple_agent_example +uv run python simple_agent_example.py +``` + +### 3. View Metrics in Grafana + +1. Open Grafana: http://localhost:3000 +2. Login with `admin` / `admin` +3. Navigate to **Dashboards** → **Vision Agents - Performance Metrics** + +The dashboard automatically displays: +- **STT Latency** (p50, p95, p99) by implementation +- **STT Errors** rate by provider and error type +- **TTS Latency** (p50, p95, p99) by implementation +- **TTS Errors** rate by provider and error type +- **Turn Detection Latency** (p50, p95, p99) by implementation + +### 4. View Traces in Jaeger + +1. Open Jaeger: http://localhost:16686 +2. Select service: `agents` +3. Click **Find Traces** to see distributed traces + +## Architecture + +### Metrics Flow + +``` +Vision Agents App (port 9464) + ↓ (scrape every 5s) +Prometheus (port 9090) + ↓ (datasource) +Grafana (port 3000) +``` + +### Traces Flow + +``` +Vision Agents App + ↓ (OTLP gRPC on port 4317) +Jaeger Collector + ↓ +Jaeger UI (port 16686) +``` + +## Available Metrics + +### STT Metrics +- `stt_latency_ms` - Histogram of STT processing latency + - Labels: `stt_class`, `provider`, `sample_rate`, `channels`, `samples`, `duration_ms` +- `stt_errors` - Counter of STT errors + - Labels: `provider`, `error_type` + +### TTS Metrics +- `tts_latency_ms` - Histogram of TTS synthesis latency + - Labels: `tts_class` +- `tts_errors` - Counter of TTS errors + - Labels: `provider`, `error_type` + +### Turn Detection Metrics +- `turn_detection_latency_ms` - Histogram of turn detection latency + - Labels: `class` + +## Configuration + +### Prometheus + +Edit `prometheus/prometheus.yml` to: +- Change scrape interval +- Add additional scrape targets +- Configure alerting rules + +### Grafana + +#### Add Custom Dashboards + +1. Place JSON dashboard files in `grafana/dashboards/` +2. They will be automatically loaded on startup + +#### Modify Datasources + +Edit `grafana/provisioning/datasources/prometheus.yml` + +### Jaeger + +Jaeger is configured with default settings. To customize, modify the `jaeger` service in `docker-compose.yml`. + +## Troubleshooting + +### Prometheus Can't Scrape Metrics + +**Issue**: Prometheus shows target as "down" + +**Solution**: Ensure `host.docker.internal` resolves correctly: +- **Linux**: Add `--add-host=host.docker.internal:host-gateway` to the prometheus service in docker-compose.yml +- **Mac/Windows**: Should work by default + +### No Data in Grafana + +1. Check Prometheus is scraping: http://localhost:9090/targets +2. Verify metrics are exposed: http://localhost:9464/metrics +3. Ensure your Vision Agents app is running with telemetry enabled + +### Jaeger Shows No Traces + +1. Verify OTLP receiver is running: `docker logs vision-agents-jaeger` +2. Check your app's trace exporter configuration +3. Ensure `endpoint="localhost:4317"` in your app + +## Stopping the Stack + +```bash +docker-compose down +``` + +To remove all data (metrics, dashboards, etc.): + +```bash +docker-compose down -v +``` + +## Production Considerations + +This setup is designed for development. For production: + +1. **Security**: + - Change default Grafana password + - Add authentication to Prometheus + - Use TLS for all connections + +2. **Persistence**: + - Configure external volumes for data persistence + - Set up regular backups + +3. **Scalability**: + - Use Prometheus remote write for long-term storage + - Consider Jaeger production deployment with Elasticsearch/Cassandra + - Deploy Grafana with a proper database backend + +4. **Monitoring**: + - Set up alerts in Prometheus/Grafana + - Configure notification channels (Slack, PagerDuty, etc.) diff --git a/observability/grafana/dashboards/vision-agents.json b/observability/grafana/dashboards/vision-agents.json new file mode 100644 index 00000000..3cc9e1d2 --- /dev/null +++ b/observability/grafana/dashboards/vision-agents.json @@ -0,0 +1,557 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum(rate(stt_latency_ms_milliseconds_bucket[5m])) by (le, stt_class))", + "legendFormat": "p50 - {{stt_class}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum(rate(stt_latency_ms_milliseconds_bucket[5m])) by (le, stt_class))", + "legendFormat": "p95 - {{stt_class}}", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum(rate(stt_latency_ms_milliseconds_bucket[5m])) by (le, stt_class))", + "legendFormat": "p99 - {{stt_class}}", + "refId": "C" + } + ], + "title": "STT Latency (by implementation)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": ["sum"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(stt_errors_total[5m])", + "legendFormat": "{{provider}} - {{error_type}}", + "refId": "A" + } + ], + "title": "STT Errors Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum(rate(tts_latency_ms_milliseconds_bucket[5m])) by (le, tts_class))", + "legendFormat": "p50 - {{tts_class}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum(rate(tts_latency_ms_milliseconds_bucket[5m])) by (le, tts_class))", + "legendFormat": "p95 - {{tts_class}}", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum(rate(tts_latency_ms_milliseconds_bucket[5m])) by (le, tts_class))", + "legendFormat": "p99 - {{tts_class}}", + "refId": "C" + } + ], + "title": "TTS Latency (by implementation)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": ["sum"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(tts_errors_total[5m])", + "legendFormat": "{{provider}} - {{error_type}}", + "refId": "A" + } + ], + "title": "TTS Errors Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum(rate(turn_detection_latency_ms_milliseconds_bucket[5m])) by (le, class))", + "legendFormat": "p50 - {{class}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum(rate(turn_detection_latency_ms_milliseconds_bucket[5m])) by (le, class))", + "legendFormat": "p95 - {{class}}", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum(rate(turn_detection_latency_ms_milliseconds_bucket[5m])) by (le, class))", + "legendFormat": "p99 - {{class}}", + "refId": "C" + } + ], + "title": "Turn Detection Latency (by implementation)", + "type": "timeseries" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": ["vision-agents", "observability"], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Vision Agents - Performance Metrics", + "uid": "vision-agents-metrics", + "version": 0, + "weekStart": "" +} diff --git a/observability/grafana/init-home-dashboard.sh b/observability/grafana/init-home-dashboard.sh new file mode 100755 index 00000000..cad54dda --- /dev/null +++ b/observability/grafana/init-home-dashboard.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# Wait for Grafana to be ready +echo "Waiting for Grafana to be ready..." +until curl -s http://grafana:3000/api/health > /dev/null 2>&1; do + echo "Grafana not ready yet, waiting..." + sleep 2 +done + +echo "Grafana is ready!" +sleep 5 # Give it a bit more time for provisioning to complete + +# Get the dashboard UID +DASHBOARD_UID="vision-agents-metrics" + +# Set the home dashboard for the organization +echo "Setting org home dashboard to Vision Agents - Performance Metrics..." +curl -X PUT \ + -H "Content-Type: application/json" \ + -d "{\"homeDashboardUID\":\"${DASHBOARD_UID}\"}" \ + http://grafana:3000/api/org/preferences + +# Also set it as the default home dashboard for admin user (for when they log in) +echo "" +echo "Setting admin user home dashboard..." +curl -X PUT \ + -u "admin:admin" \ + -H "Content-Type: application/json" \ + -d "{\"homeDashboardUID\":\"${DASHBOARD_UID}\"}" \ + http://grafana:3000/api/user/preferences + +echo "" +echo "Home dashboard configured successfully!" diff --git a/observability/grafana/provisioning/dashboards/default.yml b/observability/grafana/provisioning/dashboards/default.yml new file mode 100644 index 00000000..ed949c18 --- /dev/null +++ b/observability/grafana/provisioning/dashboards/default.yml @@ -0,0 +1,13 @@ +apiVersion: 1 + +providers: + - name: 'Vision Agents' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: true diff --git a/observability/grafana/provisioning/datasources/prometheus.yml b/observability/grafana/provisioning/datasources/prometheus.yml new file mode 100644 index 00000000..cfd90598 --- /dev/null +++ b/observability/grafana/provisioning/datasources/prometheus.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + uid: prometheus + isDefault: true + editable: true + jsonData: + timeInterval: 5s diff --git a/observability/prometheus/prometheus.yml b/observability/prometheus/prometheus.yml new file mode 100644 index 00000000..83a2693a --- /dev/null +++ b/observability/prometheus/prometheus.yml @@ -0,0 +1,21 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + external_labels: + monitor: 'vision-agents-monitor' + +scrape_configs: + # Scrape metrics from Vision Agents application + - job_name: 'vision-agents' + static_configs: + - targets: ['host.docker.internal:9464'] + labels: + service: 'vision-agents' + environment: 'development' + scrape_interval: 5s + scrape_timeout: 5s + + # Scrape Prometheus self-metrics + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] diff --git a/plugins/deepgram/vision_agents/plugins/deepgram/deepgram_stt.py b/plugins/deepgram/vision_agents/plugins/deepgram/deepgram_stt.py index 0c598f34..b5da2b80 100644 --- a/plugins/deepgram/vision_agents/plugins/deepgram/deepgram_stt.py +++ b/plugins/deepgram/vision_agents/plugins/deepgram/deepgram_stt.py @@ -75,7 +75,7 @@ def __init__( self._connection_context: Optional[Any] = None self._listen_task: Optional[asyncio.Task[Any]] = None - async def process_audio( + async def _process_audio( self, pcm_data: PcmData, participant: Optional[Participant] = None, @@ -127,20 +127,19 @@ async def start(self): "encoding": "linear16", "sample_rate": "16000", } - + # Add optional parameters if specified if self.eot_threshold is not None: connect_params["eot_threshold"] = str(self.eot_threshold) if self.eager_eot_threshold is not None: connect_params["eager_eot_threshold"] = str(self.eager_eot_threshold) - + # Connect to Deepgram v2 listen WebSocket with timeout self._connection_context = self.client.listen.v2.connect(**connect_params) - + # Add timeout for connection establishment self.connection = await asyncio.wait_for( - self._connection_context.__aenter__(), - timeout=10.0 + self._connection_context.__aenter__(), timeout=10.0 ) # Register event handlers @@ -149,7 +148,7 @@ async def start(self): self.connection.on(EventType.MESSAGE, self._on_message) self.connection.on(EventType.ERROR, self._on_error) self.connection.on(EventType.CLOSE, self._on_close) - + # Start listening for events self._listen_task = asyncio.create_task(self.connection.start_listening()) @@ -159,7 +158,7 @@ async def start(self): def _on_message(self, message): """ Event handler for messages from Deepgram. - + Args: message: The message object from Deepgram """ @@ -189,7 +188,9 @@ def _on_message(self, message): words = getattr(message, "words", []) if words: confidences = [w.confidence for w in words if hasattr(w, "confidence")] - avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0 + avg_confidence = ( + sum(confidences) / len(confidences) if confidences else 0.0 + ) else: avg_confidence = 0.0 @@ -207,7 +208,7 @@ def _on_message(self, message): "end_of_turn_confidence": end_of_turn_confidence, "turn_index": getattr(message, "turn_index", None), "event": event, - } + }, ) # Use the participant from the most recent process_audio call @@ -234,7 +235,7 @@ def _on_open(self, message): def _on_error(self, error): """ Event handler for errors from Deepgram. - + Args: error: The error from Deepgram """ diff --git a/plugins/fish/vision_agents/plugins/fish/stt.py b/plugins/fish/vision_agents/plugins/fish/stt.py index 7f3ae589..712b6ed0 100644 --- a/plugins/fish/vision_agents/plugins/fish/stt.py +++ b/plugins/fish/vision_agents/plugins/fish/stt.py @@ -49,7 +49,7 @@ def __init__( self.language = language - async def process_audio( + async def _process_audio( self, pcm_data: PcmData, participant: Optional[Participant] = None, @@ -125,7 +125,9 @@ async def process_audio( ) if participant is not None: - self._emit_transcript_event(transcript_text, participant, response_metadata) + self._emit_transcript_event( + transcript_text, participant, response_metadata + ) except Exception as e: logger.error( diff --git a/plugins/wizper/vision_agents/plugins/wizper/stt.py b/plugins/wizper/vision_agents/plugins/wizper/stt.py index a0bd7c2f..50d5d65b 100644 --- a/plugins/wizper/vision_agents/plugins/wizper/stt.py +++ b/plugins/wizper/vision_agents/plugins/wizper/stt.py @@ -57,7 +57,7 @@ def __init__( self.target_language = target_language self._fal_client = client if client is not None else fal_client.AsyncClient() - async def process_audio( + async def _process_audio( self, pcm_data: PcmData, participant: Optional["Participant"] = None, From a8b0271392fb9b374fb62cbf2b4e2d065a151eea Mon Sep 17 00:00:00 2001 From: Tommaso Barbugli Date: Fri, 31 Oct 2025 16:29:53 +0100 Subject: [PATCH 03/11] llm metrics --- agents-core/vision_agents/core/llm/llm.py | 225 ++++++++----- .../core/observability/metrics.py | 14 + .../core/turn_detection/turn_detection.py | 18 +- observability/README.md | 10 +- .../grafana/dashboards/vision-agents.json | 317 ++++++++++++++++-- .../plugins/anthropic/anthropic_llm.py | 247 +++++++++----- .../aws/vision_agents/plugins/aws/aws_llm.py | 299 ++++++++++------- .../plugins/gemini/gemini_llm.py | 227 ++++++++----- .../plugins/openai/openai_llm.py | 3 +- .../plugins/openrouter/openrouter_llm.py | 7 +- .../smart_turn/smart_turn_detection.py | 24 +- .../plugins/vogent/vogent_turn_detection.py | 23 +- plugins/xai/vision_agents/plugins/xai/llm.py | 56 ++-- 13 files changed, 1000 insertions(+), 470 deletions(-) diff --git a/agents-core/vision_agents/core/llm/llm.py b/agents-core/vision_agents/core/llm/llm.py index a699218b..1b42cd50 100644 --- a/agents-core/vision_agents/core/llm/llm.py +++ b/agents-core/vision_agents/core/llm/llm.py @@ -3,7 +3,17 @@ import abc import asyncio import json -from typing import Optional, TYPE_CHECKING, Tuple, List, Dict, Any, TypeVar, Callable, Generic +from typing import ( + Optional, + TYPE_CHECKING, + Tuple, + List, + Dict, + Any, + TypeVar, + Callable, + Generic, +) from vision_agents.core.llm import events from vision_agents.core.llm.events import ToolStartEvent, ToolEndEvent @@ -56,6 +66,35 @@ async def simple_response( processors: Optional[List[Processor]] = None, participant: Optional[Participant] = None, ) -> LLMResponseEvent[Any]: + """ + Wrapper method that tracks metrics and delegates to _simple_response. + """ + from vision_agents.core.observability.metrics import Timer, llm_latency_ms + + with Timer(llm_latency_ms) as timer: + timer.attributes["llm_class"] = ( + f"{self.__class__.__module__}.{self.__class__.__qualname__}" + ) + timer.attributes["provider"] = getattr(self, "provider_name", "unknown") + + try: + result = await self._simple_response(text, processors, participant) + return result + except Exception as e: + timer.attributes["error"] = type(e).__name__ + raise + + @abc.abstractmethod + async def _simple_response( + self, + text: str, + processors: Optional[List[Processor]] = None, + participant: Optional[Participant] = None, + ) -> LLMResponseEvent[Any]: + """ + Implementation-specific response generation. + Subclasses must implement this method. + """ raise NotImplementedError def _build_enhanced_instructions(self) -> Optional[str]: @@ -65,7 +104,7 @@ def _build_enhanced_instructions(self) -> Optional[str]: Returns: Enhanced instructions string with markdown file contents included, or None if no parsed instructions """ - if not hasattr(self, 'parsed_instructions') or not self.parsed_instructions: + if not hasattr(self, "parsed_instructions") or not self.parsed_instructions: return None parsed = self.parsed_instructions @@ -80,7 +119,9 @@ def _build_enhanced_instructions(self) -> Optional[str]: enhanced_instructions.append(content) else: enhanced_instructions.append(f"\n### {filename}") - enhanced_instructions.append("*(File not found or could not be read)*") + enhanced_instructions.append( + "*(File not found or could not be read)*" + ) return "\n".join(enhanced_instructions) @@ -88,64 +129,72 @@ def _get_tools_for_provider(self) -> List[Dict[str, Any]]: """ Get tools in provider-specific format. This method should be overridden by each LLM implementation. - + Returns: List of tools in the provider's expected format. """ tools = self.get_available_functions() return self._convert_tools_to_provider_format(tools) - - def _convert_tools_to_provider_format(self, tools: List[ToolSchema]) -> List[Dict[str, Any]]: + + def _convert_tools_to_provider_format( + self, tools: List[ToolSchema] + ) -> List[Dict[str, Any]]: """ Convert ToolSchema objects to provider-specific format. This method should be overridden by each LLM implementation. - + Args: tools: List of ToolSchema objects - + Returns: List of tools in provider-specific format """ # Default implementation - should be overridden return [] - - def _extract_tool_calls_from_response(self, response: Any) -> List[NormalizedToolCallItem]: + + def _extract_tool_calls_from_response( + self, response: Any + ) -> List[NormalizedToolCallItem]: """ Extract tool calls from provider-specific response. This method should be overridden by each LLM implementation. - + Args: response: Provider-specific response object - + Returns: List of normalized tool call items """ # Default implementation - should be overridden return [] - - def _extract_tool_calls_from_stream_chunk(self, chunk: Any) -> List[NormalizedToolCallItem]: + + def _extract_tool_calls_from_stream_chunk( + self, chunk: Any + ) -> List[NormalizedToolCallItem]: """ Extract tool calls from a streaming chunk. This method should be overridden by each LLM implementation. - + Args: chunk: Provider-specific streaming chunk - + Returns: List of normalized tool call items """ # Default implementation - should be overridden return [] - - def _create_tool_result_message(self, tool_calls: List[NormalizedToolCallItem], results: List[Any]) -> List[Dict[str, Any]]: + + def _create_tool_result_message( + self, tool_calls: List[NormalizedToolCallItem], results: List[Any] + ) -> List[Dict[str, Any]]: """ Create tool result messages for the provider. This method should be overridden by each LLM implementation. - + Args: tool_calls: List of tool calls that were executed results: List of results from function execution - + Returns: List of tool result messages in provider format """ @@ -160,67 +209,67 @@ def _attach_agent(self, agent: Agent): self._conversation = agent.conversation self._set_instructions(agent.instructions) - def _set_instructions(self, instructions: str): self.instructions = instructions # Parse instructions to extract @ mentioned markdown files self.parsed_instructions = parse_instructions(instructions) - def register_function(self, - name: Optional[str] = None, - description: Optional[str] = None) -> Callable: + def register_function( + self, name: Optional[str] = None, description: Optional[str] = None + ) -> Callable: """ Decorator to register a function with the LLM's function registry. - + Args: name: Optional custom name for the function. If not provided, uses the function name. description: Optional description for the function. If not provided, uses the docstring. - + Returns: Decorator function. """ return self.function_registry.register(name, description) - + def get_available_functions(self) -> List[ToolSchema]: """Get a list of available function schemas.""" return self.function_registry.get_tool_schemas() - + def call_function(self, name: str, arguments: Dict[str, Any]) -> Any: """ Call a registered function with the given arguments. - + Args: name: Name of the function to call. arguments: Dictionary of arguments to pass to the function. - + Returns: Result of the function call. """ return self.function_registry.call_function(name, arguments) - def _tc_key(self, tc: Dict[str, Any]) -> Tuple[Optional[str], str, str]: """Generate a unique key for tool call deduplication. - + Args: tc: Tool call dictionary - + Returns: Tuple of (id, name, arguments_json) for deduplication """ return ( - tc.get("id"), - tc["name"], - json.dumps(tc.get("arguments_json", tc.get("arguments", {})), sort_keys=True) + tc.get("id"), + tc["name"], + json.dumps( + tc.get("arguments_json", tc.get("arguments", {})), sort_keys=True + ), ) async def _maybe_await(self, x): """Await if x is a coroutine, otherwise return x directly. - + Args: x: Value that might be a coroutine - + Returns: Awaited result if coroutine, otherwise x """ @@ -230,23 +279,23 @@ async def _maybe_await(self, x): async def _run_one_tool(self, tc: Dict[str, Any], timeout_s: float): """Run a single tool call with timeout. - + Args: tc: Tool call dictionary timeout_s: Timeout in seconds - + Returns: Tuple of (tool_call, result, error) """ import inspect import time - + args = tc.get("arguments_json", tc.get("arguments", {})) or {} start_time = time.time() - + async def _invoke(): # Get the actual function to check if it's async - if hasattr(self.function_registry, 'get_callable'): + if hasattr(self.function_registry, "get_callable"): fn = self.function_registry.get_callable(tc["name"]) if inspect.iscoroutinefunction(fn): return await fn(**args) @@ -257,62 +306,74 @@ async def _invoke(): # Fallback to existing call_function method res = self.call_function(tc["name"], args) return await self._maybe_await(res) - + try: # Send tool start event - self.events.send(ToolStartEvent( - plugin_name="llm", - tool_name=tc["name"], - arguments=args, - tool_call_id=tc.get("id") - )) - + self.events.send( + ToolStartEvent( + plugin_name="llm", + tool_name=tc["name"], + arguments=args, + tool_call_id=tc.get("id"), + ) + ) + res = await asyncio.wait_for(_invoke(), timeout=timeout_s) execution_time = (time.time() - start_time) * 1000 - + # Send tool end event (success) - self.events.send(ToolEndEvent( - plugin_name="llm", - tool_name=tc["name"], - success=True, - result=res, - tool_call_id=tc.get("id"), - execution_time_ms=execution_time - )) - + self.events.send( + ToolEndEvent( + plugin_name="llm", + tool_name=tc["name"], + success=True, + result=res, + tool_call_id=tc.get("id"), + execution_time_ms=execution_time, + ) + ) + return tc, res, None except Exception as e: execution_time = (time.time() - start_time) * 1000 - + # Send tool end event (error) - self.events.send(ToolEndEvent( - plugin_name="llm", - tool_name=tc["name"], - success=False, - error=str(e), - tool_call_id=tc.get("id"), - execution_time_ms=execution_time - )) - + self.events.send( + ToolEndEvent( + plugin_name="llm", + tool_name=tc["name"], + success=False, + error=str(e), + tool_call_id=tc.get("id"), + execution_time_ms=execution_time, + ) + ) + return tc, {"error": str(e)}, e - async def _execute_tools(self, calls: List[Dict[str, Any]], *, max_concurrency: int = 8, timeout_s: float = 30): + async def _execute_tools( + self, + calls: List[Dict[str, Any]], + *, + max_concurrency: int = 8, + timeout_s: float = 30, + ): """Execute multiple tool calls concurrently with timeout. - + Args: calls: List of tool call dictionaries max_concurrency: Maximum number of concurrent tool executions timeout_s: Timeout per tool execution in seconds - + Returns: List of tuples (tool_call, result, error) """ sem = asyncio.Semaphore(max_concurrency) - + async def _guarded(tc): async with sem: return await self._run_one_tool(tc, timeout_s) - + return await asyncio.gather(*[_guarded(tc) for tc in calls]) async def _dedup_and_execute( @@ -324,13 +385,13 @@ async def _dedup_and_execute( seen: Optional[set] = None, ): """De-duplicate (by id/name/args) then execute concurrently. - + Args: calls: List of tool call dictionaries max_concurrency: Maximum number of concurrent tool executions timeout_s: Timeout per tool execution in seconds seen: Set of seen tool call keys for deduplication - + Returns: Tuple of (triples, updated_seen_set) """ @@ -346,16 +407,18 @@ async def _dedup_and_execute( if not to_run: return [], seen # nothing new - triples = await self._execute_tools(to_run, max_concurrency=max_concurrency, timeout_s=timeout_s) + triples = await self._execute_tools( + to_run, max_concurrency=max_concurrency, timeout_s=timeout_s + ) return triples, seen def _sanitize_tool_output(self, value: Any, max_chars: int = 60_000) -> str: """Sanitize tool output to prevent oversized responses. - + Args: value: Tool output value max_chars: Maximum characters allowed - + Returns: Sanitized string output """ diff --git a/agents-core/vision_agents/core/observability/metrics.py b/agents-core/vision_agents/core/observability/metrics.py index c9aad222..b39ccd2a 100644 --- a/agents-core/vision_agents/core/observability/metrics.py +++ b/agents-core/vision_agents/core/observability/metrics.py @@ -43,6 +43,20 @@ "turn.detection.latency.ms", unit="ms", ) +turn_vad_latency_ms = meter.create_histogram( + "turn.vad.latency.ms", unit="ms", description="Turn detection VAD latency" +) +turn_end_detection_latency_ms = meter.create_histogram( + "turn.end_detection.latency.ms", + unit="ms", + description="Turn end detection latency (Vogent/Smart Turn model)", +) +turn_errors = meter.create_counter("turn.errors", description="Turn detection errors") + +llm_latency_ms = meter.create_histogram( + "llm.latency.ms", unit="ms", description="Total LLM latency" +) +llm_errors = meter.create_counter("llm.errors", description="LLM errors") class Timer: diff --git a/agents-core/vision_agents/core/turn_detection/turn_detection.py b/agents-core/vision_agents/core/turn_detection/turn_detection.py index 29105c99..faf14979 100644 --- a/agents-core/vision_agents/core/turn_detection/turn_detection.py +++ b/agents-core/vision_agents/core/turn_detection/turn_detection.py @@ -8,7 +8,7 @@ from .events import TurnStartedEvent, TurnEndedEvent from ..agents.conversation import Conversation from ..edge.types import Participant -from ..observability.metrics import turn_detection_latency_ms, Timer +from ..observability.metrics import turn_detection_latency_ms, turn_errors, Timer class TurnEvent(Enum): @@ -41,7 +41,6 @@ def _emit_end_turn_event(self, event: TurnEndedEvent) -> None: event.plugin_name = self.provider_name self.events.send(event) - @Timer(turn_detection_latency_ms) async def process_audio( self, audio_data: PcmData, @@ -55,8 +54,19 @@ async def process_audio( participant: Participant that's speaking, includes user data conversation: Transcription/ chat history, sometimes useful for turn detection """ - - return await self.detect_turn(audio_data, participant, conversation) + with Timer(turn_detection_latency_ms) as timer: + timer.attributes["class"] = ( + f"{self.__class__.__module__}.{self.__class__.__qualname__}" + ) + timer.attributes["provider"] = self.provider_name + try: + await self.detect_turn(audio_data, participant, conversation) + except Exception as e: + timer.attributes["error"] = type(e).__name__ + turn_errors.add( + 1, {"provider": self.provider_name, "error_type": type(e).__name__} + ) + raise @abstractmethod async def detect_turn( diff --git a/observability/README.md b/observability/README.md index 56414221..bcd44481 100644 --- a/observability/README.md +++ b/observability/README.md @@ -40,11 +40,11 @@ uv run python simple_agent_example.py 3. Navigate to **Dashboards** → **Vision Agents - Performance Metrics** The dashboard automatically displays: +- **LLM Latency** (p50, p95, p99) by implementation - **STT Latency** (p50, p95, p99) by implementation -- **STT Errors** rate by provider and error type - **TTS Latency** (p50, p95, p99) by implementation -- **TTS Errors** rate by provider and error type - **Turn Detection Latency** (p50, p95, p99) by implementation +- **All Errors Rate** - Combined view of LLM, STT, and TTS errors by provider and error type ### 4. View Traces in Jaeger @@ -92,6 +92,12 @@ Jaeger UI (port 16686) - `turn_detection_latency_ms` - Histogram of turn detection latency - Labels: `class` +### LLM Metrics +- `llm_latency_ms` - Histogram of LLM response latency + - Labels: `llm_class`, `provider` +- `llm_errors` - Counter of LLM errors + - Labels: `provider`, `error` + ## Configuration ### Prometheus diff --git a/observability/grafana/dashboards/vision-agents.json b/observability/grafana/dashboards/vision-agents.json index 3cc9e1d2..05da23ef 100644 --- a/observability/grafana/dashboards/vision-agents.json +++ b/observability/grafana/dashboards/vision-agents.json @@ -106,8 +106,8 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.50, sum(rate(stt_latency_ms_milliseconds_bucket[5m])) by (le, stt_class))", - "legendFormat": "p50 - {{stt_class}}", + "expr": "histogram_quantile(0.50, sum(rate(llm_latency_ms_milliseconds_bucket[5m])) by (le, llm_class))", + "legendFormat": "p50 - {{llm_class}}", "refId": "A" }, { @@ -115,8 +115,8 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.95, sum(rate(stt_latency_ms_milliseconds_bucket[5m])) by (le, stt_class))", - "legendFormat": "p95 - {{stt_class}}", + "expr": "histogram_quantile(0.95, sum(rate(llm_latency_ms_milliseconds_bucket[5m])) by (le, llm_class))", + "legendFormat": "p95 - {{llm_class}}", "refId": "B" }, { @@ -124,12 +124,12 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.99, sum(rate(stt_latency_ms_milliseconds_bucket[5m])) by (le, stt_class))", - "legendFormat": "p99 - {{stt_class}}", + "expr": "histogram_quantile(0.99, sum(rate(llm_latency_ms_milliseconds_bucket[5m])) by (le, llm_class))", + "legendFormat": "p99 - {{llm_class}}", "refId": "C" } ], - "title": "STT Latency (by implementation)", + "title": "LLM Latency", "type": "timeseries" }, { @@ -186,7 +186,7 @@ } ] }, - "unit": "short" + "unit": "ms" }, "overrides": [] }, @@ -199,7 +199,7 @@ "id": 2, "options": { "legend": { - "calcs": ["sum"], + "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true @@ -216,12 +216,30 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "rate(stt_errors_total[5m])", - "legendFormat": "{{provider}} - {{error_type}}", + "expr": "histogram_quantile(0.50, sum(rate(stt_latency_ms_milliseconds_bucket[5m])) by (le, stt_class))", + "legendFormat": "p50 - {{stt_class}}", "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum(rate(stt_latency_ms_milliseconds_bucket[5m])) by (le, stt_class))", + "legendFormat": "p95 - {{stt_class}}", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum(rate(stt_latency_ms_milliseconds_bucket[5m])) by (le, stt_class))", + "legendFormat": "p99 - {{stt_class}}", + "refId": "C" } ], - "title": "STT Errors Rate", + "title": "STT Latency", "type": "timeseries" }, { @@ -331,7 +349,7 @@ "refId": "C" } ], - "title": "TTS Latency (by implementation)", + "title": "TTS Latency", "type": "timeseries" }, { @@ -388,7 +406,7 @@ } ] }, - "unit": "short" + "unit": "ms" }, "overrides": [] }, @@ -401,7 +419,7 @@ "id": 4, "options": { "legend": { - "calcs": ["sum"], + "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true @@ -418,12 +436,30 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "rate(tts_errors_total[5m])", - "legendFormat": "{{provider}} - {{error_type}}", + "expr": "histogram_quantile(0.50, sum(rate(turn_detection_latency_ms_milliseconds_bucket[5m])) by (le, provider))", + "legendFormat": "p50 - {{provider}}", "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum(rate(turn_detection_latency_ms_milliseconds_bucket[5m])) by (le, provider))", + "legendFormat": "p95 - {{provider}}", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum(rate(turn_detection_latency_ms_milliseconds_bucket[5m])) by (le, provider))", + "legendFormat": "p99 - {{provider}}", + "refId": "C" } ], - "title": "TTS Errors Rate", + "title": "Turn Detection Latency", "type": "timeseries" }, { @@ -486,7 +522,7 @@ }, "gridPos": { "h": 8, - "w": 24, + "w": 8, "x": 0, "y": 16 }, @@ -510,8 +546,118 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.50, sum(rate(turn_detection_latency_ms_milliseconds_bucket[5m])) by (le, class))", - "legendFormat": "p50 - {{class}}", + "expr": "histogram_quantile(0.50, sum(rate(turn_vad_latency_ms_milliseconds_bucket[5m])) by (le, implementation))", + "legendFormat": "p50 - {{implementation}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum(rate(turn_vad_latency_ms_milliseconds_bucket[5m])) by (le, implementation))", + "legendFormat": "p95 - {{implementation}}", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum(rate(turn_vad_latency_ms_milliseconds_bucket[5m])) by (le, implementation))", + "legendFormat": "p99 - {{implementation}}", + "refId": "C" + } + ], + "title": "Turn VAD Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 16 + }, + "id": 6, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum(rate(turn_end_detection_latency_ms_milliseconds_bucket[5m])) by (le, implementation))", + "legendFormat": "p50 - {{implementation}}", "refId": "A" }, { @@ -519,8 +665,8 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.95, sum(rate(turn_detection_latency_ms_milliseconds_bucket[5m])) by (le, class))", - "legendFormat": "p95 - {{class}}", + "expr": "histogram_quantile(0.95, sum(rate(turn_end_detection_latency_ms_milliseconds_bucket[5m])) by (le, implementation))", + "legendFormat": "p95 - {{implementation}}", "refId": "B" }, { @@ -528,12 +674,131 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.99, sum(rate(turn_detection_latency_ms_milliseconds_bucket[5m])) by (le, class))", - "legendFormat": "p99 - {{class}}", + "expr": "histogram_quantile(0.99, sum(rate(turn_end_detection_latency_ms_milliseconds_bucket[5m])) by (le, implementation))", + "legendFormat": "p99 - {{implementation}}", "refId": "C" } ], - "title": "Turn Detection Latency (by implementation)", + "title": "Turn End Detection Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 16 + }, + "id": 7, + "options": { + "legend": { + "calcs": ["sum"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(llm_errors_total[5m])", + "legendFormat": "LLM - {{provider}} - {{error}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(stt_errors_total[5m])", + "legendFormat": "STT - {{provider}} - {{error_type}}", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(tts_errors_total[5m])", + "legendFormat": "TTS - {{provider}} - {{error_type}}", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(turn_errors_total[5m])", + "legendFormat": "TURN - {{provider}} - {{error_type}}", + "refId": "D" + } + ], + "title": "All Errors Rate", "type": "timeseries" } ], diff --git a/plugins/anthropic/vision_agents/plugins/anthropic/anthropic_llm.py b/plugins/anthropic/vision_agents/plugins/anthropic/anthropic_llm.py index 30691576..1364d5d6 100644 --- a/plugins/anthropic/vision_agents/plugins/anthropic/anthropic_llm.py +++ b/plugins/anthropic/vision_agents/plugins/anthropic/anthropic_llm.py @@ -14,7 +14,10 @@ from getstream.video.rtc.pb.stream.video.sfu.models.models_pb2 import Participant -from vision_agents.core.llm.events import LLMResponseChunkEvent, LLMResponseCompletedEvent +from vision_agents.core.llm.events import ( + LLMResponseChunkEvent, + LLMResponseCompletedEvent, +) from vision_agents.core.processors import Processor from . import events @@ -59,14 +62,17 @@ def __init__( super().__init__() self.events.register_events_from_module(events) self.model = model - self._pending_tool_uses_by_index: Dict[int, Dict[str, Any]] = {} # index -> {id, name, parts: []} + self._pending_tool_uses_by_index: Dict[ + int, Dict[str, Any] + ] = {} # index -> {id, name, parts: []} + self.provider_name = "anthropic" if client is not None: self.client = client else: self.client = anthropic.AsyncAnthropic(api_key=api_key) - async def simple_response( + async def _simple_response( self, text: str, processors: Optional[List[Processor]] = None, @@ -107,7 +113,7 @@ async def create_message(self, *args, **kwargs) -> LLMResponseEvent[Any]: # ensure the AI remembers the past conversation new_messages = kwargs["messages"] - if hasattr(self, '_conversation') and self._conversation: + if hasattr(self, "_conversation") and self._conversation: old_messages = [m.original for m in self._conversation.messages] kwargs["messages"] = old_messages + new_messages # Add messages to conversation @@ -122,7 +128,7 @@ async def create_message(self, *args, **kwargs) -> LLMResponseEvent[Any]: # Extract text from Claude's response format - safely handle all text blocks text = self._concat_text_blocks(original.content) llm_response = LLMResponseEvent(original, text) - + # Multi-hop tool calling loop for non-streaming function_calls = self._extract_tool_calls_from_response(original) if function_calls: @@ -131,39 +137,53 @@ async def create_message(self, *args, **kwargs) -> LLMResponseEvent[Any]: rounds = 0 seen: set[tuple[str, str, str]] = set() current_calls = function_calls - + while current_calls and rounds < MAX_ROUNDS: # Execute calls concurrently with dedup - triples, seen = await self._dedup_and_execute(current_calls, seen=seen, max_concurrency=8, timeout_s=30) # type: ignore[arg-type] - + triples, seen = await self._dedup_and_execute( + current_calls, # type: ignore[arg-type] + seen=seen, + max_concurrency=8, + timeout_s=30, + ) + if not triples: break - + # Build tool_result user message assistant_content = [] tool_result_blocks = [] for tc, res, err in triples: - assistant_content.append({ - "type": "tool_use", - "id": tc["id"], - "name": tc["name"], - "input": tc["arguments_json"], - }) - + assistant_content.append( + { + "type": "tool_use", + "id": tc["id"], + "name": tc["name"], + "input": tc["arguments_json"], + } + ) + payload = self._sanitize_tool_output(res) - tool_result_blocks.append({ - "type": "tool_result", - "tool_use_id": tc["id"], - "content": payload, - }) + tool_result_blocks.append( + { + "type": "tool_result", + "tool_use_id": tc["id"], + "content": payload, + } + ) assistant_msg = {"role": "assistant", "content": assistant_content} - user_tool_results_msg = {"role": "user", "content": tool_result_blocks} + user_tool_results_msg = { + "role": "user", + "content": tool_result_blocks, + } messages = messages + [assistant_msg, user_tool_results_msg] # Ask again WITH tools so Claude can do another hop tools_cfg = { - "tools": self._convert_tools_to_provider_format(self.get_available_functions()), + "tools": self._convert_tools_to_provider_format( + self.get_available_functions() + ), "tool_choice": {"type": "auto"}, "stream": False, "model": self.model, @@ -172,22 +192,29 @@ async def create_message(self, *args, **kwargs) -> LLMResponseEvent[Any]: } follow_up_response = await self.client.messages.create(**tools_cfg) - + # Extract new tool calls from follow-up response - current_calls = self._extract_tool_calls_from_response(follow_up_response) - llm_response = LLMResponseEvent(follow_up_response, self._concat_text_blocks(follow_up_response.content)) + current_calls = self._extract_tool_calls_from_response( + follow_up_response + ) + llm_response = LLMResponseEvent( + follow_up_response, + self._concat_text_blocks(follow_up_response.content), + ) rounds += 1 - + # Finalization pass: no tools so Claude must answer in text if current_calls or rounds > 0: # Only if we had tool calls final_response = await self.client.messages.create( model=self.model, - messages=messages, # includes assistant tool_use + user tool_result blocks + messages=messages, # includes assistant tool_use + user tool_result blocks stream=False, - max_tokens=1000 + max_tokens=1000, + ) + llm_response = LLMResponseEvent( + final_response, self._concat_text_blocks(final_response.content) ) - llm_response = LLMResponseEvent(final_response, self._concat_text_blocks(final_response.content)) - + elif isinstance(original, AsyncStream): stream: AsyncStream[RawMessageStreamEvent] = original text_parts: List[str] = [] @@ -195,7 +222,9 @@ async def create_message(self, *args, **kwargs) -> LLMResponseEvent[Any]: # 1) First round: read stream, gather initial tool_use calls async for event in stream: - llm_response_optional = self._standardize_and_emit_event(event, text_parts) + llm_response_optional = self._standardize_and_emit_event( + event, text_parts + ) if llm_response_optional is not None: llm_response = llm_response_optional # Collect tool_use calls as they complete (your helper already reconstructs args) @@ -213,7 +242,12 @@ async def create_message(self, *args, **kwargs) -> LLMResponseEvent[Any]: last_followup_stream = None while accumulated_calls and rounds < MAX_ROUNDS: # Execute calls concurrently with dedup - triples, seen = await self._dedup_and_execute(accumulated_calls, seen=seen, max_concurrency=8, timeout_s=30) # type: ignore[arg-type] + triples, seen = await self._dedup_and_execute( + accumulated_calls, # type: ignore[arg-type] + seen=seen, + max_concurrency=8, + timeout_s=30, + ) # Build tool_result user message # Also reconstruct the assistant tool_use message that triggered these calls @@ -221,22 +255,26 @@ async def create_message(self, *args, **kwargs) -> LLMResponseEvent[Any]: executed_calls: List[NormalizedToolCallItem] = [] for tc, res, err in triples: executed_calls.append(tc) - assistant_content.append({ - "type": "tool_use", - "id": tc["id"], - "name": tc["name"], - "input": tc["arguments_json"], - }) + assistant_content.append( + { + "type": "tool_use", + "id": tc["id"], + "name": tc["name"], + "input": tc["arguments_json"], + } + ) # tool_result blocks (sanitize to keep payloads safe) tool_result_blocks = [] for tc, res, err in triples: payload = self._sanitize_tool_output(res) - tool_result_blocks.append({ - "type": "tool_result", - "tool_use_id": tc["id"], - "content": payload, - }) + tool_result_blocks.append( + { + "type": "tool_result", + "tool_use_id": tc["id"], + "content": payload, + } + ) assistant_msg = {"role": "assistant", "content": assistant_content} user_tool_results_msg = {"role": "user", "content": tool_result_blocks} @@ -244,7 +282,9 @@ async def create_message(self, *args, **kwargs) -> LLMResponseEvent[Any]: # Ask again WITH tools so Claude can do another hop tools_cfg = { - "tools": self._convert_tools_to_provider_format(self.get_available_functions()), + "tools": self._convert_tools_to_provider_format( + self.get_available_functions() + ), "tool_choice": {"type": "auto"}, "stream": True, "model": self.model, @@ -259,7 +299,9 @@ async def create_message(self, *args, **kwargs) -> LLMResponseEvent[Any]: accumulated_calls = [] # reset; we'll refill with new calls async for ev in follow_up_stream: last_followup_stream = ev - llm_response_optional = self._standardize_and_emit_event(ev, follow_up_text_parts) + llm_response_optional = self._standardize_and_emit_event( + ev, follow_up_text_parts + ) if llm_response_optional is not None: llm_response = llm_response_optional new_calls, _ = self._extract_tool_calls_from_stream_chunk(ev, None) @@ -276,14 +318,16 @@ async def create_message(self, *args, **kwargs) -> LLMResponseEvent[Any]: if accumulated_calls or rounds > 0: # Only if we had tool calls final_stream = await self.client.messages.create( model=self.model, - messages=messages, # includes assistant tool_use + user tool_result blocks + messages=messages, # includes assistant tool_use + user tool_result blocks stream=True, - max_tokens=1000 + max_tokens=1000, ) final_text_parts: List[str] = [] async for ev in final_stream: last_followup_stream = ev - llm_response_optional = self._standardize_and_emit_event(ev, final_text_parts) + llm_response_optional = self._standardize_and_emit_event( + ev, final_text_parts + ) if llm_response_optional is not None: llm_response = llm_response_optional if final_text_parts: @@ -291,8 +335,17 @@ async def create_message(self, *args, **kwargs) -> LLMResponseEvent[Any]: # 4) Done -> return all collected text total_text = "".join(text_parts) - llm_response = LLMResponseEvent(last_followup_stream or original, total_text) # type: ignore - self.events.send(LLMResponseCompletedEvent(original=last_followup_stream or original, text=total_text, plugin_name="anthropic")) + llm_response = LLMResponseEvent( + last_followup_stream or original, # type: ignore[arg-type] + total_text, + ) + self.events.send( + LLMResponseCompletedEvent( + original=last_followup_stream or original, + text=total_text, + plugin_name="anthropic", + ) + ) return llm_response @@ -303,10 +356,9 @@ def _standardize_and_emit_event( Forwards the events and also send out a standardized version (the agent class hooks into that) """ # forward the native event - self.events.send(events.ClaudeStreamEvent( - plugin_name="anthropic", - event_data=event - )) + self.events.send( + events.ClaudeStreamEvent(plugin_name="anthropic", event_data=event) + ) # send a standardized version for delta and response if event.type == "content_block_delta": @@ -314,14 +366,16 @@ def _standardize_and_emit_event( if hasattr(delta_event.delta, "text") and delta_event.delta.text: text_parts.append(delta_event.delta.text) - self.events.send(LLMResponseChunkEvent( - plugin_name="antrhopic", - content_index=delta_event.index, - item_id="", - output_index=0, - sequence_number=0, - delta=delta_event.delta.text, - )) + self.events.send( + LLMResponseChunkEvent( + plugin_name="antrhopic", + content_index=delta_event.index, + item_id="", + output_index=0, + sequence_number=0, + delta=delta_event.delta.text, + ) + ) elif event.type == "message_stop": stop_event: RawMessageStopEvent = event total_text = "".join(text_parts) @@ -354,13 +408,15 @@ def _normalize_message(claude_messages: Any) -> List["Message"]: return messages - def _convert_tools_to_provider_format(self, tools: List[ToolSchema]) -> List[Dict[str, Any]]: + def _convert_tools_to_provider_format( + self, tools: List[ToolSchema] + ) -> List[Dict[str, Any]]: """ Convert ToolSchema objects to Anthropic format. - + Args: tools: List of ToolSchema objects - + Returns: List of tools in Anthropic format """ @@ -369,37 +425,42 @@ def _convert_tools_to_provider_format(self, tools: List[ToolSchema]) -> List[Dic anthropic_tool = { "name": tool["name"], "description": tool.get("description", ""), - "input_schema": tool["parameters_schema"] + "input_schema": tool["parameters_schema"], } anthropic_tools.append(anthropic_tool) return anthropic_tools - def _extract_tool_calls_from_response(self, response: Any) -> List[NormalizedToolCallItem]: + def _extract_tool_calls_from_response( + self, response: Any + ) -> List[NormalizedToolCallItem]: """ Extract tool calls from Anthropic response. - + Args: response: Anthropic response object - + Returns: List of normalized tool call items """ tool_calls = [] - - if hasattr(response, 'content') and response.content: + + if hasattr(response, "content") and response.content: for content_block in response.content: - if hasattr(content_block, 'type') and content_block.type == "tool_use": + if hasattr(content_block, "type") and content_block.type == "tool_use": tool_call: NormalizedToolCallItem = { "type": "tool_call", "id": content_block.id, # Critical: capture the id for tool_result "name": content_block.name, - "arguments_json": content_block.input or {} # normalize to arguments_json + "arguments_json": content_block.input + or {}, # normalize to arguments_json } tool_calls.append(tool_call) - + return tool_calls - def _extract_tool_calls_from_stream_chunk(self, chunk: Any, current_tool_call: Optional[NormalizedToolCallItem] = None) -> tuple[List[NormalizedToolCallItem], Optional[NormalizedToolCallItem]]: # type: ignore[override] + def _extract_tool_calls_from_stream_chunk( # type: ignore[override] + self, chunk: Any, current_tool_call: Optional[NormalizedToolCallItem] = None + ) -> tuple[List[NormalizedToolCallItem], Optional[NormalizedToolCallItem]]: """ Extract tool calls from Anthropic streaming chunk using index-keyed accumulation. Args: @@ -409,22 +470,22 @@ def _extract_tool_calls_from_stream_chunk(self, chunk: Any, current_tool_call: O Tuple of (completed tool calls, current tool call being accumulated) """ tool_calls = [] - t = getattr(chunk, 'type', None) + t = getattr(chunk, "type", None) if t == "content_block_start": - cb = getattr(chunk, 'content_block', None) - if getattr(cb, 'type', None) == "tool_use": + cb = getattr(chunk, "content_block", None) + if getattr(cb, "type", None) == "tool_use": if cb is not None: self._pending_tool_uses_by_index[chunk.index] = { "id": cb.id, "name": cb.name, - "parts": [] + "parts": [], } elif t == "content_block_delta": - d = getattr(chunk, 'delta', None) - if getattr(d, 'type', None) == "input_json_delta": - pj = getattr(d, 'partial_json', None) + d = getattr(chunk, "delta", None) + if getattr(d, "type", None) == "input_json_delta": + pj = getattr(d, "partial_json", None) if pj is not None and chunk.index in self._pending_tool_uses_by_index: self._pending_tool_uses_by_index[chunk.index]["parts"].append(pj) @@ -440,12 +501,14 @@ def _extract_tool_calls_from_stream_chunk(self, chunk: Any, current_tool_call: O "type": "tool_call", "id": pending["id"], "name": pending["name"], - "arguments_json": args + "arguments_json": args, } tool_calls.append(tool_call_item) return tool_calls, None - def _create_tool_result_message(self, tool_calls: List[NormalizedToolCallItem], results: List[Any]) -> List[Dict[str, Any]]: + def _create_tool_result_message( + self, tool_calls: List[NormalizedToolCallItem], results: List[Any] + ) -> List[Dict[str, Any]]: """ Create tool result messages for Anthropic. tool_calls: List of tool calls that were executed @@ -461,17 +524,19 @@ def _create_tool_result_message(self, tool_calls: List[NormalizedToolCallItem], payload = str(result) else: payload = json.dumps(result) - blocks.append({ - "type": "tool_result", - "tool_use_id": tool_call["id"], # Critical: must match tool_use.id - "content": payload - }) + blocks.append( + { + "type": "tool_result", + "tool_use_id": tool_call["id"], # Critical: must match tool_use.id + "content": payload, + } + ) return [{"role": "user", "content": blocks}] def _concat_text_blocks(self, content): """Safely extract text from all text blocks in content.""" out = [] for b in content or []: - if getattr(b, 'type', None) == "text" and getattr(b, 'text', None): + if getattr(b, "type", None) == "text" and getattr(b, "text", None): out.append(b.text) return "".join(out) diff --git a/plugins/aws/vision_agents/plugins/aws/aws_llm.py b/plugins/aws/vision_agents/plugins/aws/aws_llm.py index b347720d..0d504eaf 100644 --- a/plugins/aws/vision_agents/plugins/aws/aws_llm.py +++ b/plugins/aws/vision_agents/plugins/aws/aws_llm.py @@ -8,7 +8,10 @@ from vision_agents.core.llm.llm_types import ToolSchema, NormalizedToolCallItem -from vision_agents.core.llm.events import LLMResponseChunkEvent, LLMResponseCompletedEvent +from vision_agents.core.llm.events import ( + LLMResponseChunkEvent, + LLMResponseCompletedEvent, +) from vision_agents.core.processors import Processor from . import events from vision_agents.core.edge.types import Participant @@ -25,9 +28,9 @@ class BedrockLLM(LLM): https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/bedrock-runtime/client/converse.html Chat history has to be manually passed, there is no conversation storage. - + Examples: - + from vision_agents.plugins import aws llm = aws.LLM( model="anthropic.claude-3-5-sonnet-20241022-v2:0", @@ -45,7 +48,7 @@ def __init__( ): """ Initialize the BedrockLLM class. - + Args: model: The Bedrock model ID (e.g., "anthropic.claude-3-5-sonnet-20241022-v2:0") region_name: AWS region name (default: "us-east-1") @@ -57,7 +60,8 @@ def __init__( self.events.register_events_from_module(events) self.model = model self._pending_tool_uses_by_index: Dict[int, Dict[str, Any]] = {} - + self.provider_name = "aws" + # Initialize boto3 bedrock-runtime client session_kwargs = {"region_name": region_name} if aws_access_key_id: @@ -69,12 +73,12 @@ def __init__( if os.environ.get("AWS_BEDROCK_API_KEY"): session_kwargs["aws_session_token"] = os.environ["AWS_BEDROCK_API_KEY"] - + self.client = boto3.client("bedrock-runtime", **session_kwargs) self.region_name = region_name - async def simple_response( + async def _simple_response( self, text: str, processors: Optional[List[Processor]] = None, @@ -82,14 +86,14 @@ async def simple_response( ): """ Simple response is a standardized way to create a response. - + Args: text: The text to respond to processors: list of processors (which contain state) about the video/voice AI participant: optionally the participant object - + Examples: - + await llm.simple_response("say hi to the user") """ return await self.converse_stream( @@ -118,7 +122,7 @@ async def converse(self, *args, **kwargs) -> LLMResponseEvent[Any]: # Ensure the AI remembers the past conversation new_messages = kwargs.get("messages", []) - if hasattr(self, '_conversation') and self._conversation: + if hasattr(self, "_conversation") and self._conversation: old_messages = [m.original for m in self._conversation.messages] kwargs["messages"] = old_messages + new_messages # Add messages to conversation @@ -128,11 +132,11 @@ async def converse(self, *args, **kwargs) -> LLMResponseEvent[Any]: try: response = self.client.converse(**kwargs) - + # Extract text from response text = self._extract_text_from_response(response) llm_response = LLMResponseEvent(response, text) - + # Handle tool calls if present function_calls = self._extract_tool_calls_from_response(response) if function_calls: @@ -141,22 +145,35 @@ async def converse(self, *args, **kwargs) -> LLMResponseEvent[Any]: rounds = 0 seen: set[tuple[str, str, str]] = set() current_calls = function_calls - + while current_calls and rounds < MAX_ROUNDS: # Execute calls concurrently with dedup - triples, seen = await self._dedup_and_execute(current_calls, seen=seen, max_concurrency=8, timeout_s=30) # type: ignore[arg-type] - + triples, seen = await self._dedup_and_execute( + current_calls, # type: ignore[arg-type] + seen=seen, + max_concurrency=8, + timeout_s=30, + ) + if not triples: break - + # Build tool result message tool_result_blocks = [] for tc, res, err in triples: payload = self._sanitize_tool_output(res) - tool_result_blocks.append({ - "toolUseId": tc["id"], - "content": [{"json": payload if isinstance(payload, dict) else {"result": payload}}], - }) + tool_result_blocks.append( + { + "toolUseId": tc["id"], + "content": [ + { + "json": payload + if isinstance(payload, dict) + else {"result": payload} + } + ], + } + ) # Add assistant message with tool use and user message with tool results assistant_msg = { @@ -170,11 +187,11 @@ async def converse(self, *args, **kwargs) -> LLMResponseEvent[Any]: } } for tc, _, _ in triples - ] + ], } user_tool_results_msg = { "role": "user", - "content": [{"toolResult": tr} for tr in tool_result_blocks] + "content": [{"toolResult": tr} for tr in tool_result_blocks], } messages = messages + [assistant_msg, user_tool_results_msg] @@ -184,26 +201,37 @@ async def converse(self, *args, **kwargs) -> LLMResponseEvent[Any]: messages=messages, toolConfig=kwargs.get("toolConfig", {}), ) - + # Extract new tool calls - current_calls = self._extract_tool_calls_from_response(follow_up_response) - llm_response = LLMResponseEvent(follow_up_response, self._extract_text_from_response(follow_up_response)) + current_calls = self._extract_tool_calls_from_response( + follow_up_response + ) + llm_response = LLMResponseEvent( + follow_up_response, + self._extract_text_from_response(follow_up_response), + ) rounds += 1 - + # Final pass without tools if current_calls or rounds > 0: final_response = self.client.converse( modelId=self.model, messages=messages, ) - llm_response = LLMResponseEvent(final_response, self._extract_text_from_response(final_response)) - - self.events.send(LLMResponseCompletedEvent(original=response, text=text, plugin_name="aws")) + llm_response = LLMResponseEvent( + final_response, self._extract_text_from_response(final_response) + ) + + self.events.send( + LLMResponseCompletedEvent( + original=response, text=text, plugin_name="aws" + ) + ) except ClientError as e: error_msg = f"AWS Bedrock API error: {str(e)}" - llm_response = LLMResponseEvent(None, error_msg, exception = e) - + llm_response = LLMResponseEvent(None, error_msg, exception=e) + return llm_response async def converse_stream(self, *args, **kwargs) -> LLMResponseEvent[Any]: @@ -222,7 +250,7 @@ async def converse_stream(self, *args, **kwargs) -> LLMResponseEvent[Any]: # Ensure the AI remembers the past conversation new_messages = kwargs.get("messages", []) - if hasattr(self, '_conversation') and self._conversation: + if hasattr(self, "_conversation") and self._conversation: old_messages = [m.original for m in self._conversation.messages] kwargs["messages"] = old_messages + new_messages normalized_messages = self._normalize_message(new_messages) @@ -236,37 +264,50 @@ async def converse_stream(self, *args, **kwargs) -> LLMResponseEvent[Any]: try: response = self.client.converse_stream(**kwargs) - stream = response.get('stream') - + stream = response.get("stream") + text_parts: List[str] = [] accumulated_calls: List[NormalizedToolCallItem] = [] last_event = None - + # Process stream for event in stream: last_event = event self._process_stream_event(event, text_parts, accumulated_calls) - + # Handle multi-hop tool calling messages = kwargs["messages"][:] MAX_ROUNDS = 3 rounds = 0 seen: set[tuple[str, str, str]] = set() - + while accumulated_calls and rounds < MAX_ROUNDS: - triples, seen = await self._dedup_and_execute(accumulated_calls, seen=seen, max_concurrency=8, timeout_s=30) # type: ignore[arg-type] - + triples, seen = await self._dedup_and_execute( + accumulated_calls, # type: ignore[arg-type] + seen=seen, + max_concurrency=8, + timeout_s=30, + ) + if not triples: break - + # Build tool result messages tool_result_blocks = [] for tc, res, err in triples: payload = self._sanitize_tool_output(res) - tool_result_blocks.append({ - "toolUseId": tc["id"], - "content": [{"json": payload if isinstance(payload, dict) else {"result": payload}}], - }) + tool_result_blocks.append( + { + "toolUseId": tc["id"], + "content": [ + { + "json": payload + if isinstance(payload, dict) + else {"result": payload} + } + ], + } + ) assistant_msg = { "role": "assistant", @@ -279,11 +320,11 @@ async def converse_stream(self, *args, **kwargs) -> LLMResponseEvent[Any]: } } for tc, _, _ in triples - ] + ], } user_tool_results_msg = { "role": "user", - "content": [{"toolResult": tr} for tr in tool_result_blocks] + "content": [{"toolResult": tr} for tr in tool_result_blocks], } messages = messages + [assistant_msg, user_tool_results_msg] @@ -293,85 +334,90 @@ async def converse_stream(self, *args, **kwargs) -> LLMResponseEvent[Any]: messages=messages, toolConfig=kwargs.get("toolConfig", {}), ) - + accumulated_calls = [] - follow_up_stream = follow_up_response.get('stream') + follow_up_stream = follow_up_response.get("stream") for event in follow_up_stream: last_event = event self._process_stream_event(event, text_parts, accumulated_calls) - + rounds += 1 - + # Final pass without tools if accumulated_calls or rounds > 0: final_response = self.client.converse_stream( modelId=self.model, messages=messages, ) - final_stream = final_response.get('stream') + final_stream = final_response.get("stream") for event in final_stream: last_event = event self._process_stream_event(event, text_parts, accumulated_calls) - + total_text = "".join(text_parts) llm_response = LLMResponseEvent(last_event, total_text) - self.events.send(LLMResponseCompletedEvent(original=last_event, text=total_text, plugin_name="aws")) - + self.events.send( + LLMResponseCompletedEvent( + original=last_event, text=total_text, plugin_name="aws" + ) + ) + except ClientError as e: error_msg = f"AWS Bedrock streaming error: {str(e)}" llm_response = LLMResponseEvent(None, error_msg) - + return llm_response def _process_stream_event( - self, - event: Dict[str, Any], + self, + event: Dict[str, Any], text_parts: List[str], - accumulated_calls: List[NormalizedToolCallItem] + accumulated_calls: List[NormalizedToolCallItem], ): """Process a streaming event from AWS.""" # Forward the native event - self.events.send(events.AWSStreamEvent( - plugin_name="aws", - event_data=event - )) - + self.events.send(events.AWSStreamEvent(plugin_name="aws", event_data=event)) + # Handle content block delta (text) - if 'contentBlockDelta' in event: - delta = event['contentBlockDelta']['delta'] - if 'text' in delta: - text_parts.append(delta['text']) - self.events.send(LLMResponseChunkEvent( - plugin_name="aws", - content_index=event['contentBlockDelta'].get('contentBlockIndex', 0), - item_id="", - output_index=0, - sequence_number=0, - delta=delta['text'], - )) - + if "contentBlockDelta" in event: + delta = event["contentBlockDelta"]["delta"] + if "text" in delta: + text_parts.append(delta["text"]) + self.events.send( + LLMResponseChunkEvent( + plugin_name="aws", + content_index=event["contentBlockDelta"].get( + "contentBlockIndex", 0 + ), + item_id="", + output_index=0, + sequence_number=0, + delta=delta["text"], + ) + ) + # Handle tool use - if 'contentBlockStart' in event: - start = event['contentBlockStart'].get('start', {}) - if 'toolUse' in start: - tool_use = start['toolUse'] - idx = event['contentBlockStart'].get('contentBlockIndex', 0) + if "contentBlockStart" in event: + start = event["contentBlockStart"].get("start", {}) + if "toolUse" in start: + tool_use = start["toolUse"] + idx = event["contentBlockStart"].get("contentBlockIndex", 0) self._pending_tool_uses_by_index[idx] = { - "id": tool_use.get('toolUseId', ''), - "name": tool_use.get('name', ''), - "parts": [] + "id": tool_use.get("toolUseId", ""), + "name": tool_use.get("name", ""), + "parts": [], } - - if 'contentBlockDelta' in event: - delta = event['contentBlockDelta']['delta'] - if 'toolUse' in delta: - idx = event['contentBlockDelta'].get('contentBlockIndex', 0) + + if "contentBlockDelta" in event: + delta = event["contentBlockDelta"]["delta"] + if "toolUse" in delta: + idx = event["contentBlockDelta"].get("contentBlockIndex", 0) if idx in self._pending_tool_uses_by_index: - input_data = delta['toolUse'].get('input', '') - self._pending_tool_uses_by_index[idx]['parts'].append(input_data) - - if 'contentBlockStop' in event: - idx = event['contentBlockStop'].get('contentBlockIndex', 0) + input_data = delta["toolUse"].get("input", "") + self._pending_tool_uses_by_index[idx]["parts"].append(input_data) + + if "contentBlockStop" in event: + idx = event["contentBlockStop"].get("contentBlockIndex", 0) pending = self._pending_tool_uses_by_index.pop(idx, None) if pending: buf = "".join(pending["parts"]).strip() or "{}" @@ -383,51 +429,55 @@ def _process_stream_event( "type": "tool_call", "id": pending["id"], "name": pending["name"], - "arguments_json": args + "arguments_json": args, } accumulated_calls.append(tool_call_item) def _extract_text_from_response(self, response: Dict[str, Any]) -> str: """Extract text content from AWS response.""" - output = response.get('output', {}) - message = output.get('message', {}) - content = message.get('content', []) - + output = response.get("output", {}) + message = output.get("message", {}) + content = message.get("content", []) + text_parts = [] for item in content: - if 'text' in item: - text_parts.append(item['text']) - + if "text" in item: + text_parts.append(item["text"]) + return "".join(text_parts) - def _extract_tool_calls_from_response(self, response: Dict[str, Any]) -> List[NormalizedToolCallItem]: + def _extract_tool_calls_from_response( + self, response: Dict[str, Any] + ) -> List[NormalizedToolCallItem]: """Extract tool calls from AWS response.""" tool_calls = [] - - output = response.get('output', {}) - message = output.get('message', {}) - content = message.get('content', []) - + + output = response.get("output", {}) + message = output.get("message", {}) + content = message.get("content", []) + for item in content: - if 'toolUse' in item: - tool_use = item['toolUse'] + if "toolUse" in item: + tool_use = item["toolUse"] tool_call: NormalizedToolCallItem = { "type": "tool_call", - "id": tool_use.get('toolUseId', ''), - "name": tool_use.get('name', ''), - "arguments_json": tool_use.get('input', {}) + "id": tool_use.get("toolUseId", ""), + "name": tool_use.get("name", ""), + "arguments_json": tool_use.get("input", {}), } tool_calls.append(tool_call) - + return tool_calls - def _convert_tools_to_provider_format(self, tools: List[ToolSchema]) -> List[Dict[str, Any]]: + def _convert_tools_to_provider_format( + self, tools: List[ToolSchema] + ) -> List[Dict[str, Any]]: """ Convert ToolSchema objects to AWS Bedrock format. - + Args: tools: List of ToolSchema objects - + Returns: List of tools in AWS Bedrock format """ @@ -437,9 +487,7 @@ def _convert_tools_to_provider_format(self, tools: List[ToolSchema]) -> List[Dic "toolSpec": { "name": tool["name"], "description": tool.get("description", ""), - "inputSchema": { - "json": tool["parameters_schema"] - } + "inputSchema": {"json": tool["parameters_schema"]}, } } aws_tools.append(aws_tool) @@ -451,9 +499,7 @@ def _normalize_message(aws_messages: Any) -> List["Message"]: from vision_agents.core.agents.conversation import Message if isinstance(aws_messages, str): - aws_messages = [ - {"content": [{"text": aws_messages}], "role": "user"} - ] + aws_messages = [{"content": [{"text": aws_messages}], "role": "user"}] if not isinstance(aws_messages, (List, tuple)): aws_messages = [aws_messages] @@ -465,8 +511,8 @@ def _normalize_message(aws_messages: Any) -> List["Message"]: # Extract text from content blocks text_parts = [] for item in content_items: - if isinstance(item, dict) and 'text' in item: - text_parts.append(item['text']) + if isinstance(item, dict) and "text" in item: + text_parts.append(item["text"]) elif isinstance(item, str): text_parts.append(item) content = " ".join(text_parts) @@ -478,4 +524,3 @@ def _normalize_message(aws_messages: Any) -> List["Message"]: messages.append(message) return messages - diff --git a/plugins/gemini/vision_agents/plugins/gemini/gemini_llm.py b/plugins/gemini/vision_agents/plugins/gemini/gemini_llm.py index fd0d9596..3661692d 100644 --- a/plugins/gemini/vision_agents/plugins/gemini/gemini_llm.py +++ b/plugins/gemini/vision_agents/plugins/gemini/gemini_llm.py @@ -8,7 +8,10 @@ from vision_agents.core.llm.llm import LLM, LLMResponseEvent from vision_agents.core.llm.llm_types import ToolSchema, NormalizedToolCallItem -from vision_agents.core.llm.events import LLMResponseCompletedEvent, LLMResponseChunkEvent +from vision_agents.core.llm.events import ( + LLMResponseCompletedEvent, + LLMResponseChunkEvent, +) from . import events @@ -20,24 +23,30 @@ class GeminiLLM(LLM): """ - The GeminiLLM class provides full/native access to the gemini SDK methods. - It only standardized the minimal feature set that's needed for the agent integration. + The GeminiLLM class provides full/native access to the gemini SDK methods. + It only standardized the minimal feature set that's needed for the agent integration. - The agent requires that we standardize: - - sharing instructions - - keeping conversation history - - response normalization + The agent requires that we standardize: + - sharing instructions + - keeping conversation history + - response normalization - Notes on the Gemini integration: - - the native method is called send_message (maps 1-1 to chat.send_message_stream) - - history is maintained in the gemini sdk (with the usage of client.chats.create(model=self.model)) + Notes on the Gemini integration: + - the native method is called send_message (maps 1-1 to chat.send_message_stream) + - history is maintained in the gemini sdk (with the usage of client.chats.create(model=self.model)) - Examples: + Examples: - from vision_agents.plugins import gemini - llm = gemini.LLM() - """ - def __init__(self, model: str, api_key: Optional[str] = None, client: Optional[genai.Client] = None): + from vision_agents.plugins import gemini + llm = gemini.LLM() + """ + + def __init__( + self, + model: str, + api_key: Optional[str] = None, + client: Optional[genai.Client] = None, + ): """ Initialize the GeminiLLM class. @@ -50,13 +59,19 @@ def __init__(self, model: str, api_key: Optional[str] = None, client: Optional[g self.events.register_events_from_module(events) self.model = model self.chat: Optional[Any] = None + self.provider_name = "gemini" if client is not None: self.client = client else: self.client = genai.Client(api_key=api_key) - async def simple_response(self, text: str, processors: Optional[List[Processor]] = None, participant: Optional[Any] = None) -> LLMResponseEvent[Any]: + async def _simple_response( + self, + text: str, + processors: Optional[List[Processor]] = None, + participant: Optional[Any] = None, + ) -> LLMResponseEvent[Any]: """ simple_response is a standardized way (across openai, claude, gemini etc.) to create a response. @@ -68,9 +83,7 @@ async def simple_response(self, text: str, processors: Optional[List[Processor]] llm.simple_response("say hi to the user, be mean") """ - return await self.send_message( - message=text - ) + return await self.send_message(message=text) async def send_message(self, *args, **kwargs): """ @@ -78,7 +91,7 @@ async def send_message(self, *args, **kwargs): under the hood it calls chat.send_message_stream(*args, **kwargs) this method wraps and ensures we broadcast an event which the agent class hooks into """ - #if "model" not in kwargs: + # if "model" not in kwargs: # kwargs["model"] = self.model # initialize chat if needed @@ -91,6 +104,7 @@ async def send_message(self, *args, **kwargs): tools_spec = self.get_available_functions() if tools_spec: from google.genai import types + conv_tools = self._convert_tools_to_provider_format(tools_spec) cfg = kwargs.get("config") if not isinstance(cfg, types.GenerateContentConfig): @@ -100,7 +114,7 @@ async def send_message(self, *args, **kwargs): # Generate content using the client iterator = self.chat.send_message_stream(*args, **kwargs) - text_parts : List[str] = [] + text_parts: List[str] = [] final_chunk = None pending_calls: List[NormalizedToolCallItem] = [] @@ -126,12 +140,17 @@ async def send_message(self, *args, **kwargs): rounds = 0 current_calls = pending_calls cfg_with_tools = kwargs.get("config") - + seen: set[str] = set() while current_calls and rounds < MAX_ROUNDS: # Execute tools concurrently with deduplication - triples, seen = await self._dedup_and_execute(current_calls, max_concurrency=8, timeout_s=30, seen=seen) # type: ignore[arg-type] - + triples, seen = await self._dedup_and_execute( + current_calls, # type: ignore[arg-type] + max_concurrency=8, + timeout_s=30, + seen=seen, + ) + executed = [] parts = [] for tc, res, err in triples: @@ -143,19 +162,28 @@ async def send_message(self, *args, **kwargs): sanitized_res = {} for k, v in res.items(): sanitized_res[k] = self._sanitize_tool_output(v) - parts.append(types.Part.from_function_response(name=tc["name"], response=sanitized_res)) - + parts.append( + types.Part.from_function_response( + name=tc["name"], response=sanitized_res + ) + ) + # Send function responses with tools config - follow_up_iter = self.chat.send_message_stream(parts, config=cfg_with_tools) # type: ignore[arg-type] - + follow_up_iter = self.chat.send_message_stream( + parts, # type: ignore[arg-type] + config=cfg_with_tools, + ) + follow_up_text_parts: List[str] = [] follow_up_last = None next_calls = [] - + for idx, chk in enumerate(follow_up_iter): follow_up_last = chk # TODO: unclear if this is correct (item_id and idx) - self._standardize_and_emit_event(chk, follow_up_text_parts, item_id, idx) + self._standardize_and_emit_event( + chk, follow_up_text_parts, item_id, idx + ) # Check for new function calls try: @@ -163,7 +191,7 @@ async def send_message(self, *args, **kwargs): next_calls.extend(chunk_calls) except Exception: pass - + current_calls = next_calls rounds += 1 @@ -173,12 +201,14 @@ async def send_message(self, *args, **kwargs): total_text = "".join(text_parts) llm_response = LLMResponseEvent(final_chunk, total_text) - self.events.send(LLMResponseCompletedEvent( - plugin_name="gemini", - original=llm_response.original, - text=llm_response.text, - item_id=item_id, - )) + self.events.send( + LLMResponseCompletedEvent( + plugin_name="gemini", + original=llm_response.original, + text=llm_response.text, + item_id=item_id, + ) + ) # Return the LLM response return llm_response @@ -186,12 +216,10 @@ async def send_message(self, *args, **kwargs): @staticmethod def _normalize_message(gemini_input) -> List["Message"]: from vision_agents.core.agents.conversation import Message - + # standardize on input if isinstance(gemini_input, str): - gemini_input = [ - gemini_input - ] + gemini_input = [gemini_input] if not isinstance(gemini_input, List): gemini_input = [gemini_input] @@ -203,29 +231,38 @@ def _normalize_message(gemini_input) -> List["Message"]: return messages - def _standardize_and_emit_event(self, chunk: GenerateContentResponse, text_parts: List[str], item_id: str, idx: int) -> Optional[LLMResponseEvent[Any]]: + def _standardize_and_emit_event( + self, + chunk: GenerateContentResponse, + text_parts: List[str], + item_id: str, + idx: int, + ) -> Optional[LLMResponseEvent[Any]]: """ Forwards the events and also send out a standardized version (the agent class hooks into that) """ # forward the native event - self.events.send(events.GeminiResponseEvent( - plugin_name="gemini", - response_chunk=chunk - )) + self.events.send( + events.GeminiResponseEvent(plugin_name="gemini", response_chunk=chunk) + ) # Check if response has text content - if hasattr(chunk, 'text') and chunk.text: - self.events.send(LLMResponseChunkEvent( - plugin_name="gemini", - content_index=idx, - item_id=item_id, - delta=chunk.text, - )) + if hasattr(chunk, "text") and chunk.text: + self.events.send( + LLMResponseChunkEvent( + plugin_name="gemini", + content_index=idx, + item_id=item_id, + delta=chunk.text, + ) + ) text_parts.append(chunk.text) return None - def _convert_tools_to_provider_format(self, tools: List[ToolSchema]) -> List[Dict[str, Any]]: + def _convert_tools_to_provider_format( + self, tools: List[ToolSchema] + ) -> List[Dict[str, Any]]: """ Convert ToolSchema objects to Gemini format. Args: @@ -235,75 +272,93 @@ def _convert_tools_to_provider_format(self, tools: List[ToolSchema]) -> List[Dic """ function_declarations = [] for tool in tools: - function_declarations.append({ - "name": tool["name"], - "description": tool.get("description", ""), - "parameters": tool["parameters_schema"] - }) - + function_declarations.append( + { + "name": tool["name"], + "description": tool.get("description", ""), + "parameters": tool["parameters_schema"], + } + ) + # Return as dict with function_declarations (SDK accepts dicts) return [{"function_declarations": function_declarations}] - def _extract_tool_calls_from_response(self, response: Any) -> List[NormalizedToolCallItem]: + def _extract_tool_calls_from_response( + self, response: Any + ) -> List[NormalizedToolCallItem]: """ Extract tool calls from Gemini response. - + Args: response: Gemini response object - + Returns: List of normalized tool call items """ calls: List[NormalizedToolCallItem] = [] - + try: # Prefer the top-level convenience list if available function_calls = getattr(response, "function_calls", []) or [] for fc in function_calls: - calls.append({ - "type": "tool_call", - "name": getattr(fc, "name", "unknown"), - "arguments_json": getattr(fc, "args", {}) - }) + calls.append( + { + "type": "tool_call", + "name": getattr(fc, "name", "unknown"), + "arguments_json": getattr(fc, "args", {}), + } + ) if not calls and getattr(response, "candidates", None): for c in response.candidates: if getattr(c, "content", None): for part in c.content.parts: if getattr(part, "function_call", None): - calls.append({ - "type": "tool_call", - "name": getattr(part.function_call, "name", "unknown"), - "arguments_json": getattr(part.function_call, "args", {}), - }) + calls.append( + { + "type": "tool_call", + "name": getattr( + part.function_call, "name", "unknown" + ), + "arguments_json": getattr( + part.function_call, "args", {} + ), + } + ) except Exception: pass # Ignore extraction errors - + return calls - def _extract_tool_calls_from_stream_chunk(self, chunk: Any) -> List[NormalizedToolCallItem]: + def _extract_tool_calls_from_stream_chunk( + self, chunk: Any + ) -> List[NormalizedToolCallItem]: """ Extract tool calls from Gemini streaming chunk. - + Args: chunk: Gemini streaming event - + Returns: List of normalized tool call items """ try: - return self._extract_tool_calls_from_response(chunk) # chunks use same shape + return self._extract_tool_calls_from_response( + chunk + ) # chunks use same shape except Exception: return [] # Ignore extraction errors - def _create_tool_result_parts(self, tool_calls: List[NormalizedToolCallItem], results: List[Any]): + def _create_tool_result_parts( + self, tool_calls: List[NormalizedToolCallItem], results: List[Any] + ): """ Create function_response parts for Gemini. - + Args: tool_calls: List of tool calls that were executed results: List of results from function execution - + Returns: List of function_response parts """ @@ -315,9 +370,13 @@ def _create_tool_result_parts(self, tool_calls: List[NormalizedToolCallItem], re response_data = res else: response_data = {"result": res} - + # res may be dict/list/str; pass directly; SDK serializes - parts.append(types.Part.from_function_response(name=tc["name"], response=response_data)) + parts.append( + types.Part.from_function_response( + name=tc["name"], response=response_data + ) + ) except Exception: # Fallback: create a simple text part parts.append(types.Part(text=f"Function {tc['name']} returned: {res}")) diff --git a/plugins/openai/vision_agents/plugins/openai/openai_llm.py b/plugins/openai/vision_agents/plugins/openai/openai_llm.py index 06a19940..c2fbe66d 100644 --- a/plugins/openai/vision_agents/plugins/openai/openai_llm.py +++ b/plugins/openai/vision_agents/plugins/openai/openai_llm.py @@ -69,6 +69,7 @@ def __init__( self.model = model self.openai_conversation: Optional[Any] = None self.conversation = None + self.provider_name = "openai" if client is not None: self.client = client @@ -77,7 +78,7 @@ def __init__( else: self.client = AsyncOpenAI(base_url=base_url) - async def simple_response( + async def _simple_response( self, text: str, processors: Optional[List[Processor]] = None, diff --git a/plugins/openrouter/vision_agents/plugins/openrouter/openrouter_llm.py b/plugins/openrouter/vision_agents/plugins/openrouter/openrouter_llm.py index 52664191..730f968e 100644 --- a/plugins/openrouter/vision_agents/plugins/openrouter/openrouter_llm.py +++ b/plugins/openrouter/vision_agents/plugins/openrouter/openrouter_llm.py @@ -1,4 +1,5 @@ """OpenRouter LLM implementation using OpenAI-compatible API.""" + import os from typing import Any @@ -24,7 +25,7 @@ def __init__( **kwargs: Any, ) -> None: """Initialize OpenRouter LLM. - + Args: api_key: OpenRouter API key. If not provided, uses OPENROUTER_API_KEY env var. base_url: OpenRouter API base URL. @@ -39,6 +40,7 @@ def __init__( model=model, **kwargs, ) + self.provider_name = "openrouter" async def create_conversation(self): # Do nothing, dont call super @@ -51,11 +53,10 @@ def add_conversation_history(self, kwargs): new_messages = kwargs["input"] if not isinstance(new_messages, list): new_messages = [dict(content=new_messages, role="user", type="message")] - if hasattr(self, '_conversation') and self._conversation: + if hasattr(self, "_conversation") and self._conversation: old_messages = [m.original for m in self._conversation.messages] kwargs["input"] = old_messages + new_messages # Add messages to conversation normalized_messages = self._normalize_message(new_messages) for msg in normalized_messages: self._conversation.messages.append(msg) - diff --git a/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py b/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py index 411ea76f..06884440 100644 --- a/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py +++ b/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py @@ -13,8 +13,11 @@ from vision_agents.core.agents import Conversation from vision_agents.core.agents.agents import default_agent_options, AgentOptions from vision_agents.core.edge.types import Participant -from vision_agents.core.observability import meter -from vision_agents.core.observability.metrics import Timer +from vision_agents.core.observability.metrics import ( + Timer, + turn_vad_latency_ms, + turn_end_detection_latency_ms, +) from vision_agents.core.turn_detection import ( TurnDetector, @@ -42,17 +45,6 @@ ) -turn_silero_vad_latency_ms = meter.create_histogram( - "turn.silero.vad.latency.ms", - unit="ms", -) - -turn_smart_turn_detection_latency_ms = meter.create_histogram( - "turn.smart_turn.detection.latency.ms", - unit="ms", -) - - @dataclass class Silence: trailing_silence_chunks: int = 0 @@ -248,8 +240,9 @@ async def _process_audio_packet( # detect speech in small 512 chunks, gather to larger audio segments with speech for chunk in audio_chunks[:-1]: # predict if this segment has speech - with Timer(turn_silero_vad_latency_ms) as timer: + with Timer(turn_vad_latency_ms) as timer: timer.attributes["samples"] = len(chunk.samples) + timer.attributes["implementation"] = "smart_turn" speech_probability = await self.vad.predict_speech(chunk.samples) is_speech = speech_probability > self.speech_probability_threshold @@ -289,7 +282,8 @@ async def _process_audio_packet( merged.append(self._active_segment) merged = merged.tail(8, True, "start") # see if we've completed the turn - with Timer(turn_smart_turn_detection_latency_ms) as timer: + with Timer(turn_end_detection_latency_ms) as timer: + timer.attributes["implementation"] = "smart_turn" timer.attributes["audio_duration_ms"] = merged.duration_ms timer.attributes["samples"] = len(merged.samples) timer.attributes["trailing_silence_ms"] = trailing_silence_ms diff --git a/plugins/vogent/vision_agents/plugins/vogent/vogent_turn_detection.py b/plugins/vogent/vision_agents/plugins/vogent/vogent_turn_detection.py index 34b352ee..e955d055 100644 --- a/plugins/vogent/vision_agents/plugins/vogent/vogent_turn_detection.py +++ b/plugins/vogent/vision_agents/plugins/vogent/vogent_turn_detection.py @@ -17,26 +17,23 @@ TurnStartedEvent, TurnEndedEvent, ) -from vision_agents.core.observability.metrics import Timer, meter +from vision_agents.core.observability.metrics import ( + Timer, + meter, + turn_vad_latency_ms, + turn_end_detection_latency_ms, +) import logging logger = logging.getLogger(__name__) -# Metrics for Vogent turn detection -vogent_vad_latency_ms = meter.create_histogram( - "vogent.vad.latency.ms", unit="ms", description="Vogent VAD prediction latency" -) +# Vogent-specific metric for Whisper transcription vogent_whisper_latency_ms = meter.create_histogram( "vogent.whisper.latency.ms", unit="ms", description="Vogent Whisper transcription latency", ) -vogent_turn_prediction_latency_ms = meter.create_histogram( - "vogent.turn_prediction.latency.ms", - unit="ms", - description="Vogent turn completion prediction latency", -) # Silero VAD model (reused from smart_turn) SILERO_ONNX_FILENAME = "silero_vad.onnx" @@ -260,8 +257,9 @@ async def _process_audio_packet( if self.vad is None: continue - with Timer(vogent_vad_latency_ms) as timer: + with Timer(turn_vad_latency_ms) as timer: timer.attributes["samples"] = len(chunk.samples) + timer.attributes["implementation"] = "vogent" speech_probability = self.vad.predict_speech(chunk.samples) is_speech = speech_probability > self.speech_probability_threshold @@ -421,12 +419,13 @@ async def _predict_turn_completed( Returns: True if turn is complete, False otherwise """ - with Timer(vogent_turn_prediction_latency_ms) as timer: + with Timer(turn_end_detection_latency_ms) as timer: # Ensure it's 16khz and f32 format pcm = pcm.resample(16000).to_float32() # Truncate to 8 seconds audio_array = pcm.tail(8, False).samples + timer.attributes["implementation"] = "vogent" timer.attributes["audio_duration_ms"] = len(audio_array) / 16000 * 1000 timer.attributes["prev_line_length"] = len(prev_line) timer.attributes["curr_line_length"] = len(curr_line) diff --git a/plugins/xai/vision_agents/plugins/xai/llm.py b/plugins/xai/vision_agents/plugins/xai/llm.py index 5392ab64..4ed18bf0 100644 --- a/plugins/xai/vision_agents/plugins/xai/llm.py +++ b/plugins/xai/vision_agents/plugins/xai/llm.py @@ -5,7 +5,10 @@ from vision_agents.core.llm.llm import LLM, LLMResponseEvent from vision_agents.core.processors import Processor -from vision_agents.core.llm.events import LLMResponseChunkEvent, LLMResponseCompletedEvent +from vision_agents.core.llm.events import ( + LLMResponseChunkEvent, + LLMResponseCompletedEvent, +) from . import events if TYPE_CHECKING: @@ -56,6 +59,7 @@ def __init__( self.model = model self.xai_chat: Optional["Chat"] = None self.conversation = None + self.provider_name = "xai" if client is not None: self.client = client @@ -64,7 +68,7 @@ def __init__( else: self.client = AsyncClient() - async def simple_response( + async def _simple_response( self, text: str, processors: Optional[List[Processor]] = None, @@ -91,7 +95,9 @@ async def simple_response( instructions=instructions, ) - async def create_response(self, *args: Any, **kwargs: Any) -> LLMResponseEvent[Response]: + async def create_response( + self, *args: Any, **kwargs: Any + ) -> LLMResponseEvent[Response]: """ create_response gives you full support/access to the native xAI chat.sample() and chat.stream() methods this method wraps the xAI method and ensures we broadcast an event which the agent class hooks into @@ -139,10 +145,11 @@ async def create_response(self, *args: Any, **kwargs: Any) -> LLMResponseEvent[R self.xai_chat.append(response) if llm_response is not None: - self.events.send(LLMResponseCompletedEvent( - original=llm_response.original, - text=llm_response.text - )) + self.events.send( + LLMResponseCompletedEvent( + original=llm_response.original, text=llm_response.text + ) + ) return llm_response or LLMResponseEvent[Response]( Response(chat_pb2.GetChatCompletionResponse(), 0), "" @@ -170,31 +177,32 @@ def _standardize_and_emit_chunk( Forwards the chunk events and also send out a standardized version (the agent class hooks into that) """ # Emit the raw chunk event - self.events.send(events.XAIChunkEvent( - plugin_name="xai", - chunk=chunk - )) + self.events.send(events.XAIChunkEvent(plugin_name="xai", chunk=chunk)) # Emit standardized delta events for content if chunk.content: - self.events.send(LLMResponseChunkEvent( - content_index=0, # xAI doesn't have content_index - item_id=chunk.proto.id if hasattr(chunk.proto, "id") else "", - output_index=0, # xAI doesn't have output_index - sequence_number=0, # xAI doesn't have sequence_number - delta=chunk.content, - plugin_name="xai", - )) + self.events.send( + LLMResponseChunkEvent( + content_index=0, # xAI doesn't have content_index + item_id=chunk.proto.id if hasattr(chunk.proto, "id") else "", + output_index=0, # xAI doesn't have output_index + sequence_number=0, # xAI doesn't have sequence_number + delta=chunk.content, + plugin_name="xai", + ) + ) # Check if this is the final chunk (finish_reason indicates completion) if chunk.choices and chunk.choices[0].finish_reason: # This is the final chunk, return the complete response llm_response = LLMResponseEvent[Response](response, response.content) - self.events.send(LLMResponseCompletedEvent( - plugin_name="xai", - text=llm_response.text, - original=llm_response.original - )) + self.events.send( + LLMResponseCompletedEvent( + plugin_name="xai", + text=llm_response.text, + original=llm_response.original, + ) + ) return llm_response return None From 2755b9d78f65101807b7e040919a86304bd8c940 Mon Sep 17 00:00:00 2001 From: Tommaso Barbugli Date: Fri, 31 Oct 2025 16:31:49 +0100 Subject: [PATCH 04/11] eable tracing for now --- .../simple_agent_example.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/examples/01_simple_agent_example/simple_agent_example.py b/examples/01_simple_agent_example/simple_agent_example.py index 263741b0..4dfdc634 100644 --- a/examples/01_simple_agent_example/simple_agent_example.py +++ b/examples/01_simple_agent_example/simple_agent_example.py @@ -3,7 +3,7 @@ from dotenv import load_dotenv from vision_agents.core import User, Agent -from vision_agents.plugins import cartesia, deepgram, getstream, gemini, vogent +from vision_agents.plugins import cartesia, deepgram, getstream, gemini, smart_turn load_dotenv() @@ -22,7 +22,7 @@ async def start_agent() -> None: llm=llm, tts=cartesia.TTS(), stt=deepgram.STT(), - turn_detection=vogent.TurnDetection(), + turn_detection=smart_turn.TurnDetection(), # realtime version (vad, tts and stt not needed) # llm=openai.Realtime() ) @@ -59,11 +59,14 @@ async def start_agent() -> None: def setup_telemetry(): import atexit - from opentelemetry import trace + from opentelemetry import trace, metrics from opentelemetry.sdk.resources import Resource from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter + from opentelemetry.sdk.metrics import MeterProvider + from opentelemetry.exporter.prometheus import PrometheusMetricReader + from prometheus_client import start_http_server resource = Resource.create( { @@ -76,6 +79,13 @@ def setup_telemetry(): tp.add_span_processor(BatchSpanProcessor(exporter)) trace.set_tracer_provider(tp) + reader = PrometheusMetricReader() + metrics.set_meter_provider( + MeterProvider(resource=resource, metric_readers=[reader]) + ) + + start_http_server(port=9464) + def _flush_and_shutdown(): tp.force_flush() tp.shutdown() @@ -84,5 +94,5 @@ def _flush_and_shutdown(): if __name__ == "__main__": - # setup_telemetry() + setup_telemetry() asyncio.run(start_agent()) From 94f544a2d1ee717f6e1cddaa1c995ed02da71484 Mon Sep 17 00:00:00 2001 From: Tommaso Barbugli Date: Sat, 1 Nov 2025 15:20:54 +0100 Subject: [PATCH 05/11] fix tests --- .../plugins/gemini/gemini_realtime.py | 35 ++++++++++++++ .../plugins/openai/openai_realtime.py | 35 ++++++++++++++ tests/test_function_calling.py | 46 +++++++++++++++---- 3 files changed, 107 insertions(+), 9 deletions(-) diff --git a/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py b/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py index 019a2c22..dcfbbfae 100644 --- a/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py +++ b/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py @@ -128,6 +128,41 @@ async def simple_response( self.logger.info("Simple response called with text: %s", text) await self.send_realtime_input(text=text) + async def _simple_response( + self, + text: str, + system_prompt: Optional[str] = None, + temperature: float = 0.7, + max_tokens: Optional[int] = None, + **kwargs: Any, + ) -> str: + """ + Internal simple response implementation required by LLM base class. + + Note: Gemini Realtime is event-driven and doesn't return responses directly. + This implementation sends the text and returns a placeholder. + """ + await self.send_realtime_input(text=text) + return "" # Realtime API doesn't return text synchronously + + async def _simple_response_stream( + self, + text: str, + system_prompt: Optional[str] = None, + temperature: float = 0.7, + max_tokens: Optional[int] = None, + **kwargs: Any, + ): + """ + Internal simple response stream implementation required by LLM base class. + + Note: Gemini Realtime is event-driven and doesn't stream responses in this manner. + This implementation sends the text but yields nothing. + """ + await self.send_realtime_input(text=text) + return + yield # Make this a generator + async def simple_audio_response( self, pcm: PcmData, participant: Optional[Participant] = None ): diff --git a/plugins/openai/vision_agents/plugins/openai/openai_realtime.py b/plugins/openai/vision_agents/plugins/openai/openai_realtime.py index a9074ae7..2672188e 100644 --- a/plugins/openai/vision_agents/plugins/openai/openai_realtime.py +++ b/plugins/openai/vision_agents/plugins/openai/openai_realtime.py @@ -118,6 +118,41 @@ async def simple_response( """ await self.rtc.send_text(text) + async def _simple_response( + self, + text: str, + system_prompt: Optional[str] = None, + temperature: float = 0.7, + max_tokens: Optional[int] = None, + **kwargs: Any, + ) -> str: + """ + Internal simple response implementation required by LLM base class. + + Note: OpenAI Realtime is event-driven and doesn't return responses directly. + This implementation sends the text and returns a placeholder. + """ + await self.rtc.send_text(text) + return "" # Realtime API doesn't return text synchronously + + async def _simple_response_stream( + self, + text: str, + system_prompt: Optional[str] = None, + temperature: float = 0.7, + max_tokens: Optional[int] = None, + **kwargs: Any, + ): + """ + Internal simple response stream implementation required by LLM base class. + + Note: OpenAI Realtime is event-driven and doesn't stream responses in this manner. + This implementation sends the text but yields nothing. + """ + await self.rtc.send_text(text) + return + yield # Make this a generator + async def simple_audio_response( self, audio: PcmData, participant: Optional[Participant] = None ): diff --git a/tests/test_function_calling.py b/tests/test_function_calling.py index 54dd3912..fabedf8a 100644 --- a/tests/test_function_calling.py +++ b/tests/test_function_calling.py @@ -4,6 +4,7 @@ import pytest from unittest.mock import Mock, patch +from typing import Any, Dict, Optional, AsyncIterator from vision_agents.core.llm import FunctionRegistry, function_registry from vision_agents.core.llm.llm import LLM @@ -12,6 +13,33 @@ from vision_agents.plugins.gemini import LLM as GeminiLLM +# Test implementation of LLM for unit tests +class TestLLM(LLM): + """Concrete implementation of LLM for testing.""" + + async def _simple_response( + self, + text: str, + system_prompt: Optional[str] = None, + temperature: float = 0.7, + max_tokens: Optional[int] = None, + **kwargs: Any, + ) -> str: + """Mock simple response.""" + return f"Mock response to: {text}" + + async def _simple_response_stream( + self, + text: str, + system_prompt: Optional[str] = None, + temperature: float = 0.7, + max_tokens: Optional[int] = None, + **kwargs: Any, + ) -> AsyncIterator[str]: + """Mock simple response stream.""" + yield f"Mock response to: {text}" + + class TestFunctionRegistry: """Test the FunctionRegistry class.""" @@ -131,7 +159,7 @@ class TestLLMFunctionCalling: @pytest.mark.asyncio async def test_llm_function_registration(self): """Test that LLM can register functions.""" - llm = LLM() + llm = TestLLM() @llm.register_function(description="Test function") def test_func(x: int) -> int: @@ -145,7 +173,7 @@ def test_func(x: int) -> int: @pytest.mark.asyncio async def test_llm_get_available_functions(self): """Test getting available functions from LLM.""" - llm = LLM() + llm = TestLLM() @llm.register_function(description="Function 1") def func1(x: int) -> int: @@ -361,7 +389,7 @@ class TestFunctionCallingIntegration: @pytest.mark.asyncio async def test_tool_call_processing(self): """Test processing tool calls with multiple functions.""" - llm = LLM() + llm = TestLLM() @llm.register_function(description="Get weather") def get_weather(location: str) -> str: @@ -385,7 +413,7 @@ def calculate_sum(a: int, b: int) -> int: @pytest.mark.asyncio async def test_error_handling_in_function_calls(self): """Test error handling in function calls.""" - llm = LLM() + llm = TestLLM() @llm.register_function(description="Test function that raises error") def error_function(x: int) -> int: @@ -404,7 +432,7 @@ def error_function(x: int) -> int: @pytest.mark.asyncio async def test_function_schema_generation(self): """Test that function schemas are generated correctly.""" - llm = LLM() + llm = TestLLM() @llm.register_function(description="Complex function") def complex_function( @@ -450,7 +478,7 @@ class TestConcurrentToolExecution: @pytest.mark.asyncio async def test_dedup_and_execute(self): """Test the _dedup_and_execute method.""" - llm = LLM() + llm = TestLLM() @llm.register_function(description="Test function") def test_func(x: int) -> int: @@ -479,8 +507,8 @@ def test_func(x: int) -> int: async def test_tool_lifecycle_events(self): """Test that tool lifecycle events are emitted.""" from vision_agents.core.llm.events import ToolStartEvent, ToolEndEvent - - llm = LLM() + + llm = TestLLM() @llm.register_function(description="Test function") def test_func(x: int) -> int: @@ -513,7 +541,7 @@ async def track_end_event(event: ToolEndEvent): @pytest.mark.asyncio async def test_output_sanitization(self): """Test output sanitization for large responses.""" - llm = LLM() + llm = TestLLM() # Test normal output normal_output = "Hello world" From 3646cd05f02d65e24e1ca05a2552351f40fa1c1c Mon Sep 17 00:00:00 2001 From: Tommaso Barbugli Date: Sat, 1 Nov 2025 15:21:16 +0100 Subject: [PATCH 06/11] missing docker compose file --- docker-compose.yml | 74 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 docker-compose.yml diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..cb2ce184 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,74 @@ +services: + # Jaeger for distributed tracing + jaeger: + image: jaegertracing/all-in-one:latest + container_name: vision-agents-jaeger + ports: + - "16686:16686" # Jaeger UI + - "4317:4317" # OTLP gRPC receiver + - "4318:4318" # OTLP HTTP receiver + environment: + - COLLECTOR_OTLP_ENABLED=true + networks: + - observability + + # Prometheus for metrics collection + prometheus: + image: prom/prometheus:latest + container_name: vision-agents-prometheus + ports: + - "9090:9090" + volumes: + - ./observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml + - prometheus-data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/usr/share/prometheus/console_libraries' + - '--web.console.templates=/usr/share/prometheus/consoles' + - '--web.enable-lifecycle' + networks: + - observability + + # Grafana for visualization + grafana: + image: grafana/grafana:latest + container_name: vision-agents-grafana + ports: + - "3000:3000" + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_USERS_ALLOW_SIGN_UP=false + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin + - GF_AUTH_DISABLE_LOGIN_FORM=true + volumes: + - ./observability/grafana/provisioning:/etc/grafana/provisioning + - ./observability/grafana/dashboards:/var/lib/grafana/dashboards + - grafana-data:/var/lib/grafana + depends_on: + - prometheus + networks: + - observability + + # Init service to set home dashboard + grafana-init: + image: curlimages/curl:latest + container_name: vision-agents-grafana-init + volumes: + - ./observability/grafana/init-home-dashboard.sh:/init-home-dashboard.sh:ro + command: sh /init-home-dashboard.sh + depends_on: + - grafana + networks: + - observability + restart: "no" + +volumes: + prometheus-data: + grafana-data: + +networks: + observability: + driver: bridge From ed61f4df98496d4496e254258bad53650834c4e8 Mon Sep 17 00:00:00 2001 From: Tommaso Barbugli Date: Sat, 1 Nov 2025 16:01:23 +0100 Subject: [PATCH 07/11] Update plugins/anthropic/vision_agents/plugins/anthropic/anthropic_llm.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- .../anthropic/vision_agents/plugins/anthropic/anthropic_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/anthropic/vision_agents/plugins/anthropic/anthropic_llm.py b/plugins/anthropic/vision_agents/plugins/anthropic/anthropic_llm.py index 1364d5d6..91012809 100644 --- a/plugins/anthropic/vision_agents/plugins/anthropic/anthropic_llm.py +++ b/plugins/anthropic/vision_agents/plugins/anthropic/anthropic_llm.py @@ -368,7 +368,7 @@ def _standardize_and_emit_event( self.events.send( LLMResponseChunkEvent( - plugin_name="antrhopic", + plugin_name="anthropic", content_index=delta_event.index, item_id="", output_index=0, From 16d58f74d6f62dbe5b960bc2289a6505488cd644 Mon Sep 17 00:00:00 2001 From: Tommaso Barbugli Date: Sat, 1 Nov 2025 16:08:51 +0100 Subject: [PATCH 08/11] ruffit --- .../plugins/gemini/gemini_realtime.py | 37 +-- .../plugins/openai/openai_realtime.py | 34 +- tests/test_function_calling.py | 306 ++++++++++-------- 3 files changed, 186 insertions(+), 191 deletions(-) diff --git a/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py b/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py index dcfbbfae..e9b28e50 100644 --- a/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py +++ b/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py @@ -131,37 +131,20 @@ async def simple_response( async def _simple_response( self, text: str, - system_prompt: Optional[str] = None, - temperature: float = 0.7, - max_tokens: Optional[int] = None, - **kwargs: Any, - ) -> str: + processors: Optional[List[Processor]] = None, + participant: Optional[Participant] = None, + ): """ Internal simple response implementation required by LLM base class. Note: Gemini Realtime is event-driven and doesn't return responses directly. - This implementation sends the text and returns a placeholder. - """ - await self.send_realtime_input(text=text) - return "" # Realtime API doesn't return text synchronously - - async def _simple_response_stream( - self, - text: str, - system_prompt: Optional[str] = None, - temperature: float = 0.7, - max_tokens: Optional[int] = None, - **kwargs: Any, - ): + This implementation sends the text via the public simple_response method. """ - Internal simple response stream implementation required by LLM base class. + from vision_agents.core.llm.llm import LLMResponseEvent - Note: Gemini Realtime is event-driven and doesn't stream responses in this manner. - This implementation sends the text but yields nothing. - """ - await self.send_realtime_input(text=text) - return - yield # Make this a generator + await self.simple_response(text, processors, participant) + # Return empty LLMResponseEvent since Realtime API is event-driven + return LLMResponseEvent(original=None, text="") async def simple_audio_response( self, pcm: PcmData, participant: Optional[Participant] = None @@ -376,7 +359,9 @@ async def _receive_loop(self): ) await self._handle_tool_call(server_message.tool_call) else: - self.logger.warning("Unrecognized event structure for gemini %s", server_message) + self.logger.warning( + "Unrecognized event structure for gemini %s", server_message + ) except CancelledError: logger.error("Stop async iteration exception") return diff --git a/plugins/openai/vision_agents/plugins/openai/openai_realtime.py b/plugins/openai/vision_agents/plugins/openai/openai_realtime.py index 2672188e..15bec56d 100644 --- a/plugins/openai/vision_agents/plugins/openai/openai_realtime.py +++ b/plugins/openai/vision_agents/plugins/openai/openai_realtime.py @@ -121,37 +121,20 @@ async def simple_response( async def _simple_response( self, text: str, - system_prompt: Optional[str] = None, - temperature: float = 0.7, - max_tokens: Optional[int] = None, - **kwargs: Any, - ) -> str: + processors: Optional[List[Processor]] = None, + participant: Optional[Participant] = None, + ): """ Internal simple response implementation required by LLM base class. Note: OpenAI Realtime is event-driven and doesn't return responses directly. - This implementation sends the text and returns a placeholder. - """ - await self.rtc.send_text(text) - return "" # Realtime API doesn't return text synchronously - - async def _simple_response_stream( - self, - text: str, - system_prompt: Optional[str] = None, - temperature: float = 0.7, - max_tokens: Optional[int] = None, - **kwargs: Any, - ): + This implementation sends the text via the public simple_response method. """ - Internal simple response stream implementation required by LLM base class. + from vision_agents.core.llm.llm import LLMResponseEvent - Note: OpenAI Realtime is event-driven and doesn't stream responses in this manner. - This implementation sends the text but yields nothing. - """ - await self.rtc.send_text(text) - return - yield # Make this a generator + await self.simple_response(text, processors, participant) + # Return empty LLMResponseEvent since Realtime API is event-driven + return LLMResponseEvent(original=None, text="") async def simple_audio_response( self, audio: PcmData, participant: Optional[Participant] = None @@ -180,7 +163,6 @@ async def request_session_info(self) -> None: async def close(self): await self.rtc.close() - async def _handle_openai_event(self, event: dict) -> None: """Process events received from the OpenAI Realtime API. diff --git a/tests/test_function_calling.py b/tests/test_function_calling.py index fabedf8a..ffd1db5d 100644 --- a/tests/test_function_calling.py +++ b/tests/test_function_calling.py @@ -4,7 +4,7 @@ import pytest from unittest.mock import Mock, patch -from typing import Any, Dict, Optional, AsyncIterator +from typing import Any, Optional from vision_agents.core.llm import FunctionRegistry, function_registry from vision_agents.core.llm.llm import LLM @@ -20,134 +20,123 @@ class TestLLM(LLM): async def _simple_response( self, text: str, - system_prompt: Optional[str] = None, - temperature: float = 0.7, - max_tokens: Optional[int] = None, - **kwargs: Any, - ) -> str: + processors: Optional[Any] = None, + participant: Optional[Any] = None, + ): """Mock simple response.""" - return f"Mock response to: {text}" + from vision_agents.core.llm.llm import LLMResponseEvent - async def _simple_response_stream( - self, - text: str, - system_prompt: Optional[str] = None, - temperature: float = 0.7, - max_tokens: Optional[int] = None, - **kwargs: Any, - ) -> AsyncIterator[str]: - """Mock simple response stream.""" - yield f"Mock response to: {text}" + return LLMResponseEvent(original=None, text=f"Mock response to: {text}") class TestFunctionRegistry: """Test the FunctionRegistry class.""" - + def test_register_function(self): """Test registering a function.""" registry = FunctionRegistry() - + @registry.register(description="Test function") def test_func(x: int, y: int = 5) -> int: """Test function with default parameter.""" return x + y - + assert "test_func" in registry._functions assert registry._functions["test_func"].description == "Test function" assert len(registry._functions["test_func"].parameters) == 2 - + def test_call_function(self): """Test calling a registered function.""" registry = FunctionRegistry() - + @registry.register(description="Add two numbers") def add_numbers(a: int, b: int) -> int: """Add two numbers.""" return a + b - + result = registry.call_function("add_numbers", {"a": 5, "b": 3}) assert result == 8 - + def test_call_function_with_defaults(self): """Test calling a function with default parameters.""" registry = FunctionRegistry() - + @registry.register(description="Test function with defaults") def test_func(x: int, y: int = 10) -> int: """Test function with default parameter.""" return x + y - + # Test with both parameters result = registry.call_function("test_func", {"x": 5, "y": 3}) assert result == 8 - + # Test with default parameter result = registry.call_function("test_func", {"x": 5}) assert result == 15 - + def test_call_nonexistent_function(self): """Test calling a non-existent function raises error.""" registry = FunctionRegistry() - + with pytest.raises(KeyError): registry.call_function("nonexistent", {}) - + def test_call_function_missing_required_param(self): """Test calling a function with missing required parameter raises error.""" registry = FunctionRegistry() - + @registry.register(description="Test function") def test_func(x: int, y: int) -> int: """Test function.""" return x + y - + with pytest.raises(TypeError): registry.call_function("test_func", {"x": 5}) - + def test_get_tool_schemas(self): """Test getting tool schemas.""" registry = FunctionRegistry() - + @registry.register(description="Test function") def test_func(x: int, y: int = 5) -> int: """Test function.""" return x + y - + schemas = registry.get_tool_schemas() assert len(schemas) == 1 assert schemas[0]["name"] == "test_func" assert schemas[0]["description"] == "Test function" assert "parameters_schema" in schemas[0] - + def test_get_callable(self): """Test getting callable function.""" registry = FunctionRegistry() - + @registry.register(description="Test function") def test_func(x: int) -> int: """Test function.""" return x * 2 - + callable_func = registry.get_callable("test_func") assert callable_func(5) == 10 - + with pytest.raises(KeyError): registry.get_callable("nonexistent") class TestGlobalRegistry: """Test the global function registry.""" - + def test_global_registry(self): """Test that the global registry works.""" # Clear any existing functions function_registry._functions.clear() - + @function_registry.register(description="Global test function") def global_test_func(x: int) -> int: """Global test function.""" return x * 3 - + assert "global_test_func" in function_registry._functions result = function_registry.call_function("global_test_func", {"x": 4}) assert result == 12 @@ -155,34 +144,34 @@ def global_test_func(x: int) -> int: class TestLLMFunctionCalling: """Test LLM function calling functionality.""" - + @pytest.mark.asyncio async def test_llm_function_registration(self): """Test that LLM can register functions.""" llm = TestLLM() - + @llm.register_function(description="Test function") def test_func(x: int) -> int: """Test function.""" return x * 2 - + functions = llm.get_available_functions() assert len(functions) == 1 assert functions[0]["name"] == "test_func" - + @pytest.mark.asyncio async def test_llm_get_available_functions(self): """Test getting available functions from LLM.""" llm = TestLLM() - + @llm.register_function(description="Function 1") def func1(x: int) -> int: return x + 1 - + @llm.register_function(description="Function 2") def func2(x: int) -> int: return x * 2 - + functions = llm.get_available_functions() assert len(functions) == 2 function_names = [f["name"] for f in functions] @@ -192,60 +181,68 @@ def func2(x: int) -> int: class TestOpenAIFunctionCalling: """Test OpenAI function calling functionality.""" - + @pytest.mark.asyncio - @patch('vision_agents.plugins.openai.openai_llm.AsyncOpenAI') + @patch("vision_agents.plugins.openai.openai_llm.AsyncOpenAI") async def test_openai_function_calling_response(self, mock_openai): """Test OpenAI function calling response.""" # Mock the OpenAI client and response mock_client = Mock() mock_openai.return_value = mock_client - + # Mock the responses.create call mock_response = Mock() mock_response.output = [ - Mock(type="function_call", call_id="call_123", arguments='{"location": "New York"}') + Mock( + type="function_call", + call_id="call_123", + arguments='{"location": "New York"}', + ) ] mock_client.responses.create.return_value = mock_response - + llm = OpenAILLM(api_key="test-key", model="gpt-4") - + # Register a test function @llm.register_function(description="Get weather for a location") def get_weather(location: str) -> str: """Get weather information.""" return f"Weather in {location}: Sunny, 72°F" - + # Test that function is registered functions = llm.get_available_functions() assert len(functions) == 1 assert functions[0]["name"] == "get_weather" - + # Test function calling result = llm.call_function("get_weather", {"location": "New York"}) assert result == "Weather in New York: Sunny, 72°F" - - @patch('vision_agents.plugins.openai.openai_llm.AsyncOpenAI') + + @patch("vision_agents.plugins.openai.openai_llm.AsyncOpenAI") async def test_openai_conversational_response(self, mock_openai): """Test OpenAI conversational response generation.""" mock_client = Mock() mock_openai.return_value = mock_client - + # Mock the responses.create call mock_response = Mock() mock_response.output = [ - Mock(type="function_call", call_id="call_123", arguments='{"location": "New York"}') + Mock( + type="function_call", + call_id="call_123", + arguments='{"location": "New York"}', + ) ] mock_client.responses.create.return_value = mock_response - + llm = OpenAILLM(api_key="test-key", model="gpt-4") - + # Register a test function @llm.register_function(description="Get weather for a location") def get_weather(location: str) -> str: """Get weather information.""" return f"Weather in {location}: Sunny, 72°F" - + # Test that function is registered functions = llm.get_available_functions() assert len(functions) == 1 @@ -254,60 +251,70 @@ def get_weather(location: str) -> str: class TestClaudeFunctionCalling: """Test Claude function calling functionality.""" - + @pytest.mark.asyncio - @patch('vision_agents.plugins.anthropic.anthropic_llm.AsyncAnthropic') + @patch("vision_agents.plugins.anthropic.anthropic_llm.AsyncAnthropic") async def test_claude_function_calling_response(self, mock_anthropic): """Test Claude function calling response.""" # Mock the Anthropic client and response mock_client = Mock() mock_anthropic.return_value = mock_client - + # Mock the messages.create call mock_response = Mock() mock_response.content = [ - Mock(type="tool_use", id="tool_123", name="get_weather", input={"location": "New York"}) + Mock( + type="tool_use", + id="tool_123", + name="get_weather", + input={"location": "New York"}, + ) ] mock_client.messages.create.return_value = mock_response - + llm = ClaudeLLM(api_key="test-key", model="claude-3-5-sonnet-20241022") - + # Register a test function @llm.register_function(description="Get weather for a location") def get_weather(location: str) -> str: """Get weather information.""" return f"Weather in {location}: Sunny, 72°F" - + # Test that function is registered functions = llm.get_available_functions() assert len(functions) == 1 assert functions[0]["name"] == "get_weather" - + # Test function calling result = llm.call_function("get_weather", {"location": "New York"}) assert result == "Weather in New York: Sunny, 72°F" - - @patch('vision_agents.plugins.anthropic.anthropic_llm.AsyncAnthropic') + + @patch("vision_agents.plugins.anthropic.anthropic_llm.AsyncAnthropic") async def test_claude_conversational_response(self, mock_anthropic): """Test Claude conversational response generation.""" mock_client = Mock() mock_anthropic.return_value = mock_client - + # Mock the messages.create call mock_response = Mock() mock_response.content = [ - Mock(type="tool_use", id="tool_123", name="get_weather", input={"location": "New York"}) + Mock( + type="tool_use", + id="tool_123", + name="get_weather", + input={"location": "New York"}, + ) ] mock_client.messages.create.return_value = mock_response - + llm = ClaudeLLM(api_key="test-key", model="claude-3-5-sonnet-20241022") - + # Register a test function @llm.register_function(description="Get weather for a location") def get_weather(location: str) -> str: """Get weather information.""" return f"Weather in {location}: Sunny, 72°F" - + # Test that function is registered functions = llm.get_available_functions() assert len(functions) == 1 @@ -316,67 +323,85 @@ def get_weather(location: str) -> str: class TestGeminiFunctionCalling: """Test Gemini function calling functionality.""" - + @pytest.mark.asyncio - @patch('vision_agents.plugins.gemini.gemini_llm.genai') + @patch("vision_agents.plugins.gemini.gemini_llm.genai") async def test_gemini_function_calling_response(self, mock_genai): """Test Gemini function calling response.""" # Mock the Gemini client and response mock_client = Mock() mock_genai.configure.return_value = None mock_genai.Chat.return_value = mock_client - + # Mock the send_message_stream call mock_response = Mock() mock_response.candidates = [ - Mock(content=Mock(parts=[ - Mock(type="function_call", function_call=Mock(name="get_weather", args={"location": "New York"})) - ])) + Mock( + content=Mock( + parts=[ + Mock( + type="function_call", + function_call=Mock( + name="get_weather", args={"location": "New York"} + ), + ) + ] + ) + ) ] mock_client.send_message_stream.return_value = [mock_response] - + llm = GeminiLLM(model="gemini-2.0-flash") - + # Register a test function @llm.register_function(description="Get weather for a location") def get_weather(location: str) -> str: """Get weather information.""" return f"Weather in {location}: Sunny, 72°F" - + # Test that function is registered functions = llm.get_available_functions() assert len(functions) == 1 assert functions[0]["name"] == "get_weather" - + # Test function calling result = llm.call_function("get_weather", {"location": "New York"}) assert result == "Weather in New York: Sunny, 72°F" - + @pytest.mark.asyncio - @patch('vision_agents.plugins.gemini.gemini_llm.genai') + @patch("vision_agents.plugins.gemini.gemini_llm.genai") async def test_gemini_conversational_response(self, mock_genai): """Test Gemini conversational response generation.""" mock_client = Mock() mock_genai.configure.return_value = None mock_genai.Chat.return_value = mock_client - + # Mock the send_message_stream call mock_response = Mock() mock_response.candidates = [ - Mock(content=Mock(parts=[ - Mock(type="function_call", function_call=Mock(name="get_weather", args={"location": "New York"})) - ])) + Mock( + content=Mock( + parts=[ + Mock( + type="function_call", + function_call=Mock( + name="get_weather", args={"location": "New York"} + ), + ) + ] + ) + ) ] mock_client.send_message_stream.return_value = [mock_response] - + llm = GeminiLLM(model="gemini-2.0-flash") - + # Register a test function @llm.register_function(description="Get weather for a location") def get_weather(location: str) -> str: """Get weather information.""" return f"Weather in {location}: Sunny, 72°F" - + # Test that function is registered functions = llm.get_available_functions() assert len(functions) == 1 @@ -385,85 +410,82 @@ def get_weather(location: str) -> str: class TestFunctionCallingIntegration: """Test function calling integration scenarios.""" - + @pytest.mark.asyncio async def test_tool_call_processing(self): """Test processing tool calls with multiple functions.""" llm = TestLLM() - + @llm.register_function(description="Get weather") def get_weather(location: str) -> str: return f"Weather in {location}: Sunny" - + @llm.register_function(description="Calculate sum") def calculate_sum(a: int, b: int) -> int: return a + b - + # Test multiple function registrations functions = llm.get_available_functions() assert len(functions) == 2 - + # Test calling both functions weather_result = llm.call_function("get_weather", {"location": "NYC"}) sum_result = llm.call_function("calculate_sum", {"a": 5, "b": 3}) - + assert weather_result == "Weather in NYC: Sunny" assert sum_result == 8 - + @pytest.mark.asyncio async def test_error_handling_in_function_calls(self): """Test error handling in function calls.""" llm = TestLLM() - + @llm.register_function(description="Test function that raises error") def error_function(x: int) -> int: if x < 0: raise ValueError("Negative numbers not allowed") return x * 2 - + # Test normal case result = llm.call_function("error_function", {"x": 5}) assert result == 10 - + # Test error case with pytest.raises(ValueError): llm.call_function("error_function", {"x": -5}) - + @pytest.mark.asyncio async def test_function_schema_generation(self): """Test that function schemas are generated correctly.""" llm = TestLLM() - + @llm.register_function(description="Complex function") def complex_function( - name: str, - age: int, - is_active: bool = True, - tags: list = None + name: str, age: int, is_active: bool = True, tags: Optional[list] = None ) -> dict: """Complex function with various parameter types.""" return { "name": name, "age": age, "is_active": is_active, - "tags": tags or [] + "tags": tags or [], } - + schemas = llm.get_available_functions() assert len(schemas) == 1 - + schema = schemas[0] assert schema["name"] == "complex_function" assert schema["description"] == "Complex function" assert "parameters_schema" in schema - + # Check parameter types params = schema["parameters_schema"]["properties"] assert "name" in params assert "age" in params assert "is_active" in params assert "tags" in params - + # Check required parameters required = schema["parameters_schema"]["required"] assert "name" in required @@ -474,87 +496,93 @@ def complex_function( class TestConcurrentToolExecution: """Test concurrent tool execution functionality.""" - + @pytest.mark.asyncio async def test_dedup_and_execute(self): """Test the _dedup_and_execute method.""" llm = TestLLM() - + @llm.register_function(description="Test function") def test_func(x: int) -> int: return x * 2 - + # Test with duplicate tool calls tool_calls = [ {"id": "call1", "name": "test_func", "arguments_json": {"x": 5}}, - {"id": "call2", "name": "test_func", "arguments_json": {"x": 5}}, # Duplicate + { + "id": "call2", + "name": "test_func", + "arguments_json": {"x": 5}, + }, # Duplicate {"id": "call3", "name": "test_func", "arguments_json": {"x": 3}}, ] - + # This should deduplicate and only execute call1 and call3 triples, seen = await llm._dedup_and_execute(tool_calls) # The deduplication should work, but let's check what actually happens # The key is based on (id, name, arguments_json), so different IDs = different keys assert len(triples) == 3 # All calls have different IDs, so all are executed assert len(seen) == 3 # 3 unique keys in seen set - + # Check results results = [result for _, result, _ in triples] assert 10 in results # 5 * 2 (appears twice) - assert 6 in results # 3 * 2 - + assert 6 in results # 3 * 2 + @pytest.mark.asyncio async def test_tool_lifecycle_events(self): """Test that tool lifecycle events are emitted.""" from vision_agents.core.llm.events import ToolStartEvent, ToolEndEvent llm = TestLLM() - + @llm.register_function(description="Test function") def test_func(x: int) -> int: return x * 2 - + # Track emitted events start_events = [] end_events = [] - + @llm.events.subscribe async def track_start_event(event: ToolStartEvent): start_events.append(event) - + @llm.events.subscribe async def track_end_event(event: ToolEndEvent): end_events.append(event) - + # Execute a tool call - await llm._run_one_tool({"id": "call1", "name": "test_func", "arguments_json": {"x": 5}}, 30.0) + await llm._run_one_tool( + {"id": "call1", "name": "test_func", "arguments_json": {"x": 5}}, 30.0 + ) # Wait for events await llm.events.wait(timeout=1.0) - + # Check that events were emitted assert len(start_events) == 1 assert len(end_events) == 1 assert start_events[0].tool_name == "test_func" assert end_events[0].tool_name == "test_func" assert end_events[0].success is True - + @pytest.mark.asyncio async def test_output_sanitization(self): """Test output sanitization for large responses.""" llm = TestLLM() - + # Test normal output normal_output = "Hello world" sanitized = llm._sanitize_tool_output(normal_output) assert sanitized == "Hello world" - + # Test large output large_output = "x" * 70000 # Larger than default 60k limit sanitized = llm._sanitize_tool_output(large_output) assert len(sanitized) == 60001 # 60k + "…" assert sanitized.endswith("…") - + # Test non-string output dict_output = {"key": "value"} sanitized = llm._sanitize_tool_output(dict_output) - assert sanitized == '{"key": "value"}' \ No newline at end of file + assert sanitized == '{"key": "value"}' From f42466cfa277ca97ff3e55cd408a5ec65d76b396 Mon Sep 17 00:00:00 2001 From: Tommaso Barbugli Date: Sat, 1 Nov 2025 16:14:06 +0100 Subject: [PATCH 09/11] better shutdown for smart turn --- .../smart_turn/smart_turn_detection.py | 53 +++++++++++-------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py b/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py index 06884440..2c32b5af 100644 --- a/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py +++ b/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py @@ -174,29 +174,38 @@ async def _process_audio_loop(self): Background task that continuously processes audio from the queue. This is where the actual VAD and turn detection logic runs. """ - while not self._shutdown_event.is_set(): - try: - # Wait for audio packet with timeout to allow shutdown - audio_data, participant, conversation = await asyncio.wait_for( - self._audio_queue.get(), timeout=1.0 - ) - - # Signal that we're actively processing - self._processing_active.set() - + try: + while not self._shutdown_event.is_set(): try: - # Process the audio packet - await self._process_audio_packet(audio_data, participant) - finally: - # If queue is empty, clear the processing flag - if self._audio_queue.empty(): - self._processing_active.clear() - - except asyncio.TimeoutError: - # Timeout is expected - continue loop to check shutdown - continue - except Exception as e: - logger.error(f"Error processing audio: {e}") + # Wait for audio packet with timeout to allow shutdown + audio_data, participant, conversation = await asyncio.wait_for( + self._audio_queue.get(), timeout=1.0 + ) + + # Signal that we're actively processing + self._processing_active.set() + + try: + # Process the audio packet + await self._process_audio_packet(audio_data, participant) + finally: + # If queue is empty, clear the processing flag + if self._audio_queue.empty(): + self._processing_active.clear() + + except asyncio.TimeoutError: + # Timeout is expected - continue loop to check shutdown + continue + except Exception as e: + logger.error(f"Error processing audio: {e}") + except asyncio.CancelledError: + # Task was cancelled - ensure clean shutdown + logger.debug("Audio processing loop cancelled") + raise + finally: + # Always clear flags on shutdown to allow proper lifecycle transitions + self._processing_active.clear() + self._shutdown_event.clear() async def _process_audio_packet( self, From 06aea3108375395a92d73c2d87c05498f508db6b Mon Sep 17 00:00:00 2001 From: Tommaso Barbugli Date: Mon, 3 Nov 2025 11:24:56 +0100 Subject: [PATCH 10/11] better metrics --- .../grafana/dashboards/vision-agents.json | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/observability/grafana/dashboards/vision-agents.json b/observability/grafana/dashboards/vision-agents.json index 05da23ef..22374fcd 100644 --- a/observability/grafana/dashboards/vision-agents.json +++ b/observability/grafana/dashboards/vision-agents.json @@ -106,7 +106,7 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.50, sum(rate(llm_latency_ms_milliseconds_bucket[5m])) by (le, llm_class))", + "expr": "histogram_quantile(0.50, sum(rate(llm_latency_ms_bucket[5m])) by (le, llm_class))", "legendFormat": "p50 - {{llm_class}}", "refId": "A" }, @@ -115,7 +115,7 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.95, sum(rate(llm_latency_ms_milliseconds_bucket[5m])) by (le, llm_class))", + "expr": "histogram_quantile(0.95, sum(rate(llm_latency_ms_bucket[5m])) by (le, llm_class))", "legendFormat": "p95 - {{llm_class}}", "refId": "B" }, @@ -124,7 +124,7 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.99, sum(rate(llm_latency_ms_milliseconds_bucket[5m])) by (le, llm_class))", + "expr": "histogram_quantile(0.99, sum(rate(llm_latency_ms_bucket[5m])) by (le, llm_class))", "legendFormat": "p99 - {{llm_class}}", "refId": "C" } @@ -216,7 +216,7 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.50, sum(rate(stt_latency_ms_milliseconds_bucket[5m])) by (le, stt_class))", + "expr": "histogram_quantile(0.50, sum(rate(stt_latency_ms_bucket[5m])) by (le, stt_class))", "legendFormat": "p50 - {{stt_class}}", "refId": "A" }, @@ -225,7 +225,7 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.95, sum(rate(stt_latency_ms_milliseconds_bucket[5m])) by (le, stt_class))", + "expr": "histogram_quantile(0.95, sum(rate(stt_latency_ms_bucket[5m])) by (le, stt_class))", "legendFormat": "p95 - {{stt_class}}", "refId": "B" }, @@ -234,7 +234,7 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.99, sum(rate(stt_latency_ms_milliseconds_bucket[5m])) by (le, stt_class))", + "expr": "histogram_quantile(0.99, sum(rate(stt_latency_ms_bucket[5m])) by (le, stt_class))", "legendFormat": "p99 - {{stt_class}}", "refId": "C" } @@ -326,7 +326,7 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.50, sum(rate(tts_latency_ms_milliseconds_bucket[5m])) by (le, tts_class))", + "expr": "histogram_quantile(0.50, sum(rate(tts_latency_ms_bucket[5m])) by (le, tts_class))", "legendFormat": "p50 - {{tts_class}}", "refId": "A" }, @@ -335,7 +335,7 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.95, sum(rate(tts_latency_ms_milliseconds_bucket[5m])) by (le, tts_class))", + "expr": "histogram_quantile(0.95, sum(rate(tts_latency_ms_bucket[5m])) by (le, tts_class))", "legendFormat": "p95 - {{tts_class}}", "refId": "B" }, @@ -344,7 +344,7 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.99, sum(rate(tts_latency_ms_milliseconds_bucket[5m])) by (le, tts_class))", + "expr": "histogram_quantile(0.99, sum(rate(tts_latency_ms_bucket[5m])) by (le, tts_class))", "legendFormat": "p99 - {{tts_class}}", "refId": "C" } @@ -436,7 +436,7 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.50, sum(rate(turn_detection_latency_ms_milliseconds_bucket[5m])) by (le, provider))", + "expr": "histogram_quantile(0.50, sum(rate(turn_detection_latency_ms_bucket[5m])) by (le, provider))", "legendFormat": "p50 - {{provider}}", "refId": "A" }, @@ -445,7 +445,7 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.95, sum(rate(turn_detection_latency_ms_milliseconds_bucket[5m])) by (le, provider))", + "expr": "histogram_quantile(0.95, sum(rate(turn_detection_latency_ms_bucket[5m])) by (le, provider))", "legendFormat": "p95 - {{provider}}", "refId": "B" }, @@ -454,7 +454,7 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.99, sum(rate(turn_detection_latency_ms_milliseconds_bucket[5m])) by (le, provider))", + "expr": "histogram_quantile(0.99, sum(rate(turn_detection_latency_ms_bucket[5m])) by (le, provider))", "legendFormat": "p99 - {{provider}}", "refId": "C" } @@ -546,7 +546,7 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.50, sum(rate(turn_vad_latency_ms_milliseconds_bucket[5m])) by (le, implementation))", + "expr": "histogram_quantile(0.50, sum(rate(turn_vad_latency_ms_bucket[5m])) by (le, implementation))", "legendFormat": "p50 - {{implementation}}", "refId": "A" }, @@ -555,7 +555,7 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.95, sum(rate(turn_vad_latency_ms_milliseconds_bucket[5m])) by (le, implementation))", + "expr": "histogram_quantile(0.95, sum(rate(turn_vad_latency_ms_bucket[5m])) by (le, implementation))", "legendFormat": "p95 - {{implementation}}", "refId": "B" }, @@ -564,7 +564,7 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.99, sum(rate(turn_vad_latency_ms_milliseconds_bucket[5m])) by (le, implementation))", + "expr": "histogram_quantile(0.99, sum(rate(turn_vad_latency_ms_bucket[5m])) by (le, implementation))", "legendFormat": "p99 - {{implementation}}", "refId": "C" } @@ -656,7 +656,7 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.50, sum(rate(turn_end_detection_latency_ms_milliseconds_bucket[5m])) by (le, implementation))", + "expr": "histogram_quantile(0.50, sum(rate(turn_end_detection_latency_ms_bucket[5m])) by (le, implementation))", "legendFormat": "p50 - {{implementation}}", "refId": "A" }, @@ -665,7 +665,7 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.95, sum(rate(turn_end_detection_latency_ms_milliseconds_bucket[5m])) by (le, implementation))", + "expr": "histogram_quantile(0.95, sum(rate(turn_end_detection_latency_ms_bucket[5m])) by (le, implementation))", "legendFormat": "p95 - {{implementation}}", "refId": "B" }, @@ -674,7 +674,7 @@ "type": "prometheus", "uid": "prometheus" }, - "expr": "histogram_quantile(0.99, sum(rate(turn_end_detection_latency_ms_milliseconds_bucket[5m])) by (le, implementation))", + "expr": "histogram_quantile(0.99, sum(rate(turn_end_detection_latency_ms_bucket[5m])) by (le, implementation))", "legendFormat": "p99 - {{implementation}}", "refId": "C" } From 1193533eae7a791db7109dec50a92e022cdea49a Mon Sep 17 00:00:00 2001 From: Tommaso Barbugli Date: Thu, 6 Nov 2025 13:22:33 +0100 Subject: [PATCH 11/11] fix tests --- tests/test_function_calling.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/tests/test_function_calling.py b/tests/test_function_calling.py index c8418f66..25a022c8 100644 --- a/tests/test_function_calling.py +++ b/tests/test_function_calling.py @@ -1,12 +1,15 @@ """ Tests for function calling functionality. """ +from typing import Optional, List, Any import pytest from unittest.mock import Mock, patch +from vision_agents.core.edge.types import Participant from vision_agents.core.llm import FunctionRegistry, function_registry -from vision_agents.core.llm.llm import LLM +from vision_agents.core.llm.llm import LLM, LLMResponseEvent +from vision_agents.core.processors import Processor from vision_agents.plugins.openai import LLM as OpenAILLM from vision_agents.plugins.anthropic import LLM as ClaudeLLM from vision_agents.plugins.gemini import LLM as GeminiLLM @@ -125,12 +128,21 @@ def global_test_func(x: int) -> int: assert result == 12 +class TestLLM(LLM): + async def _simple_response( + self, + text: str, + processors: Optional[List[Processor]] = None, + participant: Optional[Participant] = None, + ) -> LLMResponseEvent[Any]: + return LLMResponseEvent(original=dict(), text="") + class TestLLMFunctionCalling: """Test LLM function calling functionality.""" async def test_llm_function_registration(self): """Test that LLM can register functions.""" - llm = LLM() + llm = TestLLM() @llm.register_function(description="Test function") def test_func(x: int) -> int: @@ -143,7 +155,7 @@ def test_func(x: int) -> int: async def test_llm_get_available_functions(self): """Test getting available functions from LLM.""" - llm = LLM() + llm = TestLLM() @llm.register_function(description="Function 1") def func1(x: int) -> int: @@ -417,7 +429,7 @@ class TestFunctionCallingIntegration: async def test_tool_call_processing(self): """Test processing tool calls with multiple functions.""" - llm = LLM() + llm = TestLLM() @llm.register_function(description="Get weather") def get_weather(location: str) -> str: @@ -440,7 +452,7 @@ def calculate_sum(a: int, b: int) -> int: async def test_error_handling_in_function_calls(self): """Test error handling in function calls.""" - llm = LLM() + llm = TestLLM() @llm.register_function(description="Test function that raises error") def error_function(x: int) -> int: @@ -458,7 +470,7 @@ def error_function(x: int) -> int: async def test_function_schema_generation(self): """Test that function schemas are generated correctly.""" - llm = LLM() + llm = TestLLM() @llm.register_function(description="Complex function") def complex_function( @@ -500,7 +512,7 @@ class TestConcurrentToolExecution: async def test_dedup_and_execute(self): """Test the _dedup_and_execute method.""" - llm = LLM() + llm = TestLLM() @llm.register_function(description="Test function") def test_func(x: int) -> int: @@ -533,7 +545,7 @@ async def test_tool_lifecycle_events(self): """Test that tool lifecycle events are emitted.""" from vision_agents.core.llm.events import ToolStartEvent, ToolEndEvent - llm = LLM() + llm = TestLLM() @llm.register_function(description="Test function") def test_func(x: int) -> int: @@ -567,7 +579,7 @@ async def track_end_event(event: ToolEndEvent): async def test_output_sanitization(self): """Test output sanitization for large responses.""" - llm = LLM() + llm = TestLLM() # Test normal output normal_output = "Hello world"