langwatch · drewdrewthis · Nov 13, 2025 · Nov 13, 2025 · Nov 24, 2025 · Nov 24, 2025
diff --git a/docs/docs/pages/basics/agent-architecture.mdx b/docs/docs/pages/basics/agent-architecture.mdx
@@ -0,0 +1,167 @@
+---
+title: Agent Architecture Patterns
+description: Learn the different patterns for implementing agents in Scenario - from simple LLM calls to complex protocols.
+---
+
+import { RefLink } from "../../components/RefLink";
+
+# Agent Architecture Patterns [Choose the right pattern for your agent implementation]
+
+## Overview
+
+Scenario provides flexible agent implementation patterns that cater to different use cases and protocols. Understanding these patterns helps you build agents that integrate seamlessly with the framework.
+
+## Agent Interface
+
+All agents in Scenario implement the `Agent` interface:
+
+```typescript
+interface Agent {
+  readonly role: AgentRole;
+  call(input: AgentInput): Promise<AgentReturnTypes>;
+}
+```
+
+This interface ensures compatibility with Scenario's testing framework while allowing diverse implementation approaches.
+
+## Implementation Patterns
+
+### Pattern 1: Override `invokeLLM` (Recommended for most use cases)
+
+For agents that use HTTP-based LLM calls but need custom behavior (e.g., custom headers, different models, audio support):
+
+```typescript
+class CustomAgent extends UserSimulatorAgent {
+  protected async invokeLLM(params: InvokeLLMParams): Promise<InvokeLLMResult> {
+    // Custom LLM invocation logic
+    return await customLLMCall(params);
+  }
+}
+```
+
+**For audio support on built-in agents, use `AudioHelpers.wrapAgentForAudio`:**
+
+```typescript
+import { AudioHelpers } from "@langwatch/scenario";
+
+const audioUserSimulator = AudioHelpers.wrapAgentForAudio(
+  scenario.userSimulatorAgent(),
+  { voice: "nova" }
+);
+
+const audioJudge = AudioHelpers.wrapAgentForAudio(
+  scenario.judgeAgent({ criteria: [...] }),
+  { voice: "alloy" }
+);
+```
+
+**Use this pattern when:**
+
+- You want to customize how LLM calls are made
+- You need custom headers, authentication, or model parameters
+- You want to add logging, monitoring, or caching
+- Your agent follows standard request/response patterns
+
+**Benefits:**
+
+- Reuse built-in business logic (prompt building, config merging)
+- Only customize the LLM interaction
+- Automatic handling of message formatting and role reversal
+
+### Pattern 2: Override `call` (For fundamentally different protocols)
+
+For agents that use completely different communication protocols or have custom logic flows:
+
+```typescript
+class CustomDatabaseAgent implements Agent {
+  readonly role = AgentRole.AGENT;
+
+  async call(input: AgentInput): Promise<AgentReturnTypes> {
+    // Custom implementation using database queries
+    // Fundamentally different from HTTP-based LLM calls
+    const result = await queryDatabase(input.messages);
+    return formatDatabaseResponse(result);
+  }
+}
+```
+
+**Use this pattern when:**
+
+- Your agent uses WebSocket, database queries, or custom APIs
+- You need event-driven communication (not request/response)
+- You have complex multi-step logic or state management
+- You want to bypass LLM calls entirely
+
+**Benefits:**
+
+- Complete control over implementation
+- No assumptions about HTTP-based LLM communication
+- Suitable for any protocol or custom logic
+
+## Audio Message Support
+
+Scenario's `CoreMessage` type supports rich content including audio:
+
+```typescript
+{
+  role: "user",
+  content: [
+    { type: "text", text: "" },
+    { type: "file", mediaType: "audio/wav", data: base64Audio }
+  ]
+}
+```
+
+**How different agents handle audio:**
+
+- **Standard agents**: Transcribe audio → process text → return text
+- **Voice agents**: Convert audio format → send to voice API → return audio
+- **Custom agents**: Handle according to your specific needs
+
+## Decision Guide
+
+| Use Case                  | Recommended Pattern            | Example                         |
+| ------------------------- | ------------------------------ | ------------------------------- |
+| Custom LLM headers/auth   | Override `invokeLLM`           | Add API gateway headers         |
+| Different model provider  | Override `invokeLLM`           | Use Anthropic instead of OpenAI |
+| Audio support             | AudioHelpers.wrapAgentForAudio | OpenAI audio preview model      |
+| Custom logging/monitoring | Override `invokeLLM`           | Track token usage               |
+| WebSocket protocol        | Override `call`                | Realtime API agents             |
+| Custom message flow       | Override `call`                | Multi-step reasoning            |
+| Stateful connections      | Override `call`                | Persistent sessions             |
+| Database queries          | Override `call`                | Custom data agents              |
+
+**Rule of thumb:** If you're changing **how** you call the LLM but keeping the same **business logic**, override `invokeLLM`. If you're changing the **entire flow**, override `call`.
+
+## Built-in Agent Implementations
+
+Scenario provides several built-in agent implementations:
+
+- **`UserSimulatorAgent`**: Uses `invokeLLM` pattern for realistic user simulation
+- **`JudgeAgent`**: Uses `invokeLLM` pattern with tool-calling for evaluation
+- **`RealtimeAgentAdapter`**: Uses `call` pattern for WebSocket-based voice agents
+
+## Audio Support
+
+For audio-enabled conversations, Scenario provides `AudioHelpers.wrapAgentForAudio` to wrap built-in agents with audio capabilities:
+
+```typescript
+import { AudioHelpers } from "@langwatch/scenario";
+
+const audioAgent = AudioHelpers.wrapAgentForAudio(
+  scenario.userSimulatorAgent(),
+  { voice: "alloy" }
+);
+```
+
+This wrapper overrides the agent's `invokeLLM` method to use OpenAI's audio API instead of text API, while preserving all the agent's business logic.
+
+## Next Steps
+
+- <RefLink href="/docs/basics/writing-scenarios">
+    Learn about writing scenarios
+  </RefLink>
+- <RefLink href="/docs/examples/multimodal/overview">
+    Explore multimodal examples
+  </RefLink>
+- <RefLink href="/docs/agent-integration">See agent integration guides</RefLink>
diff --git a/javascript/examples/vitest/tests/helpers/audio-helpers.ts b/javascript/examples/vitest/tests/helpers/audio-helpers.ts
@@ -0,0 +1,105 @@
+/**
+ * Audio Helpers - Utilities for working with audio in Scenario tests
+ *
+ * Provides helpers for converting between audio and text formats,
+ * and wrapping agents to use audio APIs.
+ */
+import {
+  InvokeLLMParams,
+  InvokeLLMResult,
+  JudgeAgent,
+  UserSimulatorAgent,
+} from "@langwatch/scenario";
+import OpenAI from "openai";
+import { convertModelMessagesToOpenAIMessages } from "./convert-core-messages-to-openai";
+
+const openai = new OpenAI();
+
+/**
+ * Options for audio wrapping
+ */
+export interface AudioWrapOptions {
+  /** OpenAI voice to use for audio generation */
+  voice?: "alloy" | "nova" | "echo" | "fable" | "onyx" | "shimmer";
+  /** OpenAI model to use */
+  model?: string;
+}
+
+/**
+ * Wraps a Scenario agent to use OpenAI's audio API instead of text API
+ *
+ * This is specifically for Scenario's built-in agents (UserSimulatorAgent, JudgeAgent)
+ * that have an invokeLLM method. The wrapper overrides invokeLLM to use audio APIs.
+ *
+ * @param agent - The agent to wrap (must have invokeLLM method)
+ * @param options - Audio configuration options
+ * @returns The same agent with audio-enabled invokeLLM
+ *
+ * @example
+ * ```typescript
+ * import { scenario, AudioHelpers } from "@langwatch/scenario";
+ *
+ * const audioUserSim = AudioHelpers.wrapAgentForOpenAiAudio(
+ *   scenario.userSimulatorAgent(),
+ *   { voice: "nova" }
+ * );
+ *
+ * const audioJudge = AudioHelpers.wrapAgentForOpenAiAudio(
+ *   scenario.judgeAgent({ criteria: [...] }),
+ *   { voice: "alloy" }
+ * );
+ * ```
+ */
+export function wrapAgentForOpenAiAudio<
+  T extends JudgeAgent | UserSimulatorAgent
+>(agent: T, options: AudioWrapOptions = {}): T {
+  agent.invokeLLM = async (
+    params: InvokeLLMParams
+  ): Promise<InvokeLLMResult> => {
+    try {
+      // Use OpenAI audio API instead of text API
+      const response = await openai.chat.completions.create({
+        model: options.model || "gpt-4o-audio-preview",
+        modalities: ["text", "audio"],
+        audio: { voice: options.voice || "alloy", format: "wav" },
+        messages: convertModelMessagesToOpenAIMessages(params.messages),
+        temperature: params.temperature,
+        max_tokens: params.maxOutputTokens,
+        // Note: Tools are not supported with audio API yet
+      });
+
+      // Return the transcript from the audio response
+      const transcript = response.choices[0].message?.audio?.transcript;
+      if (!transcript) {
+        throw new Error("No transcript received from audio API");
+      }
+
+      return {
+        // Required field for InvokeLLMResult
+        text: transcript,
+        content: [
+          {
+            type: "text",
+            text: "",
+          },
+          {
+            type: "file" as const,
+            data: response.choices[0].message?.audio?.data,
+            mimeType: "audio/wav",
+          },
+        ],
+        // Force cast. We're only using the audio here.
+      } as unknown as InvokeLLMResult;
+    } catch (error) {
+      console.error("Audio API call failed:", error);
+      throw error;
+    }
+  };
+
+  return agent;
+}
+
+// Export the namespace
+export const AudioHelpers = {
+  wrapAgentForOpenAiAudio,
+};
diff --git a/javascript/examples/vitest/tests/helpers/index.ts b/javascript/examples/vitest/tests/helpers/index.ts
@@ -5,5 +5,5 @@ export {
   concatenateWavFiles,
   getAudioSegments,
 } from "./audio-conversation";
-export { OpenAiVoiceAgent } from "./openai-voice-agent";
+export { AudioHelpers } from "./audio-helpers";
 export { wrapJudgeForAudio } from "./wrap-judge-for-audio";