langwatch · tahmidtapadar · Jul 9, 2025 · Jul 9, 2025 · Jul 10, 2025 · Jul 10, 2025
diff --git a/docs/docs/pages/examples/multimodal/multimodal-audio.mdx b/docs/docs/pages/examples/multimodal/multimodal-audio.mdx
@@ -1,5 +1,137 @@
-# Multimodal Audio Analysis (Coming Soon)
+import { RefLink } from "../../../components/RefLink";
 
-This page will cover Scenario tests where the user provides **audio recordings** and the agent must transcribe, interpret, or otherwise respond.
+# Testing AI Voice Agents [Use Case]
 
-Content is under development — check back soon! 🎧🚧
+This page shows how to write a Scenario test where the user provides **audio** input, and the audio is transcribed and evaluated by the agent.
+
+## Overview
+
+Your scenario tests can cover any voice-related situation, for example:
+
+1. User speaks a **simple request** (agent should respond clearly).
+2. User gives a **multi-sentence prompt** (agent should preserve context and intent).
+3. User tone or phrasing implies intent (agent should infer and respond appropriately).
+
+In this example, we use OpenAI’s `whisper-1` model to transcribe `.wav` audio files into text before passing the message to the agent. The response is judged automatically by a <RefLink link={{ python: "judge_agent.html#scenario.judge_agent.JudgeAgent", typescript: "functions/judgeAgent.html" }} code={{ python: "JudgeAgent", typescript: "judgeAgent" }} /> with explicit criteria.
+
+## Code Walk-through
+
+:::code-group
+
+```typescript [typescript]
+import * as fs from "fs";
+import * as path from "path";
+import { openai } from "@ai-sdk/openai";
+import scenario, { AgentAdapter, AgentRole } from "@langwatch/scenario";
+import { describe, it, expect } from "vitest";
+import { generateText } from "ai";
+import OpenAI from "openai";
+
+// 🔧 Set a test set ID (useful for grouping)
+const setId = "realtime-voice-agent-test";
+
+// 1️⃣ Raw OpenAI client for Whisper
+const openaiRaw = new OpenAI({ apiKey: process.env.OPENAI_API_KEY! });
+
+// 2️⃣ Helper to load the fixture audio file
+function getFixtureAudioPath(): string {
+  return path.join(__dirname, "fixtures", "sample.wav");
+}
+
+// 3️⃣ Transcribe audio using Whisper
+async function transcribeAudio(): Promise<string> {
+  const file = fs.createReadStream(getFixtureAudioPath());
+  const response = await openaiRaw.audio.transcriptions.create({
+    file,
+    model: "whisper-1",
+    response_format: "text",
+  });
+  return response;
+}
+
+// 4️⃣ Agent that receives transcribed voice input
+const voiceAgent: AgentAdapter = {
+  role: AgentRole.AGENT,
+  call: async (input) => {
+    const response = await generateText({
+      model: openai("gpt-4o"),
+      messages: [
+        {
+          role: "system",
+          content: `You are a helpful assistant. Respond to the user's transcribed voice message.`,
+        },
+        ...input.messages,
+      ],
+    });
+    return response.text;
+  },
+};
+
+// 5️⃣ Test scenario
+describe("Voice Agent Audio Tests", () => {
+  it("should respond to a transcribed voice message", async () => {
+    const transcript = await transcribeAudio();
+
+    const result = await scenario.run({
+      name: "voice agent: with whisper",
+      description: "Transcribe audio, then test agent response",
+      agents: [
+        voiceAgent,
+        scenario.userSimulatorAgent(),
+        scenario.judgeAgent({
+          criteria: [
+            "Agent responds clearly to the user's voice message",
+            "Agent tone is conversational and helpful",
+          ],
+        }),
+      ],
+      script: [
+        scenario.message({
+          role: "user",
+          content: `🗣️ (Transcribed) ${transcript}`,
+        }),
+        scenario.agent(),
+        scenario.judge(),
+      ],
+      setId,
+    });
+
+    try {
+      expect(result.success).toBe(true);
+    } catch (error) {
+      console.error(result);
+      throw error;
+    }
+  });
+});
+```
+
+```python [python]
+# Coming soon
+```
+
+:::
+
+### Why Transcription First?
+
+Scenario currently supports text and image content in messages. To test real voice inputs, we transcribe the audio using OpenAI's Whisper model and then inject the result as a regular text message in the script.
+
+This keeps your tests deterministic, repeatable, and compatible with standard judge agents.
+
+## Best Practices
+
+When testing voice scenarios:
+
+- Use **natural speech patterns** in your audio (e.g. pauses, fillers, informal phrasing).
+- Define **clear, objective criteria** for what counts as a good agent response (tone, content, empathy).
+- Group related voice tests under a shared `setId` for easier filtering and reporting.
+
+## Next Steps
+
+- Check out [images](/examples/multimodal/multimodal-images) to test your agent with images.
+
+---
+
+### Complete Source
+
+Check out the full test in the repository [here](https://github.com/langwatch/scenario/blob/main/javascript/examples/vitest/tests/voice-agent.test.ts).
diff --git a/docs/vocs.config.tsx b/docs/vocs.config.tsx
@@ -173,12 +173,12 @@ export default defineConfig({
               link: "/examples/multimodal/multimodal-images",
             },
             {
-              text: "Files (coming soon)",
-              link: "/examples/multimodal/multimodal-files",
+              text: "Audio",
+              link: "/examples/multimodal/multimodal-audio",
             },
             {
-              text: "Audio (coming soon)",
-              link: "/examples/multimodal/multimodal-audio",
+              text: "Files (coming soon)",
+              link: "/examples/multimodal/multimodal-files",
             },
           ],
         },

diff --git a/javascript/examples/vitest/package.json b/javascript/examples/vitest/package.json
@@ -18,6 +18,7 @@
   "dependencies": {
     "@langwatch/scenario": "workspace:*",
     "ai": ">=4.0.0",
+    "openai": "5.8.3",
     "vitest": "3.2.4"
   }
 }
diff --git a/javascript/examples/vitest/tests/fixtures/sample.wav b/javascript/examples/vitest/tests/fixtures/sample.wav
diff --git a/javascript/examples/vitest/tests/voice-agent.test.ts b/javascript/examples/vitest/tests/voice-agent.test.ts
@@ -0,0 +1,84 @@
+import * as fs from "fs";
+import * as path from "path";
+import scenario, { AgentAdapter, AgentRole } from "@langwatch/scenario";
+import OpenAI from "openai";
+import { describe, it, expect } from "vitest";
+
+const setId = "realtime-voice-agent-test-multimodal";
+const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY! });
+
+function getFixtureAudioPath(): string {
+  return path.join(__dirname, "fixtures", "sample.wav");
+}
+
+function toBase64(filePath: string): Promise<string> {
+  return new Promise((resolve, reject) => {
+    fs.readFile(filePath, (err, data) => {
+      if (err) return reject(err);
+      resolve(data.toString("base64"));
+    });
+  });
+}
+
+// Voice agent powered by gpt-4o-audio-preview
+const voiceAgent: AgentAdapter = {
+  role: AgentRole.AGENT,
+  call: async () => {
+    const audioPath = getFixtureAudioPath();
+    const base64Audio = await toBase64(audioPath);
+
+    const response = await openai.chat.completions.create({
+      model: "gpt-4o-audio-preview",
+      modalities: ["text", "audio"],
+      audio: { format: "wav", voice: "alloy" },
+      messages: [
+        {
+          role: "user",
+          content: [
+            { type: "text", text: "Please summarize what the speaker is saying in this audio recording." },
+            { type: "input_audio", input_audio: { data: base64Audio, format: "wav" } },
+          ],
+        },
+      ],
+      store: false,
+    });
+
+    const reply = response.choices[0].message?.audio?.transcript;
+    return reply || "(No response)";
+  },
+};
+
+describe("Voice Agent Audio Tests (with gpt-4o preview)", () => {
+  it("should transcribe and respond to audio", async () => {
+    const result = await scenario.run({
+      name: "voice agent: gpt-4o-audio-preview",
+      description: "Send audio directly to gpt-4o-audio-preview and evaluate agent response",
+      agents: [
+        voiceAgent,
+        scenario.userSimulatorAgent(),
+        scenario.judgeAgent({
+          criteria: [
+            "Agent responds clearly to the audio message",
+            "Agent tone is conversational and helpful",
+          ],
+        }),
+      ],
+      script: [
+        scenario.message({
+          role: "user",
+          content: `Please summarize what the person is saying in this recording`,
+        }),
+        scenario.agent(),
+        scenario.judge(),
+      ],
+      setId,
+    });
+
+    try {
+      expect(result.success).toBe(true);
+    } catch (error) {
+      console.error(result);
+      throw error;
+    }
+  });
+});
diff --git a/javascript/pnpm-lock.yaml b/javascript/pnpm-lock.yaml