diff --git a/docs/docs/pages/examples/multimodal/multimodal-audio.mdx b/docs/docs/pages/examples/multimodal/multimodal-audio.mdx index da1661e3..1d3ea997 100644 --- a/docs/docs/pages/examples/multimodal/multimodal-audio.mdx +++ b/docs/docs/pages/examples/multimodal/multimodal-audio.mdx @@ -1,5 +1,137 @@ -# Multimodal Audio Analysis (Coming Soon) +import { RefLink } from "../../../components/RefLink"; -This page will cover Scenario tests where the user provides **audio recordings** and the agent must transcribe, interpret, or otherwise respond. +# Testing AI Voice Agents [Use Case] -Content is under development — check back soon! 🎧🚧 +This page shows how to write a Scenario test where the user provides **audio** input, and the audio is transcribed and evaluated by the agent. + +## Overview + +Your scenario tests can cover any voice-related situation, for example: + +1. User speaks a **simple request** (agent should respond clearly). +2. User gives a **multi-sentence prompt** (agent should preserve context and intent). +3. User tone or phrasing implies intent (agent should infer and respond appropriately). + +In this example, we use OpenAI’s `whisper-1` model to transcribe `.wav` audio files into text before passing the message to the agent. The response is judged automatically by a with explicit criteria. + +## Code Walk-through + +:::code-group + +```typescript [typescript] +import * as fs from "fs"; +import * as path from "path"; +import { openai } from "@ai-sdk/openai"; +import scenario, { AgentAdapter, AgentRole } from "@langwatch/scenario"; +import { describe, it, expect } from "vitest"; +import { generateText } from "ai"; +import OpenAI from "openai"; + +// 🔧 Set a test set ID (useful for grouping) +const setId = "realtime-voice-agent-test"; + +// 1️⃣ Raw OpenAI client for Whisper +const openaiRaw = new OpenAI({ apiKey: process.env.OPENAI_API_KEY! }); + +// 2️⃣ Helper to load the fixture audio file +function getFixtureAudioPath(): string { + return path.join(__dirname, "fixtures", "sample.wav"); +} + +// 3️⃣ Transcribe audio using Whisper +async function transcribeAudio(): Promise { + const file = fs.createReadStream(getFixtureAudioPath()); + const response = await openaiRaw.audio.transcriptions.create({ + file, + model: "whisper-1", + response_format: "text", + }); + return response; +} + +// 4️⃣ Agent that receives transcribed voice input +const voiceAgent: AgentAdapter = { + role: AgentRole.AGENT, + call: async (input) => { + const response = await generateText({ + model: openai("gpt-4o"), + messages: [ + { + role: "system", + content: `You are a helpful assistant. Respond to the user's transcribed voice message.`, + }, + ...input.messages, + ], + }); + return response.text; + }, +}; + +// 5️⃣ Test scenario +describe("Voice Agent Audio Tests", () => { + it("should respond to a transcribed voice message", async () => { + const transcript = await transcribeAudio(); + + const result = await scenario.run({ + name: "voice agent: with whisper", + description: "Transcribe audio, then test agent response", + agents: [ + voiceAgent, + scenario.userSimulatorAgent(), + scenario.judgeAgent({ + criteria: [ + "Agent responds clearly to the user's voice message", + "Agent tone is conversational and helpful", + ], + }), + ], + script: [ + scenario.message({ + role: "user", + content: `🗣️ (Transcribed) ${transcript}`, + }), + scenario.agent(), + scenario.judge(), + ], + setId, + }); + + try { + expect(result.success).toBe(true); + } catch (error) { + console.error(result); + throw error; + } + }); +}); +``` + +```python [python] +# Coming soon +``` + +::: + +### Why Transcription First? + +Scenario currently supports text and image content in messages. To test real voice inputs, we transcribe the audio using OpenAI's Whisper model and then inject the result as a regular text message in the script. + +This keeps your tests deterministic, repeatable, and compatible with standard judge agents. + +## Best Practices + +When testing voice scenarios: + +- Use **natural speech patterns** in your audio (e.g. pauses, fillers, informal phrasing). +- Define **clear, objective criteria** for what counts as a good agent response (tone, content, empathy). +- Group related voice tests under a shared `setId` for easier filtering and reporting. + +## Next Steps + +- Check out [images](/examples/multimodal/multimodal-images) to test your agent with images. + +--- + +### Complete Source + +Check out the full test in the repository [here](https://github.com/langwatch/scenario/blob/main/javascript/examples/vitest/tests/voice-agent.test.ts). diff --git a/docs/vocs.config.tsx b/docs/vocs.config.tsx index 99dbf966..25a8bdf8 100644 --- a/docs/vocs.config.tsx +++ b/docs/vocs.config.tsx @@ -173,12 +173,12 @@ export default defineConfig({ link: "/examples/multimodal/multimodal-images", }, { - text: "Files (coming soon)", - link: "/examples/multimodal/multimodal-files", + text: "Audio", + link: "/examples/multimodal/multimodal-audio", }, { - text: "Audio (coming soon)", - link: "/examples/multimodal/multimodal-audio", + text: "Files (coming soon)", + link: "/examples/multimodal/multimodal-files", }, ], }, diff --git a/javascript/examples/vitest/package.json b/javascript/examples/vitest/package.json index 52d34e0a..bac450ae 100644 --- a/javascript/examples/vitest/package.json +++ b/javascript/examples/vitest/package.json @@ -18,6 +18,7 @@ "dependencies": { "@langwatch/scenario": "workspace:*", "ai": ">=4.0.0", + "openai": "5.8.3", "vitest": "3.2.4" } } diff --git a/javascript/examples/vitest/tests/fixtures/sample.wav b/javascript/examples/vitest/tests/fixtures/sample.wav new file mode 100644 index 00000000..b05ec794 Binary files /dev/null and b/javascript/examples/vitest/tests/fixtures/sample.wav differ diff --git a/javascript/examples/vitest/tests/voice-agent.test.ts b/javascript/examples/vitest/tests/voice-agent.test.ts new file mode 100644 index 00000000..ae7e64b7 --- /dev/null +++ b/javascript/examples/vitest/tests/voice-agent.test.ts @@ -0,0 +1,84 @@ +import * as fs from "fs"; +import * as path from "path"; +import scenario, { AgentAdapter, AgentRole } from "@langwatch/scenario"; +import OpenAI from "openai"; +import { describe, it, expect } from "vitest"; + +const setId = "realtime-voice-agent-test-multimodal"; +const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY! }); + +function getFixtureAudioPath(): string { + return path.join(__dirname, "fixtures", "sample.wav"); +} + +function toBase64(filePath: string): Promise { + return new Promise((resolve, reject) => { + fs.readFile(filePath, (err, data) => { + if (err) return reject(err); + resolve(data.toString("base64")); + }); + }); +} + +// Voice agent powered by gpt-4o-audio-preview +const voiceAgent: AgentAdapter = { + role: AgentRole.AGENT, + call: async () => { + const audioPath = getFixtureAudioPath(); + const base64Audio = await toBase64(audioPath); + + const response = await openai.chat.completions.create({ + model: "gpt-4o-audio-preview", + modalities: ["text", "audio"], + audio: { format: "wav", voice: "alloy" }, + messages: [ + { + role: "user", + content: [ + { type: "text", text: "Please summarize what the speaker is saying in this audio recording." }, + { type: "input_audio", input_audio: { data: base64Audio, format: "wav" } }, + ], + }, + ], + store: false, + }); + + const reply = response.choices[0].message?.audio?.transcript; + return reply || "(No response)"; + }, +}; + +describe("Voice Agent Audio Tests (with gpt-4o preview)", () => { + it("should transcribe and respond to audio", async () => { + const result = await scenario.run({ + name: "voice agent: gpt-4o-audio-preview", + description: "Send audio directly to gpt-4o-audio-preview and evaluate agent response", + agents: [ + voiceAgent, + scenario.userSimulatorAgent(), + scenario.judgeAgent({ + criteria: [ + "Agent responds clearly to the audio message", + "Agent tone is conversational and helpful", + ], + }), + ], + script: [ + scenario.message({ + role: "user", + content: `Please summarize what the person is saying in this recording`, + }), + scenario.agent(), + scenario.judge(), + ], + setId, + }); + + try { + expect(result.success).toBe(true); + } catch (error) { + console.error(result); + throw error; + } + }); +}); diff --git a/javascript/pnpm-lock.yaml b/javascript/pnpm-lock.yaml index a78041a1..ece7bf1f 100644 --- a/javascript/pnpm-lock.yaml +++ b/javascript/pnpm-lock.yaml @@ -96,6 +96,9 @@ importers: ai: specifier: '>=4.0.0' version: 4.3.16(react@19.1.0)(zod@3.24.4) + openai: + specifier: 5.8.3 + version: 5.8.3(zod@3.24.4) vitest: specifier: 3.2.4 version: 3.2.4(@types/node@22.15.15)(tsx@4.19.4) @@ -2561,6 +2564,18 @@ packages: resolution: {integrity: sha512-kbpaSSGJTWdAY5KPVeMOKXSrPtr8C8C7wodJbcsd51jRnmD+GZu8Y0VoU6Dm5Z4vWr0Ig/1NKuWRKf7j5aaYSg==} engines: {node: '>=6'} + openai@5.8.3: + resolution: {integrity: sha512-IdotKmquCnpouTRvF9xRXVhMx6K5Sc8zkD6Usf+so+NTQ+qiJ8bLSCd7LBb8b/Rof7uYltlSxJhNp+spphKI4Q==} + hasBin: true + peerDependencies: + ws: ^8.18.0 + zod: ^3.23.8 + peerDependenciesMeta: + ws: + optional: true + zod: + optional: true + optionator@0.9.4: resolution: {integrity: sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g==} engines: {node: '>= 0.8.0'} @@ -6280,6 +6295,10 @@ snapshots: dependencies: mimic-fn: 2.1.0 + openai@5.8.3(zod@3.24.4): + optionalDependencies: + zod: 3.24.4 + optionator@0.9.4: dependencies: deep-is: 0.1.4