diff --git a/javascript/examples/vitest/tests/multimodal-audio-to-audio.test.ts b/javascript/examples/vitest/tests/multimodal-audio-to-audio.test.ts index d0a605d5..ce87688c 100644 --- a/javascript/examples/vitest/tests/multimodal-audio-to-audio.test.ts +++ b/javascript/examples/vitest/tests/multimodal-audio-to-audio.test.ts @@ -1,79 +1,59 @@ import { openai } from "@ai-sdk/openai"; -import scenario, { AgentRole } from "@langwatch/scenario"; +import scenario, { AgentRole, audioFromFile } from "@langwatch/scenario"; import { UserModelMessage } from "ai"; import { describe, it, expect } from "vitest"; -import { - encodeAudioToBase64, - getFixturePath, - wrapJudgeForAudioTranscription, -} from "./helpers"; +import { getFixturePath } from "./helpers"; import { OpenAiVoiceAgent } from "./helpers/openai-voice-agent"; class AudioAgent extends OpenAiVoiceAgent { role: AgentRole = AgentRole.AGENT; } -// Use setId to group together for visualizing in the UI const setId = "multimodal-audio-test"; /** * This example shows how to test an agent that can take audio input * from a fixture and respond with audio output. + * + * Uses: + * - audioFromFile() to load audio + * - scenario.message() to inject the audio message + * - scenario.judgeAgent({ audio: true }) for multimodal evaluation */ describe("Multimodal Audio to Audio Tests", () => { - it("should handle audio input", async () => { + it("should handle audio input from file", async () => { const myAgent = new AudioAgent({ - systemPrompt: ` - You are a helpful assistant that can analyze audio input and respond with audio output. - You must respond with audio output. - `, + systemPrompt: `You are a helpful assistant that analyzes audio input. + Answer questions about the audio content.`, voice: "alloy", forceUserRole: true, }); - const data = encodeAudioToBase64( - getFixturePath("male_or_female_voice.wav"), - ); + // Load audio file using the utility + const audio = audioFromFile(getFixturePath("male_or_female_voice.wav")); - // The AI-SDK will only support file parts, - // so we cannot use the OpenAI shape from above - // @see https://ai-sdk.dev/docs/foundations/prompts#file-parts - const audioMessage = { + // Create audio message with instructions + const audioMessage: UserModelMessage = { role: "user", content: [ - { - type: "text", - text: ` - Answer the question in the a text. - If you're not sure, you're required to take a best guess. - After you've guessed, you must repeat the question and say what format the input was in (audio or text) - `, - }, - { - type: "file", - mediaType: "audio/wav", - data, - }, + { type: "text", text: "Is this a male or female voice? Take a guess." }, + { type: "file", mediaType: audio.mediaType, data: audio.data }, ], - } satisfies UserModelMessage; - - const audioJudge = wrapJudgeForAudioTranscription( - scenario.judgeAgent({ - model: openai("gpt-4o"), - criteria: [ - "The agent correctly guesses it's a male voice", - "The agent repeats the question", - "The agent says what format the input was in (audio or text)", - ], - }), - ); + }; const result = await scenario.run({ setId, - name: "multimodal audio to audio", - description: - "User sends audio file, agent analyzes and transcribes the content", - agents: [myAgent, scenario.userSimulatorAgent(), audioJudge], + name: "audio to audio - file input", + description: "User sends audio file, agent analyzes and responds", + agents: [ + myAgent, + scenario.userSimulatorAgent(), + scenario.judgeAgent({ + model: openai("gpt-4o"), + criteria: ["The agent guesses the voice gender"], + audio: true, + }), + ], script: [ scenario.message(audioMessage), scenario.agent(), @@ -94,7 +74,7 @@ describe("Multimodal Audio to Audio Tests", () => { it.todo("should handle multiple audio formats (WAV, MP3, M4A)"); it.todo("should handle long audio files gracefully"); it.todo( - "should provide appropriate responses for unclear or corrupted audio", + "should provide appropriate responses for unclear or corrupted audio" ); it.todo("should handle audio with background noise"); it.todo("should transcribe speech in different languages"); diff --git a/javascript/examples/vitest/tests/multimodal-audio-to-text.test.ts b/javascript/examples/vitest/tests/multimodal-audio-to-text.test.ts index 027b2a07..e98ea146 100644 --- a/javascript/examples/vitest/tests/multimodal-audio-to-text.test.ts +++ b/javascript/examples/vitest/tests/multimodal-audio-to-text.test.ts @@ -3,39 +3,31 @@ import scenario, { AgentAdapter, AgentInput, AgentRole, + audioFromFile, } from "@langwatch/scenario"; import { UserModelMessage } from "ai"; import OpenAI from "openai"; import { ChatCompletionMessageParam } from "openai/resources/chat/completions.mjs"; import { describe, it, expect } from "vitest"; -import { - encodeAudioToBase64, - getFixturePath, - wrapJudgeForAudioTranscription, -} from "./helpers"; +import { getFixturePath } from "./helpers"; import { convertModelMessagesToOpenAIMessages } from "./helpers/convert-core-messages-to-openai"; -class AudioAgent extends AgentAdapter { +/** + * Agent that takes audio input and responds with text + */ +class AudioToTextAgent extends AgentAdapter { role: AgentRole = AgentRole.AGENT; private openai = new OpenAI(); call = async (input: AgentInput) => { - // To use the OpenAI "voice-to-voice" model, we need to use the - // OpenAI api directly, and so we need to convert the messages to the correct - // shape here. - // @see https://platform.openai.com/docs/guides/audio?example=audio-in const messages = convertModelMessagesToOpenAIMessages(input.messages); const response = await this.respond(messages); - - // Scenario expects the response to be a string, so we only send the transcript const transcript = response.choices[0].message?.audio?.transcript; - // Handle text response if (typeof transcript === "string") { return transcript; - } else { - throw new Error("Agent failed to generate a response"); } + throw new Error("Agent failed to generate a response"); }; private async respond(messages: ChatCompletionMessageParam[]) { @@ -43,64 +35,48 @@ class AudioAgent extends AgentAdapter { model: "gpt-4o-audio-preview", modalities: ["text", "audio"], audio: { voice: "alloy", format: "wav" }, - // We need to strip the id, or the openai client will throw an error messages, store: false, }); } } -// Use setId to group together for visualizing in the UI const setId = "multimodal-audio-test"; /** - * This example shows how to test an agent that can take audio input - * and respond with text output. + * This example shows how to test an agent that takes audio input + * and responds with text output. + * + * Uses: + * - audioFromFile() to load audio + * - scenario.message() to inject the audio message + * - scenario.judgeAgent({ audio: true }) for multimodal evaluation */ describe("Multimodal Audio to Text Tests", () => { - it("should handle audio input", async () => { - const data = encodeAudioToBase64( - getFixturePath("male_or_female_voice.wav"), - ); + it("should handle audio input from file", async () => { + // Load audio file + const audio = audioFromFile(getFixturePath("male_or_female_voice.wav")); - // The AI-SDK will only support file parts, - // so we cannot use the OpenAI shape from above - // @see https://ai-sdk.dev/docs/foundations/prompts#file-parts - const audioMessage = { + const audioMessage: UserModelMessage = { role: "user", content: [ - { - type: "text", - text: ` - Answer the question in the audio. - If you're not sure, you're required to take a best guess. - After you've guessed, you must repeat the question and say what format the input was in (audio or text) - `, - }, - { - type: "file", - mediaType: "audio/wav", - data, - }, + { type: "text", text: "Is this a male or female voice?" }, + { type: "file", mediaType: audio.mediaType, data: audio.data }, ], - } satisfies UserModelMessage; - - const audioJudge = wrapJudgeForAudioTranscription( - scenario.judgeAgent({ - model: openai("gpt-5"), - criteria: [ - "The agent guesses it's a male voice", - "The agent repeats the question", - "The agent says what format the input was in (audio or text)", - ], - }), - ); + }; const result = await scenario.run({ - name: "multimodal audio to text", - description: - "User sends audio file, agent analyzes and transcribes the content", - agents: [new AudioAgent(), scenario.userSimulatorAgent(), audioJudge], + name: "audio to text", + description: "User sends audio, agent responds with text", + agents: [ + new AudioToTextAgent(), + scenario.userSimulatorAgent(), + scenario.judgeAgent({ + model: openai("gpt-4o"), + criteria: ["The agent identifies the voice gender"], + audio: true, + }), + ], script: [ scenario.message(audioMessage), scenario.agent(), @@ -122,7 +98,7 @@ describe("Multimodal Audio to Text Tests", () => { it.todo("should handle multiple audio formats (WAV, MP3)"); it.todo("should handle long audio files gracefully"); it.todo( - "should provide appropriate responses for unclear or corrupted audio", + "should provide appropriate responses for unclear or corrupted audio" ); it.todo("should handle audio with background noise"); it.todo("should transcribe speech in different languages"); diff --git a/javascript/examples/vitest/tests/multimodal-voice-to-voice-conversation.test.ts b/javascript/examples/vitest/tests/multimodal-voice-to-voice-conversation.test.ts index b31efbb1..0035f4bf 100644 --- a/javascript/examples/vitest/tests/multimodal-voice-to-voice-conversation.test.ts +++ b/javascript/examples/vitest/tests/multimodal-voice-to-voice-conversation.test.ts @@ -1,150 +1,194 @@ /** * Multimodal Voice-to-Voice Conversation Tests * - * This test suite demonstrates a complete audio-to-audio conversation flow where: - * - A user simulator agent generates audio questions - * - A main agent responds with audio answers - * - Both communicate entirely through voice (no text) - * - The conversation is judged for quality - * - The full audio is saved for review + * This test suite demonstrates voice-first-class primitives for audio conversations: * - * This showcases: - * - Custom agent implementations with voice capabilities - * - Multi-turn voice conversations - * - Audio message handling and persistence - * - Judge agent integration with audio transcription - * - Role reversal for user simulation + * Voice Script Primitives: + * - scenario.user.speak("text") — Fixed user message converted to audio via TTS + * - scenario.agent.speak("text") — Fixed agent message converted to audio via TTS + * - scenario.user() with voice sim — Generated audio responses + * + * Voice User Simulator: + * - scenario.userSimulatorAgent({ voice: "nova" }) — Generates audio instead of text + * + * Audio-Aware Judge: + * - scenario.judgeAgent({ audio: true }) — Evaluates audio content directly */ import * as path from "path"; import { openai } from "@ai-sdk/openai"; -import scenario, { AgentInput, AgentRole } from "@langwatch/scenario"; -import { ModelMessage } from "ai"; +import scenario, { AgentRole, StringUtils } from "@langwatch/scenario"; import { describe, it, expect } from "vitest"; -import { - OpenAiVoiceAgent, - saveConversationAudio, - wrapJudgeForAudioTranscription, -} from "./helpers"; -import { messageRoleReversal } from "../../../src/agents/utils"; +import { OpenAiVoiceAgent, saveConversationAudio } from "./helpers"; /** - * Main agent that responds with helpful audio answers - * Uses "echo" voice for a distinct sound + * Voice agent that responds with audio */ -class MyAgent extends OpenAiVoiceAgent { +class VoiceAgent extends OpenAiVoiceAgent { role: AgentRole = AgentRole.AGENT; constructor() { super({ - systemPrompt: `You are a helpful and engaging AI assistant. - Respond naturally and conversationally since this is an audio conversation. - Be informative but keep your responses short, concise and engaging. - Adapt your speaking style to be natural for audio.`, + systemPrompt: `You are a helpful AI assistant having a voice conversation. + Keep responses short and conversational.`, voice: "echo", }); } } -/** - * User simulator that generates audio questions - * - * This agent: - * - Plays the role of a curious user asking questions - * - Generates audio responses (not text) - * - Uses role reversal to properly simulate user behavior - * - Automatically ends conversation after 2 exchanges - * - Uses "nova" voice to differentiate from main agent - */ -class AudioUserSimulatorAgent extends OpenAiVoiceAgent { - role: AgentRole = AgentRole.USER; - - constructor() { - super({ - systemPrompt: ` - You are role playing as a curious user looking for information about AI agentic testing, - but you're a total novice and don't know anything about it. - - Be natural and conversational in your speech patterns. - This is an audio conversation, so speak as you would naturally talk. +const setId = "voice-conversation-tests"; +const outputPath = path.join(process.cwd(), "tmp", "audio_conversations"); - After 2 responses from the other speaker, say "I'm done with this conversation" and say goodbye. - - YOUR LANGUAGE IS ENGLISH. - `, - voice: "nova", +describe("Voice-to-Voice Conversation Tests", () => { + /** + * Example 1: Fixed voice messages using .speak() + * + * Use scenario.user.speak() and scenario.agent.speak() when you want + * specific text converted to audio via TTS. + */ + it.only("should handle fixed voice messages with .speak()", async () => { + const result = await scenario.run({ + name: "fixed voice messages", + description: "Test with predetermined voice messages", + agents: [ + new VoiceAgent(), + scenario.userSimulatorAgent({ + voice: "nova", + }), // Text sim (not used in this script) + scenario.judgeAgent({ + criteria: ["Agent responds appropriately to greeting"], + // audio: "transcribe" | true | undefined, + }), + ], + script: [ + // Fixed user voice message via TTS + scenario.user.speak("Hello! Can you help me with something?"), + scenario.agent(), // Agent generates audio response + async (ctx) => { + await saveConversationAudio( + ctx, + path.join( + outputPath, + `${StringUtils.kebabCase(ctx.config.name)}.wav` + ) + ); + }, + scenario.judge(), + ], + setId, }); - } - public async call(input: AgentInput): Promise { - /** - * Role reversal is critical here: - * - The agent sees "user" messages as if they're from the assistant - * - This allows the agent to respond AS the user - * - Without this, the conversation flow would be backwards - */ - const messages = messageRoleReversal(input.messages); - return super.call({ - ...input, - messages, - }); - } -} + expect(result.success).toBe(true); + }); -// Group related test runs together in the UI -const setId = "full-audio-conversation-test"; + /** + * Example 2: Voice user simulator generates audio + * + * Use scenario.userSimulatorAgent({ voice }) to have the simulator + * generate contextual responses as audio. + */ + it("should handle voice user simulator", async () => { + const result = await scenario.run({ + name: "voice user simulator", + description: + "User is asking about cooking, keeping the conversation short and natural for voice.", + agents: [ + new VoiceAgent(), + // Voice user sim - generates audio via TTS + scenario.userSimulatorAgent({ + voice: "nova", + systemPrompt: `You are a curious user asking about cooking. + Keep questions short and natural for voice.`, + }), + scenario.judgeAgent({ + model: openai("gpt-4o"), + criteria: ["Conversation flows naturally"], + audio: true, + }), + ], + script: [ + scenario.user(), // Voice sim generates audio + scenario.agent(), // Agent responds with audio + scenario.user(), // Voice sim generates follow-up audio + scenario.agent(), + scenario.judge(), + ], + setId, + }); -// Output path for the full conversation audio file -const outputPath = path.join( - process.cwd(), - "tmp", - "audio_conversations", - "full-conversation.wav" -); + expect(result.success).toBe(true); + }); -describe("Multimodal Voice-to-Voice Conversation Tests", () => { - it("should handle complete audio-to-audio conversation", async () => { - // Initialize both agents for the conversation - const audioUserSimulator = new AudioUserSimulatorAgent(); - const audioAgent = new MyAgent(); + /** + * Example 3: Mixed text and voice in same conversation + * + * You can mix text and voice messages. The voice sim only generates + * audio when scenario.user() has no arguments. + */ + it("should handle mixed text and voice messages", async () => { + const result = await scenario.run({ + name: "mixed text and voice", + description: "Combining text and voice in one conversation", + agents: [ + new VoiceAgent(), + scenario.userSimulatorAgent({ voice: "nova" }), + scenario.judgeAgent({ + model: openai("gpt-4o"), + criteria: ["Agent handles both text and voice input"], + audio: true, + }), + ], + script: [ + // Text message (not converted to audio) + scenario.user("Hi, I need a recipe suggestion"), + scenario.agent(), + // Voice sim generates audio follow-up + scenario.user(), + scenario.agent(), + // Fixed voice message via TTS + scenario.user.speak("Thanks, that sounds delicious!"), + scenario.agent(), + scenario.judge(), + ], + setId, + }); - // Create judge agent to evaluate conversation quality - // Wrap with audio handler to transcribe audio before judging - const conversationJudge = wrapJudgeForAudioTranscription( - scenario.judgeAgent({ - model: openai("gpt-4o"), - criteria: ["The conversation flows naturally between user and agent"], - }) - ); + expect(result.success).toBe(true); + }); - // Execute the full audio conversation scenario + /** + * Example 4: Multi-turn voice conversation with audio export + */ + it("should handle multi-turn voice conversation", async () => { const result = await scenario.run({ - name: "full audio-to-audio conversation", - description: - "Complete audio conversation between user simulator and agent over multiple turns", - agents: [audioAgent, audioUserSimulator, conversationJudge], + name: "multi-turn voice conversation", + description: "Extended voice conversation with audio export", + agents: [ + new VoiceAgent(), + scenario.userSimulatorAgent({ + voice: "nova", + systemPrompt: `You are learning about AI testing. + Ask 2-3 questions then say goodbye.`, + }), + scenario.judgeAgent({ + model: openai("gpt-4o"), + criteria: ["Conversation is informative and helpful"], + audio: true, + }), + ], script: [ - // Step 1: Run 2 conversation turns between user simulator and agent - scenario.proceed(2), - - // Step 2: Save the full conversation as a single audio file + scenario.proceed(3), // 3 turns of voice conversation async (ctx) => { - await saveConversationAudio(ctx, outputPath); + await saveConversationAudio( + ctx, + path.join(outputPath, "multi-turn.wav") + ); }, - - // Step 3: Have judge evaluate the conversation quality scenario.judge(), ], setId, }); - try { - console.log("FULL AUDIO CONVERSATION RESULT", result); - - expect(result.success).toBe(true); - } catch (error) { - console.error("Full audio conversation failed:", result); - throw error; - } + expect(result.success).toBe(true); }); /** diff --git a/javascript/examples/vitest/tests/scenario-expert-realtime.test.ts b/javascript/examples/vitest/tests/scenario-expert-realtime.test.ts index 17202179..df13c0a7 100644 --- a/javascript/examples/vitest/tests/scenario-expert-realtime.test.ts +++ b/javascript/examples/vitest/tests/scenario-expert-realtime.test.ts @@ -8,6 +8,9 @@ * 1. Browser: createScenarioExpertSession() → connect with token → use directly * 2. Test: createScenarioExpertSession() → connect with API key → wrap in adapter * 3. SAME session creation = accurate testing! + * + * Uses voice-first-class primitives: + * - scenario.judgeAgent({ audio: true }) for multimodal evaluation */ import scenario, { @@ -16,7 +19,6 @@ import scenario, { type AudioResponseEvent, } from "@langwatch/scenario"; import { describe, it, expect, beforeAll, afterAll } from "vitest"; -import { wrapJudgeForAudioTranscription } from "./helpers/wrap-judge-for-audio-transcription"; import { AudioUtils } from "./utils/audio/audio.utils"; import { createUserSimulatorSession } from "../../openai-realtime-demo/agents/realtime-user-simulator.agent"; import { createScenarioExpertSession } from "../../openai-realtime-demo/agents/scenario-expert.agent"; @@ -92,15 +94,14 @@ describe("Scenario Expert Agent (Realtime API)", () => { agents: [ realtimeAdapter, // Realtime agent (handles audio!) audioUserSim, // Audio user simulator (generates voice) - wrapJudgeForAudioTranscription( - // Judge with audio transcription - scenario.judgeAgent({ - criteria: [ - "Agent explains what LangWatch Scenario is", - "Agent is helpful and informative", - ], - }) - ), + // Judge with multimodal audio evaluation + scenario.judgeAgent({ + criteria: [ + "Agent explains what LangWatch Scenario is", + "Agent is helpful and informative", + ], + audio: true, + }), ], script: [ scenario.agent( @@ -110,7 +111,7 @@ describe("Scenario Expert Agent (Realtime API)", () => { scenario.agent(), // Audio response scenario.user(), // Audio follow-up scenario.agent(), // Audio response - scenario.judge(), // Evaluates transcripts + scenario.judge(), // Evaluates audio ], setId: "realtime-examples", }); diff --git a/javascript/examples/vitest/tests/vegetarian-recipe-realtime.test.ts b/javascript/examples/vitest/tests/vegetarian-recipe-realtime.test.ts index d999694d..a319051b 100644 --- a/javascript/examples/vitest/tests/vegetarian-recipe-realtime.test.ts +++ b/javascript/examples/vitest/tests/vegetarian-recipe-realtime.test.ts @@ -8,6 +8,9 @@ * 1. Browser: createVegetarianRecipeSession() → connect with token → use directly * 2. Test: createVegetarianRecipeSession() → connect with API key → wrap in adapter * 3. SAME session creation = accurate testing! + * + * Uses voice-first-class primitives: + * - scenario.judgeAgent({ audio: true }) for multimodal evaluation */ import scenario, { @@ -16,7 +19,6 @@ import scenario, { type AudioResponseEvent, } from "@langwatch/scenario"; import { describe, it, expect, beforeAll, afterAll } from "vitest"; -import { wrapJudgeForAudioTranscription } from "./helpers/wrap-judge-for-audio-transcription"; import { AudioUtils } from "./utils/audio/audio.utils"; import { createUserSimulatorSession } from "../../openai-realtime-demo/agents/realtime-user-simulator.agent"; import { createVegetarianRecipeSession } from "../../openai-realtime-demo/agents/vegetatrian-recipe.agent"; @@ -111,17 +113,16 @@ describe("Vegetarian Recipe Agent (Realtime API)", () => { agents: [ realtimeAdapter, // Realtime agent (handles audio!) audioUserSim, // Audio user simulator (generates voice) - wrapJudgeForAudioTranscription( - // Judge with audio transcription - scenario.judgeAgent({ - criteria: [ - "Agent should provide a vegetarian recipe", - "Recipe should include ingredients", - "Recipe should include cooking steps", - "Agent should be helpful and encouraging", - ], - }) - ), + // Judge with multimodal audio evaluation + scenario.judgeAgent({ + criteria: [ + "Agent should provide a vegetarian recipe", + "Recipe should include ingredients", + "Recipe should include cooking steps", + "Agent should be helpful and encouraging", + ], + audio: true, + }), ], script: [ scenario.user( diff --git a/javascript/src/agents/judge/judge-agent.ts b/javascript/src/agents/judge/judge-agent.ts index 39029cb0..c92bf49d 100644 --- a/javascript/src/agents/judge/judge-agent.ts +++ b/javascript/src/agents/judge/judge-agent.ts @@ -1,6 +1,9 @@ -import { generateText, CoreMessage, ToolSet, Tool, ToolChoice, tool } from "ai"; +import { generateText, CoreMessage, ToolSet, Tool, ToolChoice, tool, ModelMessage } from "ai"; +import OpenAI from "openai"; +import { ChatCompletionMessageParam } from "openai/resources/chat/completions.mjs"; import { z } from "zod/v4"; import { JudgeResult } from "./interfaces"; +import { transcribeAudioInMessages } from "../../audio/transcribe"; import { getProjectConfig } from "../../config"; import { AgentInput, JudgeAgentAdapter, AgentRole } from "../../domain"; import { modelSchema } from "../../domain/core/schemas/model.schema"; @@ -20,13 +23,35 @@ export interface JudgeAgentConfig extends TestingAgentConfig { * The criteria that the judge will use to evaluate the conversation. */ criteria: string[]; + /** + * Audio handling mode: + * - `"transcribe"` — Transcribe audio to text via Whisper, then evaluate text + * - `true` — Pass audio directly to multimodal model (e.g., gpt-4o-audio-preview) + * - `false`/`undefined` — No special audio handling (default) + */ + audio?: boolean | "transcribe"; } -function buildSystemPrompt(criteria: string[], description: string): string { +function buildSystemPrompt( + criteria: string[], + description: string, + audioEnabled?: boolean +): string { const criteriaList = criteria?.map((criterion, idx) => `${idx + 1}. ${criterion}`).join("\n") || "No criteria provided"; + const audioInstructions = audioEnabled + ? ` + +This conversation includes audio messages. Listen to the audio content directly. +When evaluating: +- Consider tone, clarity, and delivery when relevant to criteria +- Evaluate both the content (what was said) and presentation (how it was said) + +` + : ""; + return ` You are an LLM as a judge watching a simulated conversation as it plays out live to determine if the agent under test meets the criteria or not. @@ -44,7 +69,7 @@ ${description} ${criteriaList} - +${audioInstructions} - Be strict, do not let the conversation continue if the agent already broke one of the "do not" or "should not" criteria. - DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary @@ -52,6 +77,70 @@ ${criteriaList} `.trim(); } +function convertModelMessagesToOpenAIMessages( + coreMessages: (ModelMessage & { id?: string })[] +): ChatCompletionMessageParam[] { + return coreMessages.map(({ id: _id, ...msg }): ChatCompletionMessageParam => { + // Handle array content (multimodal messages) + if (Array.isArray(msg.content)) { + const parts = msg.content.map((part) => { + if (part.type === "text") { + return { type: "text" as const, text: part.text }; + } + if (part.type === "file" && part.mediaType?.startsWith("audio/")) { + return { + type: "input_audio" as const, + input_audio: { + data: part.data as string, + format: "wav" as const, + }, + }; + } + return { type: "text" as const, text: "" }; + }); + return { + role: msg.role as "user" | "assistant" | "system", + content: parts, + } as ChatCompletionMessageParam; + } + return { + role: msg.role as "user" | "assistant" | "system", + content: msg.content, + } as ChatCompletionMessageParam; + }); +} + +/** + * Convert AI SDK tools to OpenAI function format. + */ +function convertToolsToOpenAIFunctions(tools: ToolSet): OpenAI.Chat.Completions.ChatCompletionTool[] { + return Object.entries(tools).map(([name, t]) => ({ + type: "function" as const, + function: { + name, + description: t.description ?? "", + parameters: (t as { inputSchema?: Record }).inputSchema ?? { type: "object" }, + }, + })); +} + +/** + * Convert OpenAI tool calls to AI SDK format. + */ +function convertOpenAIToolCalls( + toolCalls: Array<{ id: string; function: { name: string; arguments: string } }> | undefined +): InvokeLLMResult["toolCalls"] { + if (!toolCalls) return []; + return toolCalls.map((tc) => ({ + type: "tool-call" as const, + toolCallId: tc.id, + toolName: tc.function.name, + args: JSON.parse(tc.function.arguments), + input: JSON.parse(tc.function.arguments), + })); +} + + function buildContinueTestTool(): Tool { return tool({ description: "Continue the test with the next step", @@ -100,11 +189,51 @@ class JudgeAgent extends JudgeAgentAdapter { role: AgentRole = AgentRole.JUDGE; criteria: string[]; + /** + * Detects if messages contain audio file parts. + */ + private hasAudioContent(messages: CoreMessage[]): boolean { + return messages.some((message) => { + if (!Array.isArray(message.content)) return false; + return message.content.some( + (part) => + part.type === "file" && part.mediaType?.startsWith("audio/") + ); + }); + } + /** * LLM invocation function. Can be overridden to customize LLM behavior. + * Automatically uses OpenAI directly when audio content is detected. */ invokeLLM: (params: InvokeLLMParams) => Promise = async (params) => { try { + const hasAudio = this.hasAudioContent(params.messages ?? []); + + if (hasAudio) { + const openai = new OpenAI(); + const tools = params.tools ? convertToolsToOpenAIFunctions(params.tools) : undefined; + + const response = await openai.chat.completions.create({ + model: "gpt-4o-audio-preview", + modalities: ["text"], + audio: { voice: "alloy", format: "wav" }, + messages: convertModelMessagesToOpenAIMessages(params.messages ?? []), + tools, + tool_choice: params.toolChoice === "required" ? "required" : undefined, + store: false, + }); + + const message = response.choices[0]?.message; + const text = message?.content ?? ""; + return { + text, + content: [{ type: "text" as const, text }], + toolCalls: convertOpenAIToolCalls(message?.tool_calls as Array<{ id: string; function: { name: string; arguments: string } }> | undefined), + toolResults: [], + }; + } + return await generateText(params); } catch (error) { this.logger.error("Error generating text", { error }); @@ -121,12 +250,19 @@ class JudgeAgent extends JudgeAgentAdapter { async call(input: AgentInput): Promise { const cfg = this.cfg; + // audio: "transcribe" → transcribe audio to text first + // audio: true → pass audio directly to multimodal model + // audio: false/undefined → no special handling + const processedMessages = cfg.audio === "transcribe" + ? await transcribeAudioInMessages(input.messages) + : input.messages; + const systemPrompt = cfg.systemPrompt ?? - buildSystemPrompt(cfg.criteria, input.scenarioConfig.description); + buildSystemPrompt(cfg.criteria, input.scenarioConfig.description, cfg.audio === true); const messages: CoreMessage[] = [ { role: "system", content: systemPrompt }, - ...input.messages, + ...processedMessages, ]; const isLastMessage = @@ -232,12 +368,17 @@ class JudgeAgent extends JudgeAgentAdapter { * detailed reasoning for its verdicts. It evaluates each criterion independently * and provides comprehensive feedback about what worked and what didn't. * + * Supports both text and audio evaluation: + * - Text (default): Evaluates text content only + * - Audio: When `audio: true`, uses multimodal model to evaluate audio directly + * * @param cfg Configuration for the judge agent. * @param cfg.criteria List of success criteria to evaluate against. * @param cfg.model Optional The language model to use for generating responses. * @param cfg.temperature Optional The temperature to use for the model. * @param cfg.maxTokens Optional The maximum number of tokens to generate. * @param cfg.systemPrompt Optional Custom system prompt to override default judge behavior. + * @param cfg.audio When true, evaluates audio directly using multimodal model. * * @example * ```typescript @@ -251,7 +392,8 @@ class JudgeAgent extends JudgeAgentAdapter { * }; * * async function main() { - * const result = await run({ + * // Text-only evaluation + * const textResult = await run({ * name: "Judge Agent Test", * description: "A simple test to see if the judge agent works.", * agents: [ @@ -265,6 +407,28 @@ class JudgeAgent extends JudgeAgentAdapter { * agent(), * ], * }); + * + * // Audio evaluation (requires multimodal model) + * const audioResult = await run({ + * name: "Voice Judge Test", + * description: "Testing voice agent behavior", + * agents: [ + * myVoiceAgent, + * userSimulatorAgent({ voice: "nova" }), + * judgeAgent({ + * criteria: [ + * "Agent maintains professional tone", + * "Agent addresses the user's concern", + * ], + * audio: true, + * }), + * ], + * script: [ + * user.speak("I need help with billing"), + * agent(), + * judge(), + * ], + * }); * } * main(); * ``` diff --git a/javascript/src/agents/types.ts b/javascript/src/agents/types.ts index f87a2ee7..1c066022 100644 --- a/javascript/src/agents/types.ts +++ b/javascript/src/agents/types.ts @@ -1,5 +1,6 @@ import { generateText } from "ai"; import { ModelConfig } from "../domain/core/schemas/model.schema"; +import type { Voice } from "../audio/types"; /** * Parameters for LLM invocation. @@ -34,6 +35,22 @@ export interface TestingAgentConfig extends Partial { systemPrompt?: string; } +/** + * Configuration for voice-enabled user simulator agent. + */ +export interface VoiceUserSimulatorConfig extends TestingAgentConfig { + /** + * Voice to use for TTS output. + * When set, the agent outputs audio instead of text. + */ + voice?: Voice; + /** + * Output audio format. + * @default "wav" + */ + audioFormat?: "wav" | "mp3" | "opus" | "aac" | "flac" | "pcm"; +} + /** * The arguments for finishing a test, used by the judge agent's tool. */ diff --git a/javascript/src/agents/user-simulator-agent.ts b/javascript/src/agents/user-simulator-agent.ts index c8289700..2c993ff4 100644 --- a/javascript/src/agents/user-simulator-agent.ts +++ b/javascript/src/agents/user-simulator-agent.ts @@ -1,10 +1,11 @@ import { generateText, CoreMessage } from "ai"; -import { TestingAgentConfig, InvokeLLMParams, InvokeLLMResult } from "./types"; +import { VoiceUserSimulatorConfig, InvokeLLMParams, InvokeLLMResult } from "./types"; import { messageRoleReversal } from "./utils"; import { getProjectConfig } from "../config"; import { AgentInput, UserSimulatorAgentAdapter } from "../domain"; import { modelSchema } from "../domain/core/schemas/model.schema"; import { Logger } from "../utils/logger"; +import { textToSpeech } from "../audio/text-to-speech"; function buildSystemPrompt(description: string): string { return ` @@ -42,7 +43,7 @@ class UserSimulatorAgent extends UserSimulatorAgentAdapter { } }; - constructor(private readonly cfg?: TestingAgentConfig) { + constructor(private readonly cfg?: VoiceUserSimulatorConfig) { super(); } @@ -83,6 +84,21 @@ class UserSimulatorAgent extends UserSimulatorAgentAdapter { throw new Error("No response content from LLM"); } + // If voice is configured, convert text to speech + if (config?.voice) { + const audio = await textToSpeech(messageContent, { + voice: config.voice, + format: config.audioFormat, + }); + return { + role: "user", + content: [ + { type: "text", text: "" }, + { type: "file", mediaType: audio.mediaType, data: audio.data }, + ], + } satisfies CoreMessage; + } + return { role: "user", content: messageContent } satisfies CoreMessage; }; } @@ -95,6 +111,10 @@ class UserSimulatorAgent extends UserSimulatorAgentAdapter { * It uses an LLM to generate natural, contextually relevant user inputs that help * drive the conversation forward according to the scenario description. * + * Supports both text and voice output: + * - Text output (default): Returns text messages + * - Voice output: When `voice` is set, outputs audio via TTS + * * @param config Optional configuration for the agent. * @param config.model The language model to use for generating responses. * If not provided, a default model will be used. @@ -106,6 +126,8 @@ class UserSimulatorAgent extends UserSimulatorAgentAdapter { * @param config.name The name of the agent. * @param config.systemPrompt Custom system prompt to override default user simulation behavior. * Use this to create specialized user personas or behaviors. + * @param config.voice Voice to use for TTS output. When set, outputs audio instead of text. + * @param config.audioFormat Output audio format (wav, mp3, etc). Defaults to wav. * * @throws {Error} If no model is configured either in parameters or global config. * @@ -121,8 +143,8 @@ class UserSimulatorAgent extends UserSimulatorAgentAdapter { * }; * * async function main() { - * // Basic user simulator with default behavior - * const basicResult = await run({ + * // Basic user simulator with text output + * const textResult = await run({ * name: "User Simulator Test", * description: "A simple test to see if the user simulator works.", * agents: [myAgent, userSimulatorAgent()], @@ -132,40 +154,32 @@ class UserSimulatorAgent extends UserSimulatorAgentAdapter { * ], * }); * - * // Customized user simulator - * const customResult = await run({ - * name: "Expert User Test", - * description: "User seeks help with TypeScript programming", + * // Voice user simulator - outputs audio + * const voiceResult = await run({ + * name: "Voice User Test", + * description: "User interacts via voice", * agents: [ * myAgent, - * userSimulatorAgent({ - * model: openai("gpt-4"), - * temperature: 0.3, - * systemPrompt: "You are a technical user who asks detailed questions" - * }) + * userSimulatorAgent({ voice: "nova" }) * ], * script: [ - * user(), + * user(), // Outputs audio * agent(), * ], * }); * - * // User simulator with custom persona - * const expertResult = await run({ - * name: "Expert Developer Test", - * description: "Testing with a technical expert user persona.", + * // Mixed: text input, voice output + * const mixedResult = await run({ + * name: "Mixed Modality Test", + * description: "Text input with voice simulation", * agents: [ * myAgent, - * userSimulatorAgent({ - * systemPrompt: ` - * You are an expert software developer testing an AI coding assistant. - * Ask challenging, technical questions and be demanding about code quality. - * Use technical jargon and expect detailed, accurate responses. - * ` - * }) + * userSimulatorAgent({ voice: "echo", audioFormat: "mp3" }) * ], * script: [ - * user(), + * user("Help me with billing"), // Fixed text + * agent(), + * user(), // Voice sim generates audio * agent(), * ], * }); @@ -175,7 +189,8 @@ class UserSimulatorAgent extends UserSimulatorAgentAdapter { * * **Implementation Notes:** * - Uses role reversal internally to work around LLM biases toward assistant roles + * - Voice output uses OpenAI TTS API */ -export const userSimulatorAgent = (config?: TestingAgentConfig) => { +export const userSimulatorAgent = (config?: VoiceUserSimulatorConfig) => { return new UserSimulatorAgent(config); }; diff --git a/javascript/src/audio/index.ts b/javascript/src/audio/index.ts new file mode 100644 index 00000000..a014f0d0 --- /dev/null +++ b/javascript/src/audio/index.ts @@ -0,0 +1,11 @@ +/** + * Audio module for voice-first-class support in Scenario. + * + * Provides types, utilities, and TTS capabilities for working with + * audio in scenario scripts. + */ + +export * from "./types"; +export * from "./utils"; +export * from "./text-to-speech"; +export * from "./transcribe"; diff --git a/javascript/src/audio/text-to-speech.ts b/javascript/src/audio/text-to-speech.ts new file mode 100644 index 00000000..c346c321 --- /dev/null +++ b/javascript/src/audio/text-to-speech.ts @@ -0,0 +1,60 @@ +/** + * Text-to-speech utility using OpenAI TTS API. + */ +import OpenAI from "openai"; +import type { AudioData, TextToSpeechOptions, Voice } from "./types"; + +/** + * Default voice for TTS. + */ +const DEFAULT_VOICE: Voice = "nova"; + +/** + * Default audio format for TTS output. + */ +const DEFAULT_FORMAT = "wav" as const; + +/** + * Converts text to speech using OpenAI TTS API. + * + * @param text - Text to convert to speech. + * @param options - TTS options including voice and format. + * @returns AudioData with the synthesized audio. + */ +export async function textToSpeech( + text: string, + options?: TextToSpeechOptions +): Promise { + const voice = options?.voice ?? DEFAULT_VOICE; + const format = options?.format ?? DEFAULT_FORMAT; + + const openai = new OpenAI(); + + const response = await openai.audio.speech.create({ + model: "tts-1", + voice, + input: text, + response_format: format, + }); + + const arrayBuffer = await response.arrayBuffer(); + const buffer = Buffer.from(arrayBuffer); + const data = buffer.toString("base64"); + + const mediaType = + format === "mp3" + ? "audio/mp3" + : format === "opus" + ? "audio/ogg" + : format === "aac" + ? "audio/mpeg" + : format === "flac" + ? "audio/wav" + : "audio/wav"; + + return { + data, + mediaType, + transcript: text, + }; +} diff --git a/javascript/src/audio/transcribe.ts b/javascript/src/audio/transcribe.ts new file mode 100644 index 00000000..4cd29abe --- /dev/null +++ b/javascript/src/audio/transcribe.ts @@ -0,0 +1,84 @@ +/** + * Audio transcription utilities for converting audio to text. + * + * Uses OpenAI Whisper API for transcription with caching to avoid + * re-transcribing the same audio multiple times. + */ +import { CoreMessage } from "ai"; +import OpenAI from "openai"; + +/** + * Cache mapping base64 audio data to transcribed text. + */ +const transcriptionCache = new Map(); + +/** + * Transcribes audio data to text using OpenAI Whisper. + * + * @param audioData - Base64-encoded audio data. + * @param mediaType - MIME type of the audio (e.g., "audio/wav"). + * @returns Transcribed text. + */ +export async function transcribeAudio( + audioData: string, + mediaType: string = "audio/wav" +): Promise { + const cached = transcriptionCache.get(audioData); + if (cached) return cached; + + try { + const openai = new OpenAI(); + const ext = mediaType.split("/")[1] || "wav"; + const response = await openai.audio.transcriptions.create({ + model: "whisper-1", + file: new File([Buffer.from(audioData, "base64")], `audio.${ext}`, { + type: mediaType, + }), + }); + transcriptionCache.set(audioData, response.text); + return response.text; + } catch (error) { + console.error("Error transcribing audio:", error); + return "[Audio: transcription failed]"; + } +} + +/** + * Converts audio parts in messages to text transcriptions. + * + * Scans all message content for audio file parts, transcribes them + * using OpenAI Whisper, and returns messages with audio converted to text. + * + * @param messages - Original messages potentially containing audio. + * @returns Messages with audio converted to text transcriptions. + */ +export async function transcribeAudioInMessages( + messages: CoreMessage[] +): Promise { + return await Promise.all( + messages.map(async (message) => { + if (message.role === "tool") { + return message; + } + + if (Array.isArray(message.content)) { + const textParts = await Promise.all( + message.content.map(async (part) => { + if (part.type === "text") return part.text; + if (part.type === "file" && part.mediaType?.startsWith("audio/")) { + return await transcribeAudio( + part.data as string, + part.mediaType + ); + } + return ""; + }) + ); + + const textContent = textParts.filter(Boolean).join(" "); + return { ...message, content: textContent || "[Audio message]" }; + } + return message; + }) + ); +} diff --git a/javascript/src/audio/types.ts b/javascript/src/audio/types.ts new file mode 100644 index 00000000..dfe8bde0 --- /dev/null +++ b/javascript/src/audio/types.ts @@ -0,0 +1,66 @@ +/** + * Audio types and utilities for voice-first-class support in Scenario. + * + * Provides core types for representing audio data and utilities for + * loading, encoding, and converting audio between formats. + */ + +/** + * Supported audio MIME types. + */ +export type AudioMimeType = + | "audio/wav" + | "audio/mp3" + | "audio/mpeg" + | "audio/ogg" + | "audio/webm" + | "audio/pcm"; + +/** + * Supported TTS voice options. + * Based on OpenAI TTS voices. + */ +export type Voice = "alloy" | "echo" | "fable" | "onyx" | "nova" | "shimmer"; + +/** + * Represents audio data with metadata. + */ +export interface AudioData { + /** + * Base64-encoded audio data. + */ + data: string; + + /** + * Media type of the audio (MIME type). + * Uses 'mediaType' to match AI SDK conventions. + */ + mediaType: AudioMimeType; + + /** + * Optional transcript of the audio content. + */ + transcript?: string; + + /** + * Optional duration in milliseconds. + */ + durationMs?: number; +} + +/** + * Options for text-to-speech conversion. + */ +export interface TextToSpeechOptions { + /** + * Voice to use for synthesis. + * @default "nova" + */ + voice?: Voice; + + /** + * Output audio format. + * @default "wav" + */ + format?: "wav" | "mp3" | "opus" | "aac" | "flac" | "pcm"; +} diff --git a/javascript/src/audio/utils.ts b/javascript/src/audio/utils.ts new file mode 100644 index 00000000..1c103406 --- /dev/null +++ b/javascript/src/audio/utils.ts @@ -0,0 +1,84 @@ +/** + * Audio utility functions for loading, encoding, and converting audio. + */ +import * as fs from "fs"; +import * as path from "path"; +import type { AudioData, AudioMimeType } from "./types"; + +/** + * Detects the MIME type from a file extension. + * + * @param filePath - Path to the audio file. + * @returns The detected MIME type. + */ +export function detectMimeType(filePath: string): AudioMimeType { + const ext = path.extname(filePath).toLowerCase(); + const mimeTypes: Record = { + ".wav": "audio/wav", + ".mp3": "audio/mp3", + ".mpeg": "audio/mpeg", + ".ogg": "audio/ogg", + ".webm": "audio/webm", + ".pcm": "audio/pcm", + }; + return mimeTypes[ext] ?? "audio/wav"; +} + +/** + * Loads audio from a file and returns AudioData. + * + * @param filePath - Path to the audio file. + * @returns AudioData with base64-encoded content. + */ +export function audioFromFile(filePath: string): AudioData { + const absolutePath = path.isAbsolute(filePath) + ? filePath + : path.resolve(process.cwd(), filePath); + + const buffer = fs.readFileSync(absolutePath); + const data = buffer.toString("base64"); + const mediaType = detectMimeType(filePath); + + return { data, mediaType }; +} + +/** + * Creates AudioData from a base64 string. + * + * @param data - Base64-encoded audio data. + * @param mediaType - Media type (MIME type) of the audio. + * @returns AudioData object. + */ +export function audioFromBase64( + data: string, + mediaType: AudioMimeType = "audio/wav" +): AudioData { + return { data, mediaType }; +} + +/** + * Creates AudioData from a Buffer. + * + * @param buffer - Audio data buffer. + * @param mediaType - Media type (MIME type) of the audio. + * @returns AudioData object. + */ +export function audioFromBuffer( + buffer: Buffer, + mediaType: AudioMimeType = "audio/wav" +): AudioData { + return { + data: buffer.toString("base64"), + mediaType, + }; +} + +/** + * Converts AudioData to a base64 data URI. + * + * @param audio - AudioData to convert. + * @returns Data URI string. + */ +export function audioToDataUri(audio: AudioData): string { + return `data:${audio.mediaType};base64,${audio.data}`; +} diff --git a/javascript/src/index.ts b/javascript/src/index.ts index 49230692..93551c69 100644 --- a/javascript/src/index.ts +++ b/javascript/src/index.ts @@ -1,4 +1,5 @@ import * as agents from "./agents"; +import * as audio from "./audio"; import * as domain from "./domain"; import * as execution from "./execution"; import * as runner from "./runner"; @@ -6,12 +7,15 @@ import * as script from "./script"; // Re-export all types and other named exports export * from "./agents"; +export * from "./audio"; export * from "./domain"; export * from "./execution"; export * from "./runner"; export * from "./script"; +export { StringUtils } from "./utils/string-utils"; type ScenarioApi = typeof agents & + typeof audio & typeof domain & typeof execution & typeof runner & @@ -19,6 +23,7 @@ type ScenarioApi = typeof agents & export const scenario: ScenarioApi = { ...agents, + ...audio, ...domain, ...execution, ...runner, diff --git a/javascript/src/script/index.ts b/javascript/src/script/index.ts index 86f4714f..8457ddf2 100644 --- a/javascript/src/script/index.ts +++ b/javascript/src/script/index.ts @@ -7,6 +7,8 @@ * and when scenarios should succeed or fail. */ import { CoreMessage } from "ai"; +import { textToSpeech } from "../audio/text-to-speech"; +import type { Voice } from "../audio/types"; import { ScenarioExecutionStateLike, ScriptStep } from "../domain"; /** @@ -23,6 +25,19 @@ export const message = (message: CoreMessage): ScriptStep => { return (_state, executor) => executor.message(message); }; +/** + * Script step type with optional speak method for TTS. + */ +interface SpeakableAgentStep extends ScriptStep { + /** + * Convert text to speech and send as audio message. + * + * @param options - TTS options including voice. + * @returns A ScriptStep that sends audio. + */ + speak: (options?: { voice?: Voice }) => ScriptStep; +} + /** * Generate or specify an agent response in the conversation. * @@ -33,11 +48,82 @@ export const message = (message: CoreMessage): ScriptStep => { * @param content Optional agent response content. Can be a string or full message object. * If undefined, the agent under test will generate content automatically. * @returns A ScriptStep function that can be used in scenario scripts. + * + * @example + * ```typescript + * // Text message + * scenario.agent("Here's a recipe for you") + * + * // Audio message via TTS + * scenario.agent.speak("Here's a recipe for you") + * + * // Let agent generate + * scenario.agent() + * ``` */ -export const agent = (content?: string | CoreMessage): ScriptStep => { - return (_state, executor) => executor.agent(content); +const agentBase = ( + content?: string | CoreMessage +): ScriptStep | SpeakableAgentStep => { + const step: ScriptStep = (_state, executor) => executor.agent(content); + + // Only add .speak() when text content is provided + if (typeof content === "string") { + const speakableStep = step as SpeakableAgentStep; + speakableStep.speak = (options?: { voice?: Voice }): ScriptStep => { + return async (_state, executor) => { + const audio = await textToSpeech(content, { voice: options?.voice }); + const audioMessage: CoreMessage = { + role: "assistant", + content: [ + { type: "text", text: "" }, + { type: "file", mediaType: audio.mediaType, data: audio.data }, + ], + }; + await executor.message(audioMessage); + }; + }; + return speakableStep; + } + + return step; }; +/** + * Speak text as an agent audio message via TTS. + * + * @param text - Text to convert to speech. + * @param options - TTS options including voice. + * @returns A ScriptStep that sends audio. + * + * @example + * ```typescript + * scenario.agent.speak("Here's a recipe for you") + * ``` + */ +const agentSpeak = (text: string, options?: { voice?: Voice }): ScriptStep => { + return async (_state, executor) => { + const audio = await textToSpeech(text, { voice: options?.voice }); + const audioMessage: CoreMessage = { + role: "assistant", + content: [ + { type: "text", text: "" }, + { type: "file", mediaType: audio.mediaType, data: audio.data }, + ], + }; + await executor.message(audioMessage); + }; +}; + +/** + * Generate or specify an agent response in the conversation. + * + * Supports both text and audio output: + * - `scenario.agent("text")` - Send text message + * - `scenario.agent.speak("text")` - Send audio message via TTS + * - `scenario.agent()` - Let agent generate + */ +export const agent = Object.assign(agentBase, { speak: agentSpeak }); + /** * Invoke the judge agent to evaluate the current conversation state. * @@ -55,6 +141,19 @@ export const judge = (content?: string | CoreMessage): ScriptStep => { }; }; +/** + * Script step type with optional speak method for TTS. + */ +interface SpeakableUserStep extends ScriptStep { + /** + * Convert text to speech and send as audio message. + * + * @param options - TTS options including voice. + * @returns A ScriptStep that sends audio. + */ + speak: (options?: { voice?: Voice }) => ScriptStep; +} + /** * Generate or specify a user message in the conversation. * @@ -65,11 +164,82 @@ export const judge = (content?: string | CoreMessage): ScriptStep => { * @param content Optional user message content. Can be a string or full message object. * If undefined, the user simulator will generate content automatically. * @returns A ScriptStep function that can be used in scenario scripts. + * + * @example + * ```typescript + * // Text message + * scenario.user("Hello") + * + * // Audio message via TTS + * scenario.user.speak("Hello") + * + * // Let user simulator generate + * scenario.user() + * ``` */ -export const user = (content?: string | CoreMessage): ScriptStep => { - return (_state, executor) => executor.user(content); +const userBase = ( + content?: string | CoreMessage +): ScriptStep | SpeakableUserStep => { + const step: ScriptStep = (_state, executor) => executor.user(content); + + // Only add .speak() when text content is provided + if (typeof content === "string") { + const speakableStep = step as SpeakableUserStep; + speakableStep.speak = (options?: { voice?: Voice }): ScriptStep => { + return async (_state, executor) => { + const audio = await textToSpeech(content, { voice: options?.voice }); + const audioMessage: CoreMessage = { + role: "user", + content: [ + { type: "text", text: "" }, + { type: "file", mediaType: audio.mediaType, data: audio.data }, + ], + }; + await executor.message(audioMessage); + }; + }; + return speakableStep; + } + + return step; }; +/** + * Speak text as a user audio message via TTS. + * + * @param text - Text to convert to speech. + * @param options - TTS options including voice. + * @returns A ScriptStep that sends audio. + * + * @example + * ```typescript + * scenario.user.speak("I need help with billing") + * ``` + */ +const userSpeak = (text: string, options?: { voice?: Voice }): ScriptStep => { + return async (_state, executor) => { + const audio = await textToSpeech(text, { voice: options?.voice }); + const audioMessage: CoreMessage = { + role: "user", + content: [ + { type: "text", text: "" }, + { type: "file", mediaType: audio.mediaType, data: audio.data }, + ], + }; + await executor.message(audioMessage); + }; +}; + +/** + * Generate or specify a user message in the conversation. + * + * Supports both text and audio output: + * - `scenario.user("text")` - Send text message + * - `scenario.user.speak("text")` - Send audio message via TTS + * - `scenario.user()` - Let user simulator generate + */ +export const user = Object.assign(userBase, { speak: userSpeak }); + /** * Let the scenario proceed automatically for a specified number of turns. * diff --git a/javascript/src/utils/string-utils.ts b/javascript/src/utils/string-utils.ts new file mode 100644 index 00000000..d3d115ee --- /dev/null +++ b/javascript/src/utils/string-utils.ts @@ -0,0 +1,26 @@ +/** + * String utility functions. + */ + +/** + * Converts a string to kebab-case. + * + * @param str - The string to convert. + * @returns The kebab-case string. + * + * @example + * ```typescript + * StringUtils.kebabCase("Hello World") // "hello-world" + * StringUtils.kebabCase("camelCase") // "camel-case" + * ``` + */ +export function kebabCase(str: string): string { + return str + .replace(/([a-z])([A-Z])/g, "$1-$2") + .replace(/[\s_]+/g, "-") + .toLowerCase(); +} + +export const StringUtils = { + kebabCase, +};