diff --git a/javascript/examples/vitest/tests/multimodal-audio-to-audio.test.ts b/javascript/examples/vitest/tests/multimodal-audio-to-audio.test.ts
index d0a605d5..ce87688c 100644
--- a/javascript/examples/vitest/tests/multimodal-audio-to-audio.test.ts
+++ b/javascript/examples/vitest/tests/multimodal-audio-to-audio.test.ts
@@ -1,79 +1,59 @@
 import { openai } from "@ai-sdk/openai";
-import scenario, { AgentRole } from "@langwatch/scenario";
+import scenario, { AgentRole, audioFromFile } from "@langwatch/scenario";
 import { UserModelMessage } from "ai";
 import { describe, it, expect } from "vitest";
-import {
-  encodeAudioToBase64,
-  getFixturePath,
-  wrapJudgeForAudioTranscription,
-} from "./helpers";
+import { getFixturePath } from "./helpers";
 import { OpenAiVoiceAgent } from "./helpers/openai-voice-agent";
 
 class AudioAgent extends OpenAiVoiceAgent {
   role: AgentRole = AgentRole.AGENT;
 }
 
-// Use setId to group together for visualizing in the UI
 const setId = "multimodal-audio-test";
 
 /**
  * This example shows how to test an agent that can take audio input
  * from a fixture and respond with audio output.
+ *
+ * Uses:
+ * - audioFromFile() to load audio
+ * - scenario.message() to inject the audio message
+ * - scenario.judgeAgent({ audio: true }) for multimodal evaluation
  */
 describe("Multimodal Audio to Audio Tests", () => {
-  it("should handle audio input", async () => {
+  it("should handle audio input from file", async () => {
     const myAgent = new AudioAgent({
-      systemPrompt: `
-      You are a helpful assistant that can analyze audio input and respond with audio output.
-      You must respond with audio output.
-      `,
+      systemPrompt: `You are a helpful assistant that analyzes audio input.
+      Answer questions about the audio content.`,
       voice: "alloy",
       forceUserRole: true,
     });
 
-    const data = encodeAudioToBase64(
-      getFixturePath("male_or_female_voice.wav"),
-    );
+    // Load audio file using the utility
+    const audio = audioFromFile(getFixturePath("male_or_female_voice.wav"));
 
-    // The AI-SDK will only support file parts,
-    // so we cannot use the OpenAI shape from above
-    // @see https://ai-sdk.dev/docs/foundations/prompts#file-parts
-    const audioMessage = {
+    // Create audio message with instructions
+    const audioMessage: UserModelMessage = {
       role: "user",
       content: [
-        {
-          type: "text",
-          text: `
-          Answer the question in the a text.
-          If you're not sure, you're required to take a best guess.
-          After you've guessed, you must repeat the question and say what format the input was in (audio or text)
-          `,
-        },
-        {
-          type: "file",
-          mediaType: "audio/wav",
-          data,
-        },
+        { type: "text", text: "Is this a male or female voice? Take a guess." },
+        { type: "file", mediaType: audio.mediaType, data: audio.data },
       ],
-    } satisfies UserModelMessage;
-
-    const audioJudge = wrapJudgeForAudioTranscription(
-      scenario.judgeAgent({
-        model: openai("gpt-4o"),
-        criteria: [
-          "The agent correctly guesses it's a male voice",
-          "The agent repeats the question",
-          "The agent says what format the input was in (audio or text)",
-        ],
-      }),
-    );
+    };
 
     const result = await scenario.run({
       setId,
-      name: "multimodal audio to audio",
-      description:
-        "User sends audio file, agent analyzes and transcribes the content",
-      agents: [myAgent, scenario.userSimulatorAgent(), audioJudge],
+      name: "audio to audio - file input",
+      description: "User sends audio file, agent analyzes and responds",
+      agents: [
+        myAgent,
+        scenario.userSimulatorAgent(),
+        scenario.judgeAgent({
+          model: openai("gpt-4o"),
+          criteria: ["The agent guesses the voice gender"],
+          audio: true,
+        }),
+      ],
       script: [
         scenario.message(audioMessage),
         scenario.agent(),
@@ -94,7 +74,7 @@ describe("Multimodal Audio to Audio Tests", () => {
   it.todo("should handle multiple audio formats (WAV, MP3, M4A)");
   it.todo("should handle long audio files gracefully");
   it.todo(
-    "should provide appropriate responses for unclear or corrupted audio",
+    "should provide appropriate responses for unclear or corrupted audio"
   );
   it.todo("should handle audio with background noise");
   it.todo("should transcribe speech in different languages");
diff --git a/javascript/examples/vitest/tests/multimodal-audio-to-text.test.ts b/javascript/examples/vitest/tests/multimodal-audio-to-text.test.ts
index 027b2a07..e98ea146 100644
--- a/javascript/examples/vitest/tests/multimodal-audio-to-text.test.ts
+++ b/javascript/examples/vitest/tests/multimodal-audio-to-text.test.ts
@@ -3,39 +3,31 @@ import scenario, {
   AgentAdapter,
   AgentInput,
   AgentRole,
+  audioFromFile,
 } from "@langwatch/scenario";
 import { UserModelMessage } from "ai";
 import OpenAI from "openai";
 import { ChatCompletionMessageParam } from "openai/resources/chat/completions.mjs";
 import { describe, it, expect } from "vitest";
-import {
-  encodeAudioToBase64,
-  getFixturePath,
-  wrapJudgeForAudioTranscription,
-} from "./helpers";
+import { getFixturePath } from "./helpers";
 import { convertModelMessagesToOpenAIMessages } from "./helpers/convert-core-messages-to-openai";
 
-class AudioAgent extends AgentAdapter {
+/**
+ * Agent that takes audio input and responds with text
+ */
+class AudioToTextAgent extends AgentAdapter {
   role: AgentRole = AgentRole.AGENT;
   private openai = new OpenAI();
 
   call = async (input: AgentInput) => {
-    // To use the OpenAI "voice-to-voice" model, we need to use the
-    // OpenAI api directly, and so we need to convert the messages to the correct
-    // shape here.
-    // @see https://platform.openai.com/docs/guides/audio?example=audio-in
     const messages = convertModelMessagesToOpenAIMessages(input.messages);
     const response = await this.respond(messages);
-
-    // Scenario expects the response to be a string, so we only send the transcript
     const transcript = response.choices[0].message?.audio?.transcript;
 
-    // Handle text response
     if (typeof transcript === "string") {
       return transcript;
-    } else {
-      throw new Error("Agent failed to generate a response");
     }
+    throw new Error("Agent failed to generate a response");
   };
 
   private async respond(messages: ChatCompletionMessageParam[]) {
@@ -43,64 +35,48 @@ class AudioAgent extends AgentAdapter {
       model: "gpt-4o-audio-preview",
       modalities: ["text", "audio"],
       audio: { voice: "alloy", format: "wav" },
-      // We need to strip the id, or the openai client will throw an error
       messages,
       store: false,
     });
   }
 }
 
-// Use setId to group together for visualizing in the UI
 const setId = "multimodal-audio-test";
 
 /**
- * This example shows how to test an agent that can take audio input
- * and respond with text output.
+ * This example shows how to test an agent that takes audio input
+ * and responds with text output.
+ *
+ * Uses:
+ * - audioFromFile() to load audio
+ * - scenario.message() to inject the audio message
+ * - scenario.judgeAgent({ audio: true }) for multimodal evaluation
  */
 describe("Multimodal Audio to Text Tests", () => {
-  it("should handle audio input", async () => {
-    const data = encodeAudioToBase64(
-      getFixturePath("male_or_female_voice.wav"),
-    );
+  it("should handle audio input from file", async () => {
+    // Load audio file
+    const audio = audioFromFile(getFixturePath("male_or_female_voice.wav"));
 
-    // The AI-SDK will only support file parts,
-    // so we cannot use the OpenAI shape from above
-    // @see https://ai-sdk.dev/docs/foundations/prompts#file-parts
-    const audioMessage = {
+    const audioMessage: UserModelMessage = {
       role: "user",
       content: [
-        {
-          type: "text",
-          text: `
-          Answer the question in the audio.
-          If you're not sure, you're required to take a best guess.
-          After you've guessed, you must repeat the question and say what format the input was in (audio or text)
-          `,
-        },
-        {
-          type: "file",
-          mediaType: "audio/wav",
-          data,
-        },
+        { type: "text", text: "Is this a male or female voice?" },
+        { type: "file", mediaType: audio.mediaType, data: audio.data },
       ],
-    } satisfies UserModelMessage;
-
-    const audioJudge = wrapJudgeForAudioTranscription(
-      scenario.judgeAgent({
-        model: openai("gpt-5"),
-        criteria: [
-          "The agent guesses it's a male voice",
-          "The agent repeats the question",
-          "The agent says what format the input was in (audio or text)",
-        ],
-      }),
-    );
+    };
 
     const result = await scenario.run({
-      name: "multimodal audio to text",
-      description:
-        "User sends audio file, agent analyzes and transcribes the content",
-      agents: [new AudioAgent(), scenario.userSimulatorAgent(), audioJudge],
+      name: "audio to text",
+      description: "User sends audio, agent responds with text",
+      agents: [
+        new AudioToTextAgent(),
+        scenario.userSimulatorAgent(),
+        scenario.judgeAgent({
+          model: openai("gpt-4o"),
+          criteria: ["The agent identifies the voice gender"],
+          audio: true,
+        }),
+      ],
       script: [
         scenario.message(audioMessage),
         scenario.agent(),
@@ -122,7 +98,7 @@ describe("Multimodal Audio to Text Tests", () => {
   it.todo("should handle multiple audio formats (WAV, MP3)");
   it.todo("should handle long audio files gracefully");
   it.todo(
-    "should provide appropriate responses for unclear or corrupted audio",
+    "should provide appropriate responses for unclear or corrupted audio"
   );
   it.todo("should handle audio with background noise");
   it.todo("should transcribe speech in different languages");
diff --git a/javascript/examples/vitest/tests/multimodal-voice-to-voice-conversation.test.ts b/javascript/examples/vitest/tests/multimodal-voice-to-voice-conversation.test.ts
index b31efbb1..0035f4bf 100644
--- a/javascript/examples/vitest/tests/multimodal-voice-to-voice-conversation.test.ts
+++ b/javascript/examples/vitest/tests/multimodal-voice-to-voice-conversation.test.ts
@@ -1,150 +1,194 @@
 /**
  * Multimodal Voice-to-Voice Conversation Tests
  *
- * This test suite demonstrates a complete audio-to-audio conversation flow where:
- * - A user simulator agent generates audio questions
- * - A main agent responds with audio answers
- * - Both communicate entirely through voice (no text)
- * - The conversation is judged for quality
- * - The full audio is saved for review
+ * This test suite demonstrates voice-first-class primitives for audio conversations:
  *
- * This showcases:
- * - Custom agent implementations with voice capabilities
- * - Multi-turn voice conversations
- * - Audio message handling and persistence
- * - Judge agent integration with audio transcription
- * - Role reversal for user simulation
+ * Voice Script Primitives:
+ * - scenario.user.speak("text") — Fixed user message converted to audio via TTS
+ * - scenario.agent.speak("text") — Fixed agent message converted to audio via TTS
+ * - scenario.user() with voice sim — Generated audio responses
+ *
+ * Voice User Simulator:
+ * - scenario.userSimulatorAgent({ voice: "nova" }) — Generates audio instead of text
+ *
+ * Audio-Aware Judge:
+ * - scenario.judgeAgent({ audio: true }) — Evaluates audio content directly
  */
 import * as path from "path";
 import { openai } from "@ai-sdk/openai";
-import scenario, { AgentInput, AgentRole } from "@langwatch/scenario";
-import { ModelMessage } from "ai";
+import scenario, { AgentRole, StringUtils } from "@langwatch/scenario";
 import { describe, it, expect } from "vitest";
-import {
-  OpenAiVoiceAgent,
-  saveConversationAudio,
-  wrapJudgeForAudioTranscription,
-} from "./helpers";
-import { messageRoleReversal } from "../../../src/agents/utils";
+import { OpenAiVoiceAgent, saveConversationAudio } from "./helpers";
 
 /**
- * Main agent that responds with helpful audio answers
- * Uses "echo" voice for a distinct sound
+ * Voice agent that responds with audio
  */
-class MyAgent extends OpenAiVoiceAgent {
+class VoiceAgent extends OpenAiVoiceAgent {
   role: AgentRole = AgentRole.AGENT;
 
   constructor() {
     super({
-      systemPrompt: `You are a helpful and engaging AI assistant.
-      Respond naturally and conversationally since this is an audio conversation.
-      Be informative but keep your responses short, concise and engaging.
-      Adapt your speaking style to be natural for audio.`,
+      systemPrompt: `You are a helpful AI assistant having a voice conversation.
+      Keep responses short and conversational.`,
       voice: "echo",
     });
   }
 }
 
-/**
- * User simulator that generates audio questions
- *
- * This agent:
- * - Plays the role of a curious user asking questions
- * - Generates audio responses (not text)
- * - Uses role reversal to properly simulate user behavior
- * - Automatically ends conversation after 2 exchanges
- * - Uses "nova" voice to differentiate from main agent
- */
-class AudioUserSimulatorAgent extends OpenAiVoiceAgent {
-  role: AgentRole = AgentRole.USER;
-
-  constructor() {
-    super({
-      systemPrompt: `
-      You are role playing as a curious user looking for information about AI agentic testing,
-      but you're a total novice and don't know anything about it.
-
-      Be natural and conversational in your speech patterns.
-      This is an audio conversation, so speak as you would naturally talk.
+const setId = "voice-conversation-tests";
+const outputPath = path.join(process.cwd(), "tmp", "audio_conversations");
 
-      After 2 responses from the other speaker, say "I'm done with this conversation" and say goodbye.
-
-      YOUR LANGUAGE IS ENGLISH.
-      `,
-      voice: "nova",
+describe("Voice-to-Voice Conversation Tests", () => {
+  /**
+   * Example 1: Fixed voice messages using .speak()
+   *
+   * Use scenario.user.speak() and scenario.agent.speak() when you want
+   * specific text converted to audio via TTS.
+   */
+  it.only("should handle fixed voice messages with .speak()", async () => {
+    const result = await scenario.run({
+      name: "fixed voice messages",
+      description: "Test with predetermined voice messages",
+      agents: [
+        new VoiceAgent(),
+        scenario.userSimulatorAgent({
+          voice: "nova",
+        }), // Text sim (not used in this script)
+        scenario.judgeAgent({
+          criteria: ["Agent responds appropriately to greeting"],
+          // audio: "transcribe" | true | undefined,
+        }),
+      ],
+      script: [
+        // Fixed user voice message via TTS
+        scenario.user.speak("Hello! Can you help me with something?"),
+        scenario.agent(), // Agent generates audio response
+        async (ctx) => {
+          await saveConversationAudio(
+            ctx,
+            path.join(
+              outputPath,
+              `${StringUtils.kebabCase(ctx.config.name)}.wav`
+            )
+          );
+        },
+        scenario.judge(),
+      ],
+      setId,
     });
-  }
 
-  public async call(input: AgentInput): Promise<ModelMessage | string> {
-    /**
-     * Role reversal is critical here:
-     * - The agent sees "user" messages as if they're from the assistant
-     * - This allows the agent to respond AS the user
-     * - Without this, the conversation flow would be backwards
-     */
-    const messages = messageRoleReversal(input.messages);
-    return super.call({
-      ...input,
-      messages,
-    });
-  }
-}
+    expect(result.success).toBe(true);
+  });
 
-// Group related test runs together in the UI
-const setId = "full-audio-conversation-test";
+  /**
+   * Example 2: Voice user simulator generates audio
+   *
+   * Use scenario.userSimulatorAgent({ voice }) to have the simulator
+   * generate contextual responses as audio.
+   */
+  it("should handle voice user simulator", async () => {
+    const result = await scenario.run({
+      name: "voice user simulator",
+      description:
+        "User is asking about cooking, keeping the conversation short and natural for voice.",
+      agents: [
+        new VoiceAgent(),
+        // Voice user sim - generates audio via TTS
+        scenario.userSimulatorAgent({
+          voice: "nova",
+          systemPrompt: `You are a curious user asking about cooking.
+          Keep questions short and natural for voice.`,
+        }),
+        scenario.judgeAgent({
+          model: openai("gpt-4o"),
+          criteria: ["Conversation flows naturally"],
+          audio: true,
+        }),
+      ],
+      script: [
+        scenario.user(), // Voice sim generates audio
+        scenario.agent(), // Agent responds with audio
+        scenario.user(), // Voice sim generates follow-up audio
+        scenario.agent(),
+        scenario.judge(),
+      ],
+      setId,
+    });
 
-// Output path for the full conversation audio file
-const outputPath = path.join(
-  process.cwd(),
-  "tmp",
-  "audio_conversations",
-  "full-conversation.wav"
-);
+    expect(result.success).toBe(true);
+  });
 
-describe("Multimodal Voice-to-Voice Conversation Tests", () => {
-  it("should handle complete audio-to-audio conversation", async () => {
-    // Initialize both agents for the conversation
-    const audioUserSimulator = new AudioUserSimulatorAgent();
-    const audioAgent = new MyAgent();
+  /**
+   * Example 3: Mixed text and voice in same conversation
+   *
+   * You can mix text and voice messages. The voice sim only generates
+   * audio when scenario.user() has no arguments.
+   */
+  it("should handle mixed text and voice messages", async () => {
+    const result = await scenario.run({
+      name: "mixed text and voice",
+      description: "Combining text and voice in one conversation",
+      agents: [
+        new VoiceAgent(),
+        scenario.userSimulatorAgent({ voice: "nova" }),
+        scenario.judgeAgent({
+          model: openai("gpt-4o"),
+          criteria: ["Agent handles both text and voice input"],
+          audio: true,
+        }),
+      ],
+      script: [
+        // Text message (not converted to audio)
+        scenario.user("Hi, I need a recipe suggestion"),
+        scenario.agent(),
+        // Voice sim generates audio follow-up
+        scenario.user(),
+        scenario.agent(),
+        // Fixed voice message via TTS
+        scenario.user.speak("Thanks, that sounds delicious!"),
+        scenario.agent(),
+        scenario.judge(),
+      ],
+      setId,
+    });
 
-    // Create judge agent to evaluate conversation quality
-    // Wrap with audio handler to transcribe audio before judging
-    const conversationJudge = wrapJudgeForAudioTranscription(
-      scenario.judgeAgent({
-        model: openai("gpt-4o"),
-        criteria: ["The conversation flows naturally between user and agent"],
-      })
-    );
+    expect(result.success).toBe(true);
+  });
 
-    // Execute the full audio conversation scenario
+  /**
+   * Example 4: Multi-turn voice conversation with audio export
+   */
+  it("should handle multi-turn voice conversation", async () => {
     const result = await scenario.run({
-      name: "full audio-to-audio conversation",
-      description:
-        "Complete audio conversation between user simulator and agent over multiple turns",
-      agents: [audioAgent, audioUserSimulator, conversationJudge],
+      name: "multi-turn voice conversation",
+      description: "Extended voice conversation with audio export",
+      agents: [
+        new VoiceAgent(),
+        scenario.userSimulatorAgent({
+          voice: "nova",
+          systemPrompt: `You are learning about AI testing.
+          Ask 2-3 questions then say goodbye.`,
+        }),
+        scenario.judgeAgent({
+          model: openai("gpt-4o"),
+          criteria: ["Conversation is informative and helpful"],
+          audio: true,
+        }),
+      ],
       script: [
-        // Step 1: Run 2 conversation turns between user simulator and agent
-        scenario.proceed(2),
-
-        // Step 2: Save the full conversation as a single audio file
+        scenario.proceed(3), // 3 turns of voice conversation
         async (ctx) => {
-          await saveConversationAudio(ctx, outputPath);
+          await saveConversationAudio(
+            ctx,
+            path.join(outputPath, "multi-turn.wav")
+          );
         },
-
-        // Step 3: Have judge evaluate the conversation quality
         scenario.judge(),
       ],
       setId,
     });
 
-    try {
-      console.log("FULL AUDIO CONVERSATION RESULT", result);
-
-      expect(result.success).toBe(true);
-    } catch (error) {
-      console.error("Full audio conversation failed:", result);
-      throw error;
-    }
+    expect(result.success).toBe(true);
   });
 
   /**
diff --git a/javascript/examples/vitest/tests/scenario-expert-realtime.test.ts b/javascript/examples/vitest/tests/scenario-expert-realtime.test.ts
index 17202179..df13c0a7 100644
--- a/javascript/examples/vitest/tests/scenario-expert-realtime.test.ts
+++ b/javascript/examples/vitest/tests/scenario-expert-realtime.test.ts
@@ -8,6 +8,9 @@
  * 1. Browser: createScenarioExpertSession() → connect with token → use directly
  * 2. Test: createScenarioExpertSession() → connect with API key → wrap in adapter
  * 3. SAME session creation = accurate testing!
+ *
+ * Uses voice-first-class primitives:
+ * - scenario.judgeAgent({ audio: true }) for multimodal evaluation
  */
 
 import scenario, {
@@ -16,7 +19,6 @@ import scenario, {
   type AudioResponseEvent,
 } from "@langwatch/scenario";
 import { describe, it, expect, beforeAll, afterAll } from "vitest";
-import { wrapJudgeForAudioTranscription } from "./helpers/wrap-judge-for-audio-transcription";
 import { AudioUtils } from "./utils/audio/audio.utils";
 import { createUserSimulatorSession } from "../../openai-realtime-demo/agents/realtime-user-simulator.agent";
 import { createScenarioExpertSession } from "../../openai-realtime-demo/agents/scenario-expert.agent";
@@ -92,15 +94,14 @@ describe("Scenario Expert Agent (Realtime API)", () => {
       agents: [
         realtimeAdapter, // Realtime agent (handles audio!)
         audioUserSim, // Audio user simulator (generates voice)
-        wrapJudgeForAudioTranscription(
-          // Judge with audio transcription
-          scenario.judgeAgent({
-            criteria: [
-              "Agent explains what LangWatch Scenario is",
-              "Agent is helpful and informative",
-            ],
-          })
-        ),
+        // Judge with multimodal audio evaluation
+        scenario.judgeAgent({
+          criteria: [
+            "Agent explains what LangWatch Scenario is",
+            "Agent is helpful and informative",
+          ],
+          audio: true,
+        }),
       ],
       script: [
         scenario.agent(
@@ -110,7 +111,7 @@ describe("Scenario Expert Agent (Realtime API)", () => {
         scenario.agent(), // Audio response
         scenario.user(), // Audio follow-up
         scenario.agent(), // Audio response
-        scenario.judge(), // Evaluates transcripts
+        scenario.judge(), // Evaluates audio
       ],
       setId: "realtime-examples",
     });
diff --git a/javascript/examples/vitest/tests/vegetarian-recipe-realtime.test.ts b/javascript/examples/vitest/tests/vegetarian-recipe-realtime.test.ts
index d999694d..a319051b 100644
--- a/javascript/examples/vitest/tests/vegetarian-recipe-realtime.test.ts
+++ b/javascript/examples/vitest/tests/vegetarian-recipe-realtime.test.ts
@@ -8,6 +8,9 @@
  * 1. Browser: createVegetarianRecipeSession() → connect with token → use directly
  * 2. Test: createVegetarianRecipeSession() → connect with API key → wrap in adapter
  * 3. SAME session creation = accurate testing!
+ *
+ * Uses voice-first-class primitives:
+ * - scenario.judgeAgent({ audio: true }) for multimodal evaluation
  */
 
 import scenario, {
@@ -16,7 +19,6 @@ import scenario, {
   type AudioResponseEvent,
 } from "@langwatch/scenario";
 import { describe, it, expect, beforeAll, afterAll } from "vitest";
-import { wrapJudgeForAudioTranscription } from "./helpers/wrap-judge-for-audio-transcription";
 import { AudioUtils } from "./utils/audio/audio.utils";
 import { createUserSimulatorSession } from "../../openai-realtime-demo/agents/realtime-user-simulator.agent";
 import { createVegetarianRecipeSession } from "../../openai-realtime-demo/agents/vegetatrian-recipe.agent";
@@ -111,17 +113,16 @@ describe("Vegetarian Recipe Agent (Realtime API)", () => {
       agents: [
         realtimeAdapter, // Realtime agent (handles audio!)
         audioUserSim, // Audio user simulator (generates voice)
-        wrapJudgeForAudioTranscription(
-          // Judge with audio transcription
-          scenario.judgeAgent({
-            criteria: [
-              "Agent should provide a vegetarian recipe",
-              "Recipe should include ingredients",
-              "Recipe should include cooking steps",
-              "Agent should be helpful and encouraging",
-            ],
-          })
-        ),
+        // Judge with multimodal audio evaluation
+        scenario.judgeAgent({
+          criteria: [
+            "Agent should provide a vegetarian recipe",
+            "Recipe should include ingredients",
+            "Recipe should include cooking steps",
+            "Agent should be helpful and encouraging",
+          ],
+          audio: true,
+        }),
       ],
       script: [
         scenario.user(
diff --git a/javascript/src/agents/judge/judge-agent.ts b/javascript/src/agents/judge/judge-agent.ts
index 39029cb0..c92bf49d 100644
--- a/javascript/src/agents/judge/judge-agent.ts
+++ b/javascript/src/agents/judge/judge-agent.ts
@@ -1,6 +1,9 @@
-import { generateText, CoreMessage, ToolSet, Tool, ToolChoice, tool } from "ai";
+import { generateText, CoreMessage, ToolSet, Tool, ToolChoice, tool, ModelMessage } from "ai";
+import OpenAI from "openai";
+import { ChatCompletionMessageParam } from "openai/resources/chat/completions.mjs";
 import { z } from "zod/v4";
 import { JudgeResult } from "./interfaces";
+import { transcribeAudioInMessages } from "../../audio/transcribe";
 import { getProjectConfig } from "../../config";
 import { AgentInput, JudgeAgentAdapter, AgentRole } from "../../domain";
 import { modelSchema } from "../../domain/core/schemas/model.schema";
@@ -20,13 +23,35 @@ export interface JudgeAgentConfig extends TestingAgentConfig {
    * The criteria that the judge will use to evaluate the conversation.
    */
   criteria: string[];
+  /**
+   * Audio handling mode:
+   * - `"transcribe"` — Transcribe audio to text via Whisper, then evaluate text
+   * - `true` — Pass audio directly to multimodal model (e.g., gpt-4o-audio-preview)
+   * - `false`/`undefined` — No special audio handling (default)
+   */
+  audio?: boolean | "transcribe";
 }
 
-function buildSystemPrompt(criteria: string[], description: string): string {
+function buildSystemPrompt(
+  criteria: string[],
+  description: string,
+  audioEnabled?: boolean
+): string {
   const criteriaList =
     criteria?.map((criterion, idx) => `${idx + 1}. ${criterion}`).join("\n") ||
     "No criteria provided";
 
+  const audioInstructions = audioEnabled
+    ? `
+<audio_evaluation>
+This conversation includes audio messages. Listen to the audio content directly.
+When evaluating:
+- Consider tone, clarity, and delivery when relevant to criteria
+- Evaluate both the content (what was said) and presentation (how it was said)
+</audio_evaluation>
+`
+    : "";
+
   return `
 <role>
 You are an LLM as a judge watching a simulated conversation as it plays out live to determine if the agent under test meets the criteria or not.
@@ -44,7 +69,7 @@ ${description}
 <criteria>
 ${criteriaList}
 </criteria>
-
+${audioInstructions}
 <rules>
 - Be strict, do not let the conversation continue if the agent already broke one of the "do not" or "should not" criteria.
 - DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary
@@ -52,6 +77,70 @@ ${criteriaList}
 `.trim();
 }
 
+function convertModelMessagesToOpenAIMessages(
+  coreMessages: (ModelMessage & { id?: string })[]
+): ChatCompletionMessageParam[] {
+  return coreMessages.map(({ id: _id, ...msg }): ChatCompletionMessageParam => {
+    // Handle array content (multimodal messages)
+    if (Array.isArray(msg.content)) {
+      const parts = msg.content.map((part) => {
+        if (part.type === "text") {
+          return { type: "text" as const, text: part.text };
+        }
+        if (part.type === "file" && part.mediaType?.startsWith("audio/")) {
+          return {
+            type: "input_audio" as const,
+            input_audio: {
+              data: part.data as string,
+              format: "wav" as const,
+            },
+          };
+        }
+        return { type: "text" as const, text: "" };
+      });
+      return {
+        role: msg.role as "user" | "assistant" | "system",
+        content: parts,
+      } as ChatCompletionMessageParam;
+    }
+    return {
+      role: msg.role as "user" | "assistant" | "system",
+      content: msg.content,
+    } as ChatCompletionMessageParam;
+  });
+}
+
+/**
+ * Convert AI SDK tools to OpenAI function format.
+ */
+function convertToolsToOpenAIFunctions(tools: ToolSet): OpenAI.Chat.Completions.ChatCompletionTool[] {
+  return Object.entries(tools).map(([name, t]) => ({
+    type: "function" as const,
+    function: {
+      name,
+      description: t.description ?? "",
+      parameters: (t as { inputSchema?: Record<string, unknown> }).inputSchema ?? { type: "object" },
+    },
+  }));
+}
+
+/**
+ * Convert OpenAI tool calls to AI SDK format.
+ */
+function convertOpenAIToolCalls(
+  toolCalls: Array<{ id: string; function: { name: string; arguments: string } }> | undefined
+): InvokeLLMResult["toolCalls"] {
+  if (!toolCalls) return [];
+  return toolCalls.map((tc) => ({
+    type: "tool-call" as const,
+    toolCallId: tc.id,
+    toolName: tc.function.name,
+    args: JSON.parse(tc.function.arguments),
+    input: JSON.parse(tc.function.arguments),
+  }));
+}
+
+
 function buildContinueTestTool(): Tool {
   return tool({
     description: "Continue the test with the next step",
@@ -100,11 +189,51 @@ class JudgeAgent extends JudgeAgentAdapter {
   role: AgentRole = AgentRole.JUDGE;
   criteria: string[];
 
+  /**
+   * Detects if messages contain audio file parts.
+   */
+  private hasAudioContent(messages: CoreMessage[]): boolean {
+    return messages.some((message) => {
+      if (!Array.isArray(message.content)) return false;
+      return message.content.some(
+        (part) =>
+          part.type === "file" && part.mediaType?.startsWith("audio/")
+      );
+    });
+  }
+
   /**
    * LLM invocation function. Can be overridden to customize LLM behavior.
+   * Automatically uses OpenAI directly when audio content is detected.
    */
   invokeLLM: (params: InvokeLLMParams) => Promise<InvokeLLMResult> = async (params) => {
     try {
+      const hasAudio = this.hasAudioContent(params.messages ?? []);
+
+      if (hasAudio) {
+        const openai = new OpenAI();
+        const tools = params.tools ? convertToolsToOpenAIFunctions(params.tools) : undefined;
+
+        const response = await openai.chat.completions.create({
+          model: "gpt-4o-audio-preview",
+          modalities: ["text"],
+          audio: { voice: "alloy", format: "wav" },
+          messages: convertModelMessagesToOpenAIMessages(params.messages ?? []),
+          tools,
+          tool_choice: params.toolChoice === "required" ? "required" : undefined,
+          store: false,
+        });
+
+        const message = response.choices[0]?.message;
+        const text = message?.content ?? "";
+        return {
+          text,
+          content: [{ type: "text" as const, text }],
+          toolCalls: convertOpenAIToolCalls(message?.tool_calls as Array<{ id: string; function: { name: string; arguments: string } }> | undefined),
+          toolResults: [],
+        };
+      }
+
       return await generateText(params);
     } catch (error) {
       this.logger.error("Error generating text", { error });
@@ -121,12 +250,19 @@ class JudgeAgent extends JudgeAgentAdapter {
   async call(input: AgentInput): Promise<JudgeResult | null> {
     const cfg = this.cfg;
 
+    // audio: "transcribe" → transcribe audio to text first
+    // audio: true → pass audio directly to multimodal model
+    // audio: false/undefined → no special handling
+    const processedMessages = cfg.audio === "transcribe"
+      ? await transcribeAudioInMessages(input.messages)
+      : input.messages;
+
     const systemPrompt =
       cfg.systemPrompt ??
-      buildSystemPrompt(cfg.criteria, input.scenarioConfig.description);
+      buildSystemPrompt(cfg.criteria, input.scenarioConfig.description, cfg.audio === true);
     const messages: CoreMessage[] = [
       { role: "system", content: systemPrompt },
-      ...input.messages,
+      ...processedMessages,
     ];
 
     const isLastMessage =
@@ -232,12 +368,17 @@ class JudgeAgent extends JudgeAgentAdapter {
  * detailed reasoning for its verdicts. It evaluates each criterion independently
  * and provides comprehensive feedback about what worked and what didn't.
  *
+ * Supports both text and audio evaluation:
+ * - Text (default): Evaluates text content only
+ * - Audio: When `audio: true`, uses multimodal model to evaluate audio directly
+ *
  * @param cfg Configuration for the judge agent.
  * @param cfg.criteria List of success criteria to evaluate against.
  * @param cfg.model Optional The language model to use for generating responses.
  * @param cfg.temperature Optional The temperature to use for the model.
  * @param cfg.maxTokens Optional The maximum number of tokens to generate.
  * @param cfg.systemPrompt Optional Custom system prompt to override default judge behavior.
+ * @param cfg.audio When true, evaluates audio directly using multimodal model.
  *
  * @example
  * ```typescript
@@ -251,7 +392,8 @@ class JudgeAgent extends JudgeAgentAdapter {
  * };
  *
  * async function main() {
- *   const result = await run({
+ *   // Text-only evaluation
+ *   const textResult = await run({
  *     name: "Judge Agent Test",
  *     description: "A simple test to see if the judge agent works.",
  *     agents: [
@@ -265,6 +407,28 @@ class JudgeAgent extends JudgeAgentAdapter {
  *       agent(),
  *     ],
  *   });
+ *
+ *   // Audio evaluation (requires multimodal model)
+ *   const audioResult = await run({
+ *     name: "Voice Judge Test",
+ *     description: "Testing voice agent behavior",
+ *     agents: [
+ *       myVoiceAgent,
+ *       userSimulatorAgent({ voice: "nova" }),
+ *       judgeAgent({
+ *         criteria: [
+ *           "Agent maintains professional tone",
+ *           "Agent addresses the user's concern",
+ *         ],
+ *         audio: true,
+ *       }),
+ *     ],
+ *     script: [
+ *       user.speak("I need help with billing"),
+ *       agent(),
+ *       judge(),
+ *     ],
+ *   });
  * }
  * main();
  * ```
diff --git a/javascript/src/agents/types.ts b/javascript/src/agents/types.ts
index f87a2ee7..1c066022 100644
--- a/javascript/src/agents/types.ts
+++ b/javascript/src/agents/types.ts
@@ -1,5 +1,6 @@
 import { generateText } from "ai";
 import { ModelConfig } from "../domain/core/schemas/model.schema";
+import type { Voice } from "../audio/types";
 
 /**
  * Parameters for LLM invocation.
@@ -34,6 +35,22 @@ export interface TestingAgentConfig extends Partial<ModelConfig> {
   systemPrompt?: string;
 }
 
+/**
+ * Configuration for voice-enabled user simulator agent.
+ */
+export interface VoiceUserSimulatorConfig extends TestingAgentConfig {
+  /**
+   * Voice to use for TTS output.
+   * When set, the agent outputs audio instead of text.
+   */
+  voice?: Voice;
+  /**
+   * Output audio format.
+   * @default "wav"
+   */
+  audioFormat?: "wav" | "mp3" | "opus" | "aac" | "flac" | "pcm";
+}
+
 /**
  * The arguments for finishing a test, used by the judge agent's tool.
  */
diff --git a/javascript/src/agents/user-simulator-agent.ts b/javascript/src/agents/user-simulator-agent.ts
index c8289700..2c993ff4 100644
--- a/javascript/src/agents/user-simulator-agent.ts
+++ b/javascript/src/agents/user-simulator-agent.ts
@@ -1,10 +1,11 @@
 import { generateText, CoreMessage } from "ai";
-import { TestingAgentConfig, InvokeLLMParams, InvokeLLMResult } from "./types";
+import { VoiceUserSimulatorConfig, InvokeLLMParams, InvokeLLMResult } from "./types";
 import { messageRoleReversal } from "./utils";
 import { getProjectConfig } from "../config";
 import { AgentInput, UserSimulatorAgentAdapter } from "../domain";
 import { modelSchema } from "../domain/core/schemas/model.schema";
 import { Logger } from "../utils/logger";
+import { textToSpeech } from "../audio/text-to-speech";
 
 function buildSystemPrompt(description: string): string {
   return `
@@ -42,7 +43,7 @@ class UserSimulatorAgent extends UserSimulatorAgentAdapter {
     }
   };
 
-  constructor(private readonly cfg?: TestingAgentConfig) {
+  constructor(private readonly cfg?: VoiceUserSimulatorConfig) {
     super();
   }
 
@@ -83,6 +84,21 @@ class UserSimulatorAgent extends UserSimulatorAgentAdapter {
       throw new Error("No response content from LLM");
     }
 
+    // If voice is configured, convert text to speech
+    if (config?.voice) {
+      const audio = await textToSpeech(messageContent, {
+        voice: config.voice,
+        format: config.audioFormat,
+      });
+      return {
+        role: "user",
+        content: [
+          { type: "text", text: "" },
+          { type: "file", mediaType: audio.mediaType, data: audio.data },
+        ],
+      } satisfies CoreMessage;
+    }
+
     return { role: "user", content: messageContent } satisfies CoreMessage;
   };
 }
@@ -95,6 +111,10 @@ class UserSimulatorAgent extends UserSimulatorAgentAdapter {
  * It uses an LLM to generate natural, contextually relevant user inputs that help
  * drive the conversation forward according to the scenario description.
  *
+ * Supports both text and voice output:
+ * - Text output (default): Returns text messages
+ * - Voice output: When `voice` is set, outputs audio via TTS
+ *
  * @param config Optional configuration for the agent.
  * @param config.model The language model to use for generating responses.
  *                     If not provided, a default model will be used.
@@ -106,6 +126,8 @@ class UserSimulatorAgent extends UserSimulatorAgentAdapter {
  * @param config.name The name of the agent.
  * @param config.systemPrompt Custom system prompt to override default user simulation behavior.
  *                           Use this to create specialized user personas or behaviors.
+ * @param config.voice Voice to use for TTS output. When set, outputs audio instead of text.
+ * @param config.audioFormat Output audio format (wav, mp3, etc). Defaults to wav.
  *
  * @throws {Error} If no model is configured either in parameters or global config.
  *
@@ -121,8 +143,8 @@ class UserSimulatorAgent extends UserSimulatorAgentAdapter {
  * };
  *
  * async function main() {
- *   // Basic user simulator with default behavior
- *   const basicResult = await run({
+ *   // Basic user simulator with text output
+ *   const textResult = await run({
  *     name: "User Simulator Test",
  *     description: "A simple test to see if the user simulator works.",
  *     agents: [myAgent, userSimulatorAgent()],
@@ -132,40 +154,32 @@ class UserSimulatorAgent extends UserSimulatorAgentAdapter {
  *     ],
  *   });
  *
- *   // Customized user simulator
- *   const customResult = await run({
- *     name: "Expert User Test",
- *     description: "User seeks help with TypeScript programming",
+ *   // Voice user simulator - outputs audio
+ *   const voiceResult = await run({
+ *     name: "Voice User Test",
+ *     description: "User interacts via voice",
  *     agents: [
  *       myAgent,
- *       userSimulatorAgent({
- *         model: openai("gpt-4"),
- *         temperature: 0.3,
- *         systemPrompt: "You are a technical user who asks detailed questions"
- *       })
+ *       userSimulatorAgent({ voice: "nova" })
  *     ],
  *     script: [
- *       user(),
+ *       user(),  // Outputs audio
  *       agent(),
  *     ],
  *   });
  *
- *   // User simulator with custom persona
- *   const expertResult = await run({
- *     name: "Expert Developer Test",
- *     description: "Testing with a technical expert user persona.",
+ *   // Mixed: text input, voice output
+ *   const mixedResult = await run({
+ *     name: "Mixed Modality Test",
+ *     description: "Text input with voice simulation",
  *     agents: [
  *       myAgent,
- *       userSimulatorAgent({
- *         systemPrompt: `
- *           You are an expert software developer testing an AI coding assistant.
- *           Ask challenging, technical questions and be demanding about code quality.
- *           Use technical jargon and expect detailed, accurate responses.
- *         `
- *       })
+ *       userSimulatorAgent({ voice: "echo", audioFormat: "mp3" })
  *     ],
  *     script: [
- *       user(),
+ *       user("Help me with billing"),  // Fixed text
+ *       agent(),
+ *       user(),  // Voice sim generates audio
  *       agent(),
  *     ],
  *   });
@@ -175,7 +189,8 @@ class UserSimulatorAgent extends UserSimulatorAgentAdapter {
  *
  * **Implementation Notes:**
  * - Uses role reversal internally to work around LLM biases toward assistant roles
+ * - Voice output uses OpenAI TTS API
  */
-export const userSimulatorAgent = (config?: TestingAgentConfig) => {
+export const userSimulatorAgent = (config?: VoiceUserSimulatorConfig) => {
   return new UserSimulatorAgent(config);
 };
diff --git a/javascript/src/audio/index.ts b/javascript/src/audio/index.ts
new file mode 100644
index 00000000..a014f0d0
--- /dev/null
+++ b/javascript/src/audio/index.ts
@@ -0,0 +1,11 @@
+/**
+ * Audio module for voice-first-class support in Scenario.
+ *
+ * Provides types, utilities, and TTS capabilities for working with
+ * audio in scenario scripts.
+ */
+
+export * from "./types";
+export * from "./utils";
+export * from "./text-to-speech";
+export * from "./transcribe";
diff --git a/javascript/src/audio/text-to-speech.ts b/javascript/src/audio/text-to-speech.ts
new file mode 100644
index 00000000..c346c321
--- /dev/null
+++ b/javascript/src/audio/text-to-speech.ts
@@ -0,0 +1,60 @@
+/**
+ * Text-to-speech utility using OpenAI TTS API.
+ */
+import OpenAI from "openai";
+import type { AudioData, TextToSpeechOptions, Voice } from "./types";
+
+/**
+ * Default voice for TTS.
+ */
+const DEFAULT_VOICE: Voice = "nova";
+
+/**
+ * Default audio format for TTS output.
+ */
+const DEFAULT_FORMAT = "wav" as const;
+
+/**
+ * Converts text to speech using OpenAI TTS API.
+ *
+ * @param text - Text to convert to speech.
+ * @param options - TTS options including voice and format.
+ * @returns AudioData with the synthesized audio.
+ */
+export async function textToSpeech(
+  text: string,
+  options?: TextToSpeechOptions
+): Promise<AudioData> {
+  const voice = options?.voice ?? DEFAULT_VOICE;
+  const format = options?.format ?? DEFAULT_FORMAT;
+
+  const openai = new OpenAI();
+
+  const response = await openai.audio.speech.create({
+    model: "tts-1",
+    voice,
+    input: text,
+    response_format: format,
+  });
+
+  const arrayBuffer = await response.arrayBuffer();
+  const buffer = Buffer.from(arrayBuffer);
+  const data = buffer.toString("base64");
+
+  const mediaType =
+    format === "mp3"
+      ? "audio/mp3"
+      : format === "opus"
+      ? "audio/ogg"
+      : format === "aac"
+      ? "audio/mpeg"
+      : format === "flac"
+      ? "audio/wav"
+      : "audio/wav";
+
+  return {
+    data,
+    mediaType,
+    transcript: text,
+  };
+}
diff --git a/javascript/src/audio/transcribe.ts b/javascript/src/audio/transcribe.ts
new file mode 100644
index 00000000..4cd29abe
--- /dev/null
+++ b/javascript/src/audio/transcribe.ts
@@ -0,0 +1,84 @@
+/**
+ * Audio transcription utilities for converting audio to text.
+ *
+ * Uses OpenAI Whisper API for transcription with caching to avoid
+ * re-transcribing the same audio multiple times.
+ */
+import { CoreMessage } from "ai";
+import OpenAI from "openai";
+
+/**
+ * Cache mapping base64 audio data to transcribed text.
+ */
+const transcriptionCache = new Map<string, string>();
+
+/**
+ * Transcribes audio data to text using OpenAI Whisper.
+ *
+ * @param audioData - Base64-encoded audio data.
+ * @param mediaType - MIME type of the audio (e.g., "audio/wav").
+ * @returns Transcribed text.
+ */
+export async function transcribeAudio(
+  audioData: string,
+  mediaType: string = "audio/wav"
+): Promise<string> {
+  const cached = transcriptionCache.get(audioData);
+  if (cached) return cached;
+
+  try {
+    const openai = new OpenAI();
+    const ext = mediaType.split("/")[1] || "wav";
+    const response = await openai.audio.transcriptions.create({
+      model: "whisper-1",
+      file: new File([Buffer.from(audioData, "base64")], `audio.${ext}`, {
+        type: mediaType,
+      }),
+    });
+    transcriptionCache.set(audioData, response.text);
+    return response.text;
+  } catch (error) {
+    console.error("Error transcribing audio:", error);
+    return "[Audio: transcription failed]";
+  }
+}
+
+/**
+ * Converts audio parts in messages to text transcriptions.
+ *
+ * Scans all message content for audio file parts, transcribes them
+ * using OpenAI Whisper, and returns messages with audio converted to text.
+ *
+ * @param messages - Original messages potentially containing audio.
+ * @returns Messages with audio converted to text transcriptions.
+ */
+export async function transcribeAudioInMessages(
+  messages: CoreMessage[]
+): Promise<CoreMessage[]> {
+  return await Promise.all(
+    messages.map(async (message) => {
+      if (message.role === "tool") {
+        return message;
+      }
+
+      if (Array.isArray(message.content)) {
+        const textParts = await Promise.all(
+          message.content.map(async (part) => {
+            if (part.type === "text") return part.text;
+            if (part.type === "file" && part.mediaType?.startsWith("audio/")) {
+              return await transcribeAudio(
+                part.data as string,
+                part.mediaType
+              );
+            }
+            return "";
+          })
+        );
+
+        const textContent = textParts.filter(Boolean).join(" ");
+        return { ...message, content: textContent || "[Audio message]" };
+      }
+      return message;
+    })
+  );
+}
diff --git a/javascript/src/audio/types.ts b/javascript/src/audio/types.ts
new file mode 100644
index 00000000..dfe8bde0
--- /dev/null
+++ b/javascript/src/audio/types.ts
@@ -0,0 +1,66 @@
+/**
+ * Audio types and utilities for voice-first-class support in Scenario.
+ *
+ * Provides core types for representing audio data and utilities for
+ * loading, encoding, and converting audio between formats.
+ */
+
+/**
+ * Supported audio MIME types.
+ */
+export type AudioMimeType =
+  | "audio/wav"
+  | "audio/mp3"
+  | "audio/mpeg"
+  | "audio/ogg"
+  | "audio/webm"
+  | "audio/pcm";
+
+/**
+ * Supported TTS voice options.
+ * Based on OpenAI TTS voices.
+ */
+export type Voice = "alloy" | "echo" | "fable" | "onyx" | "nova" | "shimmer";
+
+/**
+ * Represents audio data with metadata.
+ */
+export interface AudioData {
+  /**
+   * Base64-encoded audio data.
+   */
+  data: string;
+
+  /**
+   * Media type of the audio (MIME type).
+   * Uses 'mediaType' to match AI SDK conventions.
+   */
+  mediaType: AudioMimeType;
+
+  /**
+   * Optional transcript of the audio content.
+   */
+  transcript?: string;
+
+  /**
+   * Optional duration in milliseconds.
+   */
+  durationMs?: number;
+}
+
+/**
+ * Options for text-to-speech conversion.
+ */
+export interface TextToSpeechOptions {
+  /**
+   * Voice to use for synthesis.
+   * @default "nova"
+   */
+  voice?: Voice;
+
+  /**
+   * Output audio format.
+   * @default "wav"
+   */
+  format?: "wav" | "mp3" | "opus" | "aac" | "flac" | "pcm";
+}
diff --git a/javascript/src/audio/utils.ts b/javascript/src/audio/utils.ts
new file mode 100644
index 00000000..1c103406
--- /dev/null
+++ b/javascript/src/audio/utils.ts
@@ -0,0 +1,84 @@
+/**
+ * Audio utility functions for loading, encoding, and converting audio.
+ */
+import * as fs from "fs";
+import * as path from "path";
+import type { AudioData, AudioMimeType } from "./types";
+
+/**
+ * Detects the MIME type from a file extension.
+ *
+ * @param filePath - Path to the audio file.
+ * @returns The detected MIME type.
+ */
+export function detectMimeType(filePath: string): AudioMimeType {
+  const ext = path.extname(filePath).toLowerCase();
+  const mimeTypes: Record<string, AudioMimeType> = {
+    ".wav": "audio/wav",
+    ".mp3": "audio/mp3",
+    ".mpeg": "audio/mpeg",
+    ".ogg": "audio/ogg",
+    ".webm": "audio/webm",
+    ".pcm": "audio/pcm",
+  };
+  return mimeTypes[ext] ?? "audio/wav";
+}
+
+/**
+ * Loads audio from a file and returns AudioData.
+ *
+ * @param filePath - Path to the audio file.
+ * @returns AudioData with base64-encoded content.
+ */
+export function audioFromFile(filePath: string): AudioData {
+  const absolutePath = path.isAbsolute(filePath)
+    ? filePath
+    : path.resolve(process.cwd(), filePath);
+
+  const buffer = fs.readFileSync(absolutePath);
+  const data = buffer.toString("base64");
+  const mediaType = detectMimeType(filePath);
+
+  return { data, mediaType };
+}
+
+/**
+ * Creates AudioData from a base64 string.
+ *
+ * @param data - Base64-encoded audio data.
+ * @param mediaType - Media type (MIME type) of the audio.
+ * @returns AudioData object.
+ */
+export function audioFromBase64(
+  data: string,
+  mediaType: AudioMimeType = "audio/wav"
+): AudioData {
+  return { data, mediaType };
+}
+
+/**
+ * Creates AudioData from a Buffer.
+ *
+ * @param buffer - Audio data buffer.
+ * @param mediaType - Media type (MIME type) of the audio.
+ * @returns AudioData object.
+ */
+export function audioFromBuffer(
+  buffer: Buffer,
+  mediaType: AudioMimeType = "audio/wav"
+): AudioData {
+  return {
+    data: buffer.toString("base64"),
+    mediaType,
+  };
+}
+
+/**
+ * Converts AudioData to a base64 data URI.
+ *
+ * @param audio - AudioData to convert.
+ * @returns Data URI string.
+ */
+export function audioToDataUri(audio: AudioData): string {
+  return `data:${audio.mediaType};base64,${audio.data}`;
+}
diff --git a/javascript/src/index.ts b/javascript/src/index.ts
index 49230692..93551c69 100644
--- a/javascript/src/index.ts
+++ b/javascript/src/index.ts
@@ -1,4 +1,5 @@
 import * as agents from "./agents";
+import * as audio from "./audio";
 import * as domain from "./domain";
 import * as execution from "./execution";
 import * as runner from "./runner";
@@ -6,12 +7,15 @@ import * as script from "./script";
 
 // Re-export all types and other named exports
 export * from "./agents";
+export * from "./audio";
 export * from "./domain";
 export * from "./execution";
 export * from "./runner";
 export * from "./script";
+export { StringUtils } from "./utils/string-utils";
 
 type ScenarioApi = typeof agents &
+  typeof audio &
   typeof domain &
   typeof execution &
   typeof runner &
@@ -19,6 +23,7 @@ type ScenarioApi = typeof agents &
 
 export const scenario: ScenarioApi = {
   ...agents,
+  ...audio,
   ...domain,
   ...execution,
   ...runner,
diff --git a/javascript/src/script/index.ts b/javascript/src/script/index.ts
index 86f4714f..8457ddf2 100644
--- a/javascript/src/script/index.ts
+++ b/javascript/src/script/index.ts
@@ -7,6 +7,8 @@
  * and when scenarios should succeed or fail.
  */
 import { CoreMessage } from "ai";
+import { textToSpeech } from "../audio/text-to-speech";
+import type { Voice } from "../audio/types";
 import { ScenarioExecutionStateLike, ScriptStep } from "../domain";
 
 /**
@@ -23,6 +25,19 @@ export const message = (message: CoreMessage): ScriptStep => {
   return (_state, executor) => executor.message(message);
 };
 
+/**
+ * Script step type with optional speak method for TTS.
+ */
+interface SpeakableAgentStep extends ScriptStep {
+  /**
+   * Convert text to speech and send as audio message.
+   *
+   * @param options - TTS options including voice.
+   * @returns A ScriptStep that sends audio.
+   */
+  speak: (options?: { voice?: Voice }) => ScriptStep;
+}
+
 /**
  * Generate or specify an agent response in the conversation.
  *
@@ -33,11 +48,82 @@ export const message = (message: CoreMessage): ScriptStep => {
  * @param content Optional agent response content. Can be a string or full message object.
  *                If undefined, the agent under test will generate content automatically.
  * @returns A ScriptStep function that can be used in scenario scripts.
+ *
+ * @example
+ * ```typescript
+ * // Text message
+ * scenario.agent("Here's a recipe for you")
+ *
+ * // Audio message via TTS
+ * scenario.agent.speak("Here's a recipe for you")
+ *
+ * // Let agent generate
+ * scenario.agent()
+ * ```
  */
-export const agent = (content?: string | CoreMessage): ScriptStep => {
-  return (_state, executor) => executor.agent(content);
+const agentBase = (
+  content?: string | CoreMessage
+): ScriptStep | SpeakableAgentStep => {
+  const step: ScriptStep = (_state, executor) => executor.agent(content);
+
+  // Only add .speak() when text content is provided
+  if (typeof content === "string") {
+    const speakableStep = step as SpeakableAgentStep;
+    speakableStep.speak = (options?: { voice?: Voice }): ScriptStep => {
+      return async (_state, executor) => {
+        const audio = await textToSpeech(content, { voice: options?.voice });
+        const audioMessage: CoreMessage = {
+          role: "assistant",
+          content: [
+            { type: "text", text: "" },
+            { type: "file", mediaType: audio.mediaType, data: audio.data },
+          ],
+        };
+        await executor.message(audioMessage);
+      };
+    };
+    return speakableStep;
+  }
+
+  return step;
 };
 
+/**
+ * Speak text as an agent audio message via TTS.
+ *
+ * @param text - Text to convert to speech.
+ * @param options - TTS options including voice.
+ * @returns A ScriptStep that sends audio.
+ *
+ * @example
+ * ```typescript
+ * scenario.agent.speak("Here's a recipe for you")
+ * ```
+ */
+const agentSpeak = (text: string, options?: { voice?: Voice }): ScriptStep => {
+  return async (_state, executor) => {
+    const audio = await textToSpeech(text, { voice: options?.voice });
+    const audioMessage: CoreMessage = {
+      role: "assistant",
+      content: [
+        { type: "text", text: "" },
+        { type: "file", mediaType: audio.mediaType, data: audio.data },
+      ],
+    };
+    await executor.message(audioMessage);
+  };
+};
+
+/**
+ * Generate or specify an agent response in the conversation.
+ *
+ * Supports both text and audio output:
+ * - `scenario.agent("text")` - Send text message
+ * - `scenario.agent.speak("text")` - Send audio message via TTS
+ * - `scenario.agent()` - Let agent generate
+ */
+export const agent = Object.assign(agentBase, { speak: agentSpeak });
+
 /**
  * Invoke the judge agent to evaluate the current conversation state.
  *
@@ -55,6 +141,19 @@ export const judge = (content?: string | CoreMessage): ScriptStep => {
   };
 };
 
+/**
+ * Script step type with optional speak method for TTS.
+ */
+interface SpeakableUserStep extends ScriptStep {
+  /**
+   * Convert text to speech and send as audio message.
+   *
+   * @param options - TTS options including voice.
+   * @returns A ScriptStep that sends audio.
+   */
+  speak: (options?: { voice?: Voice }) => ScriptStep;
+}
+
 /**
  * Generate or specify a user message in the conversation.
  *
@@ -65,11 +164,82 @@ export const judge = (content?: string | CoreMessage): ScriptStep => {
  * @param content Optional user message content. Can be a string or full message object.
  *                If undefined, the user simulator will generate content automatically.
  * @returns A ScriptStep function that can be used in scenario scripts.
+ *
+ * @example
+ * ```typescript
+ * // Text message
+ * scenario.user("Hello")
+ *
+ * // Audio message via TTS
+ * scenario.user.speak("Hello")
+ *
+ * // Let user simulator generate
+ * scenario.user()
+ * ```
  */
-export const user = (content?: string | CoreMessage): ScriptStep => {
-  return (_state, executor) => executor.user(content);
+const userBase = (
+  content?: string | CoreMessage
+): ScriptStep | SpeakableUserStep => {
+  const step: ScriptStep = (_state, executor) => executor.user(content);
+
+  // Only add .speak() when text content is provided
+  if (typeof content === "string") {
+    const speakableStep = step as SpeakableUserStep;
+    speakableStep.speak = (options?: { voice?: Voice }): ScriptStep => {
+      return async (_state, executor) => {
+        const audio = await textToSpeech(content, { voice: options?.voice });
+        const audioMessage: CoreMessage = {
+          role: "user",
+          content: [
+            { type: "text", text: "" },
+            { type: "file", mediaType: audio.mediaType, data: audio.data },
+          ],
+        };
+        await executor.message(audioMessage);
+      };
+    };
+    return speakableStep;
+  }
+
+  return step;
 };
 
+/**
+ * Speak text as a user audio message via TTS.
+ *
+ * @param text - Text to convert to speech.
+ * @param options - TTS options including voice.
+ * @returns A ScriptStep that sends audio.
+ *
+ * @example
+ * ```typescript
+ * scenario.user.speak("I need help with billing")
+ * ```
+ */
+const userSpeak = (text: string, options?: { voice?: Voice }): ScriptStep => {
+  return async (_state, executor) => {
+    const audio = await textToSpeech(text, { voice: options?.voice });
+    const audioMessage: CoreMessage = {
+      role: "user",
+      content: [
+        { type: "text", text: "" },
+        { type: "file", mediaType: audio.mediaType, data: audio.data },
+      ],
+    };
+    await executor.message(audioMessage);
+  };
+};
+
+/**
+ * Generate or specify a user message in the conversation.
+ *
+ * Supports both text and audio output:
+ * - `scenario.user("text")` - Send text message
+ * - `scenario.user.speak("text")` - Send audio message via TTS
+ * - `scenario.user()` - Let user simulator generate
+ */
+export const user = Object.assign(userBase, { speak: userSpeak });
+
 /**
  * Let the scenario proceed automatically for a specified number of turns.
  *
diff --git a/javascript/src/utils/string-utils.ts b/javascript/src/utils/string-utils.ts
new file mode 100644
index 00000000..d3d115ee
--- /dev/null
+++ b/javascript/src/utils/string-utils.ts
@@ -0,0 +1,26 @@
+/**
+ * String utility functions.
+ */
+
+/**
+ * Converts a string to kebab-case.
+ *
+ * @param str - The string to convert.
+ * @returns The kebab-case string.
+ *
+ * @example
+ * ```typescript
+ * StringUtils.kebabCase("Hello World") // "hello-world"
+ * StringUtils.kebabCase("camelCase") // "camel-case"
+ * ```
+ */
+export function kebabCase(str: string): string {
+  return str
+    .replace(/([a-z])([A-Z])/g, "$1-$2")
+    .replace(/[\s_]+/g, "-")
+    .toLowerCase();
+}
+
+export const StringUtils = {
+  kebabCase,
+};