diff --git a/.cursor/rules/file-structure.mdc b/.cursor/rules/file-structure.mdc index f526318e..cb0924d5 100644 --- a/.cursor/rules/file-structure.mdc +++ b/.cursor/rules/file-structure.mdc @@ -6,7 +6,14 @@ alwaysApply: false We want thin files that aim to have a single responsibility and a single export. We should have a single export per file. -Specific directories: -- hooks: for hooks -- components: for components -- pages: for pages \ No newline at end of file +Special files: + +## Utils +File name convention: *.utils.ts +Single export: XXXUtils +example: + +export const RunnerUtils = { + runnerFunction1, + runnerFunction2, +} \ No newline at end of file diff --git a/javascript/src/domain/scenarios/index.ts b/javascript/src/domain/scenarios/index.ts index f781c637..3fe3cc92 100644 --- a/javascript/src/domain/scenarios/index.ts +++ b/javascript/src/domain/scenarios/index.ts @@ -1,6 +1,7 @@ -import { CoreMessage } from "ai"; +import { ScenarioExecution, ScenarioExecutionState } from "../.."; import { AgentAdapter } from "../agents/index"; -import { ScenarioExecutionStateLike, ScenarioResult } from "../core/execution"; +import { ScenarioExecutionStateLike } from "../core/execution"; +export * from "./script-commands"; export const DEFAULT_MAX_TURNS = 10; export const DEFAULT_VERBOSE = false; @@ -86,75 +87,11 @@ export interface ScenarioConfigFinal setId?: string; } -/** - * The execution context for a scenario script. - * This provides the functions to control the flow of the scenario. - */ -export interface ScenarioExecutionLike { - /** - * The history of messages in the conversation. - */ - readonly messages: CoreMessage[]; - - /** - * The ID of the conversation thread. - */ - readonly threadId: string; - - /** - * Adds a message to the conversation. - * @param message The message to add. - */ - message(message: CoreMessage): Promise; - /** - * Adds a user message to the conversation. - * If no content is provided, the user simulator will generate a message. - * @param content The content of the user message. - */ - user(content?: string | CoreMessage): Promise; - /** - * Adds an agent message to the conversation. - * If no content is provided, the agent under test will generate a message. - * @param content The content of the agent message. - */ - agent(content?: string | CoreMessage): Promise; - /** - * Invokes the judge agent to evaluate the current state. - * @param content Optional message to the judge. - * @returns The result of the scenario if the judge makes a final decision. - */ - judge(content?: string | CoreMessage): Promise; - /** - * Proceeds with the scenario automatically for a number of turns. - * @param turns The number of turns to proceed. Defaults to running until the scenario ends. - * @param onTurn Optional callback executed at the end of each turn. - * @param onStep Optional callback executed after each agent interaction. - * @returns The result of the scenario if it ends. - */ - proceed( - turns?: number, - onTurn?: (state: ScenarioExecutionStateLike) => void | Promise, - onStep?: (state: ScenarioExecutionStateLike) => void | Promise - ): Promise; - /** - * Ends the scenario with a success. - * @param reasoning Optional reasoning for the success. - * @returns The final result of the scenario. - */ - succeed(reasoning?: string): Promise; - /** - * Ends the scenario with a failure. - * @param reasoning Optional reasoning for the failure. - * @returns The final result of the scenario. - */ - fail(reasoning?: string): Promise; -} - /** * A step in a scenario script. * This is a function that takes the current state and an executor, and performs an action. */ export type ScriptStep = ( - state: ScenarioExecutionStateLike, - executor: ScenarioExecutionLike + state: ScenarioExecutionState, + executor: ScenarioExecution ) => Promise | void; diff --git a/javascript/src/domain/scenarios/script-commands.ts b/javascript/src/domain/scenarios/script-commands.ts new file mode 100644 index 00000000..d0e638b2 --- /dev/null +++ b/javascript/src/domain/scenarios/script-commands.ts @@ -0,0 +1,80 @@ +import { CoreMessage } from "ai"; +import { ScenarioExecutionStateLike, ScenarioExecutionLike } from "../index"; + +/** + * Represents a typed script command that describes a specific action to take + * during scenario execution. Commands are data structures that can be serialized + * and provide type safety for scenario scripts. + */ +export type ScriptCommand = + | MessageCommand + | UserCommand + | AgentCommand + | JudgeCommand + | ProceedCommand + | SucceedCommand + | FailCommand; + +/** + * Command to add a specific message directly to the conversation. + * Useful for simulating tool responses, system messages, or specific conversational states. + */ +export interface MessageCommand { + readonly type: "message"; + readonly message: CoreMessage; +} + +/** + * Command to generate or specify a user message in the conversation. + * If content is not provided, the user simulator agent will generate content automatically. + */ +export interface UserCommand { + readonly type: "user"; + readonly content?: string | CoreMessage; +} + +/** + * Command to generate or specify an agent response in the conversation. + * If content is not provided, the agent under test will generate content automatically. + */ +export interface AgentCommand { + readonly type: "agent"; + readonly content?: string | CoreMessage; +} + +/** + * Command to invoke the judge agent to evaluate the current conversation state. + * The judge will evaluate based on its configured criteria and may end the scenario. + */ +export interface JudgeCommand { + readonly type: "judge"; + readonly content?: string | CoreMessage; +} + +/** + * Command to let the scenario proceed automatically for a specified number of turns. + * Agents will interact naturally according to their roles until the turn limit is reached + * or the judge decides to end the scenario. + */ +export interface ProceedCommand { + readonly type: "proceed"; + readonly turns?: number; + readonly onTurn?: (state: ScenarioExecutionStateLike) => void | Promise; + readonly onStep?: (state: ScenarioExecutionStateLike) => void | Promise; +} + +/** + * Command to immediately end the scenario with a success verdict. + */ +export interface SucceedCommand { + readonly type: "succeed"; + readonly reasoning?: string; +} + +/** + * Command to immediately end the scenario with a failure verdict. + */ +export interface FailCommand { + readonly type: "fail"; + readonly reasoning?: string; +} diff --git a/javascript/src/execution/scenario-execution-state.ts b/javascript/src/execution/scenario-execution-state.ts index e2bae533..e7fa84de 100644 --- a/javascript/src/execution/scenario-execution-state.ts +++ b/javascript/src/execution/scenario-execution-state.ts @@ -5,7 +5,7 @@ import { CoreUserMessage, } from "ai"; import { Observable, Subject } from "rxjs"; -import { ScenarioExecutionStateLike, ScenarioConfig } from "../domain"; +import { ScenarioConfig, ScenarioResult } from "../domain"; import { generateMessageId } from "../utils/ids"; // Generic enum - ready for extension @@ -26,7 +26,7 @@ export type StateChangeEvent = { * the internal logic for tracking conversation history, turns, results, and * other related information. */ -export class ScenarioExecutionState implements ScenarioExecutionStateLike { +export class ScenarioExecutionState { private _messages: (CoreMessage & { id: string })[] = []; private _currentTurn: number = 0; private _threadId: string = ""; @@ -38,6 +38,7 @@ export class ScenarioExecutionState implements ScenarioExecutionStateLike { description: string; config: ScenarioConfig; + result?: ScenarioResult; constructor(config: ScenarioConfig) { this.config = config; @@ -52,10 +53,6 @@ export class ScenarioExecutionState implements ScenarioExecutionStateLike { return this._currentTurn; } - set currentTurn(turn: number) { - this._currentTurn = turn; - } - get threadId(): string { return this._threadId; } @@ -64,6 +61,10 @@ export class ScenarioExecutionState implements ScenarioExecutionStateLike { this._threadId = value; } + incrementTurn(): void { + this._currentTurn++; + } + /** * Adds a message to the conversation history. * diff --git a/javascript/src/execution/scenario-execution.ts b/javascript/src/execution/scenario-execution.ts index 2b2f26a3..b0b0202a 100644 --- a/javascript/src/execution/scenario-execution.ts +++ b/javascript/src/execution/scenario-execution.ts @@ -1,24 +1,23 @@ -import { ModelMessage } from "ai"; +import { CoreMessage, ModelMessage } from "ai"; import { filter, Observable, Subject } from "rxjs"; import { ScenarioExecutionState, StateChangeEventType, } from "./scenario-execution-state"; +import { JudgeResult } from "../agents"; import { type ScenarioResult, type ScenarioConfig, AgentRole, - type AgentInput, - type ScriptStep, type AgentReturnTypes, - type ScenarioExecutionLike, + type ScriptStep, type AgentAdapter, - JudgeAgentAdapter, - ScenarioExecutionStateLike, ScenarioConfigFinal, DEFAULT_MAX_TURNS, DEFAULT_VERBOSE, + JudgeAgentAdapter, } from "../domain"; +import { ScenarioExecutionUtils } from "./scenario-execution.utils"; import { ScenarioEvent, ScenarioEventType, @@ -35,123 +34,41 @@ import { generateThreadId, getBatchRunId, } from "../utils/ids"; -import { Logger } from "../utils/logger"; /** * Manages the execution of a single scenario test. * - * This class orchestrates the interaction between agents (user simulator, agent under test, - * and judge), executes the test script step-by-step, and manages the scenario's state - * throughout execution. It also emits events that can be subscribed to for real-time - * monitoring of the scenario's progress. - * - * ## Execution Flow Overview + * Orchestrates interaction between agents (user simulator, agent under test, and judge), + * executes test scripts step-by-step, and manages scenario state. Emits events for + * real-time monitoring of scenario progress. * - * The execution follows a turn-based system where agents take turns responding. The key - * concepts are: - * - **Script Steps**: Functions in the scenario script like `user()`, `agent()`, `proceed()`, etc. - * - **Agent Interactions**: Individual agent responses that occur when an agent takes their turn + * Key concepts: + * - **Script Steps**: Functions like `user()`, `agent()`, `proceed()`, etc. + * - **Agent Interactions**: Individual agent responses within turns * - **Turns**: Groups of agent interactions that happen in sequence * - * ## Message Broadcasting System - * - * The class implements a sophisticated message broadcasting system that ensures all agents - * can "hear" each other's messages: - * - * 1. **Message Creation**: When an agent sends a message, it's added to the conversation history - * 2. **Broadcasting**: The message is immediately broadcast to all other agents via `broadcastMessage()` - * 3. **Queue Management**: Each agent has a pending message queue (`pendingMessages`) that stores - * messages from other agents - * 4. **Agent Input**: When an agent is called, it receives both the full conversation history - * and any new pending messages that have been broadcast to it - * 5. **Queue Clearing**: After an agent processes its pending messages, its queue is cleared - * - * This creates a realistic conversation environment where agents can respond contextually - * to the full conversation history and any new messages from other agents. - * - * ## Example Message Flow - * - * ``` - * Turn 1: - * 1. User Agent sends: "Hello" - * - Added to conversation history - * - Broadcast to Agent and Judge (pendingMessages[1] = ["Hello"], pendingMessages[2] = ["Hello"]) - * - * 2. Agent is called: - * - Receives: full conversation + pendingMessages[1] = ["Hello"] - * - Sends: "Hi there! How can I help you?" - * - Added to conversation history - * - Broadcast to User and Judge (pendingMessages[0] = ["Hi there!..."], pendingMessages[2] = ["Hello", "Hi there!..."]) - * - pendingMessages[1] is cleared - * - * 3. Judge is called: - * - Receives: full conversation + pendingMessages[2] = ["Hello", "Hi there!..."] - * - Evaluates and decides to continue - * - pendingMessages[2] is cleared - * ``` - * - * Each script step can trigger one or more agent interactions depending on the step type. - * For example, a `proceed(5)` step might trigger 10 agent interactions across 5 turns. - * - * Note: This is an internal class. Most users will interact with the higher-level - * `scenario.run()` function instead of instantiating this class directly. + * Note: This is an internal class. Most users should use the higher-level + * `scenario.run()` function instead. * * @example * ```typescript - * import scenario from "@langwatch/scenario"; - * - * // This is a simplified example of what `scenario.run` does internally. * const result = await scenario.run({ - * name: "My First Scenario", - * description: "A simple test of the agent's greeting.", - * agents: [ - * scenario.userSimulatorAgent(), - * scenario.judgeAgent({ - * criteria: ["Agent should respond with a greeting"], - * }), - * ], - * script: [ - * scenario.user("Hello"), // Script step 1: triggers 1 agent interaction - * scenario.agent(), // Script step 2: triggers 1 agent interaction - * scenario.proceed(3), // Script step 3: triggers multiple agent interactions - * scenario.judge(), // Script step 4: triggers 1 agent interaction - * ] + * name: "Greeting Test", + * description: "Test agent's greeting response", + * agents: [userSimulatorAgent(), judgeAgent({ criteria: ["Agent greets user"] })], + * script: [scenario.user("Hello"), scenario.agent(), scenario.judge()] * }); - * - * console.log("Scenario result:", result.success); * ``` */ -export class ScenarioExecution implements ScenarioExecutionLike { +export class ScenarioExecution { /** The current state of the scenario execution */ private state: ScenarioExecutionState; - /** The final result of the scenario execution, set when a conclusion is reached */ - private _result?: ScenarioResult; - /** Logger for debugging and monitoring */ - private logger = new Logger("scenario.execution.ScenarioExecution"); /** Finalized configuration with all defaults applied */ private config: ScenarioConfigFinal; - /** Array of all agents participating in the scenario */ - private agents: AgentAdapter[] = []; - - /** Roles that still need to act in the current turn (USER, AGENT, JUDGE) */ - private pendingRolesOnTurn: AgentRole[] = []; - - /** Agents that still need to act in the current turn */ - private pendingAgentsOnTurn: Set = new Set(); - - /** - * Message queues for each agent. When an agent sends a message, it gets - * broadcast to all other agents' pending message queues. When an agent - * is called, it receives these pending messages as part of its input. - * - * Key: agent index, Value: array of pending messages for that agent - */ - private pendingMessages: Map = new Map(); - /** Accumulated execution time for each agent (for performance tracking) */ private agentTimes: Map = new Map(); @@ -198,44 +115,14 @@ export class ScenarioExecution implements ScenarioExecutionLike { } /** - * Gets the complete conversation history as an array of messages. - * - * @returns Array of ModelMessage objects representing the full conversation - */ - get messages(): ModelMessage[] { - return this.state.messages; - } - - /** - * Gets the unique identifier for the conversation thread. - * This ID is used to maintain conversation context across multiple runs. - * - * @returns The thread identifier string - */ - get threadId(): string { - return this.state.threadId; - } - - /** - * Gets the result of the scenario execution if it has been set. + * Sets the final scenario result. * - * @returns The scenario result or undefined if not yet set - */ - get result(): ScenarioResult | undefined { - return this._result; - } - - /** - * Sets the result of the scenario execution. - * This is called when the scenario reaches a conclusion (success or failure). - * Automatically includes messages, totalTime, and agentTime from the current execution context. - * - * @param result - The final scenario result (without messages/timing, which will be added automatically) + * @param result - Result data (messages/timing added automatically) */ private setResult( result: Omit ): void { - const agentRoleAgentsIdx = this.agents + const agentRoleAgentsIdx = this.config.agents .map((agent, i) => ({ agent, idx: i })) .filter(({ agent }) => agent.role === AgentRole.AGENT) .map(({ idx }) => idx); @@ -246,7 +133,7 @@ export class ScenarioExecution implements ScenarioExecutionLike { const totalAgentTime = agentTimes.reduce((sum, time) => sum + time, 0); - this._result = { + this.state.result = { ...result, messages: this.state.messages, totalTime: this.totalTime, @@ -264,28 +151,11 @@ export class ScenarioExecution implements ScenarioExecutionLike { /** * Executes the entire scenario from start to finish. * - * This method runs through all script steps sequentially until a final result - * (success, failure, or error) is determined. Each script step can trigger one or - * more agent interactions depending on the step type: - * - `user()` and `agent()` steps typically trigger one agent interaction each - * - `proceed()` steps can trigger multiple agent interactions across multiple turns - * - `judge()` steps trigger the judge agent to evaluate the conversation - * - `succeed()` and `fail()` steps immediately end the scenario + * Runs through all script steps until completion. Each step can trigger different + * agent interactions. Stops early on final result, max turns reached, or errors. * - * The execution will stop early if: - * - A script step returns a ScenarioResult - * - The maximum number of turns is reached - * - An error occurs during execution - * - * @returns A promise that resolves with the final result of the scenario - * @throws Error if an unhandled exception occurs during execution - * - * @example - * ```typescript - * const execution = new ScenarioExecution(config, script); - * const result = await execution.execute(); - * console.log(`Scenario ${result.success ? 'passed' : 'failed'}`); - * ``` + * @returns Promise resolving to the final scenario result + * @throws Error on unhandled exceptions during execution */ async execute(): Promise { this.reset(); @@ -303,40 +173,34 @@ export class ScenarioExecution implements ScenarioExecutionLike { }); try { - // Execute script steps - pass the execution context (this), not just state - for (let i = 0; i < this.config.script.length; i++) { - const scriptStep = this.config.script[i]; - - await this.executeScriptStep(scriptStep, i); - - if (this.result) { - this.emitRunFinished({ - scenarioRunId, - status: this.result.success - ? ScenarioRunStatus.SUCCESS - : ScenarioRunStatus.FAILED, - result: this.result, - }); - - return this.result; + // If no script is provided, proceed until max turns + if (this.config.script.length === 0) { + await this.proceed(DEFAULT_MAX_TURNS); + } else { + // Execute provided script + for (let i = 0; i < this.config.script.length; i++) { + const scriptStep = this.config.script[i]; + await scriptStep(this.state, this); + + if (this.state.result) break; } } - // If no conclusion reached, set max turns error - this.reachedMaxTurns( - [ - "Reached end of script without conclusion, add one of the following to the end of the script:", - "- `Scenario.proceed()` to let the simulation continue to play out", - "- `Scenario.judge()` to force criteria judgement", - "- `Scenario.succeed()` or `Scenario.fail()` to end the test with an explicit result", - ].join("\n") - ); + if (!this.state.result) { + throw new Error("No result found after executing script"); + } - this.emitRunFinished({ scenarioRunId, status: ScenarioRunStatus.FAILED }); + this.emitRunFinished({ + scenarioRunId, + status: this.state.result.success + ? ScenarioRunStatus.SUCCESS + : ScenarioRunStatus.FAILED, + result: this.state.result, + }); - return this.result!; + return this.state.result!; } catch (error) { - const errorInfo = extractErrorInfo(error); + const errorInfo = ScenarioExecutionUtils.extractErrorInfo(error); this.setResult({ success: false, @@ -346,12 +210,6 @@ export class ScenarioExecution implements ScenarioExecutionLike { error: JSON.stringify(errorInfo), }); - this.emitRunFinished({ - scenarioRunId, - status: ScenarioRunStatus.ERROR, - result: this.result!, - }); - // Re-throw the error in case it was a vitest assertion error throw error; } finally { @@ -361,366 +219,165 @@ export class ScenarioExecution implements ScenarioExecutionLike { } /** - * Executes a single agent interaction in the scenario. - * - * This method is for manual step-by-step execution of the scenario, where each call - * represents one agent taking their turn. This is different from script steps (like - * `user()`, `agent()`, `proceed()`, etc.) which are functions in the scenario script. - * - * Each call to this method will: - * - Progress to the next turn if needed - * - Find the next agent that should act - * - Execute that agent's response - * - Set the result if the scenario concludes - * - * Note: This method is primarily for debugging or custom execution flows. Most users - * will use `execute()` to run the entire scenario automatically. - * - * After calling this method, check `this.result` to see if the scenario has concluded. + * Adds a message to the conversation history. * - * @example - * ```typescript - * const execution = new ScenarioExecution(config, script); + * Routes message to appropriate agent based on role. * - * // Execute one agent interaction at a time - * await execution.step(); - * if (execution.result) { - * console.log('Scenario finished:', execution.result.success); - * } - * ``` + * @param message - The message to add */ - async step(): Promise { - await this._step(); - } - - private async _step( - goToNextTurn: boolean = true, - onTurn?: (state: ScenarioExecutionStateLike) => void | Promise - ): Promise { - if (this.pendingRolesOnTurn.length === 0) { - if (!goToNextTurn) return; - - this.newTurn(); - - if (onTurn) await onTurn(this.state); - - if (this.state.currentTurn >= this.config.maxTurns) { - this.reachedMaxTurns(); - return; - } - } - - const currentRole = this.pendingRolesOnTurn[0]; - const { idx, agent: nextAgent } = this.nextAgentForRole(currentRole); - if (!nextAgent) { - this.removePendingRole(currentRole); - return this._step(goToNextTurn, onTurn); - } - - this.removePendingAgent(nextAgent); - - await this.callAgent(idx, currentRole); + async message(message: ModelMessage): Promise { + this.state.addMessage(message); } /** - * Calls a specific agent to generate a response or make a decision. - * - * This method is the core of agent interaction. It prepares the agent's input - * by combining the conversation history with any pending messages that have been - * broadcast to this agent, then calls the agent and processes its response. - * - * The agent input includes: - * - Full conversation history (this.state.messages) - * - New messages that have been broadcast to this agent (this.pendingMessages.get(idx)) - * - The role the agent is being asked to play - * - Whether this is a judgment request (for judge agents) - * - Current scenario state and configuration + * Executes a user turn in the conversation. * - * After the agent responds: - * - Performance timing is recorded - * - Pending messages for this agent are cleared (they've been processed) - * - If the agent returns a ScenarioResult, it's set on this.result - * - Otherwise, the agent's messages are added to the conversation and broadcast + * Uses provided content or calls user simulator agent to generate a response. * - * @param idx - The index of the agent in the agents array - * @param role - The role the agent is being asked to play (USER, AGENT, or JUDGE) - * @param judgmentRequest - Whether this is a judgment request (for judge agents) - * @throws Error if the agent call fails + * @param content - Optional user message content (string or ModelMessage) */ - private async callAgent( - idx: number, - role: AgentRole, - judgmentRequest: boolean = false - ): Promise { - const agent = this.agents[idx]; - const startTime = Date.now(); - const agentInput: AgentInput = { - threadId: this.state.threadId, - messages: this.state.messages, - newMessages: this.pendingMessages.get(idx) ?? [], - requestedRole: role, - judgmentRequest: judgmentRequest, - scenarioState: this.state, - scenarioConfig: this.config, - }; - - try { - const agentResponse = await agent.call(agentInput); - const endTime = Date.now(); - - this.addAgentTime(idx, endTime - startTime); - this.pendingMessages.delete(idx); - - if ( - agentResponse && - typeof agentResponse === "object" && - "success" in agentResponse - ) { - // JudgeResult is automatically augmented with messages by setResult - this.setResult(agentResponse); - return; - } - - const currentAgentTime = this.agentTimes.get(idx) ?? 0; - this.agentTimes.set(idx, currentAgentTime + (Date.now() - startTime)); - - const messages = convertAgentReturnTypesToMessages( - agentResponse, - role === AgentRole.USER ? "user" : "assistant" - ); - - for (const message of messages) { - this.state.addMessage(message); - this.broadcastMessage(message, idx); - } - } catch (error) { - this.logger.error( - `[${this.config.id}] Error calling agent ${agent.constructor.name}`, - { - error: error instanceof Error ? error.message : String(error), - agent: agent.constructor.name, - agentInput, - } + async user(content?: string | ModelMessage): Promise { + const userAgent = this.findAgent(AgentRole.USER); + if (!userAgent) { + throw new Error( + "No user simulator agent configured. Add scenario.userSimulatorAgent() to agents." ); - - throw error; } - } - /** - * Adds a message to the conversation history. - * - * This method is part of the ScenarioExecutionLike interface used by script steps. - * It automatically routes the message to the appropriate agent based on the message role: - * - "user" messages are routed to USER role agents - * - "assistant" messages are routed to AGENT role agents - * - Other message types are added directly to the conversation - * - * @param message - The ModelMessage to add to the conversation - * - * @example - * ```typescript - * await execution.message({ - * role: "user", - * content: "Hello, how are you?" - * }); - * ``` - */ - async message(message: ModelMessage): Promise { - if (message.role === "user") { - await this.scriptCallAgent(AgentRole.USER, message); - } else if (message.role === "assistant") { - await this.scriptCallAgent(AgentRole.AGENT, message); + // If content is a string, add it to the conversation as ModelMessage + if (typeof content === "string") { + this.message({ role: "user" as const, content }); + // If content is a ModelMessage but not a user message, throw an error + } else if (content && content.role !== "user") { + throw new Error("Content must be a user message"); + // If content is a ModelMessage and is a user message, add it to the conversation + } else if (content) { + this.message(content); + // Let the user simulator agent generate the content } else { - this.state.addMessage(message); - this.broadcastMessage(message); + const response = await userAgent.call({ + threadId: this.state.threadId, + messages: this.state.messages, + scenarioState: this.state, + scenarioConfig: this.config, + }); + this.handleResponse(response, "user"); } - } - /** - * Executes a user turn in the conversation. - * - * If content is provided, it's used directly as the user's message. If not provided, - * the user simulator agent is called to generate an appropriate response based on - * the current conversation context. - * - * This method is part of the ScenarioExecutionLike interface used by script steps. - * - * @param content - Optional content for the user's message. Can be a string or ModelMessage. - * If not provided, the user simulator agent will generate the content. - * - * @example - * ```typescript - * // Use provided content - * await execution.user("What's the weather like?"); - * - * // Let user simulator generate content - * await execution.user(); - * - * // Use a ModelMessage object - * await execution.user({ - * role: "user", - * content: "Tell me a joke" - * }); - * ``` - */ - async user(content?: string | ModelMessage): Promise { - await this.scriptCallAgent(AgentRole.USER, content); + this.state.incrementTurn(); } /** * Executes an agent turn in the conversation. * - * If content is provided, it's used directly as the agent's response. If not provided, - * the agent under test is called to generate a response based on the current conversation - * context and any pending messages. - * - * This method is part of the ScenarioExecutionLike interface used by script steps. - * - * @param content - Optional content for the agent's response. Can be a string or ModelMessage. - * If not provided, the agent under test will generate the response. - * - * @example - * ```typescript - * // Let agent generate response - * await execution.agent(); - * - * // Use provided content - * await execution.agent("The weather is sunny today!"); + * Uses provided content or calls agent under test to generate a response. * - * // Use a ModelMessage object - * await execution.agent({ - * role: "assistant", - * content: "I'm here to help you with weather information." - * }); - * ``` + * @param content - Optional agent response content (string or ModelMessage) */ async agent(content?: string | ModelMessage): Promise { - await this.scriptCallAgent(AgentRole.AGENT, content); + const agent = this.findAgent(AgentRole.AGENT); + if (!agent) { + throw new Error( + "No agent under test configured. Add your agent to agents." + ); + } + + // If content is a string, add it to the conversation as ModelMessage + if (typeof content === "string") { + this.message({ role: "assistant" as const, content }); + // If content is a ModelMessage but not a assistant message, throw an error + } else if (content && content.role !== "assistant") { + throw new Error("Content must be a assistant message"); + // If content is a ModelMessage and is a assistant message, add it to the conversation + } else if (content) { + this.message(content); + } else { + const response = await agent.call({ + threadId: this.state.threadId, + messages: this.state.messages, + scenarioState: this.state, + scenarioConfig: this.config, + }); + this.handleResponse(response, "assistant"); + } + + this.state.incrementTurn(); } /** - * Invokes the judge agent to evaluate the current state of the conversation. - * - * The judge agent analyzes the conversation history and determines whether the - * scenario criteria have been met. This can result in either: - * - A final scenario result (success/failure) if the judge makes a decision - * - Null if the judge needs more information or conversation to continue + * Invokes the judge agent to evaluate the current conversation. * - * This method is part of the ScenarioExecutionLike interface used by script steps. + * Judge analyzes conversation history against criteria and returns final result + * if decision can be made, or null to continue conversation. * - * @param content - Optional message to pass to the judge agent for additional context - * @returns A promise that resolves with: - * - ScenarioResult if the judge makes a final decision, or - * - Null if the conversation should continue - * - * @example - * ```typescript - * // Let judge evaluate current state - * const result = await execution.judge(); - * if (result) { - * console.log(`Judge decided: ${result.success ? 'pass' : 'fail'}`); - * } - * - * // Provide additional context to judge - * const result = await execution.judge("Please consider the user's satisfaction level"); - * ``` + * @param content - Optional additional context for judge + * @returns Promise resolving to ScenarioResult or null */ - async judge(content?: string | ModelMessage): Promise { - return await this.scriptCallAgent(AgentRole.JUDGE, content, true); + async judge(_content?: string | CoreMessage): Promise { + const judge = this.findAgent(AgentRole.JUDGE) as JudgeAgentAdapter; + + if (!judge) { + throw new Error( + "No judge agent configured. Add scenario.judgeAgent() to agents." + ); + } + + const response = await judge.call({ + threadId: this.state.threadId, + messages: this.state.messages, + scenarioState: this.state, + scenarioConfig: this.config, + }); + + if (response && "success" in (response as JudgeResult)) { + this.setResult(response as JudgeResult); + return this.state.result!; + } + + return null; } /** - * Lets the scenario proceed automatically for a specified number of turns. - * - * This method is a script step that simulates natural conversation flow by allowing - * agents to interact automatically without explicit script steps. It can trigger - * multiple agent interactions across multiple turns, making it useful for testing - * scenarios where you want to see how agents behave in extended conversations. + * Lets the scenario proceed automatically for specified turns. * - * Unlike other script steps that typically trigger one agent interaction each, - * this step can trigger many agent interactions depending on the number of turns - * and the agents' behavior. + * Simulates natural conversation flow with user→agent→judge sequence. + * Continues until turns reached, final result determined, or max turns hit. * - * The method will continue until: - * - The specified number of turns is reached - * - A final scenario result is determined - * - The maximum turns limit is reached - * - * @param turns - The number of turns to proceed. If undefined, runs until a conclusion - * or max turns is reached - * @param onTurn - Optional callback executed at the end of each turn. Receives the - * current execution state - * @param onStep - Optional callback executed after each agent interaction. Receives - * the current execution state - * @returns A promise that resolves with: - * - ScenarioResult if a conclusion is reached during the proceeding, or - * - Null if the specified turns complete without conclusion - * - * @example - * ```typescript - * // Proceed for 5 turns - * const result = await execution.proceed(5); - * - * // Proceed until conclusion with callbacks - * const result = await execution.proceed( - * undefined, - * (state) => console.log(`Turn ${state.currentTurn} completed`), - * (state) => console.log(`Agent interaction completed, ${state.messages.length} messages`) - * ); - * ``` + * @param turns - Number of turns to proceed (optional) + * @returns Promise resolving to ScenarioResult or null */ async proceed( turns?: number, - onTurn?: (state: ScenarioExecutionStateLike) => void | Promise, - onStep?: (state: ScenarioExecutionStateLike) => void | Promise + _onTurn?: (state: ScenarioExecutionState) => void | Promise, + _onStep?: (state: ScenarioExecutionState) => void | Promise ): Promise { - let initialTurn = this.state.currentTurn; - - while (true) { - const goToNextTurn = - turns === void 0 || - initialTurn === null || - (this.state.currentTurn != null && - this.state.currentTurn + 1 < initialTurn + turns); - await this._step(goToNextTurn, onTurn); - - if (initialTurn === null) initialTurn = this.state.currentTurn; - - if (this.result) { - return this.result; - } + const maxTurns = turns ?? DEFAULT_MAX_TURNS; + /** + * Normally, a turn is a user, agent, judge sequence. + * However, if the last message is a user message, we can skip the user turn. + */ + const shouldSkipUserTurn = Boolean( + this.state.lastMessage()?.role === "user" + ); - if (onStep) await onStep(this.state); + while (this.state.currentTurn < maxTurns) { + if (!shouldSkipUserTurn) await this.user(); + await this.agent(); + const result = await this.judge(); - if (!goToNextTurn) { - return null; - } + if (result) return result; // Judge stopped execution } + + return null; // Reached max turns } /** * Immediately ends the scenario with a success verdict. * - * This method forces the scenario to end successfully, regardless of the current - * conversation state. It's useful for scenarios where you want to explicitly - * mark success based on specific conditions or external factors. - * - * This method is part of the ScenarioExecutionLike interface used by script steps. - * - * @param reasoning - Optional explanation for why the scenario is being marked as successful - * @returns A promise that resolves with the final successful scenario result - * - * @example - * ```typescript - * // Mark success with default reasoning - * const result = await execution.succeed(); + * Forces scenario to end successfully regardless of current state. * - * // Mark success with custom reasoning - * const result = await execution.succeed( - * "User successfully completed the onboarding flow" - * ); - * ``` + * @param reasoning - Optional explanation for success + * @returns Promise resolving to successful scenario result */ async succeed(reasoning?: string): Promise { this.setResult({ @@ -730,31 +387,16 @@ export class ScenarioExecution implements ScenarioExecutionLike { metCriteria: [], unmetCriteria: [], }); - return this.result!; + return this.state.result!; } /** * Immediately ends the scenario with a failure verdict. * - * This method forces the scenario to end with failure, regardless of the current - * conversation state. It's useful for scenarios where you want to explicitly - * mark failure based on specific conditions or external factors. + * Forces scenario to end with failure regardless of current state. * - * This method is part of the ScenarioExecutionLike interface used by script steps. - * - * @param reasoning - Optional explanation for why the scenario is being marked as failed - * @returns A promise that resolves with the final failed scenario result - * - * @example - * ```typescript - * // Mark failure with default reasoning - * const result = await execution.fail(); - * - * // Mark failure with custom reasoning - * const result = await execution.fail( - * "Agent failed to provide accurate weather information" - * ); - * ``` + * @param reasoning - Optional explanation for failure + * @returns Promise resolving to failed scenario result */ async fail(reasoning?: string): Promise { this.setResult({ @@ -763,25 +405,14 @@ export class ScenarioExecution implements ScenarioExecutionLike { metCriteria: [], unmetCriteria: [], }); - return this.result!; + return this.state.result!; } /** - * Adds execution time for a specific agent to the performance tracking. - * - * This method is used internally to track how long each agent takes to respond, - * which is included in the final scenario result for performance analysis. - * The accumulated time for each agent is used to calculate total agent response - * times in the scenario result. - * - * @param agentIdx - The index of the agent in the agents array - * @param time - The execution time in milliseconds to add to the agent's total + * Adds execution time for a specific agent to performance tracking. * - * @example - * ```typescript - * // This is typically called internally by the execution engine - * execution.addAgentTime(0, 1500); // Agent at index 0 took 1.5 seconds - * ``` + * @param agentIdx - Index of the agent in agents array + * @param time - Execution time in milliseconds */ addAgentTime(agentIdx: number, time: number): void { const currentTime = this.agentTimes.get(agentIdx) || 0; @@ -810,214 +441,15 @@ export class ScenarioExecution implements ScenarioExecutionLike { * decision, or null if the conversation should continue * @throws Error if no agent is found for the specified role */ - private async scriptCallAgent( - role: AgentRole, - content?: string | ModelMessage, - judgmentRequest: boolean = false - ): Promise { - this.consumeUntilRole(role); - - let index = -1; - let agent: AgentAdapter | null = null; - - let nextAgent = this.getNextAgentForRole(role); - if (!nextAgent) { - this.newTurn(); - this.consumeUntilRole(role); - - nextAgent = this.getNextAgentForRole(role); - } - - if (!nextAgent) { - let roleClass = ""; - switch (role) { - case AgentRole.USER: - roleClass = "a scenario.userSimulatorAgent()"; - break; - case AgentRole.AGENT: - roleClass = "a scenario.agent()"; - break; - case AgentRole.JUDGE: - roleClass = "a scenario.judgeAgent()"; - break; - - default: - roleClass = "your agent"; - } - - if (content) - throw new Error( - `Cannot generate a message for role \`${role}\` with content \`${content}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list` - ); - - throw new Error( - `Cannot generate a message for role \`${role}\` because no agent with this role was found, please add ${roleClass} to the scenario \`agents\` list` - ); - } - - index = nextAgent.index; - agent = nextAgent.agent; - - this.removePendingAgent(agent); - - if (content) { - const message = - typeof content === "string" - ? ({ - role: role === AgentRole.USER ? "user" : "assistant", - content, - } as ModelMessage) - : content; - this.state.addMessage(message); - this.broadcastMessage(message, index); - - return null; - } - - await this.callAgent(index, role, judgmentRequest); - - // The result may have been set by callAgent if the agent made a decision - return this.result ?? null; - } /** * Resets the scenario execution to its initial state. * - * This method is called at the beginning of each execution to ensure a clean - * state. It creates a new execution state, initializes agents, sets up the - * first turn, and clears any pending messages or partial results. - * - * The reset process: - * - Creates a new ScenarioExecutionState with the current config - * - Sets up the thread ID (generates new one if not provided) - * - Initializes all agents - * - Starts the first turn - * - Records the start time for performance tracking - * - Clears any pending messages - * - Clears the result from any previous execution + * Called at execution start to ensure clean state. */ private reset(): void { this.state = new ScenarioExecutionState(this.config); - this.state.threadId = this.config.threadId || generateThreadId(); - this.setAgents(this.config.agents); - this.newTurn(); - this.state.currentTurn = 0; this.totalStartTime = Date.now(); - this.pendingMessages.clear(); - this._result = undefined; - } - - private nextAgentForRole(role: AgentRole): { - idx: number; - agent: AgentAdapter | null; - } { - for (const agent of this.agents) { - if ( - agent.role === role && - this.pendingAgentsOnTurn.has(agent) && - this.pendingRolesOnTurn.includes(role) - ) { - return { idx: this.agents.indexOf(agent), agent }; - } - } - - return { idx: -1, agent: null }; - } - - /** - * Starts a new turn in the scenario execution. - * - * This method is called when transitioning to a new turn. It resets the pending - * agents and roles for the turn, allowing all agents to participate again in - * the new turn. The turn counter is incremented to track the current turn number. - * - * A turn represents a cycle where agents can take actions. Each turn can involve - * multiple agent interactions as agents respond to each other's messages. - */ - private newTurn(): void { - this.pendingAgentsOnTurn = new Set(this.agents); - this.pendingRolesOnTurn = [ - AgentRole.USER, - AgentRole.AGENT, - AgentRole.JUDGE, - ]; - - if (this.state.currentTurn === null) { - this.state.currentTurn = 1; - } else { - this.state.currentTurn++; - } - } - - private removePendingRole(role: AgentRole): void { - const index = this.pendingRolesOnTurn.indexOf(role); - if (index > -1) { - this.pendingRolesOnTurn.splice(index, 1); - } - } - - private removePendingAgent(agent: AgentAdapter): void { - this.pendingAgentsOnTurn.delete(agent); - } - - private getNextAgentForRole( - role: AgentRole - ): { index: number; agent: AgentAdapter } | null { - for (let i = 0; i < this.agents.length; i++) { - const agent = this.agents[i]; - if (agent.role === role && this.pendingAgentsOnTurn.has(agent)) { - return { index: i, agent }; - } - } - return null; - } - - private setAgents(agents: AgentAdapter[]): void { - this.agents = agents; - this.agentTimes.clear(); - } - - private consumeUntilRole(role: AgentRole): void { - while (this.pendingRolesOnTurn.length > 0) { - const nextRole = this.pendingRolesOnTurn[0]; - if (nextRole === role) break; - this.pendingRolesOnTurn.pop(); - } - } - - /** - * Creates a failure result when the maximum number of turns is reached. - * - * This method is called when the scenario execution reaches the maximum number - * of turns without reaching a conclusion. It creates a failure result with - * appropriate reasoning and includes performance metrics, then sets it on this.result. - * - * The result includes: - * - All messages from the conversation - * - Failure reasoning explaining the turn limit was reached - * - Empty met criteria (since no conclusion was reached) - * - All judge criteria as unmet (since no evaluation was completed) - * - Total execution time and agent response times - * - * @param errorMessage - Optional custom error message to use instead of the default - */ - private reachedMaxTurns(errorMessage?: string): void { - this.setResult({ - success: false, - reasoning: - errorMessage || - `Reached maximum turns (${ - this.config.maxTurns || 10 - }) without conclusion`, - metCriteria: [], - unmetCriteria: this.getJudgeAgent()?.criteria ?? [], - }); - } - - private getJudgeAgent(): JudgeAgentAdapter | null { - return ( - this.agents.find((agent) => agent instanceof JudgeAgentAdapter) ?? null - ); } /** @@ -1097,168 +529,20 @@ export class ScenarioExecution implements ScenarioExecutionLike { this.eventSubject.complete(); } - /** - * Distributes a message to all other agents in the scenario. - * - * This method implements the message broadcasting system that allows agents to - * "hear" messages from other agents. When an agent sends a message, it needs to - * be distributed to all other agents so they can respond appropriately. - * - * The broadcasting process: - * 1. Iterates through all agents in the scenario - * 2. Skips the agent that sent the message (to avoid echo) - * 3. Adds the message to each agent's pending message queue - * 4. Agents will receive these messages when they're called next - * - * This creates a realistic conversation environment where agents can see - * the full conversation history and respond contextually. - * - * @param message - The message to broadcast to all other agents - * @param fromAgentIdx - The index of the agent that sent the message (to avoid echoing back to sender) - * - * @example - * ```typescript - * // When agent 0 sends a message, it gets broadcast to agents 1 and 2 - * execution.broadcastMessage( - * { role: "user", content: "Hello" }, - * 0 // fromAgentIdx - * ); - * // Now agents 1 and 2 have this message in their pendingMessages queue - * ``` - */ - private broadcastMessage(message: ModelMessage, fromAgentIdx?: number): void { - for (let idx = 0; idx < this.agents.length; idx++) { - if (idx === fromAgentIdx) continue; - - if (!this.pendingMessages.has(idx)) { - this.pendingMessages.set(idx, []); - } - this.pendingMessages.get(idx)!.push(message); - } - } - - /** - * Executes a single script step with proper error handling and logging. - * - * This method is responsible for executing each script step function with - * comprehensive error handling and logging. It provides the execution context - * to the script step and handles any errors that occur during execution. - * - * The method: - * - Logs the start of script step execution - * - Calls the script step function with the current state and execution context - * - Logs the completion of the script step - * - Handles and logs any errors that occur - * - Re-throws errors to maintain the original error context - * - * @param scriptStep - The script step function to execute (user, agent, judge, etc.) - * @param stepIndex - The index of the script step for logging and debugging context - * @returns The result of the script step execution (void, ScenarioResult, or null) - * @throws Error if the script step throws an error (preserves original error) - */ - private async executeScriptStep( - scriptStep: ScriptStep, - stepIndex: number - ): Promise { - const functionString = scriptStep.toString(); - - try { - this.logger.debug( - `[${this.config.id}] Executing script step ${stepIndex + 1}`, - { - stepIndex, - function: functionString, - } - ); - - const result = await scriptStep(this.state, this); - - this.logger.debug( - `[${this.config.id}] Script step ${stepIndex + 1} completed`, - { - stepIndex, - hasResult: result !== null && result !== undefined, - resultType: typeof result, - } - ); - - return result; - } catch (error) { - const errorMessage = - error instanceof Error ? error.message : String(error); - - this.logger.error( - `[${this.config.id}] Script step ${stepIndex + 1} failed`, - { - stepIndex, - error: errorMessage, - function: functionString, - } - ); - - // Re-throw the error in case it was a vitest assertion error - throw error; + private handleResponse( + response: AgentReturnTypes, + role: "user" | "assistant" + ): void { + const messages = ScenarioExecutionUtils.convertAgentReturnTypesToMessages( + response, + role + ); + for (const message of messages) { + this.state.addMessage(message); } } -} - -/** - * Converts agent return types to ModelMessage format. - * - * This utility function handles the various return types that agents can return - * and converts them to a standardized ModelMessage format. Agents can return: - * - A string (converted to a message with the specified role) - * - An array of ModelMessage objects (returned as-is) - * - A single ModelMessage object (wrapped in an array) - * - Any other type (returns empty array) - * - * @param response - The response from an agent (string, ModelMessage, or array of ModelMessage) - * @param role - The role to assign if the response is a string ("user" or "assistant") - * @returns An array of ModelMessage objects - */ -function convertAgentReturnTypesToMessages( - response: AgentReturnTypes, - role: "user" | "assistant" -): ModelMessage[] { - if (typeof response === "string") - return [{ role, content: response } as ModelMessage]; - - if (Array.isArray(response)) return response; - - if (response && typeof response === "object" && "role" in response) - return [response]; - return []; -} - -/** - * Extracts structured error information for logging and reporting. - * - * This function takes any thrown error (unknown type) and returns an object - * containing the error's name, message, and stack trace if available. - * If the input is not an instance of Error, it provides a generic name and - * stringified value for message. - * - * @param error - The error object or value to extract information from. - * @returns An object with 'name', optional 'message', and optional 'stack' properties. - */ -function extractErrorInfo(error: unknown): { - name: string; - message?: string; - stack?: string; -} { - // Extracts error information in a structured way for logging and reporting. - // Returns an object with name, message, and stack if available. - if (error instanceof Error) { - return { - name: error.name, - message: error.message, - stack: error.stack, - }; + private findAgent(role: AgentRole): AgentAdapter | undefined { + return this.config.agents.find((a) => a.role === role); } - // If not an Error instance, provide a generic name and stringified value. - return { - name: typeof error, - message: String(error), - }; } diff --git a/javascript/src/execution/scenario-execution.utils.ts b/javascript/src/execution/scenario-execution.utils.ts new file mode 100644 index 00000000..b3ce4f02 --- /dev/null +++ b/javascript/src/execution/scenario-execution.utils.ts @@ -0,0 +1,68 @@ +import { ModelMessage } from "ai"; +import { AgentReturnTypes } from "../domain"; + +/** + * Utility functions for scenario execution. + */ +export const ScenarioExecutionUtils = { + /** + * Converts agent return types to ModelMessage format. + * + * This utility function handles the various return types that agents can return + * and converts them to a standardized ModelMessage format. Agents can return: + * - A string (converted to a message with the specified role) + * - An array of ModelMessage objects (returned as-is) + * - A single ModelMessage object (wrapped in an array) + * - Any other type (returns empty array) + * + * @param response - The response from an agent (string, ModelMessage, or array of ModelMessage) + * @param role - The role to assign if the response is a string ("user" or "assistant") + * @returns An array of ModelMessage objects + */ + convertAgentReturnTypesToMessages( + response: AgentReturnTypes, + role: "user" | "assistant" + ): ModelMessage[] { + if (typeof response === "string") + return [{ role, content: response } as ModelMessage]; + + if (Array.isArray(response)) return response; + + if (response && typeof response === "object" && "role" in response) + return [response]; + + return []; + }, + + /** + * Extracts structured error information for logging and reporting. + * + * This function takes any thrown error (unknown type) and returns an object + * containing the error's name, message, and stack trace if available. + * If the input is not an instance of Error, it provides a generic name and + * stringified value for message. + * + * @param error - The error object or value to extract information from. + * @returns An object with 'name', optional 'message', and optional 'stack' properties. + */ + extractErrorInfo(error: unknown): { + name: string; + message?: string; + stack?: string; + } { + // Extracts error information in a structured way for logging and reporting. + // Returns an object with name, message, and stack if available. + if (error instanceof Error) { + return { + name: error.name, + message: error.message, + stack: error.stack, + }; + } + // If not an Error instance, provide a generic name and stringified value. + return { + name: typeof error, + message: String(error), + }; + }, +}; diff --git a/javascript/src/runner/run.ts b/javascript/src/runner/run.ts index 542a7285..8712658c 100644 --- a/javascript/src/runner/run.ts +++ b/javascript/src/runner/run.ts @@ -95,7 +95,7 @@ export async function run(cfg: ScenarioConfig): Promise { cfg.threadId = generateThreadId(); } - const steps = cfg.script || [proceed()]; + const steps = cfg.script || []; const execution = new ScenarioExecution(cfg, steps); let eventBus: EventBus | null = null;