diff --git a/CLAUDE.md b/CLAUDE.md index 0113e0d..fd8cecb 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,14 +6,14 @@ This file provides guidance to Claude Code when working with this repository. This project has Serena configured. **You MUST follow these rules:** -| Instead of... | USE THIS | Cost | -| -------------------------- | ------------------------------------ | -------- | -| Built-in `Grep`, `grep` | `rg "pattern"` | **FREE** | -| Built-in `Edit` tool | Morph `edit_file` | **FREE** | -| Reading entire files | Serena `get_symbols_overview` | **FREE** | -| Searching for symbols | Serena `find_symbol` | **FREE** | -| Finding usages | Serena `find_referencing_symbols` | **FREE** | -| Semantic/fuzzy search | `warpgrep_codebase_search` | **$$$** | +| Instead of... | USE THIS | Cost | +| ----------------------- | --------------------------------- | -------- | +| Built-in `Grep`, `grep` | `rg "pattern"` | **FREE** | +| Built-in `Edit` tool | Morph `edit_file` | **FREE** | +| Reading entire files | Serena `get_symbols_overview` | **FREE** | +| Searching for symbols | Serena `find_symbol` | **FREE** | +| Finding usages | Serena `find_referencing_symbols` | **FREE** | +| Semantic/fuzzy search | `warpgrep_codebase_search` | **$$$** | > **FREE tools first. `warpgrep` costs real money - only use when `rg` and Serena cannot answer the question.** diff --git a/src/cli/commands/execute.ts b/src/cli/commands/execute.ts index 10b473a..95a7746 100644 --- a/src/cli/commands/execute.ts +++ b/src/cli/commands/execute.ts @@ -76,12 +76,12 @@ export function registerExecuteCommand(program: Command): void { writeJson(`${resultsDir}/scenarios.json`, generation.scenarios); // Stage 3: Execution - const execution = await runExecution( + const execution = await runExecution({ analysis, - generation.scenarios, + scenarios: generation.scenarios, config, - consoleProgress, - ); + progress: consoleProgress, + }); state = updateStateAfterExecution(state, execution.results); await saveState(state); diff --git a/src/cli/commands/resume.ts b/src/cli/commands/resume.ts index 72402cb..c11e1f2 100644 --- a/src/cli/commands/resume.ts +++ b/src/cli/commands/resume.ts @@ -58,20 +58,20 @@ async function resumeFromAnalysis( const generation = await runGeneration(analysis, config); writeJson(`${resultsDir}/scenarios.json`, generation.scenarios); - const execution = await runExecution( + const execution = await runExecution({ analysis, - generation.scenarios, + scenarios: generation.scenarios, config, - consoleProgress, - ); + progress: consoleProgress, + }); - const evaluation = await runEvaluation( - analysis.plugin_name, - generation.scenarios, - execution.results, + const evaluation = await runEvaluation({ + pluginName: analysis.plugin_name, + scenarios: generation.scenarios, + executions: execution.results, config, - consoleProgress, - ); + progress: consoleProgress, + }); // Chain state updates let currentState = updateStateAfterAnalysis(initialState, analysis); @@ -107,20 +107,20 @@ async function resumeFromGeneration( const generation = await runGeneration(analysisData, config); writeJson(`${resultsDir}/scenarios.json`, generation.scenarios); - const execution = await runExecution( - analysisData, - generation.scenarios, + const execution = await runExecution({ + analysis: analysisData, + scenarios: generation.scenarios, config, - consoleProgress, - ); + progress: consoleProgress, + }); - const evaluation = await runEvaluation( - analysisData.plugin_name, - generation.scenarios, - execution.results, + const evaluation = await runEvaluation({ + pluginName: analysisData.plugin_name, + scenarios: generation.scenarios, + executions: execution.results, config, - consoleProgress, - ); + progress: consoleProgress, + }); // Chain state updates let currentState = updateStateAfterGeneration( @@ -158,20 +158,20 @@ async function resumeFromExecution( ); } - const execution = await runExecution( - analysisData, - scenarioData, + const execution = await runExecution({ + analysis: analysisData, + scenarios: scenarioData, config, - consoleProgress, - ); + progress: consoleProgress, + }); - const evaluation = await runEvaluation( - analysisData.plugin_name, - scenarioData, - execution.results, + const evaluation = await runEvaluation({ + pluginName: analysisData.plugin_name, + scenarios: scenarioData, + executions: execution.results, config, - consoleProgress, - ); + progress: consoleProgress, + }); // Chain state updates let currentState = updateStateAfterExecution(initialState, execution.results); @@ -206,13 +206,13 @@ async function resumeFromEvaluation( ); } - const evaluation = await runEvaluation( - analysisData.plugin_name, - scenarioData, - executionData, + const evaluation = await runEvaluation({ + pluginName: analysisData.plugin_name, + scenarios: scenarioData, + executions: executionData, config, - consoleProgress, - ); + progress: consoleProgress, + }); // Chain state updates let currentState = updateStateAfterEvaluation( diff --git a/src/cli/commands/run.ts b/src/cli/commands/run.ts index 7e1a3bf..2e59bae 100644 --- a/src/cli/commands/run.ts +++ b/src/cli/commands/run.ts @@ -145,25 +145,25 @@ export function registerRunCommand(program: Command): void { } // Stage 3: Execution - const execution = await runExecution( + const execution = await runExecution({ analysis, - scenariosToRun, + scenarios: scenariosToRun, config, - consoleProgress, - ); + progress: consoleProgress, + }); state = updateStateAfterExecution(state, execution.results); await saveState(state); writeExecutionMetadata(resultsDir, execution); // Stage 4: Evaluation - const evaluation = await runEvaluation( - analysis.plugin_name, - scenariosToRun, - execution.results, + const evaluation = await runEvaluation({ + pluginName: analysis.plugin_name, + scenarios: scenariosToRun, + executions: execution.results, config, - consoleProgress, - ); + progress: consoleProgress, + }); state = updateStateAfterEvaluation(state, evaluation.results); await saveState(state); diff --git a/src/stages/2-generation/agent-scenario-generator.ts b/src/stages/2-generation/agent-scenario-generator.ts index 94b1f30..67cab31 100644 --- a/src/stages/2-generation/agent-scenario-generator.ts +++ b/src/stages/2-generation/agent-scenario-generator.ts @@ -306,25 +306,35 @@ export async function generateAgentScenarios( return parseAgentScenarioResponse(response, agent); } +/** + * Options for generateAllAgentScenarios. + */ +export interface GenerateAllAgentScenariosOptions { + /** Anthropic client */ + client: Anthropic; + /** Array of agent components */ + agents: AgentComponent[]; + /** Generation config */ + config: GenerationConfig; + /** Optional progress callback */ + onProgress?: (completed: number, total: number, agent: string) => void; + /** Maximum concurrent LLM calls (defaults to 10) */ + maxConcurrent?: number; +} + /** * Generate scenarios for all agents. * * Uses parallel execution with optional rate limiting. * - * @param client - Anthropic client - * @param agents - Array of agent components - * @param config - Generation config - * @param onProgress - Optional progress callback - * @param maxConcurrent - Maximum concurrent LLM calls (defaults to 10) + * @param options - Generate all agent scenarios options * @returns Array of all test scenarios */ export async function generateAllAgentScenarios( - client: Anthropic, - agents: AgentComponent[], - config: GenerationConfig, - onProgress?: (completed: number, total: number, agent: string) => void, - maxConcurrent = 10, + options: GenerateAllAgentScenariosOptions, ): Promise { + const { client, agents, config, onProgress, maxConcurrent = 10 } = options; + // Create rate limiter if configured const rateLimiter = setupRateLimiter(config); diff --git a/src/stages/2-generation/cost-estimator.ts b/src/stages/2-generation/cost-estimator.ts index e9c6dcc..810bf4f 100644 --- a/src/stages/2-generation/cost-estimator.ts +++ b/src/stages/2-generation/cost-estimator.ts @@ -27,6 +27,40 @@ import type { TextBlockParam } from "@anthropic-ai/sdk/resources/messages/messag */ export type SystemPrompt = string | TextBlockParam[]; +/** + * Options for countPromptTokens. + */ +export interface CountPromptTokensOptions { + /** Anthropic client */ + client: Anthropic; + /** Model to use */ + model: string; + /** Prompt text */ + prompt: string; + /** Optional per-request timeout in milliseconds (default: 30000 = 30s) */ + timeout?: number; + /** Optional system prompt (string or array of text blocks for caching) */ + system?: SystemPrompt; +} + +/** + * Options for estimateGenerationCost. + */ +export interface EstimateGenerationCostOptions { + /** Anthropic client */ + client: Anthropic; + /** Prompts to estimate */ + prompts: string[]; + /** Model to use */ + model: string; + /** Maximum concurrent token counting operations (default: 10) */ + concurrency?: number; + /** Timeout for each token counting request in ms (default: 30000) */ + tokenCountingTimeout?: number; + /** Optional system prompt to include in token count */ + system?: SystemPrompt; +} + /** * Default SDK timeout in milliseconds (2 minutes). * This is a conservative default for client-level timeout. @@ -74,12 +108,9 @@ export function createAnthropicClient(timeout?: number): Anthropic { * @returns Token count */ export async function countPromptTokens( - client: Anthropic, - model: string, - prompt: string, - timeout?: number, - system?: SystemPrompt, + options: CountPromptTokensOptions, ): Promise { + const { client, model, prompt, timeout, system } = options; const result = await client.messages.countTokens( { model: resolveModelId(model), @@ -111,20 +142,32 @@ const DEFAULT_TOKEN_COUNTING_CONCURRENCY = 10; * @returns Token estimate */ export async function estimateGenerationCost( - client: Anthropic, - prompts: string[], - model: string, - concurrency: number = DEFAULT_TOKEN_COUNTING_CONCURRENCY, - tokenCountingTimeout?: number, - system?: SystemPrompt, + options: EstimateGenerationCostOptions, ): Promise { + const { + client, + prompts, + model, + concurrency = DEFAULT_TOKEN_COUNTING_CONCURRENCY, + tokenCountingTimeout, + system, + } = options; + // Count tokens in parallel for improved performance // Use continueOnError: false to fail fast - partial results would underestimate costs const result = await parallel({ items: prompts, concurrency, fn: async (prompt) => - countPromptTokens(client, model, prompt, tokenCountingTimeout, system), + countPromptTokens({ + client, + model, + prompt, + ...(tokenCountingTimeout !== undefined && { + timeout: tokenCountingTimeout, + }), + ...(system !== undefined && { system }), + }), continueOnError: false, }); const totalInputTokens = result.results.reduce((a, b) => a + b, 0); diff --git a/src/stages/2-generation/diversity-manager.ts b/src/stages/2-generation/diversity-manager.ts index 44bb8de..e02934a 100644 --- a/src/stages/2-generation/diversity-manager.ts +++ b/src/stages/2-generation/diversity-manager.ts @@ -58,23 +58,33 @@ export function calculateScenarioDistribution( }; } +/** + * Options for createBaseScenario. + */ +export interface CreateBaseScenarioOptions { + /** Component reference (name) */ + componentRef: string; + /** Type of component */ + componentType: ComponentType; + /** The triggering mechanism to preserve */ + coreIntent: string; + /** Original prompt */ + basePrompt: string; + /** Index for ID generation */ + index: number; +} + /** * Create a base scenario from a component. * - * @param componentRef - Component reference (name) - * @param componentType - Type of component - * @param coreIntent - The triggering mechanism to preserve - * @param basePrompt - Original prompt - * @param index - Index for ID generation + * @param options - Create base scenario options * @returns Base scenario */ export function createBaseScenario( - componentRef: string, - componentType: ComponentType, - coreIntent: string, - basePrompt: string, - index: number, + options: CreateBaseScenarioOptions, ): BaseScenario { + const { componentRef, componentType, coreIntent, basePrompt, index } = + options; return { id: `${componentRef}-base-${String(index)}`, component_ref: componentRef, diff --git a/src/stages/2-generation/index.ts b/src/stages/2-generation/index.ts index 3dbaa3a..8ea5515 100644 --- a/src/stages/2-generation/index.ts +++ b/src/stages/2-generation/index.ts @@ -201,13 +201,13 @@ export async function runGeneration( componentType: "skills", components: analysis.components.skills, generator: async (onProgress) => - generateAllSkillScenarios( + generateAllSkillScenarios({ client, - analysis.components.skills, - config.generation, + skills: analysis.components.skills, + config: config.generation, onProgress, - config.max_concurrent, - ), + maxConcurrent: config.max_concurrent, + }), createFallback: createFallbackSkillScenarios, onProgress, }); @@ -220,13 +220,13 @@ export async function runGeneration( componentType: "agents", components: analysis.components.agents, generator: async (onProgress) => - generateAllAgentScenarios( + generateAllAgentScenarios({ client, - analysis.components.agents, - config.generation, + agents: analysis.components.agents, + config: config.generation, onProgress, - config.max_concurrent, - ), + maxConcurrent: config.max_concurrent, + }), createFallback: createFallbackAgentScenarios, onProgress, }); diff --git a/src/stages/2-generation/skill-scenario-generator.ts b/src/stages/2-generation/skill-scenario-generator.ts index bb1985c..765549a 100644 --- a/src/stages/2-generation/skill-scenario-generator.ts +++ b/src/stages/2-generation/skill-scenario-generator.ts @@ -192,25 +192,35 @@ export async function generateSkillScenarios( return parseSkillScenarioResponse(response, skill); } +/** + * Options for generateAllSkillScenarios. + */ +export interface GenerateAllSkillScenariosOptions { + /** Anthropic client */ + client: Anthropic; + /** Array of skill components */ + skills: SkillComponent[]; + /** Generation config */ + config: GenerationConfig; + /** Optional progress callback */ + onProgress?: (completed: number, total: number, skill: string) => void; + /** Maximum concurrent LLM calls (defaults to 10) */ + maxConcurrent?: number; +} + /** * Generate scenarios for all skills. * * Uses parallel execution with optional rate limiting. * - * @param client - Anthropic client - * @param skills - Array of skill components - * @param config - Generation config - * @param onProgress - Optional progress callback - * @param maxConcurrent - Maximum concurrent LLM calls (defaults to 10) + * @param options - Generate all skill scenarios options * @returns Array of all test scenarios */ export async function generateAllSkillScenarios( - client: Anthropic, - skills: SkillComponent[], - config: GenerationConfig, - onProgress?: (completed: number, total: number, skill: string) => void, - maxConcurrent = 10, + options: GenerateAllSkillScenariosOptions, ): Promise { + const { client, skills, config, onProgress, maxConcurrent = 10 } = options; + // Create rate limiter if configured const rateLimiter = setupRateLimiter(config); diff --git a/src/stages/3-execution/index.ts b/src/stages/3-execution/index.ts index 3f2ec5b..66d0b08 100644 --- a/src/stages/3-execution/index.ts +++ b/src/stages/3-execution/index.ts @@ -71,6 +71,22 @@ export interface ExecutionOutput { total_tools_captured: number; } +/** + * Options for runExecution. + */ +export interface RunExecutionOptions { + /** Output from Stage 1 (plugin analysis) */ + analysis: AnalysisOutput; + /** Output from Stage 2 (test scenarios) */ + scenarios: TestScenario[]; + /** Evaluation configuration */ + config: EvalConfig; + /** Optional progress callbacks */ + progress?: ProgressCallbacks; + /** Optional query function (for testing) */ + queryFn?: QueryFunction; +} + /** * Run Stage 3: Execution. * @@ -99,12 +115,16 @@ export interface ExecutionOutput { * ``` */ export async function runExecution( - analysis: AnalysisOutput, - scenarios: TestScenario[], - config: EvalConfig, - progress: ProgressCallbacks = consoleProgress, - queryFn?: QueryFunction, + options: RunExecutionOptions, ): Promise { + const { + analysis, + scenarios, + config, + progress = consoleProgress, + queryFn, + } = options; + logger.stageHeader("Stage 3: Execution", scenarios.length); const pluginPath = config.plugin.path; diff --git a/src/stages/3-execution/plugin-loader.ts b/src/stages/3-execution/plugin-loader.ts index 3092631..118b2ca 100644 --- a/src/stages/3-execution/plugin-loader.ts +++ b/src/stages/3-execution/plugin-loader.ts @@ -94,14 +94,22 @@ export interface PluginLoaderOptions { enableMcpDiscovery?: boolean | undefined; } +/** + * Options for buildPluginQueryInput. + */ +interface BuildPluginQueryInputOptions { + pluginPath: string; + config: ExecutionConfig; + settingSources: SettingSource[]; + controller: AbortController; + startTime: number; +} + /** Build query input for plugin verification */ function buildPluginQueryInput( - pluginPath: string, - config: ExecutionConfig, - settingSources: SettingSource[], - controller: AbortController, - startTime: number, + options: BuildPluginQueryInputOptions, ): QueryInput { + const { pluginPath, config, settingSources, controller, startTime } = options; return { prompt: "Plugin initialization check - respond with OK", options: { @@ -223,13 +231,13 @@ export async function verifyPluginLoad( const settingSources: SettingSource[] = enableMcpDiscovery ? ["project"] : []; try { - const queryInput = buildPluginQueryInput( + const queryInput = buildPluginQueryInput({ pluginPath, config, settingSources, controller, startTime, - ); + }); const q = queryFn ? queryFn(queryInput) : executeQuery(queryInput); return await processQueryMessages(q, pluginPath, timings); diff --git a/src/stages/3-execution/progress-reporters.ts b/src/stages/3-execution/progress-reporters.ts index 9faa420..0c72443 100644 --- a/src/stages/3-execution/progress-reporters.ts +++ b/src/stages/3-execution/progress-reporters.ts @@ -29,12 +29,12 @@ import type { OutputConfig, ProgressCallbacks } from "../../types/index.js"; * * @example * ```typescript - * const output = await runExecution( + * const output = await runExecution({ * analysis, * scenarios, * config, - * consoleProgress - * ); + * progress: consoleProgress, + * }); * ``` */ export const consoleProgress: ProgressCallbacks = { @@ -64,12 +64,12 @@ export const consoleProgress: ProgressCallbacks = { * * @example * ```typescript - * const output = await runExecution( + * const output = await runExecution({ * analysis, * scenarios, * config, - * verboseProgress - * ); + * progress: verboseProgress, + * }); * ``` */ export const verboseProgress: ProgressCallbacks = { @@ -106,12 +106,12 @@ export const verboseProgress: ProgressCallbacks = { * * @example * ```typescript - * const output = await runExecution( + * const output = await runExecution({ * analysis, * scenarios, * config, - * silentProgress - * ); + * progress: silentProgress, + * }); * ``` */ export const silentProgress: ProgressCallbacks = { @@ -126,12 +126,12 @@ export const silentProgress: ProgressCallbacks = { * * @example * ```typescript - * const output = await runExecution( + * const output = await runExecution({ * analysis, * scenarios, * config, - * jsonProgress - * ); + * progress: jsonProgress, + * }); * ``` */ export const jsonProgress: ProgressCallbacks = { diff --git a/src/stages/4-evaluation/aggregation/scenario-results.ts b/src/stages/4-evaluation/aggregation/scenario-results.ts index 5da8d40..2755d25 100644 --- a/src/stages/4-evaluation/aggregation/scenario-results.ts +++ b/src/stages/4-evaluation/aggregation/scenario-results.ts @@ -17,25 +17,42 @@ import type { calculateConflictSeverity } from "../conflict-tracker.js"; import type { ProgrammaticResult, ScenarioEvaluationResult } from "./types.js"; import type { getUniqueDetections } from "../detection/index.js"; +/** + * Options for buildEvaluationResult. + */ +export interface BuildEvaluationResultOptions { + /** Test scenario being evaluated */ + scenario: TestScenario; + /** Whether component was triggered */ + triggered: boolean; + /** Unique component detections */ + uniqueDetections: ReturnType; + /** Conflict analysis result */ + conflictAnalysis: ReturnType; + /** LLM judgment result (null if programmatic-only) */ + judgment: MultiSampleResult | null; + /** How the trigger was detected */ + detectionSource: DetectionSource; +} + /** * Build the evaluation result object. * - * @param scenario - Test scenario being evaluated - * @param triggered - Whether component was triggered - * @param uniqueDetections - Unique component detections - * @param conflictAnalysis - Conflict analysis result - * @param judgment - LLM judgment result (null if programmatic-only) - * @param detectionSource - How the trigger was detected + * @param options - Build evaluation result options * @returns Complete evaluation result */ export function buildEvaluationResult( - scenario: TestScenario, - triggered: boolean, - uniqueDetections: ReturnType, - conflictAnalysis: ReturnType, - judgment: MultiSampleResult | null, - detectionSource: DetectionSource, + options: BuildEvaluationResultOptions, ): EvaluationResult { + const { + scenario, + triggered, + uniqueDetections, + conflictAnalysis, + judgment, + detectionSource, + } = options; + const allTriggeredComponents: TriggeredComponent[] = uniqueDetections.map( (d) => ({ component_type: d.component_type, @@ -94,14 +111,14 @@ export function buildFinalResult( judgeStrategy, } = programmatic; - const result = buildEvaluationResult( - context.scenario, + const result = buildEvaluationResult({ + scenario: context.scenario, triggered, uniqueDetections, conflictAnalysis, judgment, - judgeStrategy.detectionSource, - ); + detectionSource: judgeStrategy.detectionSource, + }); const variance = judgment?.score_variance ?? 0; const isUnanimous = judgment?.is_unanimous ?? true; diff --git a/src/stages/4-evaluation/detection/capture-detection.ts b/src/stages/4-evaluation/detection/capture-detection.ts index 0ea061d..39a9164 100644 --- a/src/stages/4-evaluation/detection/capture-detection.ts +++ b/src/stages/4-evaluation/detection/capture-detection.ts @@ -22,16 +22,25 @@ interface ToolCallLike { input: unknown; } +/** + * Options for createDetection. + */ +interface CreateDetectionOptions { + componentType: ComponentType; + componentName: string; + toolName: string; + evidence: string; + timestamp: number; +} + /** * Create a detection from a tool call. */ function createDetection( - componentType: ComponentType, - componentName: string, - toolName: string, - evidence: string, - timestamp: number, + options: CreateDetectionOptions, ): ProgrammaticDetection { + const { componentType, componentName, toolName, evidence, timestamp } = + options; return { component_type: componentType, component_name: componentName, @@ -53,13 +62,13 @@ function processSkillTool( if (!isSkillInput(tc.input)) { return null; } - return createDetection( - "skill", - tc.input.skill, - tc.name, - `Skill tool invoked: ${tc.input.skill}${evidenceSuffix}`, + return createDetection({ + componentType: "skill", + componentName: tc.input.skill, + toolName: tc.name, + evidence: `Skill tool invoked: ${tc.input.skill}${evidenceSuffix}`, timestamp, - ); + }); } /** @@ -73,13 +82,13 @@ function processTaskTool( if (!isTaskInput(tc.input)) { return null; } - return createDetection( - "agent", - tc.input.subagent_type, - tc.name, - `Task tool invoked: ${tc.input.subagent_type}${evidenceSuffix}`, + return createDetection({ + componentType: "agent", + componentName: tc.input.subagent_type, + toolName: tc.name, + evidence: `Task tool invoked: ${tc.input.subagent_type}${evidenceSuffix}`, timestamp, - ); + }); } /** @@ -93,13 +102,13 @@ function processCommandTool( if (!isSkillInput(tc.input)) { return null; } - return createDetection( - "command", - tc.input.skill, - tc.name, - `SlashCommand invoked: ${tc.input.skill}${evidenceSuffix}`, + return createDetection({ + componentType: "command", + componentName: tc.input.skill, + toolName: tc.name, + evidence: `SlashCommand invoked: ${tc.input.skill}${evidenceSuffix}`, timestamp, - ); + }); } /** @@ -114,13 +123,13 @@ function processMcpTool( if (!parsed) { return null; } - return createDetection( - "mcp_server", - parsed.serverName, - tc.name, - `MCP tool invoked: ${tc.name} (server: ${parsed.serverName}, tool: ${parsed.toolName})${evidenceSuffix}`, + return createDetection({ + componentType: "mcp_server", + componentName: parsed.serverName, + toolName: tc.name, + evidence: `MCP tool invoked: ${tc.name} (server: ${parsed.serverName}, tool: ${parsed.toolName})${evidenceSuffix}`, timestamp, - ); + }); } /** diff --git a/src/stages/4-evaluation/detection/orchestrator.ts b/src/stages/4-evaluation/detection/orchestrator.ts index 532f0fa..cd7b6c7 100644 --- a/src/stages/4-evaluation/detection/orchestrator.ts +++ b/src/stages/4-evaluation/detection/orchestrator.ts @@ -29,6 +29,22 @@ import type { Transcript, } from "../../../types/index.js"; +/** + * Options for detectAllComponentsWithHooks. + */ +export interface DetectAllComponentsWithHooksOptions { + /** Tool captures from execution */ + captures: ToolCapture[]; + /** Execution transcript */ + transcript: Transcript; + /** Test scenario */ + scenario: TestScenario; + /** Optional hook response captures */ + hookResponses?: HookResponseCapture[]; + /** Optional subagent lifecycle captures */ + subagentCaptures?: SubagentCapture[]; +} + /** * Detect all components using all detection methods. * @@ -108,12 +124,11 @@ export function detectAllComponents( * @returns Array of all detected components including hooks, agents, and MCP servers */ export function detectAllComponentsWithHooks( - captures: ToolCapture[], - transcript: Transcript, - scenario: TestScenario, - hookResponses?: HookResponseCapture[], - subagentCaptures?: SubagentCapture[], + options: DetectAllComponentsWithHooksOptions, ): ProgrammaticDetection[] { + const { captures, transcript, scenario, hookResponses, subagentCaptures } = + options; + // Get standard component detections (now includes MCP servers) const detections = detectAllComponents(captures, transcript, scenario); diff --git a/src/stages/4-evaluation/index.ts b/src/stages/4-evaluation/index.ts index 2660c69..70e0166 100644 --- a/src/stages/4-evaluation/index.ts +++ b/src/stages/4-evaluation/index.ts @@ -69,6 +69,68 @@ import type { } from "../../types/index.js"; import type Anthropic from "@anthropic-ai/sdk"; +/** + * Sample data entry for multi-sampling metrics. + */ +interface SampleDataEntry { + scenarioId: string; + variance: number; + numSamples: number; + hasConsensus: boolean; +} + +/** + * Options for runSynchronousEvaluation. + */ +interface RunSynchronousEvaluationOptions { + /** Anthropic client */ + client: Anthropic; + /** Programmatic detection results */ + programmaticResults: ProgrammaticResult[]; + /** Evaluation configuration */ + config: EvalConfig; + /** Progress callbacks */ + progress: ProgressCallbacks; + /** Sample data array (mutated during evaluation) */ + sampleData: SampleDataEntry[]; +} + +/** + * Options for calculateAndSaveMetrics. + */ +interface CalculateAndSaveMetricsOptions { + /** Plugin name */ + pluginName: string; + /** Results with scenario and execution context */ + resultsWithContext: { + result: EvaluationResult; + scenario: TestScenario; + execution: ExecutionResult; + }[]; + /** Execution results */ + executions: ExecutionResult[]; + /** Evaluation configuration */ + config: EvalConfig; + /** Sample data for multi-sampling metrics */ + sampleData: SampleDataEntry[]; +} + +/** + * Options for runEvaluation. + */ +export interface RunEvaluationOptions { + /** Plugin name */ + pluginName: string; + /** Test scenarios */ + scenarios: TestScenario[]; + /** Execution results */ + executions: ExecutionResult[]; + /** Evaluation configuration */ + config: EvalConfig; + /** Progress callbacks */ + progress?: ProgressCallbacks; +} + /** * Output from Stage 4: Evaluation. */ @@ -127,13 +189,17 @@ function runProgrammaticDetection( // Otherwise use the simpler detectAllComponents const detections = scenario.component_type === "hook" || scenario.component_type === "agent" - ? detectAllComponentsWithHooks( - execution.detected_tools, - execution.transcript, + ? detectAllComponentsWithHooks({ + captures: execution.detected_tools, + transcript: execution.transcript, scenario, - execution.hook_responses, - execution.subagent_captures, - ) + ...(execution.hook_responses !== undefined && { + hookResponses: execution.hook_responses, + }), + ...(execution.subagent_captures !== undefined && { + subagentCaptures: execution.subagent_captures, + }), + }) : detectAllComponents( execution.detected_tools, execution.transcript, @@ -256,17 +322,9 @@ async function runBatchedEvaluation( * Run synchronous LLM evaluation (original behavior). */ async function runSynchronousEvaluation( - client: Anthropic, - programmaticResults: ProgrammaticResult[], - config: EvalConfig, - progress: ProgressCallbacks, - sampleData: { - scenarioId: string; - variance: number; - numSamples: number; - hasConsensus: boolean; - }[], + options: RunSynchronousEvaluationOptions, ): Promise { + const { client, programmaticResults, config, progress, sampleData } = options; const evalConfig = config.evaluation; const parallelResult = await parallel< @@ -280,13 +338,13 @@ async function runSynchronousEvaluation( if (pr.judgeStrategy.needsLLMJudge) { try { - judgment = await runJudgment( + judgment = await runJudgment({ client, - pr.context.scenario, - pr.context.execution.transcript, - pr.uniqueDetections, - evalConfig, - ); + scenario: pr.context.scenario, + transcript: pr.context.execution.transcript, + programmaticResult: pr.uniqueDetections, + config: evalConfig, + }); } catch (err) { const errorResponse = createErrorJudgeResponse( formatErrorWithRequestId(err), @@ -348,21 +406,11 @@ async function runSynchronousEvaluation( * @returns Calculated metrics */ async function calculateAndSaveMetrics( - pluginName: string, - resultsWithContext: { - result: EvaluationResult; - scenario: TestScenario; - execution: ExecutionResult; - }[], - executions: ExecutionResult[], - config: EvalConfig, - sampleData: { - scenarioId: string; - variance: number; - numSamples: number; - hasConsensus: boolean; - }[], + options: CalculateAndSaveMetricsOptions, ): Promise { + const { pluginName, resultsWithContext, executions, config, sampleData } = + options; + // Build metrics options const metricsOptions: { numSamples?: number; @@ -406,12 +454,10 @@ async function calculateAndSaveMetrics( * @returns Evaluation output */ export async function runEvaluation( - pluginName: string, - scenarios: TestScenario[], - executions: ExecutionResult[], - config: EvalConfig, - progress: ProgressCallbacks = {}, + options: RunEvaluationOptions, ): Promise { + const { pluginName, scenarios, executions, config, progress = {} } = options; + logger.stageHeader("Stage 4: Evaluation", executions.length); const startTime = Date.now(); @@ -470,12 +516,7 @@ export async function runEvaluation( }); // Track sample data for metrics - const sampleData: { - scenarioId: string; - variance: number; - numSamples: number; - hasConsensus: boolean; - }[] = []; + const sampleData: SampleDataEntry[] = []; let evalResults: ScenarioEvaluationResult[]; @@ -506,13 +547,13 @@ export async function runEvaluation( ); // Phase 2b: Run synchronous LLM evaluation - evalResults = await runSynchronousEvaluation( + evalResults = await runSynchronousEvaluation({ client, programmaticResults, config, progress, sampleData, - ); + }); } const results = evalResults.map((r) => r.result); @@ -536,13 +577,13 @@ export async function runEvaluation( }); // Calculate metrics and save results - const metrics = await calculateAndSaveMetrics( + const metrics = await calculateAndSaveMetrics({ pluginName, resultsWithContext, executions, config, sampleData, - ); + }); const totalDuration = Date.now() - startTime; diff --git a/src/stages/4-evaluation/llm-judge.ts b/src/stages/4-evaluation/llm-judge.ts index f14b19a..2c77700 100644 --- a/src/stages/4-evaluation/llm-judge.ts +++ b/src/stages/4-evaluation/llm-judge.ts @@ -24,6 +24,22 @@ import type { } from "../../types/index.js"; import type Anthropic from "@anthropic-ai/sdk"; +/** + * Options for LLM judge evaluation functions. + */ +export interface EvaluateJudgeOptions { + /** Anthropic client */ + client: Anthropic; + /** Test scenario being evaluated */ + scenario: TestScenario; + /** Execution transcript */ + transcript: Transcript; + /** Programmatic detection results */ + programmaticResult: ProgrammaticDetection[]; + /** Evaluation configuration */ + config: EvaluationConfig; +} + /** * Judge response schema for structured output. * @@ -268,12 +284,10 @@ export function buildJudgePrompt( * ``` */ export async function evaluateWithLLMJudge( - client: Anthropic, - scenario: TestScenario, - transcript: Transcript, - programmaticResult: ProgrammaticDetection[], - config: EvaluationConfig, + options: EvaluateJudgeOptions, ): Promise { + const { client, scenario, transcript, programmaticResult, config } = options; + const userPrompt = buildJudgePrompt( scenario, transcript, @@ -336,30 +350,14 @@ export async function evaluateWithLLMJudge( * @returns Judge response */ export async function evaluateWithFallback( - client: Anthropic, - scenario: TestScenario, - transcript: Transcript, - programmaticResult: ProgrammaticDetection[], - config: EvaluationConfig, + options: EvaluateJudgeOptions, ): Promise { try { // Try structured output first - return await evaluateWithLLMJudge( - client, - scenario, - transcript, - programmaticResult, - config, - ); + return await evaluateWithLLMJudge(options); } catch { // Fallback to regular JSON parsing - return evaluateWithJsonFallback( - client, - scenario, - transcript, - programmaticResult, - config, - ); + return evaluateWithJsonFallback(options); } } @@ -395,12 +393,10 @@ No markdown, no explanation - just the JSON.`; * @returns Judge response */ async function evaluateWithJsonFallback( - client: Anthropic, - scenario: TestScenario, - transcript: Transcript, - programmaticResult: ProgrammaticDetection[], - config: EvaluationConfig, + options: EvaluateJudgeOptions, ): Promise { + const { client, scenario, transcript, programmaticResult, config } = options; + const userPrompt = buildJudgePrompt( scenario, transcript, diff --git a/src/stages/4-evaluation/multi-sampler.ts b/src/stages/4-evaluation/multi-sampler.ts index c7ab7df..b0cac6e 100644 --- a/src/stages/4-evaluation/multi-sampler.ts +++ b/src/stages/4-evaluation/multi-sampler.ts @@ -27,6 +27,30 @@ import type { } from "../../types/index.js"; import type Anthropic from "@anthropic-ai/sdk"; +/** + * Options for multi-sample evaluation functions. + */ +export interface EvaluateMultiSampleOptions { + /** Anthropic client */ + client: Anthropic; + /** Test scenario being evaluated */ + scenario: TestScenario; + /** Execution transcript */ + transcript: Transcript; + /** Programmatic detection results */ + programmaticResult: ProgrammaticDetection[]; + /** Evaluation configuration */ + config: EvaluationConfig; +} + +/** + * Options for evaluateWithMultiSampling. + */ +export interface EvaluateWithMultiSamplingOptions extends EvaluateMultiSampleOptions { + /** Maximum concurrent samples (default: 10) */ + maxConcurrent?: number; +} + /** * Aggregate scores using the specified method. * @@ -171,13 +195,17 @@ const DEFAULT_MULTI_SAMPLE_CONCURRENCY = 10; * ``` */ export async function evaluateWithMultiSampling( - client: Anthropic, - scenario: TestScenario, - transcript: Transcript, - programmaticResult: ProgrammaticDetection[], - config: EvaluationConfig, - maxConcurrent: number = DEFAULT_MULTI_SAMPLE_CONCURRENCY, + options: EvaluateWithMultiSamplingOptions, ): Promise { + const { + client, + scenario, + transcript, + programmaticResult, + config, + maxConcurrent = DEFAULT_MULTI_SAMPLE_CONCURRENCY, + } = options; + const numSamples = config.num_samples || 1; // Run judge multiple times in parallel for improved performance @@ -186,13 +214,13 @@ export async function evaluateWithMultiSampling( items: Array.from({ length: numSamples }), concurrency: Math.min(numSamples, maxConcurrent), fn: async () => - evaluateWithFallback( + evaluateWithFallback({ client, scenario, transcript, programmaticResult, config, - ), + }), continueOnError: false, }); const responses = result.results; @@ -302,19 +330,17 @@ export function getConfidenceLevel( * @returns Multi-sample result with single sample */ export async function evaluateSingleSample( - client: Anthropic, - scenario: TestScenario, - transcript: Transcript, - programmaticResult: ProgrammaticDetection[], - config: EvaluationConfig, + options: EvaluateMultiSampleOptions, ): Promise { - const response = await evaluateWithFallback( + const { client, scenario, transcript, programmaticResult, config } = options; + + const response = await evaluateWithFallback({ client, scenario, transcript, programmaticResult, config, - ); + }); return { individual_scores: [response.quality_score], @@ -338,27 +364,11 @@ export async function evaluateSingleSample( * @returns Multi-sample result */ export async function runJudgment( - client: Anthropic, - scenario: TestScenario, - transcript: Transcript, - programmaticResult: ProgrammaticDetection[], - config: EvaluationConfig, + options: EvaluateMultiSampleOptions, ): Promise { - if (config.num_samples <= 1) { - return evaluateSingleSample( - client, - scenario, - transcript, - programmaticResult, - config, - ); + if (options.config.num_samples <= 1) { + return evaluateSingleSample(options); } - return evaluateWithMultiSampling( - client, - scenario, - transcript, - programmaticResult, - config, - ); + return evaluateWithMultiSampling(options); } diff --git a/tests/integration/stages/4-evaluation/index.test.ts b/tests/integration/stages/4-evaluation/index.test.ts index f128763..819dfbf 100644 --- a/tests/integration/stages/4-evaluation/index.test.ts +++ b/tests/integration/stages/4-evaluation/index.test.ts @@ -536,7 +536,12 @@ describe("Stage 4: Evaluation Integration", () => { it("handles empty execution results gracefully", async () => { const config = createTestConfig(); - const output = await runEvaluation("test-plugin", [], [], config); + const output = await runEvaluation({ + pluginName: "test-plugin", + scenarios: [], + executions: [], + config, + }); expect(output.plugin_name).toBe("test-plugin"); expect(output.results).toHaveLength(0); @@ -580,12 +585,12 @@ describe("Stage 4: Evaluation Integration", () => { // Use programmatic detection only (skip LLM for true negatives) config.evaluation.detection_mode = "programmatic_first"; - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(output.plugin_name).toBe("test-plugin"); expect(output.results).toHaveLength(2); @@ -640,12 +645,12 @@ describe("Stage 4: Evaluation Integration", () => { ]; const config = createTestConfig(); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); const result = output.results[0]; expect(result?.triggered).toBe(true); @@ -686,13 +691,13 @@ describe("Stage 4: Evaluation Integration", () => { onError: vi.fn(), }; - await runEvaluation( - "test-plugin", + await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, progress, - ); + }); expect(progress.onStageStart).toHaveBeenCalledWith("evaluation", 1); expect(progress.onStageComplete).toHaveBeenCalledWith( @@ -721,12 +726,12 @@ describe("Stage 4: Evaluation Integration", () => { ]; const config = createTestConfig(); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); // Should produce no results since scenario not found expect(output.results).toHaveLength(0); diff --git a/tests/unit/stages/2-generation/agent-scenario-generator.test.ts b/tests/unit/stages/2-generation/agent-scenario-generator.test.ts index ccf13cc..6e2c65f 100644 --- a/tests/unit/stages/2-generation/agent-scenario-generator.test.ts +++ b/tests/unit/stages/2-generation/agent-scenario-generator.test.ts @@ -649,11 +649,11 @@ describe("generateAllAgentScenarios", () => { ], }); - const scenarios = await generateAllAgentScenarios( - mockClient as unknown as Anthropic, + const scenarios = await generateAllAgentScenarios({ + client: mockClient as unknown as Anthropic, agents, config, - ); + }); expect(mockClient.messages.create).toHaveBeenCalledTimes(2); expect(scenarios).toHaveLength(2); @@ -667,12 +667,12 @@ describe("generateAllAgentScenarios", () => { }); const progressCallback = vi.fn(); - await generateAllAgentScenarios( - mockClient as unknown as Anthropic, + await generateAllAgentScenarios({ + client: mockClient as unknown as Anthropic, agents, config, - progressCallback, - ); + onProgress: progressCallback, + }); // Called once per completed agent (parallel execution) expect(progressCallback).toHaveBeenCalledTimes(2); @@ -681,11 +681,11 @@ describe("generateAllAgentScenarios", () => { }); it("should return empty array for empty agents list", async () => { - const scenarios = await generateAllAgentScenarios( - mockClient as unknown as Anthropic, - [], + const scenarios = await generateAllAgentScenarios({ + client: mockClient as unknown as Anthropic, + agents: [], config, - ); + }); expect(scenarios).toEqual([]); expect(mockClient.messages.create).not.toHaveBeenCalled(); @@ -731,11 +731,11 @@ describe("generateAllAgentScenarios", () => { ], }); - const scenarios = await generateAllAgentScenarios( - mockClient as unknown as Anthropic, + const scenarios = await generateAllAgentScenarios({ + client: mockClient as unknown as Anthropic, agents, config, - ); + }); expect(scenarios).toHaveLength(3); expect(scenarios[1].setup_messages).toBeDefined(); diff --git a/tests/unit/stages/2-generation/cost-estimator.test.ts b/tests/unit/stages/2-generation/cost-estimator.test.ts index a1f73f5..befa422 100644 --- a/tests/unit/stages/2-generation/cost-estimator.test.ts +++ b/tests/unit/stages/2-generation/cost-estimator.test.ts @@ -558,11 +558,11 @@ describe("countPromptTokens", () => { it("should count tokens for a prompt", async () => { mockClient.messages.countTokens.mockResolvedValue({ input_tokens: 150 }); - const count = await countPromptTokens( - mockClient as unknown as Anthropic, - "haiku", - "Test prompt content", - ); + const count = await countPromptTokens({ + client: mockClient as unknown as Anthropic, + model: "haiku", + prompt: "Test prompt content", + }); expect(mockClient.messages.countTokens).toHaveBeenCalledTimes(1); expect(mockClient.messages.countTokens).toHaveBeenCalledWith( @@ -578,11 +578,11 @@ describe("countPromptTokens", () => { it("should resolve model shorthand before counting", async () => { mockClient.messages.countTokens.mockResolvedValue({ input_tokens: 100 }); - await countPromptTokens( - mockClient as unknown as Anthropic, - "sonnet", - "Test", - ); + await countPromptTokens({ + client: mockClient as unknown as Anthropic, + model: "sonnet", + prompt: "Test", + }); expect(mockClient.messages.countTokens).toHaveBeenCalledWith( { @@ -597,11 +597,11 @@ describe("countPromptTokens", () => { const longPrompt = "word ".repeat(1000); mockClient.messages.countTokens.mockResolvedValue({ input_tokens: 5000 }); - const count = await countPromptTokens( - mockClient as unknown as Anthropic, - "haiku", - longPrompt, - ); + const count = await countPromptTokens({ + client: mockClient as unknown as Anthropic, + model: "haiku", + prompt: longPrompt, + }); expect(count).toBe(5000); }); @@ -609,13 +609,12 @@ describe("countPromptTokens", () => { it("should include system prompt in token count when provided as string", async () => { mockClient.messages.countTokens.mockResolvedValue({ input_tokens: 650 }); - const count = await countPromptTokens( - mockClient as unknown as Anthropic, - "haiku", - "Test prompt", - undefined, // timeout - "You are a helpful assistant.", - ); + const count = await countPromptTokens({ + client: mockClient as unknown as Anthropic, + model: "haiku", + prompt: "Test prompt", + system: "You are a helpful assistant.", + }); expect(mockClient.messages.countTokens).toHaveBeenCalledWith( { @@ -640,13 +639,12 @@ describe("countPromptTokens", () => { }, ]; - const count = await countPromptTokens( - mockClient as unknown as Anthropic, - "haiku", - "Test prompt", - undefined, // timeout - systemPromptArray, - ); + const count = await countPromptTokens({ + client: mockClient as unknown as Anthropic, + model: "haiku", + prompt: "Test prompt", + system: systemPromptArray, + }); expect(mockClient.messages.countTokens).toHaveBeenCalledWith( { @@ -662,11 +660,11 @@ describe("countPromptTokens", () => { it("should not include system in request when not provided", async () => { mockClient.messages.countTokens.mockResolvedValue({ input_tokens: 100 }); - await countPromptTokens( - mockClient as unknown as Anthropic, - "haiku", - "Test prompt", - ); + await countPromptTokens({ + client: mockClient as unknown as Anthropic, + model: "haiku", + prompt: "Test prompt", + }); expect(mockClient.messages.countTokens).toHaveBeenCalledWith( { @@ -699,11 +697,11 @@ describe("estimateGenerationCost (async)", () => { .mockResolvedValueOnce({ input_tokens: 150 }) .mockResolvedValueOnce({ input_tokens: 200 }); - const estimate = await estimateGenerationCost( - mockClient as unknown as Anthropic, - ["prompt1", "prompt2", "prompt3"], - "haiku", - ); + const estimate = await estimateGenerationCost({ + client: mockClient as unknown as Anthropic, + prompts: ["prompt1", "prompt2", "prompt3"], + model: "haiku", + }); expect(mockClient.messages.countTokens).toHaveBeenCalledTimes(3); expect(estimate.stage).toBe("generation"); @@ -713,11 +711,11 @@ describe("estimateGenerationCost (async)", () => { }); it("should return zero cost for empty prompts", async () => { - const estimate = await estimateGenerationCost( - mockClient as unknown as Anthropic, - [], - "haiku", - ); + const estimate = await estimateGenerationCost({ + client: mockClient as unknown as Anthropic, + prompts: [], + model: "haiku", + }); expect(mockClient.messages.countTokens).not.toHaveBeenCalled(); expect(estimate.input_tokens).toBe(0); @@ -728,20 +726,20 @@ describe("estimateGenerationCost (async)", () => { it("should use correct model for cost calculation", async () => { mockClient.messages.countTokens.mockResolvedValue({ input_tokens: 1000 }); - const haikuEstimate = await estimateGenerationCost( - mockClient as unknown as Anthropic, - ["test"], - "haiku", - ); + const haikuEstimate = await estimateGenerationCost({ + client: mockClient as unknown as Anthropic, + prompts: ["test"], + model: "haiku", + }); mockClient.messages.countTokens.mockClear(); mockClient.messages.countTokens.mockResolvedValue({ input_tokens: 1000 }); - const opusEstimate = await estimateGenerationCost( - mockClient as unknown as Anthropic, - ["test"], - "opus", - ); + const opusEstimate = await estimateGenerationCost({ + client: mockClient as unknown as Anthropic, + prompts: ["test"], + model: "opus", + }); // Opus should be more expensive than Haiku expect(opusEstimate.estimated_cost_usd).toBeGreaterThan( @@ -752,11 +750,11 @@ describe("estimateGenerationCost (async)", () => { it("should call countTokens for each prompt", async () => { mockClient.messages.countTokens.mockResolvedValue({ input_tokens: 50 }); - await estimateGenerationCost( - mockClient as unknown as Anthropic, - ["prompt A", "prompt B"], - "haiku", - ); + await estimateGenerationCost({ + client: mockClient as unknown as Anthropic, + prompts: ["prompt A", "prompt B"], + model: "haiku", + }); expect(mockClient.messages.countTokens).toHaveBeenCalledWith( { @@ -781,11 +779,11 @@ describe("estimateGenerationCost (async)", () => { .mockResolvedValueOnce({ input_tokens: 200 }); await expect( - estimateGenerationCost( - mockClient as unknown as Anthropic, - ["prompt1", "prompt2", "prompt3"], - "haiku", - ), + estimateGenerationCost({ + client: mockClient as unknown as Anthropic, + prompts: ["prompt1", "prompt2", "prompt3"], + model: "haiku", + }), ).rejects.toThrow("API error"); }); @@ -793,14 +791,13 @@ describe("estimateGenerationCost (async)", () => { mockClient.messages.countTokens.mockResolvedValue({ input_tokens: 500 }); const systemPrompt = "You are a test generator."; - await estimateGenerationCost( - mockClient as unknown as Anthropic, - ["prompt1", "prompt2"], - "haiku", - 5, - undefined, - systemPrompt, - ); + await estimateGenerationCost({ + client: mockClient as unknown as Anthropic, + prompts: ["prompt1", "prompt2"], + model: "haiku", + concurrency: 5, + system: systemPrompt, + }); // Verify all calls include the system prompt expect(mockClient.messages.countTokens).toHaveBeenCalledTimes(2); @@ -825,11 +822,11 @@ describe("estimateGenerationCost (async)", () => { it("should not include system when not provided", async () => { mockClient.messages.countTokens.mockResolvedValue({ input_tokens: 100 }); - await estimateGenerationCost( - mockClient as unknown as Anthropic, - ["prompt"], - "haiku", - ); + await estimateGenerationCost({ + client: mockClient as unknown as Anthropic, + prompts: ["prompt"], + model: "haiku", + }); const callArgs = mockClient.messages.countTokens.mock.calls[0][0]; expect(callArgs).not.toHaveProperty("system"); diff --git a/tests/unit/stages/2-generation/diversity-manager.test.ts b/tests/unit/stages/2-generation/diversity-manager.test.ts index 888435b..797146a 100644 --- a/tests/unit/stages/2-generation/diversity-manager.test.ts +++ b/tests/unit/stages/2-generation/diversity-manager.test.ts @@ -263,13 +263,13 @@ describe("calculateDiversityMetrics", () => { describe("createBaseScenario", () => { it("should create a base scenario with correct properties", () => { - const result = createBaseScenario( - "test-skill", - "skill", - "create a hook", - "I want to create a hook", - 0, - ); + const result = createBaseScenario({ + componentRef: "test-skill", + componentType: "skill", + coreIntent: "create a hook", + basePrompt: "I want to create a hook", + index: 0, + }); expect(result.id).toBe("test-skill-base-0"); expect(result.component_ref).toBe("test-skill"); @@ -281,28 +281,30 @@ describe("createBaseScenario", () => { describe("baseToTestScenario", () => { it("should convert base scenario to test scenario", () => { - const base = createBaseScenario( - "test-skill", - "skill", - "create a hook", - "I want to create a hook", - 0, - ); + const base = createBaseScenario({ + componentRef: "test-skill", + componentType: "skill", + coreIntent: "create a hook", + basePrompt: "I want to create a hook", + index: 0, + }); const result = baseToTestScenario(base, "direct", true, "Test reasoning"); expect(result.id).toBe("test-skill-base-0"); - expect(result.component_ref).toBe("test-skill"); - expect(result.component_type).toBe("skill"); expect(result.scenario_type).toBe("direct"); - expect(result.user_prompt).toBe("I want to create a hook"); expect(result.expected_trigger).toBe(true); - expect(result.expected_component).toBe("test-skill"); expect(result.reasoning).toBe("Test reasoning"); }); it("should omit reasoning when not provided", () => { - const base = createBaseScenario("test-skill", "skill", "test", "test", 0); + const base = createBaseScenario({ + componentRef: "test-skill", + componentType: "skill", + coreIntent: "test", + basePrompt: "test", + index: 0, + }); const result = baseToTestScenario(base, "direct", true); diff --git a/tests/unit/stages/2-generation/skill-scenario-generator.test.ts b/tests/unit/stages/2-generation/skill-scenario-generator.test.ts index f07f8e7..600d0bd 100644 --- a/tests/unit/stages/2-generation/skill-scenario-generator.test.ts +++ b/tests/unit/stages/2-generation/skill-scenario-generator.test.ts @@ -565,11 +565,11 @@ describe("generateAllSkillScenarios", () => { ], }); - const scenarios = await generateAllSkillScenarios( - mockClient as unknown as Anthropic, + const scenarios = await generateAllSkillScenarios({ + client: mockClient as unknown as Anthropic, skills, config, - ); + }); expect(mockClient.messages.create).toHaveBeenCalledTimes(2); expect(scenarios).toHaveLength(2); @@ -583,12 +583,12 @@ describe("generateAllSkillScenarios", () => { }); const progressCallback = vi.fn(); - await generateAllSkillScenarios( - mockClient as unknown as Anthropic, + await generateAllSkillScenarios({ + client: mockClient as unknown as Anthropic, skills, config, - progressCallback, - ); + onProgress: progressCallback, + }); // Called once per completed skill (parallel execution) expect(progressCallback).toHaveBeenCalledTimes(2); @@ -597,11 +597,11 @@ describe("generateAllSkillScenarios", () => { }); it("should return empty array for empty skills list", async () => { - const scenarios = await generateAllSkillScenarios( - mockClient as unknown as Anthropic, - [], + const scenarios = await generateAllSkillScenarios({ + client: mockClient as unknown as Anthropic, + skills: [], config, - ); + }); expect(scenarios).toEqual([]); expect(mockClient.messages.create).not.toHaveBeenCalled(); @@ -646,11 +646,11 @@ describe("generateAllSkillScenarios", () => { ], }); - const scenarios = await generateAllSkillScenarios( - mockClient as unknown as Anthropic, + const scenarios = await generateAllSkillScenarios({ + client: mockClient as unknown as Anthropic, skills, config, - ); + }); expect(scenarios).toHaveLength(3); }); diff --git a/tests/unit/stages/3-execution/index.test.ts b/tests/unit/stages/3-execution/index.test.ts index 0272553..8268da3 100644 --- a/tests/unit/stages/3-execution/index.test.ts +++ b/tests/unit/stages/3-execution/index.test.ts @@ -273,7 +273,7 @@ describe("runExecution", () => { const scenarios = [createScenario()]; const config = createConfig(); - await runExecution(analysis, scenarios, config); + await runExecution({ analysis, scenarios, config }); expect(verifyPluginLoad).toHaveBeenCalledWith({ pluginPath: config.plugin.path, @@ -298,7 +298,7 @@ describe("runExecution", () => { ]; const config = createConfig(); - const result = await runExecution(analysis, scenarios, config); + const result = await runExecution({ analysis, scenarios, config }); expect(result.results).toHaveLength(0); expect(result.error_count).toBe(2); // All scenarios failed @@ -331,7 +331,7 @@ describe("runExecution", () => { scope: { ...createConfig().scope, mcp_servers: true }, }); - await runExecution(analysis, scenarios, config); + await runExecution({ analysis, scenarios, config }); // The parallel mock receives the filtered scenarios expect(parallel).toHaveBeenCalled(); @@ -353,7 +353,7 @@ describe("runExecution", () => { const analysis = createAnalysis(); const config = createConfig(); - await runExecution(analysis, scenarios, config); + await runExecution({ analysis, scenarios, config }); expect(resolveExecutionStrategy).toHaveBeenCalledWith( config.execution, @@ -379,7 +379,7 @@ describe("runExecution", () => { scenarios, }); - await runExecution(analysis, scenarios, config); + await runExecution({ analysis, scenarios, config }); expect(resolveExecutionStrategy).toHaveBeenCalledWith( config.execution, @@ -408,7 +408,7 @@ describe("runExecution", () => { }); try { - await runExecution(analysis, scenarios, config); + await runExecution({ analysis, scenarios, config }); } catch { // Expected to fail due to incomplete mocking of batched execution } @@ -432,7 +432,7 @@ describe("runExecution", () => { .mockResolvedValueOnce(createExecutionResult({ scenario_id: "s2" })) .mockResolvedValueOnce(createExecutionResult({ scenario_id: "s3" })); - await runExecution(analysis, scenarios, config); + await runExecution({ analysis, scenarios, config }); expect(parallel).toHaveBeenCalledWith( expect.objectContaining({ @@ -455,7 +455,12 @@ describe("runExecution", () => { onError: vi.fn(), }; - await runExecution(analysis, scenarios, config, mockProgress); + await runExecution({ + analysis, + scenarios, + config, + progress: mockProgress, + }); expect(mockProgress.onStageStart).toHaveBeenCalledWith("execution", 1); expect(mockProgress.onStageComplete).toHaveBeenCalledWith( @@ -481,7 +486,7 @@ describe("runExecution", () => { createExecutionResult({ scenario_id: "s2", cost_usd: 0.02 }), ); - const result = await runExecution(analysis, scenarios, config); + const result = await runExecution({ analysis, scenarios, config }); expect(result.results).toHaveLength(2); expect(result.total_cost_usd).toBeCloseTo(0.03); @@ -516,7 +521,7 @@ describe("runExecution", () => { }), ); - const result = await runExecution(analysis, scenarios, config); + const result = await runExecution({ analysis, scenarios, config }); expect(result.success_count).toBe(1); expect(result.error_count).toBe(1); @@ -527,7 +532,7 @@ describe("runExecution", () => { const scenarios = [createScenario()]; const config = createConfig(); - await runExecution(analysis, scenarios, config); + await runExecution({ analysis, scenarios, config }); // Verify parallel is called with continueOnError: true expect(parallel).toHaveBeenCalledWith( @@ -544,7 +549,7 @@ describe("runExecution", () => { const scenarios = [createScenario()]; const config = createConfig(); - await runExecution(analysis, scenarios, config); + await runExecution({ analysis, scenarios, config }); expect(logger.warn).toHaveBeenCalledWith( expect.stringContaining("exceed budget"), @@ -558,7 +563,7 @@ describe("runExecution", () => { const scenarios = [createScenario()]; const config = createConfig(); - await runExecution(analysis, scenarios, config); + await runExecution({ analysis, scenarios, config }); expect(writeJsonAsync).toHaveBeenCalled(); }); @@ -576,7 +581,7 @@ describe("runExecution", () => { }, }); - await runExecution(analysis, scenarios, config); + await runExecution({ analysis, scenarios, config }); // Verify execution completes with sanitization enabled expect(writeJsonAsync).toHaveBeenCalled(); @@ -623,7 +628,7 @@ describe("runExecution", () => { }), ); - const result = await runExecution(analysis, scenarios, config); + const result = await runExecution({ analysis, scenarios, config }); expect(result.total_tools_captured).toBe(3); }); @@ -633,7 +638,7 @@ describe("runExecution", () => { const scenarios = [createScenario()]; const config = createConfig(); - const result = await runExecution(analysis, scenarios, config); + const result = await runExecution({ analysis, scenarios, config }); expect(result.total_duration_ms).toBeGreaterThanOrEqual(0); expect(result.total_duration_ms).toBeLessThan(5000); // Should be fast in tests @@ -651,7 +656,7 @@ describe("runExecution", () => { }, }); - await runExecution(analysis, scenarios, config); + await runExecution({ analysis, scenarios, config }); // Rate limiting is applied inside executeAllScenariosIsolated // We verify the config is passed correctly @@ -669,7 +674,7 @@ describe("runExecution", () => { const scenarios = [createScenario()]; const config = createConfig({ rewind_file_changes: true }); - await runExecution(analysis, scenarios, config); + await runExecution({ analysis, scenarios, config }); // The parallel mock will call the fn, which internally uses executeScenarioWithCheckpoint // We verify by checking the mock was called (via parallel's fn) @@ -685,7 +690,7 @@ describe("runExecution", () => { scope: { ...createConfig().scope, mcp_servers: true }, }); - await runExecution(analysis, scenarios, config); + await runExecution({ analysis, scenarios, config }); expect(verifyPluginLoad).toHaveBeenCalledWith( expect.objectContaining({ diff --git a/tests/unit/stages/4-evaluation/aggregation/scenario-results.test.ts b/tests/unit/stages/4-evaluation/aggregation/scenario-results.test.ts index 736aafc..43f87b1 100644 --- a/tests/unit/stages/4-evaluation/aggregation/scenario-results.test.ts +++ b/tests/unit/stages/4-evaluation/aggregation/scenario-results.test.ts @@ -122,14 +122,14 @@ describe("buildEvaluationResult", () => { const conflictAnalysis = createConflictAnalysis(); const judgment = createMultiSampleResult({ aggregated_score: 9 }); - const result = buildEvaluationResult( + const result = buildEvaluationResult({ scenario, - true, - detections, + triggered: true, + uniqueDetections: detections, conflictAnalysis, judgment, - "both", - ); + detectionSource: "both", + }); expect(result.scenario_id).toBe("test-scenario-1"); expect(result.triggered).toBe(true); @@ -152,14 +152,14 @@ describe("buildEvaluationResult", () => { const scenario = createScenario({ expected_trigger: false }); const conflictAnalysis = createConflictAnalysis(); - const result = buildEvaluationResult( + const result = buildEvaluationResult({ scenario, - false, - [], // no detections + triggered: false, + uniqueDetections: [], // no detections conflictAnalysis, - null, // no judgment - "programmatic", - ); + judgment: null, // no judgment + detectionSource: "programmatic", + }); expect(result.triggered).toBe(false); expect(result.confidence).toBe(0); @@ -174,14 +174,14 @@ describe("buildEvaluationResult", () => { const detections = [createDetection()]; const conflictAnalysis = createConflictAnalysis(); - const result = buildEvaluationResult( + const result = buildEvaluationResult({ scenario, - true, - detections, + triggered: true, + uniqueDetections: detections, conflictAnalysis, - null, - "programmatic", - ); + judgment: null, + detectionSource: "programmatic", + }); expect(result.quality_score).toBe(7); expect(result.summary).toBe("Correctly triggered component"); @@ -191,14 +191,14 @@ describe("buildEvaluationResult", () => { const scenario = createScenario({ expected_trigger: true }); const conflictAnalysis = createConflictAnalysis(); - const result = buildEvaluationResult( + const result = buildEvaluationResult({ scenario, - false, // didn't trigger but was expected - [], + triggered: false, // didn't trigger but was expected + uniqueDetections: [], conflictAnalysis, - null, - "programmatic", - ); + judgment: null, + detectionSource: "programmatic", + }); expect(result.summary).toBe("Incorrectly did not trigger component"); }); @@ -213,14 +213,14 @@ describe("buildEvaluationResult", () => { }), }); - const result = buildEvaluationResult( + const result = buildEvaluationResult({ scenario, - true, - detections, + triggered: true, + uniqueDetections: detections, conflictAnalysis, judgment, - "both", - ); + detectionSource: "both", + }); expect(result.summary).toBe("Custom LLM summary"); }); @@ -233,14 +233,14 @@ describe("buildEvaluationResult", () => { all_issues: ["Minor issue 1", "Minor issue 2"], }); - const result = buildEvaluationResult( + const result = buildEvaluationResult({ scenario, - true, - detections, + triggered: true, + uniqueDetections: detections, conflictAnalysis, judgment, - "both", - ); + detectionSource: "both", + }); expect(result.issues).toEqual(["Minor issue 1", "Minor issue 2"]); }); @@ -260,14 +260,14 @@ describe("buildEvaluationResult", () => { ], }); - const result = buildEvaluationResult( + const result = buildEvaluationResult({ scenario, - true, - detections, + triggered: true, + uniqueDetections: detections, conflictAnalysis, - null, - "programmatic", - ); + judgment: null, + detectionSource: "programmatic", + }); expect(result.has_conflict).toBe(true); expect(result.conflict_severity).toBe("major"); diff --git a/tests/unit/stages/4-evaluation/index.test.ts b/tests/unit/stages/4-evaluation/index.test.ts index 8c6dcce..ff7b22f 100644 --- a/tests/unit/stages/4-evaluation/index.test.ts +++ b/tests/unit/stages/4-evaluation/index.test.ts @@ -236,12 +236,12 @@ describe("runEvaluation", () => { const executions = [createExecutionResult()]; const config = createConfig(); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(output.plugin_name).toBe("test-plugin"); expect(output.results).toHaveLength(1); @@ -254,12 +254,12 @@ describe("runEvaluation", () => { const executions: ExecutionResult[] = []; const config = createConfig(); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(output.results).toHaveLength(0); expect(output.metrics.total_scenarios).toBe(0); @@ -271,7 +271,12 @@ describe("runEvaluation", () => { const executions = [createExecutionResult()]; const config = createConfig(); - await runEvaluation("test-plugin", scenarios, executions, config); + await runEvaluation({ + pluginName: "test-plugin", + scenarios, + executions, + config, + }); expect(writeJsonAsync).toHaveBeenCalledTimes(1); expect(writeJsonAsync).toHaveBeenCalledWith( @@ -295,12 +300,12 @@ describe("runEvaluation", () => { ]; const config = createConfig(); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(output.results[0]?.triggered).toBe(true); expect(output.results[0]?.confidence).toBe(100); @@ -340,12 +345,12 @@ describe("runEvaluation", () => { ]; const config = createConfig(); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(output.results[0]?.triggered).toBe(false); expect(output.results[0]?.confidence).toBe(0); @@ -367,12 +372,12 @@ describe("runEvaluation", () => { ]; const config = createConfig(); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(output.results[0]?.triggered).toBe(true); expect(output.results[0]?.all_triggered_components).toContainEqual( @@ -399,12 +404,12 @@ describe("runEvaluation", () => { ]; const config = createConfig(); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(output.results[0]?.triggered).toBe(true); expect(output.results[0]?.all_triggered_components).toContainEqual( @@ -435,12 +440,12 @@ describe("runEvaluation", () => { const executions = [createExecutionResult()]; const config = createConfig(); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(runJudgment).toHaveBeenCalled(); expect(output.results[0]?.quality_score).toBe(9); @@ -477,12 +482,12 @@ describe("runEvaluation", () => { ]; const config = createConfig(); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(runJudgment).toHaveBeenCalled(); expect(output.results[0]?.issues).toContain( @@ -516,7 +521,12 @@ describe("runEvaluation", () => { }, }); - await runEvaluation("test-plugin", scenarios, executions, config); + await runEvaluation({ + pluginName: "test-plugin", + scenarios, + executions, + config, + }); // True negative with direct scenario type - no LLM needed expect(runJudgment).not.toHaveBeenCalled(); @@ -543,7 +553,12 @@ describe("runEvaluation", () => { }, }); - await runEvaluation("test-plugin", scenarios, executions, config); + await runEvaluation({ + pluginName: "test-plugin", + scenarios, + executions, + config, + }); expect(runJudgment).toHaveBeenCalled(); }); @@ -555,12 +570,12 @@ describe("runEvaluation", () => { const executions = [createExecutionResult()]; const config = createConfig(); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); // Should still return result with error captured expect(output.results).toHaveLength(1); @@ -579,12 +594,12 @@ describe("runEvaluation", () => { ]; const config = createConfig(); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(output.results[0]?.has_conflict).toBe(false); expect(output.results[0]?.conflict_severity).toBe("none"); @@ -599,12 +614,12 @@ describe("runEvaluation", () => { ]; const config = createConfig(); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(output.results[0]?.has_conflict).toBe(true); expect(output.results[0]?.conflict_severity).toBe("major"); @@ -624,12 +639,12 @@ describe("runEvaluation", () => { ]; const config = createConfig(); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(output.results[0]?.has_conflict).toBe(true); expect(output.results[0]?.conflict_severity).toBe("minor"); @@ -660,12 +675,12 @@ describe("runEvaluation", () => { ]; const config = createConfig(); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(output.metrics.trigger_rate).toBeCloseTo(2 / 3); expect(output.metrics.triggered_count).toBe(2); @@ -698,12 +713,12 @@ describe("runEvaluation", () => { ]; const config = createConfig(); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(output.metrics.accuracy).toBe(1); // Both correct }); @@ -719,12 +734,12 @@ describe("runEvaluation", () => { ]; const config = createConfig(); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(output.metrics.total_cost_usd).toBe(0.08); expect(output.metrics.avg_cost_per_scenario).toBe(0.04); @@ -748,12 +763,12 @@ describe("runEvaluation", () => { ]; const config = createConfig(); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(output.metrics.conflict_count).toBe(1); expect(output.metrics.major_conflicts).toBe(1); @@ -770,8 +785,12 @@ describe("runEvaluation", () => { ]; const config = createConfig(); - await runEvaluation("test-plugin", scenarios, executions, config, { - onStageStart, + await runEvaluation({ + pluginName: "test-plugin", + scenarios, + executions, + config, + progress: { onStageStart }, }); expect(onStageStart).toHaveBeenCalledWith("evaluation", 2); @@ -783,8 +802,12 @@ describe("runEvaluation", () => { const executions = [createExecutionResult()]; const config = createConfig(); - await runEvaluation("test-plugin", scenarios, executions, config, { - onStageComplete, + await runEvaluation({ + pluginName: "test-plugin", + scenarios, + executions, + config, + progress: { onStageComplete }, }); expect(onStageComplete).toHaveBeenCalledWith( @@ -802,8 +825,12 @@ describe("runEvaluation", () => { const executions = [createExecutionResult()]; const config = createConfig(); - await runEvaluation("test-plugin", scenarios, executions, config, { - onError, + await runEvaluation({ + pluginName: "test-plugin", + scenarios, + executions, + config, + progress: { onError }, }); // onError is called by parallel utility, but our mock doesn't trigger it @@ -848,12 +875,12 @@ describe("runEvaluation", () => { ]; const config = createConfig(); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(output.results).toHaveLength(3); @@ -881,12 +908,12 @@ describe("runEvaluation", () => { ]; const config = createConfig(); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(output.results).toHaveLength(1); expect(logger.warn).toHaveBeenCalledWith( @@ -917,12 +944,12 @@ describe("runEvaluation", () => { ]; const config = createConfig(); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(output.results[0]?.detection_source).toBe("programmatic"); }); @@ -932,12 +959,12 @@ describe("runEvaluation", () => { const executions = [createExecutionResult()]; const config = createConfig(); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(output.results[0]?.detection_source).toBe("both"); }); @@ -952,12 +979,12 @@ describe("runEvaluation", () => { }, }); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(output.results[0]?.detection_source).toBe("llm"); }); @@ -983,12 +1010,12 @@ describe("runEvaluation", () => { }, }); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(output.metrics.multi_sample_stats).toBeDefined(); expect(output.metrics.multi_sample_stats?.avg_score_variance).toBeCloseTo( @@ -1035,12 +1062,12 @@ describe("runEvaluation", () => { }, }); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(output.metrics.multi_sample_stats).toBeDefined(); expect( @@ -1063,12 +1090,12 @@ describe("runEvaluation", () => { }, }); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(output.metrics.multi_sample_stats).toBeUndefined(); }); @@ -1114,12 +1141,12 @@ describe("runEvaluation", () => { }, }); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(output.metrics.multi_sample_stats).toBeDefined(); // 1 out of 2 scenarios had unanimous trigger_accuracy agreement @@ -1168,12 +1195,12 @@ describe("runEvaluation", () => { }, }); - const output = await runEvaluation( - "test-plugin", + const output = await runEvaluation({ + pluginName: "test-plugin", scenarios, executions, config, - ); + }); expect(output.metrics.multi_sample_stats).toBeDefined(); // s2 has high variance (> 1.0) diff --git a/tests/unit/stages/4-evaluation/llm-judge.test.ts b/tests/unit/stages/4-evaluation/llm-judge.test.ts index c6f8b8b..7b5d4ee 100644 --- a/tests/unit/stages/4-evaluation/llm-judge.test.ts +++ b/tests/unit/stages/4-evaluation/llm-judge.test.ts @@ -366,13 +366,13 @@ describe("evaluateWithLLMJudge", () => { const detections = createDetections([{ name: "commit", type: "skill" }]); const config = createConfig({ model: "sonnet", max_tokens: 2048 }); - await evaluateWithLLMJudge( - mockClient, + await evaluateWithLLMJudge({ + client: mockClient, scenario, transcript, - detections, + programmaticResult: detections, config, - ); + }); expect(mockClient.beta.messages.create).toHaveBeenCalledTimes(1); const callArgs = mockClient.beta.messages.create.mock @@ -403,13 +403,13 @@ describe("evaluateWithLLMJudge", () => { const detections = createDetections([{ name: "commit", type: "skill" }]); const config = createConfig(); - const result = await evaluateWithLLMJudge( - mockClient, + const result = await evaluateWithLLMJudge({ + client: mockClient, scenario, transcript, - detections, + programmaticResult: detections, config, - ); + }); expect(result.quality_score).toBe(9); expect(result.response_relevance).toBe(8); @@ -436,13 +436,13 @@ describe("evaluateWithLLMJudge", () => { const detections = createDetections([]); const config = createConfig(); - const result = await evaluateWithLLMJudge( - mockClient, + const result = await evaluateWithLLMJudge({ + client: mockClient, scenario, transcript, - detections, + programmaticResult: detections, config, - ); + }); expect(result.highlights).toHaveLength(1); expect(result.highlights?.[0]?.description).toBe("Good trigger"); @@ -467,13 +467,13 @@ describe("evaluateWithLLMJudge", () => { const detections = createDetections([]); const config = createConfig(); - const result = await evaluateWithLLMJudge( - mockClient, + const result = await evaluateWithLLMJudge({ + client: mockClient, scenario, transcript, - detections, + programmaticResult: detections, config, - ); + }); expect(result.highlights).toBeUndefined(); }); @@ -486,13 +486,13 @@ describe("evaluateWithLLMJudge", () => { const config = createConfig(); await expect( - evaluateWithLLMJudge( - mockClient, + evaluateWithLLMJudge({ + client: mockClient, scenario, transcript, - detections, + programmaticResult: detections, config, - ), + }), ).rejects.toThrow("Failed to parse structured output"); }); @@ -512,13 +512,13 @@ describe("evaluateWithLLMJudge", () => { const config = createConfig(); await expect( - evaluateWithLLMJudge( - mockClient, + evaluateWithLLMJudge({ + client: mockClient, scenario, transcript, - detections, + programmaticResult: detections, config, - ), + }), ).rejects.toThrow("Failed to parse structured output"); }); @@ -538,13 +538,13 @@ describe("evaluateWithLLMJudge", () => { const config = createConfig(); await expect( - evaluateWithLLMJudge( - mockClient, + evaluateWithLLMJudge({ + client: mockClient, scenario, transcript, - detections, + programmaticResult: detections, config, - ), + }), ).rejects.toThrow("Failed to parse structured output"); }); @@ -571,13 +571,13 @@ describe("evaluateWithLLMJudge", () => { const config = createConfig(); await expect( - evaluateWithLLMJudge( - mockClient, + evaluateWithLLMJudge({ + client: mockClient, scenario, transcript, - detections, + programmaticResult: detections, config, - ), + }), ).rejects.toThrow("No text block"); }); }); @@ -595,13 +595,13 @@ describe("evaluateWithFallback", () => { const detections = createDetections([{ name: "commit", type: "skill" }]); const config = createConfig(); - const result = await evaluateWithFallback( - mockClient, + const result = await evaluateWithFallback({ + client: mockClient, scenario, transcript, - detections, + programmaticResult: detections, config, - ); + }); expect(result.quality_score).toBe(9); expect(mockClient.beta.messages.create).toHaveBeenCalledTimes(1); @@ -625,13 +625,13 @@ describe("evaluateWithFallback", () => { const detections = createDetections([]); const config = createConfig(); - const result = await evaluateWithFallback( - mockClient, + const result = await evaluateWithFallback({ + client: mockClient, scenario, transcript, - detections, + programmaticResult: detections, config, - ); + }); expect(result.quality_score).toBe(7); expect(betaCreateMock).toHaveBeenCalledTimes(1); @@ -656,13 +656,13 @@ describe("evaluateWithFallback", () => { const detections = createDetections([]); const config = createConfig(); - const result = await evaluateWithFallback( - mockClient, + const result = await evaluateWithFallback({ + client: mockClient, scenario, transcript, - detections, + programmaticResult: detections, config, - ); + }); expect(result.quality_score).toBe(6); }); @@ -684,13 +684,13 @@ describe("evaluateWithFallback", () => { const detections = createDetections([]); const config = createConfig(); - const result = await evaluateWithFallback( - mockClient, + const result = await evaluateWithFallback({ + client: mockClient, scenario, transcript, - detections, + programmaticResult: detections, config, - ); + }); // Should return default error response expect(result.quality_score).toBe(1); diff --git a/tests/unit/stages/4-evaluation/multi-sampler.test.ts b/tests/unit/stages/4-evaluation/multi-sampler.test.ts index b2a2b5e..c07f245 100644 --- a/tests/unit/stages/4-evaluation/multi-sampler.test.ts +++ b/tests/unit/stages/4-evaluation/multi-sampler.test.ts @@ -399,13 +399,13 @@ describe("evaluateSingleSample", () => { const detections = createDetections(); const config = createConfig(); - const result = await evaluateSingleSample( + const result = await evaluateSingleSample({ client, scenario, transcript, - detections, + programmaticResult: detections, config, - ); + }); expect(result.individual_scores).toEqual([8]); expect(result.aggregated_score).toBe(8); @@ -422,13 +422,13 @@ describe("evaluateSingleSample", () => { }); (evaluateWithFallback as Mock).mockResolvedValue(mockResponse); - const result = await evaluateSingleSample( - createMockClient(), - createScenario(), - createTranscript(), - createDetections(), - createConfig(), - ); + const result = await evaluateSingleSample({ + client: createMockClient(), + scenario: createScenario(), + transcript: createTranscript(), + programmaticResult: createDetections(), + config: createConfig(), + }); expect(result.all_issues).toEqual([ "Minor formatting issue", @@ -446,35 +446,35 @@ describe("evaluateSingleSample", () => { const detections = createDetections(); const config = createConfig({ model: "sonnet", max_tokens: 2048 }); - await evaluateSingleSample( + await evaluateSingleSample({ client, scenario, transcript, - detections, + programmaticResult: detections, config, - ); + }); expect(evaluateWithFallback).toHaveBeenCalledTimes(1); - expect(evaluateWithFallback).toHaveBeenCalledWith( + expect(evaluateWithFallback).toHaveBeenCalledWith({ client, scenario, transcript, - detections, + programmaticResult: detections, config, - ); + }); }); it("should always set is_unanimous to true (single sample is trivially unanimous)", async () => { const mockResponse = createJudgeResponse({ trigger_accuracy: "correct" }); (evaluateWithFallback as Mock).mockResolvedValue(mockResponse); - const result = await evaluateSingleSample( - createMockClient(), - createScenario(), - createTranscript(), - createDetections(), - createConfig(), - ); + const result = await evaluateSingleSample({ + client: createMockClient(), + scenario: createScenario(), + transcript: createTranscript(), + programmaticResult: createDetections(), + config: createConfig(), + }); expect(result.is_unanimous).toBe(true); }); @@ -491,13 +491,13 @@ describe("evaluateWithMultiSampling", () => { const config = createConfig({ num_samples: 3 }); - await evaluateWithMultiSampling( - createMockClient(), - createScenario(), - createTranscript(), - createDetections(), + await evaluateWithMultiSampling({ + client: createMockClient(), + scenario: createScenario(), + transcript: createTranscript(), + programmaticResult: createDetections(), config, - ); + }); expect(evaluateWithFallback).toHaveBeenCalledTimes(3); }); @@ -518,13 +518,13 @@ describe("evaluateWithMultiSampling", () => { aggregate_method: "average", }); - const result = await evaluateWithMultiSampling( - createMockClient(), - createScenario(), - createTranscript(), - createDetections(), + const result = await evaluateWithMultiSampling({ + client: createMockClient(), + scenario: createScenario(), + transcript: createTranscript(), + programmaticResult: createDetections(), config, - ); + }); expect(result.individual_scores).toEqual([7, 8, 9]); expect(result.aggregated_score).toBe(8); // (7+8+9)/3 @@ -544,13 +544,13 @@ describe("evaluateWithMultiSampling", () => { const config = createConfig({ num_samples: 3, aggregate_method: "median" }); - const result = await evaluateWithMultiSampling( - createMockClient(), - createScenario(), - createTranscript(), - createDetections(), + const result = await evaluateWithMultiSampling({ + client: createMockClient(), + scenario: createScenario(), + transcript: createTranscript(), + programmaticResult: createDetections(), config, - ); + }); expect(result.aggregated_score).toBe(8); // median of [5, 8, 10] }); @@ -568,13 +568,13 @@ describe("evaluateWithMultiSampling", () => { const config = createConfig({ num_samples: 3 }); - const result = await evaluateWithMultiSampling( - createMockClient(), - createScenario(), - createTranscript(), - createDetections(), + const result = await evaluateWithMultiSampling({ + client: createMockClient(), + scenario: createScenario(), + transcript: createTranscript(), + programmaticResult: createDetections(), config, - ); + }); expect(result.consensus_trigger_accuracy).toBe("correct"); }); @@ -592,13 +592,13 @@ describe("evaluateWithMultiSampling", () => { const config = createConfig({ num_samples: 3 }); - const result = await evaluateWithMultiSampling( - createMockClient(), - createScenario(), - createTranscript(), - createDetections(), + const result = await evaluateWithMultiSampling({ + client: createMockClient(), + scenario: createScenario(), + transcript: createTranscript(), + programmaticResult: createDetections(), config, - ); + }); expect(result.is_unanimous).toBe(true); expect(result.consensus_trigger_accuracy).toBe("correct"); @@ -617,13 +617,13 @@ describe("evaluateWithMultiSampling", () => { const config = createConfig({ num_samples: 3 }); - const result = await evaluateWithMultiSampling( - createMockClient(), - createScenario(), - createTranscript(), - createDetections(), + const result = await evaluateWithMultiSampling({ + client: createMockClient(), + scenario: createScenario(), + transcript: createTranscript(), + programmaticResult: createDetections(), config, - ); + }); expect(result.is_unanimous).toBe(false); // Majority vote should still work @@ -643,13 +643,13 @@ describe("evaluateWithMultiSampling", () => { const config = createConfig({ num_samples: 3 }); - const result = await evaluateWithMultiSampling( - createMockClient(), - createScenario(), - createTranscript(), - createDetections(), + const result = await evaluateWithMultiSampling({ + client: createMockClient(), + scenario: createScenario(), + transcript: createTranscript(), + programmaticResult: createDetections(), config, - ); + }); expect(result.all_issues).toHaveLength(3); expect(result.all_issues).toContain("Issue A"); @@ -682,13 +682,13 @@ describe("evaluateWithMultiSampling", () => { aggregate_method: "average", }); - const result = await evaluateWithMultiSampling( - createMockClient(), - createScenario(), - createTranscript(), - createDetections(), + const result = await evaluateWithMultiSampling({ + client: createMockClient(), + scenario: createScenario(), + transcript: createTranscript(), + programmaticResult: createDetections(), config, - ); + }); // Representative should have aggregated values expect(result.representative_response.quality_score).toBe(8); // (7+9)/2 @@ -704,13 +704,13 @@ describe("evaluateWithMultiSampling", () => { const config = createConfig({ num_samples: 1 }); - const result = await evaluateWithMultiSampling( - createMockClient(), - createScenario(), - createTranscript(), - createDetections(), + const result = await evaluateWithMultiSampling({ + client: createMockClient(), + scenario: createScenario(), + transcript: createTranscript(), + programmaticResult: createDetections(), config, - ); + }); expect(evaluateWithFallback).toHaveBeenCalledTimes(1); expect(result.individual_scores).toEqual([7]); @@ -726,13 +726,13 @@ describe("evaluateWithMultiSampling", () => { const config = createConfig({ num_samples: 3 }); await expect( - evaluateWithMultiSampling( - createMockClient(), - createScenario(), - createTranscript(), - createDetections(), + evaluateWithMultiSampling({ + client: createMockClient(), + scenario: createScenario(), + transcript: createTranscript(), + programmaticResult: createDetections(), config, - ), + }), ).rejects.toThrow("API error"); }); }); @@ -748,13 +748,13 @@ describe("runJudgment", () => { const config = createConfig({ num_samples: 1 }); - const result = await runJudgment( - createMockClient(), - createScenario(), - createTranscript(), - createDetections(), + const result = await runJudgment({ + client: createMockClient(), + scenario: createScenario(), + transcript: createTranscript(), + programmaticResult: createDetections(), config, - ); + }); expect(evaluateWithFallback).toHaveBeenCalledTimes(1); expect(result.individual_scores).toEqual([8]); @@ -767,13 +767,13 @@ describe("runJudgment", () => { const config = createConfig({ num_samples: 3 }); - const result = await runJudgment( - createMockClient(), - createScenario(), - createTranscript(), - createDetections(), + const result = await runJudgment({ + client: createMockClient(), + scenario: createScenario(), + transcript: createTranscript(), + programmaticResult: createDetections(), config, - ); + }); expect(evaluateWithFallback).toHaveBeenCalledTimes(3); expect(result.individual_scores).toHaveLength(3); @@ -785,13 +785,13 @@ describe("runJudgment", () => { const config = createConfig({ num_samples: 0 }); - const result = await runJudgment( - createMockClient(), - createScenario(), - createTranscript(), - createDetections(), + const result = await runJudgment({ + client: createMockClient(), + scenario: createScenario(), + transcript: createTranscript(), + programmaticResult: createDetections(), config, - ); + }); // Should treat 0 as single sample (via evaluateSingleSample path) expect(evaluateWithFallback).toHaveBeenCalledTimes(1); @@ -804,13 +804,13 @@ describe("runJudgment", () => { const config = createConfig({ num_samples: 1 }); await expect( - runJudgment( - createMockClient(), - createScenario(), - createTranscript(), - createDetections(), + runJudgment({ + client: createMockClient(), + scenario: createScenario(), + transcript: createTranscript(), + programmaticResult: createDetections(), config, - ), + }), ).rejects.toThrow("API Error"); }); }); diff --git a/tests/unit/stages/4-evaluation/programmatic-detector.test.ts b/tests/unit/stages/4-evaluation/programmatic-detector.test.ts index 9f04074..0aa6a17 100644 --- a/tests/unit/stages/4-evaluation/programmatic-detector.test.ts +++ b/tests/unit/stages/4-evaluation/programmatic-detector.test.ts @@ -960,12 +960,12 @@ describe("detectAllComponentsWithHooks", () => { }, ]; - const detections = detectAllComponentsWithHooks( + const detections = detectAllComponentsWithHooks({ captures, transcript, scenario, hookResponses, - ); + }); expect(detections.length).toBeGreaterThan(0); expect(detections.some((d) => d.component_type === "skill")).toBe(true); @@ -996,12 +996,12 @@ describe("detectAllComponentsWithHooks", () => { }, ]; - const detections = detectAllComponentsWithHooks( + const detections = detectAllComponentsWithHooks({ captures, transcript, scenario, hookResponses, - ); + }); const hookDetections = detections.filter( (d) => d.component_type === "hook", @@ -1024,12 +1024,12 @@ describe("detectAllComponentsWithHooks", () => { }, ]; - const detections = detectAllComponentsWithHooks( + const detections = detectAllComponentsWithHooks({ captures, transcript, scenario, hookResponses, - ); + }); expect(detections.every((d) => d.component_type !== "hook")).toBe(true); }); @@ -1039,11 +1039,11 @@ describe("detectAllComponentsWithHooks", () => { const transcript = createTranscript([]); const scenario = createScenario({ component_type: "skill" }); - const detections = detectAllComponentsWithHooks( + const detections = detectAllComponentsWithHooks({ captures, transcript, scenario, - ); + }); expect(detections).toHaveLength(1); expect(detections[0]?.component_type).toBe("skill"); @@ -1057,11 +1057,11 @@ describe("detectAllComponentsWithHooks", () => { const transcript = createTranscript([]); const scenario = createScenario({ component_type: "skill" }); - const detections = detectAllComponentsWithHooks( + const detections = detectAllComponentsWithHooks({ captures, transcript, scenario, - ); + }); expect(detections).toHaveLength(1); }); @@ -1082,13 +1082,12 @@ describe("detectAllComponentsWithHooks", () => { }, ]; - const detections = detectAllComponentsWithHooks( + const detections = detectAllComponentsWithHooks({ captures, transcript, scenario, - undefined, subagentCaptures, - ); + }); expect(detections).toHaveLength(1); expect(detections[0]?.component_type).toBe("agent");