diff --git a/CLAUDE.md b/CLAUDE.md
index 0113e0d..fd8cecb 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -6,14 +6,14 @@ This file provides guidance to Claude Code when working with this repository.
 
 This project has Serena configured. **You MUST follow these rules:**
 
-| Instead of...              | USE THIS                             | Cost     |
-| -------------------------- | ------------------------------------ | -------- |
-| Built-in `Grep`, `grep`    | `rg "pattern"`                       | **FREE** |
-| Built-in `Edit` tool       | Morph `edit_file`                    | **FREE** |
-| Reading entire files       | Serena `get_symbols_overview`        | **FREE** |
-| Searching for symbols      | Serena `find_symbol`                 | **FREE** |
-| Finding usages             | Serena `find_referencing_symbols`    | **FREE** |
-| Semantic/fuzzy search      | `warpgrep_codebase_search`           | **$$$**  |
+| Instead of...           | USE THIS                          | Cost     |
+| ----------------------- | --------------------------------- | -------- |
+| Built-in `Grep`, `grep` | `rg "pattern"`                    | **FREE** |
+| Built-in `Edit` tool    | Morph `edit_file`                 | **FREE** |
+| Reading entire files    | Serena `get_symbols_overview`     | **FREE** |
+| Searching for symbols   | Serena `find_symbol`              | **FREE** |
+| Finding usages          | Serena `find_referencing_symbols` | **FREE** |
+| Semantic/fuzzy search   | `warpgrep_codebase_search`        | **$$$**  |
 
 > **FREE tools first. `warpgrep` costs real money - only use when `rg` and Serena cannot answer the question.**
 
diff --git a/src/cli/commands/execute.ts b/src/cli/commands/execute.ts
index 10b473a..95a7746 100644
--- a/src/cli/commands/execute.ts
+++ b/src/cli/commands/execute.ts
@@ -76,12 +76,12 @@ export function registerExecuteCommand(program: Command): void {
         writeJson(`${resultsDir}/scenarios.json`, generation.scenarios);
 
         // Stage 3: Execution
-        const execution = await runExecution(
+        const execution = await runExecution({
           analysis,
-          generation.scenarios,
+          scenarios: generation.scenarios,
           config,
-          consoleProgress,
-        );
+          progress: consoleProgress,
+        });
         state = updateStateAfterExecution(state, execution.results);
         await saveState(state);
 
diff --git a/src/cli/commands/resume.ts b/src/cli/commands/resume.ts
index 72402cb..c11e1f2 100644
--- a/src/cli/commands/resume.ts
+++ b/src/cli/commands/resume.ts
@@ -58,20 +58,20 @@ async function resumeFromAnalysis(
   const generation = await runGeneration(analysis, config);
   writeJson(`${resultsDir}/scenarios.json`, generation.scenarios);
 
-  const execution = await runExecution(
+  const execution = await runExecution({
     analysis,
-    generation.scenarios,
+    scenarios: generation.scenarios,
     config,
-    consoleProgress,
-  );
+    progress: consoleProgress,
+  });
 
-  const evaluation = await runEvaluation(
-    analysis.plugin_name,
-    generation.scenarios,
-    execution.results,
+  const evaluation = await runEvaluation({
+    pluginName: analysis.plugin_name,
+    scenarios: generation.scenarios,
+    executions: execution.results,
     config,
-    consoleProgress,
-  );
+    progress: consoleProgress,
+  });
 
   // Chain state updates
   let currentState = updateStateAfterAnalysis(initialState, analysis);
@@ -107,20 +107,20 @@ async function resumeFromGeneration(
   const generation = await runGeneration(analysisData, config);
   writeJson(`${resultsDir}/scenarios.json`, generation.scenarios);
 
-  const execution = await runExecution(
-    analysisData,
-    generation.scenarios,
+  const execution = await runExecution({
+    analysis: analysisData,
+    scenarios: generation.scenarios,
     config,
-    consoleProgress,
-  );
+    progress: consoleProgress,
+  });
 
-  const evaluation = await runEvaluation(
-    analysisData.plugin_name,
-    generation.scenarios,
-    execution.results,
+  const evaluation = await runEvaluation({
+    pluginName: analysisData.plugin_name,
+    scenarios: generation.scenarios,
+    executions: execution.results,
     config,
-    consoleProgress,
-  );
+    progress: consoleProgress,
+  });
 
   // Chain state updates
   let currentState = updateStateAfterGeneration(
@@ -158,20 +158,20 @@ async function resumeFromExecution(
     );
   }
 
-  const execution = await runExecution(
-    analysisData,
-    scenarioData,
+  const execution = await runExecution({
+    analysis: analysisData,
+    scenarios: scenarioData,
     config,
-    consoleProgress,
-  );
+    progress: consoleProgress,
+  });
 
-  const evaluation = await runEvaluation(
-    analysisData.plugin_name,
-    scenarioData,
-    execution.results,
+  const evaluation = await runEvaluation({
+    pluginName: analysisData.plugin_name,
+    scenarios: scenarioData,
+    executions: execution.results,
     config,
-    consoleProgress,
-  );
+    progress: consoleProgress,
+  });
 
   // Chain state updates
   let currentState = updateStateAfterExecution(initialState, execution.results);
@@ -206,13 +206,13 @@ async function resumeFromEvaluation(
     );
   }
 
-  const evaluation = await runEvaluation(
-    analysisData.plugin_name,
-    scenarioData,
-    executionData,
+  const evaluation = await runEvaluation({
+    pluginName: analysisData.plugin_name,
+    scenarios: scenarioData,
+    executions: executionData,
     config,
-    consoleProgress,
-  );
+    progress: consoleProgress,
+  });
 
   // Chain state updates
   let currentState = updateStateAfterEvaluation(
diff --git a/src/cli/commands/run.ts b/src/cli/commands/run.ts
index 7e1a3bf..2e59bae 100644
--- a/src/cli/commands/run.ts
+++ b/src/cli/commands/run.ts
@@ -145,25 +145,25 @@ export function registerRunCommand(program: Command): void {
         }
 
         // Stage 3: Execution
-        const execution = await runExecution(
+        const execution = await runExecution({
           analysis,
-          scenariosToRun,
+          scenarios: scenariosToRun,
           config,
-          consoleProgress,
-        );
+          progress: consoleProgress,
+        });
         state = updateStateAfterExecution(state, execution.results);
         await saveState(state);
 
         writeExecutionMetadata(resultsDir, execution);
 
         // Stage 4: Evaluation
-        const evaluation = await runEvaluation(
-          analysis.plugin_name,
-          scenariosToRun,
-          execution.results,
+        const evaluation = await runEvaluation({
+          pluginName: analysis.plugin_name,
+          scenarios: scenariosToRun,
+          executions: execution.results,
           config,
-          consoleProgress,
-        );
+          progress: consoleProgress,
+        });
         state = updateStateAfterEvaluation(state, evaluation.results);
         await saveState(state);
 
diff --git a/src/stages/2-generation/agent-scenario-generator.ts b/src/stages/2-generation/agent-scenario-generator.ts
index 94b1f30..67cab31 100644
--- a/src/stages/2-generation/agent-scenario-generator.ts
+++ b/src/stages/2-generation/agent-scenario-generator.ts
@@ -306,25 +306,35 @@ export async function generateAgentScenarios(
   return parseAgentScenarioResponse(response, agent);
 }
 
+/**
+ * Options for generateAllAgentScenarios.
+ */
+export interface GenerateAllAgentScenariosOptions {
+  /** Anthropic client */
+  client: Anthropic;
+  /** Array of agent components */
+  agents: AgentComponent[];
+  /** Generation config */
+  config: GenerationConfig;
+  /** Optional progress callback */
+  onProgress?: (completed: number, total: number, agent: string) => void;
+  /** Maximum concurrent LLM calls (defaults to 10) */
+  maxConcurrent?: number;
+}
+
 /**
  * Generate scenarios for all agents.
  *
  * Uses parallel execution with optional rate limiting.
  *
- * @param client - Anthropic client
- * @param agents - Array of agent components
- * @param config - Generation config
- * @param onProgress - Optional progress callback
- * @param maxConcurrent - Maximum concurrent LLM calls (defaults to 10)
+ * @param options - Generate all agent scenarios options
  * @returns Array of all test scenarios
  */
 export async function generateAllAgentScenarios(
-  client: Anthropic,
-  agents: AgentComponent[],
-  config: GenerationConfig,
-  onProgress?: (completed: number, total: number, agent: string) => void,
-  maxConcurrent = 10,
+  options: GenerateAllAgentScenariosOptions,
 ): Promise<TestScenario[]> {
+  const { client, agents, config, onProgress, maxConcurrent = 10 } = options;
+
   // Create rate limiter if configured
   const rateLimiter = setupRateLimiter(config);
 
diff --git a/src/stages/2-generation/cost-estimator.ts b/src/stages/2-generation/cost-estimator.ts
index e9c6dcc..810bf4f 100644
--- a/src/stages/2-generation/cost-estimator.ts
+++ b/src/stages/2-generation/cost-estimator.ts
@@ -27,6 +27,40 @@ import type { TextBlockParam } from "@anthropic-ai/sdk/resources/messages/messag
  */
 export type SystemPrompt = string | TextBlockParam[];
 
+/**
+ * Options for countPromptTokens.
+ */
+export interface CountPromptTokensOptions {
+  /** Anthropic client */
+  client: Anthropic;
+  /** Model to use */
+  model: string;
+  /** Prompt text */
+  prompt: string;
+  /** Optional per-request timeout in milliseconds (default: 30000 = 30s) */
+  timeout?: number;
+  /** Optional system prompt (string or array of text blocks for caching) */
+  system?: SystemPrompt;
+}
+
+/**
+ * Options for estimateGenerationCost.
+ */
+export interface EstimateGenerationCostOptions {
+  /** Anthropic client */
+  client: Anthropic;
+  /** Prompts to estimate */
+  prompts: string[];
+  /** Model to use */
+  model: string;
+  /** Maximum concurrent token counting operations (default: 10) */
+  concurrency?: number;
+  /** Timeout for each token counting request in ms (default: 30000) */
+  tokenCountingTimeout?: number;
+  /** Optional system prompt to include in token count */
+  system?: SystemPrompt;
+}
+
 /**
  * Default SDK timeout in milliseconds (2 minutes).
  * This is a conservative default for client-level timeout.
@@ -74,12 +108,9 @@ export function createAnthropicClient(timeout?: number): Anthropic {
  * @returns Token count
  */
 export async function countPromptTokens(
-  client: Anthropic,
-  model: string,
-  prompt: string,
-  timeout?: number,
-  system?: SystemPrompt,
+  options: CountPromptTokensOptions,
 ): Promise<number> {
+  const { client, model, prompt, timeout, system } = options;
   const result = await client.messages.countTokens(
     {
       model: resolveModelId(model),
@@ -111,20 +142,32 @@ const DEFAULT_TOKEN_COUNTING_CONCURRENCY = 10;
  * @returns Token estimate
  */
 export async function estimateGenerationCost(
-  client: Anthropic,
-  prompts: string[],
-  model: string,
-  concurrency: number = DEFAULT_TOKEN_COUNTING_CONCURRENCY,
-  tokenCountingTimeout?: number,
-  system?: SystemPrompt,
+  options: EstimateGenerationCostOptions,
 ): Promise<TokenEstimate> {
+  const {
+    client,
+    prompts,
+    model,
+    concurrency = DEFAULT_TOKEN_COUNTING_CONCURRENCY,
+    tokenCountingTimeout,
+    system,
+  } = options;
+
   // Count tokens in parallel for improved performance
   // Use continueOnError: false to fail fast - partial results would underestimate costs
   const result = await parallel({
     items: prompts,
     concurrency,
     fn: async (prompt) =>
-      countPromptTokens(client, model, prompt, tokenCountingTimeout, system),
+      countPromptTokens({
+        client,
+        model,
+        prompt,
+        ...(tokenCountingTimeout !== undefined && {
+          timeout: tokenCountingTimeout,
+        }),
+        ...(system !== undefined && { system }),
+      }),
     continueOnError: false,
   });
   const totalInputTokens = result.results.reduce((a, b) => a + b, 0);
diff --git a/src/stages/2-generation/diversity-manager.ts b/src/stages/2-generation/diversity-manager.ts
index 44bb8de..e02934a 100644
--- a/src/stages/2-generation/diversity-manager.ts
+++ b/src/stages/2-generation/diversity-manager.ts
@@ -58,23 +58,33 @@ export function calculateScenarioDistribution(
   };
 }
 
+/**
+ * Options for createBaseScenario.
+ */
+export interface CreateBaseScenarioOptions {
+  /** Component reference (name) */
+  componentRef: string;
+  /** Type of component */
+  componentType: ComponentType;
+  /** The triggering mechanism to preserve */
+  coreIntent: string;
+  /** Original prompt */
+  basePrompt: string;
+  /** Index for ID generation */
+  index: number;
+}
+
 /**
  * Create a base scenario from a component.
  *
- * @param componentRef - Component reference (name)
- * @param componentType - Type of component
- * @param coreIntent - The triggering mechanism to preserve
- * @param basePrompt - Original prompt
- * @param index - Index for ID generation
+ * @param options - Create base scenario options
  * @returns Base scenario
  */
 export function createBaseScenario(
-  componentRef: string,
-  componentType: ComponentType,
-  coreIntent: string,
-  basePrompt: string,
-  index: number,
+  options: CreateBaseScenarioOptions,
 ): BaseScenario {
+  const { componentRef, componentType, coreIntent, basePrompt, index } =
+    options;
   return {
     id: `${componentRef}-base-${String(index)}`,
     component_ref: componentRef,
diff --git a/src/stages/2-generation/index.ts b/src/stages/2-generation/index.ts
index 3dbaa3a..8ea5515 100644
--- a/src/stages/2-generation/index.ts
+++ b/src/stages/2-generation/index.ts
@@ -201,13 +201,13 @@ export async function runGeneration(
       componentType: "skills",
       components: analysis.components.skills,
       generator: async (onProgress) =>
-        generateAllSkillScenarios(
+        generateAllSkillScenarios({
           client,
-          analysis.components.skills,
-          config.generation,
+          skills: analysis.components.skills,
+          config: config.generation,
           onProgress,
-          config.max_concurrent,
-        ),
+          maxConcurrent: config.max_concurrent,
+        }),
       createFallback: createFallbackSkillScenarios,
       onProgress,
     });
@@ -220,13 +220,13 @@ export async function runGeneration(
       componentType: "agents",
       components: analysis.components.agents,
       generator: async (onProgress) =>
-        generateAllAgentScenarios(
+        generateAllAgentScenarios({
           client,
-          analysis.components.agents,
-          config.generation,
+          agents: analysis.components.agents,
+          config: config.generation,
           onProgress,
-          config.max_concurrent,
-        ),
+          maxConcurrent: config.max_concurrent,
+        }),
       createFallback: createFallbackAgentScenarios,
       onProgress,
     });
diff --git a/src/stages/2-generation/skill-scenario-generator.ts b/src/stages/2-generation/skill-scenario-generator.ts
index bb1985c..765549a 100644
--- a/src/stages/2-generation/skill-scenario-generator.ts
+++ b/src/stages/2-generation/skill-scenario-generator.ts
@@ -192,25 +192,35 @@ export async function generateSkillScenarios(
   return parseSkillScenarioResponse(response, skill);
 }
 
+/**
+ * Options for generateAllSkillScenarios.
+ */
+export interface GenerateAllSkillScenariosOptions {
+  /** Anthropic client */
+  client: Anthropic;
+  /** Array of skill components */
+  skills: SkillComponent[];
+  /** Generation config */
+  config: GenerationConfig;
+  /** Optional progress callback */
+  onProgress?: (completed: number, total: number, skill: string) => void;
+  /** Maximum concurrent LLM calls (defaults to 10) */
+  maxConcurrent?: number;
+}
+
 /**
  * Generate scenarios for all skills.
  *
  * Uses parallel execution with optional rate limiting.
  *
- * @param client - Anthropic client
- * @param skills - Array of skill components
- * @param config - Generation config
- * @param onProgress - Optional progress callback
- * @param maxConcurrent - Maximum concurrent LLM calls (defaults to 10)
+ * @param options - Generate all skill scenarios options
  * @returns Array of all test scenarios
  */
 export async function generateAllSkillScenarios(
-  client: Anthropic,
-  skills: SkillComponent[],
-  config: GenerationConfig,
-  onProgress?: (completed: number, total: number, skill: string) => void,
-  maxConcurrent = 10,
+  options: GenerateAllSkillScenariosOptions,
 ): Promise<TestScenario[]> {
+  const { client, skills, config, onProgress, maxConcurrent = 10 } = options;
+
   // Create rate limiter if configured
   const rateLimiter = setupRateLimiter(config);
 
diff --git a/src/stages/3-execution/index.ts b/src/stages/3-execution/index.ts
index 3f2ec5b..66d0b08 100644
--- a/src/stages/3-execution/index.ts
+++ b/src/stages/3-execution/index.ts
@@ -71,6 +71,22 @@ export interface ExecutionOutput {
   total_tools_captured: number;
 }
 
+/**
+ * Options for runExecution.
+ */
+export interface RunExecutionOptions {
+  /** Output from Stage 1 (plugin analysis) */
+  analysis: AnalysisOutput;
+  /** Output from Stage 2 (test scenarios) */
+  scenarios: TestScenario[];
+  /** Evaluation configuration */
+  config: EvalConfig;
+  /** Optional progress callbacks */
+  progress?: ProgressCallbacks;
+  /** Optional query function (for testing) */
+  queryFn?: QueryFunction;
+}
+
 /**
  * Run Stage 3: Execution.
  *
@@ -99,12 +115,16 @@ export interface ExecutionOutput {
  * ```
  */
 export async function runExecution(
-  analysis: AnalysisOutput,
-  scenarios: TestScenario[],
-  config: EvalConfig,
-  progress: ProgressCallbacks = consoleProgress,
-  queryFn?: QueryFunction,
+  options: RunExecutionOptions,
 ): Promise<ExecutionOutput> {
+  const {
+    analysis,
+    scenarios,
+    config,
+    progress = consoleProgress,
+    queryFn,
+  } = options;
+
   logger.stageHeader("Stage 3: Execution", scenarios.length);
 
   const pluginPath = config.plugin.path;
diff --git a/src/stages/3-execution/plugin-loader.ts b/src/stages/3-execution/plugin-loader.ts
index 3092631..118b2ca 100644
--- a/src/stages/3-execution/plugin-loader.ts
+++ b/src/stages/3-execution/plugin-loader.ts
@@ -94,14 +94,22 @@ export interface PluginLoaderOptions {
   enableMcpDiscovery?: boolean | undefined;
 }
 
+/**
+ * Options for buildPluginQueryInput.
+ */
+interface BuildPluginQueryInputOptions {
+  pluginPath: string;
+  config: ExecutionConfig;
+  settingSources: SettingSource[];
+  controller: AbortController;
+  startTime: number;
+}
+
 /** Build query input for plugin verification */
 function buildPluginQueryInput(
-  pluginPath: string,
-  config: ExecutionConfig,
-  settingSources: SettingSource[],
-  controller: AbortController,
-  startTime: number,
+  options: BuildPluginQueryInputOptions,
 ): QueryInput {
+  const { pluginPath, config, settingSources, controller, startTime } = options;
   return {
     prompt: "Plugin initialization check - respond with OK",
     options: {
@@ -223,13 +231,13 @@ export async function verifyPluginLoad(
   const settingSources: SettingSource[] = enableMcpDiscovery ? ["project"] : [];
 
   try {
-    const queryInput = buildPluginQueryInput(
+    const queryInput = buildPluginQueryInput({
       pluginPath,
       config,
       settingSources,
       controller,
       startTime,
-    );
+    });
     const q = queryFn ? queryFn(queryInput) : executeQuery(queryInput);
 
     return await processQueryMessages(q, pluginPath, timings);
diff --git a/src/stages/3-execution/progress-reporters.ts b/src/stages/3-execution/progress-reporters.ts
index 9faa420..0c72443 100644
--- a/src/stages/3-execution/progress-reporters.ts
+++ b/src/stages/3-execution/progress-reporters.ts
@@ -29,12 +29,12 @@ import type { OutputConfig, ProgressCallbacks } from "../../types/index.js";
  *
  * @example
  * ```typescript
- * const output = await runExecution(
+ * const output = await runExecution({
  *   analysis,
  *   scenarios,
  *   config,
- *   consoleProgress
- * );
+ *   progress: consoleProgress,
+ * });
  * ```
  */
 export const consoleProgress: ProgressCallbacks = {
@@ -64,12 +64,12 @@ export const consoleProgress: ProgressCallbacks = {
  *
  * @example
  * ```typescript
- * const output = await runExecution(
+ * const output = await runExecution({
  *   analysis,
  *   scenarios,
  *   config,
- *   verboseProgress
- * );
+ *   progress: verboseProgress,
+ * });
  * ```
  */
 export const verboseProgress: ProgressCallbacks = {
@@ -106,12 +106,12 @@ export const verboseProgress: ProgressCallbacks = {
  *
  * @example
  * ```typescript
- * const output = await runExecution(
+ * const output = await runExecution({
  *   analysis,
  *   scenarios,
  *   config,
- *   silentProgress
- * );
+ *   progress: silentProgress,
+ * });
  * ```
  */
 export const silentProgress: ProgressCallbacks = {
@@ -126,12 +126,12 @@ export const silentProgress: ProgressCallbacks = {
  *
  * @example
  * ```typescript
- * const output = await runExecution(
+ * const output = await runExecution({
  *   analysis,
  *   scenarios,
  *   config,
- *   jsonProgress
- * );
+ *   progress: jsonProgress,
+ * });
  * ```
  */
 export const jsonProgress: ProgressCallbacks = {
diff --git a/src/stages/4-evaluation/aggregation/scenario-results.ts b/src/stages/4-evaluation/aggregation/scenario-results.ts
index 5da8d40..2755d25 100644
--- a/src/stages/4-evaluation/aggregation/scenario-results.ts
+++ b/src/stages/4-evaluation/aggregation/scenario-results.ts
@@ -17,25 +17,42 @@ import type { calculateConflictSeverity } from "../conflict-tracker.js";
 import type { ProgrammaticResult, ScenarioEvaluationResult } from "./types.js";
 import type { getUniqueDetections } from "../detection/index.js";
 
+/**
+ * Options for buildEvaluationResult.
+ */
+export interface BuildEvaluationResultOptions {
+  /** Test scenario being evaluated */
+  scenario: TestScenario;
+  /** Whether component was triggered */
+  triggered: boolean;
+  /** Unique component detections */
+  uniqueDetections: ReturnType<typeof getUniqueDetections>;
+  /** Conflict analysis result */
+  conflictAnalysis: ReturnType<typeof calculateConflictSeverity>;
+  /** LLM judgment result (null if programmatic-only) */
+  judgment: MultiSampleResult | null;
+  /** How the trigger was detected */
+  detectionSource: DetectionSource;
+}
+
 /**
  * Build the evaluation result object.
  *
- * @param scenario - Test scenario being evaluated
- * @param triggered - Whether component was triggered
- * @param uniqueDetections - Unique component detections
- * @param conflictAnalysis - Conflict analysis result
- * @param judgment - LLM judgment result (null if programmatic-only)
- * @param detectionSource - How the trigger was detected
+ * @param options - Build evaluation result options
  * @returns Complete evaluation result
  */
 export function buildEvaluationResult(
-  scenario: TestScenario,
-  triggered: boolean,
-  uniqueDetections: ReturnType<typeof getUniqueDetections>,
-  conflictAnalysis: ReturnType<typeof calculateConflictSeverity>,
-  judgment: MultiSampleResult | null,
-  detectionSource: DetectionSource,
+  options: BuildEvaluationResultOptions,
 ): EvaluationResult {
+  const {
+    scenario,
+    triggered,
+    uniqueDetections,
+    conflictAnalysis,
+    judgment,
+    detectionSource,
+  } = options;
+
   const allTriggeredComponents: TriggeredComponent[] = uniqueDetections.map(
     (d) => ({
       component_type: d.component_type,
@@ -94,14 +111,14 @@ export function buildFinalResult(
     judgeStrategy,
   } = programmatic;
 
-  const result = buildEvaluationResult(
-    context.scenario,
+  const result = buildEvaluationResult({
+    scenario: context.scenario,
     triggered,
     uniqueDetections,
     conflictAnalysis,
     judgment,
-    judgeStrategy.detectionSource,
-  );
+    detectionSource: judgeStrategy.detectionSource,
+  });
 
   const variance = judgment?.score_variance ?? 0;
   const isUnanimous = judgment?.is_unanimous ?? true;
diff --git a/src/stages/4-evaluation/detection/capture-detection.ts b/src/stages/4-evaluation/detection/capture-detection.ts
index 0ea061d..39a9164 100644
--- a/src/stages/4-evaluation/detection/capture-detection.ts
+++ b/src/stages/4-evaluation/detection/capture-detection.ts
@@ -22,16 +22,25 @@ interface ToolCallLike {
   input: unknown;
 }
 
+/**
+ * Options for createDetection.
+ */
+interface CreateDetectionOptions {
+  componentType: ComponentType;
+  componentName: string;
+  toolName: string;
+  evidence: string;
+  timestamp: number;
+}
+
 /**
  * Create a detection from a tool call.
  */
 function createDetection(
-  componentType: ComponentType,
-  componentName: string,
-  toolName: string,
-  evidence: string,
-  timestamp: number,
+  options: CreateDetectionOptions,
 ): ProgrammaticDetection {
+  const { componentType, componentName, toolName, evidence, timestamp } =
+    options;
   return {
     component_type: componentType,
     component_name: componentName,
@@ -53,13 +62,13 @@ function processSkillTool(
   if (!isSkillInput(tc.input)) {
     return null;
   }
-  return createDetection(
-    "skill",
-    tc.input.skill,
-    tc.name,
-    `Skill tool invoked: ${tc.input.skill}${evidenceSuffix}`,
+  return createDetection({
+    componentType: "skill",
+    componentName: tc.input.skill,
+    toolName: tc.name,
+    evidence: `Skill tool invoked: ${tc.input.skill}${evidenceSuffix}`,
     timestamp,
-  );
+  });
 }
 
 /**
@@ -73,13 +82,13 @@ function processTaskTool(
   if (!isTaskInput(tc.input)) {
     return null;
   }
-  return createDetection(
-    "agent",
-    tc.input.subagent_type,
-    tc.name,
-    `Task tool invoked: ${tc.input.subagent_type}${evidenceSuffix}`,
+  return createDetection({
+    componentType: "agent",
+    componentName: tc.input.subagent_type,
+    toolName: tc.name,
+    evidence: `Task tool invoked: ${tc.input.subagent_type}${evidenceSuffix}`,
     timestamp,
-  );
+  });
 }
 
 /**
@@ -93,13 +102,13 @@ function processCommandTool(
   if (!isSkillInput(tc.input)) {
     return null;
   }
-  return createDetection(
-    "command",
-    tc.input.skill,
-    tc.name,
-    `SlashCommand invoked: ${tc.input.skill}${evidenceSuffix}`,
+  return createDetection({
+    componentType: "command",
+    componentName: tc.input.skill,
+    toolName: tc.name,
+    evidence: `SlashCommand invoked: ${tc.input.skill}${evidenceSuffix}`,
     timestamp,
-  );
+  });
 }
 
 /**
@@ -114,13 +123,13 @@ function processMcpTool(
   if (!parsed) {
     return null;
   }
-  return createDetection(
-    "mcp_server",
-    parsed.serverName,
-    tc.name,
-    `MCP tool invoked: ${tc.name} (server: ${parsed.serverName}, tool: ${parsed.toolName})${evidenceSuffix}`,
+  return createDetection({
+    componentType: "mcp_server",
+    componentName: parsed.serverName,
+    toolName: tc.name,
+    evidence: `MCP tool invoked: ${tc.name} (server: ${parsed.serverName}, tool: ${parsed.toolName})${evidenceSuffix}`,
     timestamp,
-  );
+  });
 }
 
 /**
diff --git a/src/stages/4-evaluation/detection/orchestrator.ts b/src/stages/4-evaluation/detection/orchestrator.ts
index 532f0fa..cd7b6c7 100644
--- a/src/stages/4-evaluation/detection/orchestrator.ts
+++ b/src/stages/4-evaluation/detection/orchestrator.ts
@@ -29,6 +29,22 @@ import type {
   Transcript,
 } from "../../../types/index.js";
 
+/**
+ * Options for detectAllComponentsWithHooks.
+ */
+export interface DetectAllComponentsWithHooksOptions {
+  /** Tool captures from execution */
+  captures: ToolCapture[];
+  /** Execution transcript */
+  transcript: Transcript;
+  /** Test scenario */
+  scenario: TestScenario;
+  /** Optional hook response captures */
+  hookResponses?: HookResponseCapture[];
+  /** Optional subagent lifecycle captures */
+  subagentCaptures?: SubagentCapture[];
+}
+
 /**
  * Detect all components using all detection methods.
  *
@@ -108,12 +124,11 @@ export function detectAllComponents(
  * @returns Array of all detected components including hooks, agents, and MCP servers
  */
 export function detectAllComponentsWithHooks(
-  captures: ToolCapture[],
-  transcript: Transcript,
-  scenario: TestScenario,
-  hookResponses?: HookResponseCapture[],
-  subagentCaptures?: SubagentCapture[],
+  options: DetectAllComponentsWithHooksOptions,
 ): ProgrammaticDetection[] {
+  const { captures, transcript, scenario, hookResponses, subagentCaptures } =
+    options;
+
   // Get standard component detections (now includes MCP servers)
   const detections = detectAllComponents(captures, transcript, scenario);
 
diff --git a/src/stages/4-evaluation/index.ts b/src/stages/4-evaluation/index.ts
index 2660c69..70e0166 100644
--- a/src/stages/4-evaluation/index.ts
+++ b/src/stages/4-evaluation/index.ts
@@ -69,6 +69,68 @@ import type {
 } from "../../types/index.js";
 import type Anthropic from "@anthropic-ai/sdk";
 
+/**
+ * Sample data entry for multi-sampling metrics.
+ */
+interface SampleDataEntry {
+  scenarioId: string;
+  variance: number;
+  numSamples: number;
+  hasConsensus: boolean;
+}
+
+/**
+ * Options for runSynchronousEvaluation.
+ */
+interface RunSynchronousEvaluationOptions {
+  /** Anthropic client */
+  client: Anthropic;
+  /** Programmatic detection results */
+  programmaticResults: ProgrammaticResult[];
+  /** Evaluation configuration */
+  config: EvalConfig;
+  /** Progress callbacks */
+  progress: ProgressCallbacks;
+  /** Sample data array (mutated during evaluation) */
+  sampleData: SampleDataEntry[];
+}
+
+/**
+ * Options for calculateAndSaveMetrics.
+ */
+interface CalculateAndSaveMetricsOptions {
+  /** Plugin name */
+  pluginName: string;
+  /** Results with scenario and execution context */
+  resultsWithContext: {
+    result: EvaluationResult;
+    scenario: TestScenario;
+    execution: ExecutionResult;
+  }[];
+  /** Execution results */
+  executions: ExecutionResult[];
+  /** Evaluation configuration */
+  config: EvalConfig;
+  /** Sample data for multi-sampling metrics */
+  sampleData: SampleDataEntry[];
+}
+
+/**
+ * Options for runEvaluation.
+ */
+export interface RunEvaluationOptions {
+  /** Plugin name */
+  pluginName: string;
+  /** Test scenarios */
+  scenarios: TestScenario[];
+  /** Execution results */
+  executions: ExecutionResult[];
+  /** Evaluation configuration */
+  config: EvalConfig;
+  /** Progress callbacks */
+  progress?: ProgressCallbacks;
+}
+
 /**
  * Output from Stage 4: Evaluation.
  */
@@ -127,13 +189,17 @@ function runProgrammaticDetection(
   // Otherwise use the simpler detectAllComponents
   const detections =
     scenario.component_type === "hook" || scenario.component_type === "agent"
-      ? detectAllComponentsWithHooks(
-          execution.detected_tools,
-          execution.transcript,
+      ? detectAllComponentsWithHooks({
+          captures: execution.detected_tools,
+          transcript: execution.transcript,
           scenario,
-          execution.hook_responses,
-          execution.subagent_captures,
-        )
+          ...(execution.hook_responses !== undefined && {
+            hookResponses: execution.hook_responses,
+          }),
+          ...(execution.subagent_captures !== undefined && {
+            subagentCaptures: execution.subagent_captures,
+          }),
+        })
       : detectAllComponents(
           execution.detected_tools,
           execution.transcript,
@@ -256,17 +322,9 @@ async function runBatchedEvaluation(
  * Run synchronous LLM evaluation (original behavior).
  */
 async function runSynchronousEvaluation(
-  client: Anthropic,
-  programmaticResults: ProgrammaticResult[],
-  config: EvalConfig,
-  progress: ProgressCallbacks,
-  sampleData: {
-    scenarioId: string;
-    variance: number;
-    numSamples: number;
-    hasConsensus: boolean;
-  }[],
+  options: RunSynchronousEvaluationOptions,
 ): Promise<ScenarioEvaluationResult[]> {
+  const { client, programmaticResults, config, progress, sampleData } = options;
   const evalConfig = config.evaluation;
 
   const parallelResult = await parallel<
@@ -280,13 +338,13 @@ async function runSynchronousEvaluation(
 
       if (pr.judgeStrategy.needsLLMJudge) {
         try {
-          judgment = await runJudgment(
+          judgment = await runJudgment({
             client,
-            pr.context.scenario,
-            pr.context.execution.transcript,
-            pr.uniqueDetections,
-            evalConfig,
-          );
+            scenario: pr.context.scenario,
+            transcript: pr.context.execution.transcript,
+            programmaticResult: pr.uniqueDetections,
+            config: evalConfig,
+          });
         } catch (err) {
           const errorResponse = createErrorJudgeResponse(
             formatErrorWithRequestId(err),
@@ -348,21 +406,11 @@ async function runSynchronousEvaluation(
  * @returns Calculated metrics
  */
 async function calculateAndSaveMetrics(
-  pluginName: string,
-  resultsWithContext: {
-    result: EvaluationResult;
-    scenario: TestScenario;
-    execution: ExecutionResult;
-  }[],
-  executions: ExecutionResult[],
-  config: EvalConfig,
-  sampleData: {
-    scenarioId: string;
-    variance: number;
-    numSamples: number;
-    hasConsensus: boolean;
-  }[],
+  options: CalculateAndSaveMetricsOptions,
 ): Promise<EvalMetrics> {
+  const { pluginName, resultsWithContext, executions, config, sampleData } =
+    options;
+
   // Build metrics options
   const metricsOptions: {
     numSamples?: number;
@@ -406,12 +454,10 @@ async function calculateAndSaveMetrics(
  * @returns Evaluation output
  */
 export async function runEvaluation(
-  pluginName: string,
-  scenarios: TestScenario[],
-  executions: ExecutionResult[],
-  config: EvalConfig,
-  progress: ProgressCallbacks = {},
+  options: RunEvaluationOptions,
 ): Promise<EvaluationOutput> {
+  const { pluginName, scenarios, executions, config, progress = {} } = options;
+
   logger.stageHeader("Stage 4: Evaluation", executions.length);
 
   const startTime = Date.now();
@@ -470,12 +516,7 @@ export async function runEvaluation(
   });
 
   // Track sample data for metrics
-  const sampleData: {
-    scenarioId: string;
-    variance: number;
-    numSamples: number;
-    hasConsensus: boolean;
-  }[] = [];
+  const sampleData: SampleDataEntry[] = [];
 
   let evalResults: ScenarioEvaluationResult[];
 
@@ -506,13 +547,13 @@ export async function runEvaluation(
     );
 
     // Phase 2b: Run synchronous LLM evaluation
-    evalResults = await runSynchronousEvaluation(
+    evalResults = await runSynchronousEvaluation({
       client,
       programmaticResults,
       config,
       progress,
       sampleData,
-    );
+    });
   }
 
   const results = evalResults.map((r) => r.result);
@@ -536,13 +577,13 @@ export async function runEvaluation(
   });
 
   // Calculate metrics and save results
-  const metrics = await calculateAndSaveMetrics(
+  const metrics = await calculateAndSaveMetrics({
     pluginName,
     resultsWithContext,
     executions,
     config,
     sampleData,
-  );
+  });
 
   const totalDuration = Date.now() - startTime;
 
diff --git a/src/stages/4-evaluation/llm-judge.ts b/src/stages/4-evaluation/llm-judge.ts
index f14b19a..2c77700 100644
--- a/src/stages/4-evaluation/llm-judge.ts
+++ b/src/stages/4-evaluation/llm-judge.ts
@@ -24,6 +24,22 @@ import type {
 } from "../../types/index.js";
 import type Anthropic from "@anthropic-ai/sdk";
 
+/**
+ * Options for LLM judge evaluation functions.
+ */
+export interface EvaluateJudgeOptions {
+  /** Anthropic client */
+  client: Anthropic;
+  /** Test scenario being evaluated */
+  scenario: TestScenario;
+  /** Execution transcript */
+  transcript: Transcript;
+  /** Programmatic detection results */
+  programmaticResult: ProgrammaticDetection[];
+  /** Evaluation configuration */
+  config: EvaluationConfig;
+}
+
 /**
  * Judge response schema for structured output.
  *
@@ -268,12 +284,10 @@ export function buildJudgePrompt(
  * ```
  */
 export async function evaluateWithLLMJudge(
-  client: Anthropic,
-  scenario: TestScenario,
-  transcript: Transcript,
-  programmaticResult: ProgrammaticDetection[],
-  config: EvaluationConfig,
+  options: EvaluateJudgeOptions,
 ): Promise<JudgeResponse> {
+  const { client, scenario, transcript, programmaticResult, config } = options;
+
   const userPrompt = buildJudgePrompt(
     scenario,
     transcript,
@@ -336,30 +350,14 @@ export async function evaluateWithLLMJudge(
  * @returns Judge response
  */
 export async function evaluateWithFallback(
-  client: Anthropic,
-  scenario: TestScenario,
-  transcript: Transcript,
-  programmaticResult: ProgrammaticDetection[],
-  config: EvaluationConfig,
+  options: EvaluateJudgeOptions,
 ): Promise<JudgeResponse> {
   try {
     // Try structured output first
-    return await evaluateWithLLMJudge(
-      client,
-      scenario,
-      transcript,
-      programmaticResult,
-      config,
-    );
+    return await evaluateWithLLMJudge(options);
   } catch {
     // Fallback to regular JSON parsing
-    return evaluateWithJsonFallback(
-      client,
-      scenario,
-      transcript,
-      programmaticResult,
-      config,
-    );
+    return evaluateWithJsonFallback(options);
   }
 }
 
@@ -395,12 +393,10 @@ No markdown, no explanation - just the JSON.`;
  * @returns Judge response
  */
 async function evaluateWithJsonFallback(
-  client: Anthropic,
-  scenario: TestScenario,
-  transcript: Transcript,
-  programmaticResult: ProgrammaticDetection[],
-  config: EvaluationConfig,
+  options: EvaluateJudgeOptions,
 ): Promise<JudgeResponse> {
+  const { client, scenario, transcript, programmaticResult, config } = options;
+
   const userPrompt = buildJudgePrompt(
     scenario,
     transcript,
diff --git a/src/stages/4-evaluation/multi-sampler.ts b/src/stages/4-evaluation/multi-sampler.ts
index c7ab7df..b0cac6e 100644
--- a/src/stages/4-evaluation/multi-sampler.ts
+++ b/src/stages/4-evaluation/multi-sampler.ts
@@ -27,6 +27,30 @@ import type {
 } from "../../types/index.js";
 import type Anthropic from "@anthropic-ai/sdk";
 
+/**
+ * Options for multi-sample evaluation functions.
+ */
+export interface EvaluateMultiSampleOptions {
+  /** Anthropic client */
+  client: Anthropic;
+  /** Test scenario being evaluated */
+  scenario: TestScenario;
+  /** Execution transcript */
+  transcript: Transcript;
+  /** Programmatic detection results */
+  programmaticResult: ProgrammaticDetection[];
+  /** Evaluation configuration */
+  config: EvaluationConfig;
+}
+
+/**
+ * Options for evaluateWithMultiSampling.
+ */
+export interface EvaluateWithMultiSamplingOptions extends EvaluateMultiSampleOptions {
+  /** Maximum concurrent samples (default: 10) */
+  maxConcurrent?: number;
+}
+
 /**
  * Aggregate scores using the specified method.
  *
@@ -171,13 +195,17 @@ const DEFAULT_MULTI_SAMPLE_CONCURRENCY = 10;
  * ```
  */
 export async function evaluateWithMultiSampling(
-  client: Anthropic,
-  scenario: TestScenario,
-  transcript: Transcript,
-  programmaticResult: ProgrammaticDetection[],
-  config: EvaluationConfig,
-  maxConcurrent: number = DEFAULT_MULTI_SAMPLE_CONCURRENCY,
+  options: EvaluateWithMultiSamplingOptions,
 ): Promise<MultiSampleResult> {
+  const {
+    client,
+    scenario,
+    transcript,
+    programmaticResult,
+    config,
+    maxConcurrent = DEFAULT_MULTI_SAMPLE_CONCURRENCY,
+  } = options;
+
   const numSamples = config.num_samples || 1;
 
   // Run judge multiple times in parallel for improved performance
@@ -186,13 +214,13 @@ export async function evaluateWithMultiSampling(
     items: Array.from({ length: numSamples }),
     concurrency: Math.min(numSamples, maxConcurrent),
     fn: async () =>
-      evaluateWithFallback(
+      evaluateWithFallback({
         client,
         scenario,
         transcript,
         programmaticResult,
         config,
-      ),
+      }),
     continueOnError: false,
   });
   const responses = result.results;
@@ -302,19 +330,17 @@ export function getConfidenceLevel(
  * @returns Multi-sample result with single sample
  */
 export async function evaluateSingleSample(
-  client: Anthropic,
-  scenario: TestScenario,
-  transcript: Transcript,
-  programmaticResult: ProgrammaticDetection[],
-  config: EvaluationConfig,
+  options: EvaluateMultiSampleOptions,
 ): Promise<MultiSampleResult> {
-  const response = await evaluateWithFallback(
+  const { client, scenario, transcript, programmaticResult, config } = options;
+
+  const response = await evaluateWithFallback({
     client,
     scenario,
     transcript,
     programmaticResult,
     config,
-  );
+  });
 
   return {
     individual_scores: [response.quality_score],
@@ -338,27 +364,11 @@ export async function evaluateSingleSample(
  * @returns Multi-sample result
  */
 export async function runJudgment(
-  client: Anthropic,
-  scenario: TestScenario,
-  transcript: Transcript,
-  programmaticResult: ProgrammaticDetection[],
-  config: EvaluationConfig,
+  options: EvaluateMultiSampleOptions,
 ): Promise<MultiSampleResult> {
-  if (config.num_samples <= 1) {
-    return evaluateSingleSample(
-      client,
-      scenario,
-      transcript,
-      programmaticResult,
-      config,
-    );
+  if (options.config.num_samples <= 1) {
+    return evaluateSingleSample(options);
   }
 
-  return evaluateWithMultiSampling(
-    client,
-    scenario,
-    transcript,
-    programmaticResult,
-    config,
-  );
+  return evaluateWithMultiSampling(options);
 }
diff --git a/tests/integration/stages/4-evaluation/index.test.ts b/tests/integration/stages/4-evaluation/index.test.ts
index f128763..819dfbf 100644
--- a/tests/integration/stages/4-evaluation/index.test.ts
+++ b/tests/integration/stages/4-evaluation/index.test.ts
@@ -536,7 +536,12 @@ describe("Stage 4: Evaluation Integration", () => {
 
     it("handles empty execution results gracefully", async () => {
       const config = createTestConfig();
-      const output = await runEvaluation("test-plugin", [], [], config);
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
+        scenarios: [],
+        executions: [],
+        config,
+      });
 
       expect(output.plugin_name).toBe("test-plugin");
       expect(output.results).toHaveLength(0);
@@ -580,12 +585,12 @@ describe("Stage 4: Evaluation Integration", () => {
       // Use programmatic detection only (skip LLM for true negatives)
       config.evaluation.detection_mode = "programmatic_first";
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(output.plugin_name).toBe("test-plugin");
       expect(output.results).toHaveLength(2);
@@ -640,12 +645,12 @@ describe("Stage 4: Evaluation Integration", () => {
       ];
 
       const config = createTestConfig();
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       const result = output.results[0];
       expect(result?.triggered).toBe(true);
@@ -686,13 +691,13 @@ describe("Stage 4: Evaluation Integration", () => {
         onError: vi.fn(),
       };
 
-      await runEvaluation(
-        "test-plugin",
+      await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
         progress,
-      );
+      });
 
       expect(progress.onStageStart).toHaveBeenCalledWith("evaluation", 1);
       expect(progress.onStageComplete).toHaveBeenCalledWith(
@@ -721,12 +726,12 @@ describe("Stage 4: Evaluation Integration", () => {
       ];
 
       const config = createTestConfig();
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       // Should produce no results since scenario not found
       expect(output.results).toHaveLength(0);
diff --git a/tests/unit/stages/2-generation/agent-scenario-generator.test.ts b/tests/unit/stages/2-generation/agent-scenario-generator.test.ts
index ccf13cc..6e2c65f 100644
--- a/tests/unit/stages/2-generation/agent-scenario-generator.test.ts
+++ b/tests/unit/stages/2-generation/agent-scenario-generator.test.ts
@@ -649,11 +649,11 @@ describe("generateAllAgentScenarios", () => {
         ],
       });
 
-    const scenarios = await generateAllAgentScenarios(
-      mockClient as unknown as Anthropic,
+    const scenarios = await generateAllAgentScenarios({
+      client: mockClient as unknown as Anthropic,
       agents,
       config,
-    );
+    });
 
     expect(mockClient.messages.create).toHaveBeenCalledTimes(2);
     expect(scenarios).toHaveLength(2);
@@ -667,12 +667,12 @@ describe("generateAllAgentScenarios", () => {
     });
 
     const progressCallback = vi.fn();
-    await generateAllAgentScenarios(
-      mockClient as unknown as Anthropic,
+    await generateAllAgentScenarios({
+      client: mockClient as unknown as Anthropic,
       agents,
       config,
-      progressCallback,
-    );
+      onProgress: progressCallback,
+    });
 
     // Called once per completed agent (parallel execution)
     expect(progressCallback).toHaveBeenCalledTimes(2);
@@ -681,11 +681,11 @@ describe("generateAllAgentScenarios", () => {
   });
 
   it("should return empty array for empty agents list", async () => {
-    const scenarios = await generateAllAgentScenarios(
-      mockClient as unknown as Anthropic,
-      [],
+    const scenarios = await generateAllAgentScenarios({
+      client: mockClient as unknown as Anthropic,
+      agents: [],
       config,
-    );
+    });
 
     expect(scenarios).toEqual([]);
     expect(mockClient.messages.create).not.toHaveBeenCalled();
@@ -731,11 +731,11 @@ describe("generateAllAgentScenarios", () => {
         ],
       });
 
-    const scenarios = await generateAllAgentScenarios(
-      mockClient as unknown as Anthropic,
+    const scenarios = await generateAllAgentScenarios({
+      client: mockClient as unknown as Anthropic,
       agents,
       config,
-    );
+    });
 
     expect(scenarios).toHaveLength(3);
     expect(scenarios[1].setup_messages).toBeDefined();
diff --git a/tests/unit/stages/2-generation/cost-estimator.test.ts b/tests/unit/stages/2-generation/cost-estimator.test.ts
index a1f73f5..befa422 100644
--- a/tests/unit/stages/2-generation/cost-estimator.test.ts
+++ b/tests/unit/stages/2-generation/cost-estimator.test.ts
@@ -558,11 +558,11 @@ describe("countPromptTokens", () => {
   it("should count tokens for a prompt", async () => {
     mockClient.messages.countTokens.mockResolvedValue({ input_tokens: 150 });
 
-    const count = await countPromptTokens(
-      mockClient as unknown as Anthropic,
-      "haiku",
-      "Test prompt content",
-    );
+    const count = await countPromptTokens({
+      client: mockClient as unknown as Anthropic,
+      model: "haiku",
+      prompt: "Test prompt content",
+    });
 
     expect(mockClient.messages.countTokens).toHaveBeenCalledTimes(1);
     expect(mockClient.messages.countTokens).toHaveBeenCalledWith(
@@ -578,11 +578,11 @@ describe("countPromptTokens", () => {
   it("should resolve model shorthand before counting", async () => {
     mockClient.messages.countTokens.mockResolvedValue({ input_tokens: 100 });
 
-    await countPromptTokens(
-      mockClient as unknown as Anthropic,
-      "sonnet",
-      "Test",
-    );
+    await countPromptTokens({
+      client: mockClient as unknown as Anthropic,
+      model: "sonnet",
+      prompt: "Test",
+    });
 
     expect(mockClient.messages.countTokens).toHaveBeenCalledWith(
       {
@@ -597,11 +597,11 @@ describe("countPromptTokens", () => {
     const longPrompt = "word ".repeat(1000);
     mockClient.messages.countTokens.mockResolvedValue({ input_tokens: 5000 });
 
-    const count = await countPromptTokens(
-      mockClient as unknown as Anthropic,
-      "haiku",
-      longPrompt,
-    );
+    const count = await countPromptTokens({
+      client: mockClient as unknown as Anthropic,
+      model: "haiku",
+      prompt: longPrompt,
+    });
 
     expect(count).toBe(5000);
   });
@@ -609,13 +609,12 @@ describe("countPromptTokens", () => {
   it("should include system prompt in token count when provided as string", async () => {
     mockClient.messages.countTokens.mockResolvedValue({ input_tokens: 650 });
 
-    const count = await countPromptTokens(
-      mockClient as unknown as Anthropic,
-      "haiku",
-      "Test prompt",
-      undefined, // timeout
-      "You are a helpful assistant.",
-    );
+    const count = await countPromptTokens({
+      client: mockClient as unknown as Anthropic,
+      model: "haiku",
+      prompt: "Test prompt",
+      system: "You are a helpful assistant.",
+    });
 
     expect(mockClient.messages.countTokens).toHaveBeenCalledWith(
       {
@@ -640,13 +639,12 @@ describe("countPromptTokens", () => {
       },
     ];
 
-    const count = await countPromptTokens(
-      mockClient as unknown as Anthropic,
-      "haiku",
-      "Test prompt",
-      undefined, // timeout
-      systemPromptArray,
-    );
+    const count = await countPromptTokens({
+      client: mockClient as unknown as Anthropic,
+      model: "haiku",
+      prompt: "Test prompt",
+      system: systemPromptArray,
+    });
 
     expect(mockClient.messages.countTokens).toHaveBeenCalledWith(
       {
@@ -662,11 +660,11 @@ describe("countPromptTokens", () => {
   it("should not include system in request when not provided", async () => {
     mockClient.messages.countTokens.mockResolvedValue({ input_tokens: 100 });
 
-    await countPromptTokens(
-      mockClient as unknown as Anthropic,
-      "haiku",
-      "Test prompt",
-    );
+    await countPromptTokens({
+      client: mockClient as unknown as Anthropic,
+      model: "haiku",
+      prompt: "Test prompt",
+    });
 
     expect(mockClient.messages.countTokens).toHaveBeenCalledWith(
       {
@@ -699,11 +697,11 @@ describe("estimateGenerationCost (async)", () => {
       .mockResolvedValueOnce({ input_tokens: 150 })
       .mockResolvedValueOnce({ input_tokens: 200 });
 
-    const estimate = await estimateGenerationCost(
-      mockClient as unknown as Anthropic,
-      ["prompt1", "prompt2", "prompt3"],
-      "haiku",
-    );
+    const estimate = await estimateGenerationCost({
+      client: mockClient as unknown as Anthropic,
+      prompts: ["prompt1", "prompt2", "prompt3"],
+      model: "haiku",
+    });
 
     expect(mockClient.messages.countTokens).toHaveBeenCalledTimes(3);
     expect(estimate.stage).toBe("generation");
@@ -713,11 +711,11 @@ describe("estimateGenerationCost (async)", () => {
   });
 
   it("should return zero cost for empty prompts", async () => {
-    const estimate = await estimateGenerationCost(
-      mockClient as unknown as Anthropic,
-      [],
-      "haiku",
-    );
+    const estimate = await estimateGenerationCost({
+      client: mockClient as unknown as Anthropic,
+      prompts: [],
+      model: "haiku",
+    });
 
     expect(mockClient.messages.countTokens).not.toHaveBeenCalled();
     expect(estimate.input_tokens).toBe(0);
@@ -728,20 +726,20 @@ describe("estimateGenerationCost (async)", () => {
   it("should use correct model for cost calculation", async () => {
     mockClient.messages.countTokens.mockResolvedValue({ input_tokens: 1000 });
 
-    const haikuEstimate = await estimateGenerationCost(
-      mockClient as unknown as Anthropic,
-      ["test"],
-      "haiku",
-    );
+    const haikuEstimate = await estimateGenerationCost({
+      client: mockClient as unknown as Anthropic,
+      prompts: ["test"],
+      model: "haiku",
+    });
 
     mockClient.messages.countTokens.mockClear();
     mockClient.messages.countTokens.mockResolvedValue({ input_tokens: 1000 });
 
-    const opusEstimate = await estimateGenerationCost(
-      mockClient as unknown as Anthropic,
-      ["test"],
-      "opus",
-    );
+    const opusEstimate = await estimateGenerationCost({
+      client: mockClient as unknown as Anthropic,
+      prompts: ["test"],
+      model: "opus",
+    });
 
     // Opus should be more expensive than Haiku
     expect(opusEstimate.estimated_cost_usd).toBeGreaterThan(
@@ -752,11 +750,11 @@ describe("estimateGenerationCost (async)", () => {
   it("should call countTokens for each prompt", async () => {
     mockClient.messages.countTokens.mockResolvedValue({ input_tokens: 50 });
 
-    await estimateGenerationCost(
-      mockClient as unknown as Anthropic,
-      ["prompt A", "prompt B"],
-      "haiku",
-    );
+    await estimateGenerationCost({
+      client: mockClient as unknown as Anthropic,
+      prompts: ["prompt A", "prompt B"],
+      model: "haiku",
+    });
 
     expect(mockClient.messages.countTokens).toHaveBeenCalledWith(
       {
@@ -781,11 +779,11 @@ describe("estimateGenerationCost (async)", () => {
       .mockResolvedValueOnce({ input_tokens: 200 });
 
     await expect(
-      estimateGenerationCost(
-        mockClient as unknown as Anthropic,
-        ["prompt1", "prompt2", "prompt3"],
-        "haiku",
-      ),
+      estimateGenerationCost({
+        client: mockClient as unknown as Anthropic,
+        prompts: ["prompt1", "prompt2", "prompt3"],
+        model: "haiku",
+      }),
     ).rejects.toThrow("API error");
   });
 
@@ -793,14 +791,13 @@ describe("estimateGenerationCost (async)", () => {
     mockClient.messages.countTokens.mockResolvedValue({ input_tokens: 500 });
     const systemPrompt = "You are a test generator.";
 
-    await estimateGenerationCost(
-      mockClient as unknown as Anthropic,
-      ["prompt1", "prompt2"],
-      "haiku",
-      5,
-      undefined,
-      systemPrompt,
-    );
+    await estimateGenerationCost({
+      client: mockClient as unknown as Anthropic,
+      prompts: ["prompt1", "prompt2"],
+      model: "haiku",
+      concurrency: 5,
+      system: systemPrompt,
+    });
 
     // Verify all calls include the system prompt
     expect(mockClient.messages.countTokens).toHaveBeenCalledTimes(2);
@@ -825,11 +822,11 @@ describe("estimateGenerationCost (async)", () => {
   it("should not include system when not provided", async () => {
     mockClient.messages.countTokens.mockResolvedValue({ input_tokens: 100 });
 
-    await estimateGenerationCost(
-      mockClient as unknown as Anthropic,
-      ["prompt"],
-      "haiku",
-    );
+    await estimateGenerationCost({
+      client: mockClient as unknown as Anthropic,
+      prompts: ["prompt"],
+      model: "haiku",
+    });
 
     const callArgs = mockClient.messages.countTokens.mock.calls[0][0];
     expect(callArgs).not.toHaveProperty("system");
diff --git a/tests/unit/stages/2-generation/diversity-manager.test.ts b/tests/unit/stages/2-generation/diversity-manager.test.ts
index 888435b..797146a 100644
--- a/tests/unit/stages/2-generation/diversity-manager.test.ts
+++ b/tests/unit/stages/2-generation/diversity-manager.test.ts
@@ -263,13 +263,13 @@ describe("calculateDiversityMetrics", () => {
 
 describe("createBaseScenario", () => {
   it("should create a base scenario with correct properties", () => {
-    const result = createBaseScenario(
-      "test-skill",
-      "skill",
-      "create a hook",
-      "I want to create a hook",
-      0,
-    );
+    const result = createBaseScenario({
+      componentRef: "test-skill",
+      componentType: "skill",
+      coreIntent: "create a hook",
+      basePrompt: "I want to create a hook",
+      index: 0,
+    });
 
     expect(result.id).toBe("test-skill-base-0");
     expect(result.component_ref).toBe("test-skill");
@@ -281,28 +281,30 @@ describe("createBaseScenario", () => {
 
 describe("baseToTestScenario", () => {
   it("should convert base scenario to test scenario", () => {
-    const base = createBaseScenario(
-      "test-skill",
-      "skill",
-      "create a hook",
-      "I want to create a hook",
-      0,
-    );
+    const base = createBaseScenario({
+      componentRef: "test-skill",
+      componentType: "skill",
+      coreIntent: "create a hook",
+      basePrompt: "I want to create a hook",
+      index: 0,
+    });
 
     const result = baseToTestScenario(base, "direct", true, "Test reasoning");
 
     expect(result.id).toBe("test-skill-base-0");
-    expect(result.component_ref).toBe("test-skill");
-    expect(result.component_type).toBe("skill");
     expect(result.scenario_type).toBe("direct");
-    expect(result.user_prompt).toBe("I want to create a hook");
     expect(result.expected_trigger).toBe(true);
-    expect(result.expected_component).toBe("test-skill");
     expect(result.reasoning).toBe("Test reasoning");
   });
 
   it("should omit reasoning when not provided", () => {
-    const base = createBaseScenario("test-skill", "skill", "test", "test", 0);
+    const base = createBaseScenario({
+      componentRef: "test-skill",
+      componentType: "skill",
+      coreIntent: "test",
+      basePrompt: "test",
+      index: 0,
+    });
 
     const result = baseToTestScenario(base, "direct", true);
 
diff --git a/tests/unit/stages/2-generation/skill-scenario-generator.test.ts b/tests/unit/stages/2-generation/skill-scenario-generator.test.ts
index f07f8e7..600d0bd 100644
--- a/tests/unit/stages/2-generation/skill-scenario-generator.test.ts
+++ b/tests/unit/stages/2-generation/skill-scenario-generator.test.ts
@@ -565,11 +565,11 @@ describe("generateAllSkillScenarios", () => {
         ],
       });
 
-    const scenarios = await generateAllSkillScenarios(
-      mockClient as unknown as Anthropic,
+    const scenarios = await generateAllSkillScenarios({
+      client: mockClient as unknown as Anthropic,
       skills,
       config,
-    );
+    });
 
     expect(mockClient.messages.create).toHaveBeenCalledTimes(2);
     expect(scenarios).toHaveLength(2);
@@ -583,12 +583,12 @@ describe("generateAllSkillScenarios", () => {
     });
 
     const progressCallback = vi.fn();
-    await generateAllSkillScenarios(
-      mockClient as unknown as Anthropic,
+    await generateAllSkillScenarios({
+      client: mockClient as unknown as Anthropic,
       skills,
       config,
-      progressCallback,
-    );
+      onProgress: progressCallback,
+    });
 
     // Called once per completed skill (parallel execution)
     expect(progressCallback).toHaveBeenCalledTimes(2);
@@ -597,11 +597,11 @@ describe("generateAllSkillScenarios", () => {
   });
 
   it("should return empty array for empty skills list", async () => {
-    const scenarios = await generateAllSkillScenarios(
-      mockClient as unknown as Anthropic,
-      [],
+    const scenarios = await generateAllSkillScenarios({
+      client: mockClient as unknown as Anthropic,
+      skills: [],
       config,
-    );
+    });
 
     expect(scenarios).toEqual([]);
     expect(mockClient.messages.create).not.toHaveBeenCalled();
@@ -646,11 +646,11 @@ describe("generateAllSkillScenarios", () => {
         ],
       });
 
-    const scenarios = await generateAllSkillScenarios(
-      mockClient as unknown as Anthropic,
+    const scenarios = await generateAllSkillScenarios({
+      client: mockClient as unknown as Anthropic,
       skills,
       config,
-    );
+    });
 
     expect(scenarios).toHaveLength(3);
   });
diff --git a/tests/unit/stages/3-execution/index.test.ts b/tests/unit/stages/3-execution/index.test.ts
index 0272553..8268da3 100644
--- a/tests/unit/stages/3-execution/index.test.ts
+++ b/tests/unit/stages/3-execution/index.test.ts
@@ -273,7 +273,7 @@ describe("runExecution", () => {
       const scenarios = [createScenario()];
       const config = createConfig();
 
-      await runExecution(analysis, scenarios, config);
+      await runExecution({ analysis, scenarios, config });
 
       expect(verifyPluginLoad).toHaveBeenCalledWith({
         pluginPath: config.plugin.path,
@@ -298,7 +298,7 @@ describe("runExecution", () => {
       ];
       const config = createConfig();
 
-      const result = await runExecution(analysis, scenarios, config);
+      const result = await runExecution({ analysis, scenarios, config });
 
       expect(result.results).toHaveLength(0);
       expect(result.error_count).toBe(2); // All scenarios failed
@@ -331,7 +331,7 @@ describe("runExecution", () => {
         scope: { ...createConfig().scope, mcp_servers: true },
       });
 
-      await runExecution(analysis, scenarios, config);
+      await runExecution({ analysis, scenarios, config });
 
       // The parallel mock receives the filtered scenarios
       expect(parallel).toHaveBeenCalled();
@@ -353,7 +353,7 @@ describe("runExecution", () => {
       const analysis = createAnalysis();
       const config = createConfig();
 
-      await runExecution(analysis, scenarios, config);
+      await runExecution({ analysis, scenarios, config });
 
       expect(resolveExecutionStrategy).toHaveBeenCalledWith(
         config.execution,
@@ -379,7 +379,7 @@ describe("runExecution", () => {
         scenarios,
       });
 
-      await runExecution(analysis, scenarios, config);
+      await runExecution({ analysis, scenarios, config });
 
       expect(resolveExecutionStrategy).toHaveBeenCalledWith(
         config.execution,
@@ -408,7 +408,7 @@ describe("runExecution", () => {
       });
 
       try {
-        await runExecution(analysis, scenarios, config);
+        await runExecution({ analysis, scenarios, config });
       } catch {
         // Expected to fail due to incomplete mocking of batched execution
       }
@@ -432,7 +432,7 @@ describe("runExecution", () => {
         .mockResolvedValueOnce(createExecutionResult({ scenario_id: "s2" }))
         .mockResolvedValueOnce(createExecutionResult({ scenario_id: "s3" }));
 
-      await runExecution(analysis, scenarios, config);
+      await runExecution({ analysis, scenarios, config });
 
       expect(parallel).toHaveBeenCalledWith(
         expect.objectContaining({
@@ -455,7 +455,12 @@ describe("runExecution", () => {
         onError: vi.fn(),
       };
 
-      await runExecution(analysis, scenarios, config, mockProgress);
+      await runExecution({
+        analysis,
+        scenarios,
+        config,
+        progress: mockProgress,
+      });
 
       expect(mockProgress.onStageStart).toHaveBeenCalledWith("execution", 1);
       expect(mockProgress.onStageComplete).toHaveBeenCalledWith(
@@ -481,7 +486,7 @@ describe("runExecution", () => {
           createExecutionResult({ scenario_id: "s2", cost_usd: 0.02 }),
         );
 
-      const result = await runExecution(analysis, scenarios, config);
+      const result = await runExecution({ analysis, scenarios, config });
 
       expect(result.results).toHaveLength(2);
       expect(result.total_cost_usd).toBeCloseTo(0.03);
@@ -516,7 +521,7 @@ describe("runExecution", () => {
           }),
         );
 
-      const result = await runExecution(analysis, scenarios, config);
+      const result = await runExecution({ analysis, scenarios, config });
 
       expect(result.success_count).toBe(1);
       expect(result.error_count).toBe(1);
@@ -527,7 +532,7 @@ describe("runExecution", () => {
       const scenarios = [createScenario()];
       const config = createConfig();
 
-      await runExecution(analysis, scenarios, config);
+      await runExecution({ analysis, scenarios, config });
 
       // Verify parallel is called with continueOnError: true
       expect(parallel).toHaveBeenCalledWith(
@@ -544,7 +549,7 @@ describe("runExecution", () => {
       const scenarios = [createScenario()];
       const config = createConfig();
 
-      await runExecution(analysis, scenarios, config);
+      await runExecution({ analysis, scenarios, config });
 
       expect(logger.warn).toHaveBeenCalledWith(
         expect.stringContaining("exceed budget"),
@@ -558,7 +563,7 @@ describe("runExecution", () => {
       const scenarios = [createScenario()];
       const config = createConfig();
 
-      await runExecution(analysis, scenarios, config);
+      await runExecution({ analysis, scenarios, config });
 
       expect(writeJsonAsync).toHaveBeenCalled();
     });
@@ -576,7 +581,7 @@ describe("runExecution", () => {
         },
       });
 
-      await runExecution(analysis, scenarios, config);
+      await runExecution({ analysis, scenarios, config });
 
       // Verify execution completes with sanitization enabled
       expect(writeJsonAsync).toHaveBeenCalled();
@@ -623,7 +628,7 @@ describe("runExecution", () => {
           }),
         );
 
-      const result = await runExecution(analysis, scenarios, config);
+      const result = await runExecution({ analysis, scenarios, config });
 
       expect(result.total_tools_captured).toBe(3);
     });
@@ -633,7 +638,7 @@ describe("runExecution", () => {
       const scenarios = [createScenario()];
       const config = createConfig();
 
-      const result = await runExecution(analysis, scenarios, config);
+      const result = await runExecution({ analysis, scenarios, config });
 
       expect(result.total_duration_ms).toBeGreaterThanOrEqual(0);
       expect(result.total_duration_ms).toBeLessThan(5000); // Should be fast in tests
@@ -651,7 +656,7 @@ describe("runExecution", () => {
         },
       });
 
-      await runExecution(analysis, scenarios, config);
+      await runExecution({ analysis, scenarios, config });
 
       // Rate limiting is applied inside executeAllScenariosIsolated
       // We verify the config is passed correctly
@@ -669,7 +674,7 @@ describe("runExecution", () => {
       const scenarios = [createScenario()];
       const config = createConfig({ rewind_file_changes: true });
 
-      await runExecution(analysis, scenarios, config);
+      await runExecution({ analysis, scenarios, config });
 
       // The parallel mock will call the fn, which internally uses executeScenarioWithCheckpoint
       // We verify by checking the mock was called (via parallel's fn)
@@ -685,7 +690,7 @@ describe("runExecution", () => {
         scope: { ...createConfig().scope, mcp_servers: true },
       });
 
-      await runExecution(analysis, scenarios, config);
+      await runExecution({ analysis, scenarios, config });
 
       expect(verifyPluginLoad).toHaveBeenCalledWith(
         expect.objectContaining({
diff --git a/tests/unit/stages/4-evaluation/aggregation/scenario-results.test.ts b/tests/unit/stages/4-evaluation/aggregation/scenario-results.test.ts
index 736aafc..43f87b1 100644
--- a/tests/unit/stages/4-evaluation/aggregation/scenario-results.test.ts
+++ b/tests/unit/stages/4-evaluation/aggregation/scenario-results.test.ts
@@ -122,14 +122,14 @@ describe("buildEvaluationResult", () => {
     const conflictAnalysis = createConflictAnalysis();
     const judgment = createMultiSampleResult({ aggregated_score: 9 });
 
-    const result = buildEvaluationResult(
+    const result = buildEvaluationResult({
       scenario,
-      true,
-      detections,
+      triggered: true,
+      uniqueDetections: detections,
       conflictAnalysis,
       judgment,
-      "both",
-    );
+      detectionSource: "both",
+    });
 
     expect(result.scenario_id).toBe("test-scenario-1");
     expect(result.triggered).toBe(true);
@@ -152,14 +152,14 @@ describe("buildEvaluationResult", () => {
     const scenario = createScenario({ expected_trigger: false });
     const conflictAnalysis = createConflictAnalysis();
 
-    const result = buildEvaluationResult(
+    const result = buildEvaluationResult({
       scenario,
-      false,
-      [], // no detections
+      triggered: false,
+      uniqueDetections: [], // no detections
       conflictAnalysis,
-      null, // no judgment
-      "programmatic",
-    );
+      judgment: null, // no judgment
+      detectionSource: "programmatic",
+    });
 
     expect(result.triggered).toBe(false);
     expect(result.confidence).toBe(0);
@@ -174,14 +174,14 @@ describe("buildEvaluationResult", () => {
     const detections = [createDetection()];
     const conflictAnalysis = createConflictAnalysis();
 
-    const result = buildEvaluationResult(
+    const result = buildEvaluationResult({
       scenario,
-      true,
-      detections,
+      triggered: true,
+      uniqueDetections: detections,
       conflictAnalysis,
-      null,
-      "programmatic",
-    );
+      judgment: null,
+      detectionSource: "programmatic",
+    });
 
     expect(result.quality_score).toBe(7);
     expect(result.summary).toBe("Correctly triggered component");
@@ -191,14 +191,14 @@ describe("buildEvaluationResult", () => {
     const scenario = createScenario({ expected_trigger: true });
     const conflictAnalysis = createConflictAnalysis();
 
-    const result = buildEvaluationResult(
+    const result = buildEvaluationResult({
       scenario,
-      false, // didn't trigger but was expected
-      [],
+      triggered: false, // didn't trigger but was expected
+      uniqueDetections: [],
       conflictAnalysis,
-      null,
-      "programmatic",
-    );
+      judgment: null,
+      detectionSource: "programmatic",
+    });
 
     expect(result.summary).toBe("Incorrectly did not trigger component");
   });
@@ -213,14 +213,14 @@ describe("buildEvaluationResult", () => {
       }),
     });
 
-    const result = buildEvaluationResult(
+    const result = buildEvaluationResult({
       scenario,
-      true,
-      detections,
+      triggered: true,
+      uniqueDetections: detections,
       conflictAnalysis,
       judgment,
-      "both",
-    );
+      detectionSource: "both",
+    });
 
     expect(result.summary).toBe("Custom LLM summary");
   });
@@ -233,14 +233,14 @@ describe("buildEvaluationResult", () => {
       all_issues: ["Minor issue 1", "Minor issue 2"],
     });
 
-    const result = buildEvaluationResult(
+    const result = buildEvaluationResult({
       scenario,
-      true,
-      detections,
+      triggered: true,
+      uniqueDetections: detections,
       conflictAnalysis,
       judgment,
-      "both",
-    );
+      detectionSource: "both",
+    });
 
     expect(result.issues).toEqual(["Minor issue 1", "Minor issue 2"]);
   });
@@ -260,14 +260,14 @@ describe("buildEvaluationResult", () => {
       ],
     });
 
-    const result = buildEvaluationResult(
+    const result = buildEvaluationResult({
       scenario,
-      true,
-      detections,
+      triggered: true,
+      uniqueDetections: detections,
       conflictAnalysis,
-      null,
-      "programmatic",
-    );
+      judgment: null,
+      detectionSource: "programmatic",
+    });
 
     expect(result.has_conflict).toBe(true);
     expect(result.conflict_severity).toBe("major");
diff --git a/tests/unit/stages/4-evaluation/index.test.ts b/tests/unit/stages/4-evaluation/index.test.ts
index 8c6dcce..ff7b22f 100644
--- a/tests/unit/stages/4-evaluation/index.test.ts
+++ b/tests/unit/stages/4-evaluation/index.test.ts
@@ -236,12 +236,12 @@ describe("runEvaluation", () => {
       const executions = [createExecutionResult()];
       const config = createConfig();
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(output.plugin_name).toBe("test-plugin");
       expect(output.results).toHaveLength(1);
@@ -254,12 +254,12 @@ describe("runEvaluation", () => {
       const executions: ExecutionResult[] = [];
       const config = createConfig();
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(output.results).toHaveLength(0);
       expect(output.metrics.total_scenarios).toBe(0);
@@ -271,7 +271,12 @@ describe("runEvaluation", () => {
       const executions = [createExecutionResult()];
       const config = createConfig();
 
-      await runEvaluation("test-plugin", scenarios, executions, config);
+      await runEvaluation({
+        pluginName: "test-plugin",
+        scenarios,
+        executions,
+        config,
+      });
 
       expect(writeJsonAsync).toHaveBeenCalledTimes(1);
       expect(writeJsonAsync).toHaveBeenCalledWith(
@@ -295,12 +300,12 @@ describe("runEvaluation", () => {
       ];
       const config = createConfig();
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(output.results[0]?.triggered).toBe(true);
       expect(output.results[0]?.confidence).toBe(100);
@@ -340,12 +345,12 @@ describe("runEvaluation", () => {
       ];
       const config = createConfig();
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(output.results[0]?.triggered).toBe(false);
       expect(output.results[0]?.confidence).toBe(0);
@@ -367,12 +372,12 @@ describe("runEvaluation", () => {
       ];
       const config = createConfig();
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(output.results[0]?.triggered).toBe(true);
       expect(output.results[0]?.all_triggered_components).toContainEqual(
@@ -399,12 +404,12 @@ describe("runEvaluation", () => {
       ];
       const config = createConfig();
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(output.results[0]?.triggered).toBe(true);
       expect(output.results[0]?.all_triggered_components).toContainEqual(
@@ -435,12 +440,12 @@ describe("runEvaluation", () => {
       const executions = [createExecutionResult()];
       const config = createConfig();
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(runJudgment).toHaveBeenCalled();
       expect(output.results[0]?.quality_score).toBe(9);
@@ -477,12 +482,12 @@ describe("runEvaluation", () => {
       ];
       const config = createConfig();
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(runJudgment).toHaveBeenCalled();
       expect(output.results[0]?.issues).toContain(
@@ -516,7 +521,12 @@ describe("runEvaluation", () => {
         },
       });
 
-      await runEvaluation("test-plugin", scenarios, executions, config);
+      await runEvaluation({
+        pluginName: "test-plugin",
+        scenarios,
+        executions,
+        config,
+      });
 
       // True negative with direct scenario type - no LLM needed
       expect(runJudgment).not.toHaveBeenCalled();
@@ -543,7 +553,12 @@ describe("runEvaluation", () => {
         },
       });
 
-      await runEvaluation("test-plugin", scenarios, executions, config);
+      await runEvaluation({
+        pluginName: "test-plugin",
+        scenarios,
+        executions,
+        config,
+      });
 
       expect(runJudgment).toHaveBeenCalled();
     });
@@ -555,12 +570,12 @@ describe("runEvaluation", () => {
       const executions = [createExecutionResult()];
       const config = createConfig();
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       // Should still return result with error captured
       expect(output.results).toHaveLength(1);
@@ -579,12 +594,12 @@ describe("runEvaluation", () => {
       ];
       const config = createConfig();
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(output.results[0]?.has_conflict).toBe(false);
       expect(output.results[0]?.conflict_severity).toBe("none");
@@ -599,12 +614,12 @@ describe("runEvaluation", () => {
       ];
       const config = createConfig();
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(output.results[0]?.has_conflict).toBe(true);
       expect(output.results[0]?.conflict_severity).toBe("major");
@@ -624,12 +639,12 @@ describe("runEvaluation", () => {
       ];
       const config = createConfig();
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(output.results[0]?.has_conflict).toBe(true);
       expect(output.results[0]?.conflict_severity).toBe("minor");
@@ -660,12 +675,12 @@ describe("runEvaluation", () => {
       ];
       const config = createConfig();
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(output.metrics.trigger_rate).toBeCloseTo(2 / 3);
       expect(output.metrics.triggered_count).toBe(2);
@@ -698,12 +713,12 @@ describe("runEvaluation", () => {
       ];
       const config = createConfig();
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(output.metrics.accuracy).toBe(1); // Both correct
     });
@@ -719,12 +734,12 @@ describe("runEvaluation", () => {
       ];
       const config = createConfig();
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(output.metrics.total_cost_usd).toBe(0.08);
       expect(output.metrics.avg_cost_per_scenario).toBe(0.04);
@@ -748,12 +763,12 @@ describe("runEvaluation", () => {
       ];
       const config = createConfig();
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(output.metrics.conflict_count).toBe(1);
       expect(output.metrics.major_conflicts).toBe(1);
@@ -770,8 +785,12 @@ describe("runEvaluation", () => {
       ];
       const config = createConfig();
 
-      await runEvaluation("test-plugin", scenarios, executions, config, {
-        onStageStart,
+      await runEvaluation({
+        pluginName: "test-plugin",
+        scenarios,
+        executions,
+        config,
+        progress: { onStageStart },
       });
 
       expect(onStageStart).toHaveBeenCalledWith("evaluation", 2);
@@ -783,8 +802,12 @@ describe("runEvaluation", () => {
       const executions = [createExecutionResult()];
       const config = createConfig();
 
-      await runEvaluation("test-plugin", scenarios, executions, config, {
-        onStageComplete,
+      await runEvaluation({
+        pluginName: "test-plugin",
+        scenarios,
+        executions,
+        config,
+        progress: { onStageComplete },
       });
 
       expect(onStageComplete).toHaveBeenCalledWith(
@@ -802,8 +825,12 @@ describe("runEvaluation", () => {
       const executions = [createExecutionResult()];
       const config = createConfig();
 
-      await runEvaluation("test-plugin", scenarios, executions, config, {
-        onError,
+      await runEvaluation({
+        pluginName: "test-plugin",
+        scenarios,
+        executions,
+        config,
+        progress: { onError },
       });
 
       // onError is called by parallel utility, but our mock doesn't trigger it
@@ -848,12 +875,12 @@ describe("runEvaluation", () => {
       ];
       const config = createConfig();
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(output.results).toHaveLength(3);
 
@@ -881,12 +908,12 @@ describe("runEvaluation", () => {
       ];
       const config = createConfig();
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(output.results).toHaveLength(1);
       expect(logger.warn).toHaveBeenCalledWith(
@@ -917,12 +944,12 @@ describe("runEvaluation", () => {
       ];
       const config = createConfig();
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(output.results[0]?.detection_source).toBe("programmatic");
     });
@@ -932,12 +959,12 @@ describe("runEvaluation", () => {
       const executions = [createExecutionResult()];
       const config = createConfig();
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(output.results[0]?.detection_source).toBe("both");
     });
@@ -952,12 +979,12 @@ describe("runEvaluation", () => {
         },
       });
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(output.results[0]?.detection_source).toBe("llm");
     });
@@ -983,12 +1010,12 @@ describe("runEvaluation", () => {
         },
       });
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(output.metrics.multi_sample_stats).toBeDefined();
       expect(output.metrics.multi_sample_stats?.avg_score_variance).toBeCloseTo(
@@ -1035,12 +1062,12 @@ describe("runEvaluation", () => {
         },
       });
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(output.metrics.multi_sample_stats).toBeDefined();
       expect(
@@ -1063,12 +1090,12 @@ describe("runEvaluation", () => {
         },
       });
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(output.metrics.multi_sample_stats).toBeUndefined();
     });
@@ -1114,12 +1141,12 @@ describe("runEvaluation", () => {
         },
       });
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(output.metrics.multi_sample_stats).toBeDefined();
       // 1 out of 2 scenarios had unanimous trigger_accuracy agreement
@@ -1168,12 +1195,12 @@ describe("runEvaluation", () => {
         },
       });
 
-      const output = await runEvaluation(
-        "test-plugin",
+      const output = await runEvaluation({
+        pluginName: "test-plugin",
         scenarios,
         executions,
         config,
-      );
+      });
 
       expect(output.metrics.multi_sample_stats).toBeDefined();
       // s2 has high variance (> 1.0)
diff --git a/tests/unit/stages/4-evaluation/llm-judge.test.ts b/tests/unit/stages/4-evaluation/llm-judge.test.ts
index c6f8b8b..7b5d4ee 100644
--- a/tests/unit/stages/4-evaluation/llm-judge.test.ts
+++ b/tests/unit/stages/4-evaluation/llm-judge.test.ts
@@ -366,13 +366,13 @@ describe("evaluateWithLLMJudge", () => {
     const detections = createDetections([{ name: "commit", type: "skill" }]);
     const config = createConfig({ model: "sonnet", max_tokens: 2048 });
 
-    await evaluateWithLLMJudge(
-      mockClient,
+    await evaluateWithLLMJudge({
+      client: mockClient,
       scenario,
       transcript,
-      detections,
+      programmaticResult: detections,
       config,
-    );
+    });
 
     expect(mockClient.beta.messages.create).toHaveBeenCalledTimes(1);
     const callArgs = mockClient.beta.messages.create.mock
@@ -403,13 +403,13 @@ describe("evaluateWithLLMJudge", () => {
     const detections = createDetections([{ name: "commit", type: "skill" }]);
     const config = createConfig();
 
-    const result = await evaluateWithLLMJudge(
-      mockClient,
+    const result = await evaluateWithLLMJudge({
+      client: mockClient,
       scenario,
       transcript,
-      detections,
+      programmaticResult: detections,
       config,
-    );
+    });
 
     expect(result.quality_score).toBe(9);
     expect(result.response_relevance).toBe(8);
@@ -436,13 +436,13 @@ describe("evaluateWithLLMJudge", () => {
     const detections = createDetections([]);
     const config = createConfig();
 
-    const result = await evaluateWithLLMJudge(
-      mockClient,
+    const result = await evaluateWithLLMJudge({
+      client: mockClient,
       scenario,
       transcript,
-      detections,
+      programmaticResult: detections,
       config,
-    );
+    });
 
     expect(result.highlights).toHaveLength(1);
     expect(result.highlights?.[0]?.description).toBe("Good trigger");
@@ -467,13 +467,13 @@ describe("evaluateWithLLMJudge", () => {
     const detections = createDetections([]);
     const config = createConfig();
 
-    const result = await evaluateWithLLMJudge(
-      mockClient,
+    const result = await evaluateWithLLMJudge({
+      client: mockClient,
       scenario,
       transcript,
-      detections,
+      programmaticResult: detections,
       config,
-    );
+    });
 
     expect(result.highlights).toBeUndefined();
   });
@@ -486,13 +486,13 @@ describe("evaluateWithLLMJudge", () => {
     const config = createConfig();
 
     await expect(
-      evaluateWithLLMJudge(
-        mockClient,
+      evaluateWithLLMJudge({
+        client: mockClient,
         scenario,
         transcript,
-        detections,
+        programmaticResult: detections,
         config,
-      ),
+      }),
     ).rejects.toThrow("Failed to parse structured output");
   });
 
@@ -512,13 +512,13 @@ describe("evaluateWithLLMJudge", () => {
     const config = createConfig();
 
     await expect(
-      evaluateWithLLMJudge(
-        mockClient,
+      evaluateWithLLMJudge({
+        client: mockClient,
         scenario,
         transcript,
-        detections,
+        programmaticResult: detections,
         config,
-      ),
+      }),
     ).rejects.toThrow("Failed to parse structured output");
   });
 
@@ -538,13 +538,13 @@ describe("evaluateWithLLMJudge", () => {
     const config = createConfig();
 
     await expect(
-      evaluateWithLLMJudge(
-        mockClient,
+      evaluateWithLLMJudge({
+        client: mockClient,
         scenario,
         transcript,
-        detections,
+        programmaticResult: detections,
         config,
-      ),
+      }),
     ).rejects.toThrow("Failed to parse structured output");
   });
 
@@ -571,13 +571,13 @@ describe("evaluateWithLLMJudge", () => {
     const config = createConfig();
 
     await expect(
-      evaluateWithLLMJudge(
-        mockClient,
+      evaluateWithLLMJudge({
+        client: mockClient,
         scenario,
         transcript,
-        detections,
+        programmaticResult: detections,
         config,
-      ),
+      }),
     ).rejects.toThrow("No text block");
   });
 });
@@ -595,13 +595,13 @@ describe("evaluateWithFallback", () => {
     const detections = createDetections([{ name: "commit", type: "skill" }]);
     const config = createConfig();
 
-    const result = await evaluateWithFallback(
-      mockClient,
+    const result = await evaluateWithFallback({
+      client: mockClient,
       scenario,
       transcript,
-      detections,
+      programmaticResult: detections,
       config,
-    );
+    });
 
     expect(result.quality_score).toBe(9);
     expect(mockClient.beta.messages.create).toHaveBeenCalledTimes(1);
@@ -625,13 +625,13 @@ describe("evaluateWithFallback", () => {
     const detections = createDetections([]);
     const config = createConfig();
 
-    const result = await evaluateWithFallback(
-      mockClient,
+    const result = await evaluateWithFallback({
+      client: mockClient,
       scenario,
       transcript,
-      detections,
+      programmaticResult: detections,
       config,
-    );
+    });
 
     expect(result.quality_score).toBe(7);
     expect(betaCreateMock).toHaveBeenCalledTimes(1);
@@ -656,13 +656,13 @@ describe("evaluateWithFallback", () => {
     const detections = createDetections([]);
     const config = createConfig();
 
-    const result = await evaluateWithFallback(
-      mockClient,
+    const result = await evaluateWithFallback({
+      client: mockClient,
       scenario,
       transcript,
-      detections,
+      programmaticResult: detections,
       config,
-    );
+    });
 
     expect(result.quality_score).toBe(6);
   });
@@ -684,13 +684,13 @@ describe("evaluateWithFallback", () => {
     const detections = createDetections([]);
     const config = createConfig();
 
-    const result = await evaluateWithFallback(
-      mockClient,
+    const result = await evaluateWithFallback({
+      client: mockClient,
       scenario,
       transcript,
-      detections,
+      programmaticResult: detections,
       config,
-    );
+    });
 
     // Should return default error response
     expect(result.quality_score).toBe(1);
diff --git a/tests/unit/stages/4-evaluation/multi-sampler.test.ts b/tests/unit/stages/4-evaluation/multi-sampler.test.ts
index b2a2b5e..c07f245 100644
--- a/tests/unit/stages/4-evaluation/multi-sampler.test.ts
+++ b/tests/unit/stages/4-evaluation/multi-sampler.test.ts
@@ -399,13 +399,13 @@ describe("evaluateSingleSample", () => {
     const detections = createDetections();
     const config = createConfig();
 
-    const result = await evaluateSingleSample(
+    const result = await evaluateSingleSample({
       client,
       scenario,
       transcript,
-      detections,
+      programmaticResult: detections,
       config,
-    );
+    });
 
     expect(result.individual_scores).toEqual([8]);
     expect(result.aggregated_score).toBe(8);
@@ -422,13 +422,13 @@ describe("evaluateSingleSample", () => {
     });
     (evaluateWithFallback as Mock).mockResolvedValue(mockResponse);
 
-    const result = await evaluateSingleSample(
-      createMockClient(),
-      createScenario(),
-      createTranscript(),
-      createDetections(),
-      createConfig(),
-    );
+    const result = await evaluateSingleSample({
+      client: createMockClient(),
+      scenario: createScenario(),
+      transcript: createTranscript(),
+      programmaticResult: createDetections(),
+      config: createConfig(),
+    });
 
     expect(result.all_issues).toEqual([
       "Minor formatting issue",
@@ -446,35 +446,35 @@ describe("evaluateSingleSample", () => {
     const detections = createDetections();
     const config = createConfig({ model: "sonnet", max_tokens: 2048 });
 
-    await evaluateSingleSample(
+    await evaluateSingleSample({
       client,
       scenario,
       transcript,
-      detections,
+      programmaticResult: detections,
       config,
-    );
+    });
 
     expect(evaluateWithFallback).toHaveBeenCalledTimes(1);
-    expect(evaluateWithFallback).toHaveBeenCalledWith(
+    expect(evaluateWithFallback).toHaveBeenCalledWith({
       client,
       scenario,
       transcript,
-      detections,
+      programmaticResult: detections,
       config,
-    );
+    });
   });
 
   it("should always set is_unanimous to true (single sample is trivially unanimous)", async () => {
     const mockResponse = createJudgeResponse({ trigger_accuracy: "correct" });
     (evaluateWithFallback as Mock).mockResolvedValue(mockResponse);
 
-    const result = await evaluateSingleSample(
-      createMockClient(),
-      createScenario(),
-      createTranscript(),
-      createDetections(),
-      createConfig(),
-    );
+    const result = await evaluateSingleSample({
+      client: createMockClient(),
+      scenario: createScenario(),
+      transcript: createTranscript(),
+      programmaticResult: createDetections(),
+      config: createConfig(),
+    });
 
     expect(result.is_unanimous).toBe(true);
   });
@@ -491,13 +491,13 @@ describe("evaluateWithMultiSampling", () => {
 
     const config = createConfig({ num_samples: 3 });
 
-    await evaluateWithMultiSampling(
-      createMockClient(),
-      createScenario(),
-      createTranscript(),
-      createDetections(),
+    await evaluateWithMultiSampling({
+      client: createMockClient(),
+      scenario: createScenario(),
+      transcript: createTranscript(),
+      programmaticResult: createDetections(),
       config,
-    );
+    });
 
     expect(evaluateWithFallback).toHaveBeenCalledTimes(3);
   });
@@ -518,13 +518,13 @@ describe("evaluateWithMultiSampling", () => {
       aggregate_method: "average",
     });
 
-    const result = await evaluateWithMultiSampling(
-      createMockClient(),
-      createScenario(),
-      createTranscript(),
-      createDetections(),
+    const result = await evaluateWithMultiSampling({
+      client: createMockClient(),
+      scenario: createScenario(),
+      transcript: createTranscript(),
+      programmaticResult: createDetections(),
       config,
-    );
+    });
 
     expect(result.individual_scores).toEqual([7, 8, 9]);
     expect(result.aggregated_score).toBe(8); // (7+8+9)/3
@@ -544,13 +544,13 @@ describe("evaluateWithMultiSampling", () => {
 
     const config = createConfig({ num_samples: 3, aggregate_method: "median" });
 
-    const result = await evaluateWithMultiSampling(
-      createMockClient(),
-      createScenario(),
-      createTranscript(),
-      createDetections(),
+    const result = await evaluateWithMultiSampling({
+      client: createMockClient(),
+      scenario: createScenario(),
+      transcript: createTranscript(),
+      programmaticResult: createDetections(),
       config,
-    );
+    });
 
     expect(result.aggregated_score).toBe(8); // median of [5, 8, 10]
   });
@@ -568,13 +568,13 @@ describe("evaluateWithMultiSampling", () => {
 
     const config = createConfig({ num_samples: 3 });
 
-    const result = await evaluateWithMultiSampling(
-      createMockClient(),
-      createScenario(),
-      createTranscript(),
-      createDetections(),
+    const result = await evaluateWithMultiSampling({
+      client: createMockClient(),
+      scenario: createScenario(),
+      transcript: createTranscript(),
+      programmaticResult: createDetections(),
       config,
-    );
+    });
 
     expect(result.consensus_trigger_accuracy).toBe("correct");
   });
@@ -592,13 +592,13 @@ describe("evaluateWithMultiSampling", () => {
 
     const config = createConfig({ num_samples: 3 });
 
-    const result = await evaluateWithMultiSampling(
-      createMockClient(),
-      createScenario(),
-      createTranscript(),
-      createDetections(),
+    const result = await evaluateWithMultiSampling({
+      client: createMockClient(),
+      scenario: createScenario(),
+      transcript: createTranscript(),
+      programmaticResult: createDetections(),
       config,
-    );
+    });
 
     expect(result.is_unanimous).toBe(true);
     expect(result.consensus_trigger_accuracy).toBe("correct");
@@ -617,13 +617,13 @@ describe("evaluateWithMultiSampling", () => {
 
     const config = createConfig({ num_samples: 3 });
 
-    const result = await evaluateWithMultiSampling(
-      createMockClient(),
-      createScenario(),
-      createTranscript(),
-      createDetections(),
+    const result = await evaluateWithMultiSampling({
+      client: createMockClient(),
+      scenario: createScenario(),
+      transcript: createTranscript(),
+      programmaticResult: createDetections(),
       config,
-    );
+    });
 
     expect(result.is_unanimous).toBe(false);
     // Majority vote should still work
@@ -643,13 +643,13 @@ describe("evaluateWithMultiSampling", () => {
 
     const config = createConfig({ num_samples: 3 });
 
-    const result = await evaluateWithMultiSampling(
-      createMockClient(),
-      createScenario(),
-      createTranscript(),
-      createDetections(),
+    const result = await evaluateWithMultiSampling({
+      client: createMockClient(),
+      scenario: createScenario(),
+      transcript: createTranscript(),
+      programmaticResult: createDetections(),
       config,
-    );
+    });
 
     expect(result.all_issues).toHaveLength(3);
     expect(result.all_issues).toContain("Issue A");
@@ -682,13 +682,13 @@ describe("evaluateWithMultiSampling", () => {
       aggregate_method: "average",
     });
 
-    const result = await evaluateWithMultiSampling(
-      createMockClient(),
-      createScenario(),
-      createTranscript(),
-      createDetections(),
+    const result = await evaluateWithMultiSampling({
+      client: createMockClient(),
+      scenario: createScenario(),
+      transcript: createTranscript(),
+      programmaticResult: createDetections(),
       config,
-    );
+    });
 
     // Representative should have aggregated values
     expect(result.representative_response.quality_score).toBe(8); // (7+9)/2
@@ -704,13 +704,13 @@ describe("evaluateWithMultiSampling", () => {
 
     const config = createConfig({ num_samples: 1 });
 
-    const result = await evaluateWithMultiSampling(
-      createMockClient(),
-      createScenario(),
-      createTranscript(),
-      createDetections(),
+    const result = await evaluateWithMultiSampling({
+      client: createMockClient(),
+      scenario: createScenario(),
+      transcript: createTranscript(),
+      programmaticResult: createDetections(),
       config,
-    );
+    });
 
     expect(evaluateWithFallback).toHaveBeenCalledTimes(1);
     expect(result.individual_scores).toEqual([7]);
@@ -726,13 +726,13 @@ describe("evaluateWithMultiSampling", () => {
     const config = createConfig({ num_samples: 3 });
 
     await expect(
-      evaluateWithMultiSampling(
-        createMockClient(),
-        createScenario(),
-        createTranscript(),
-        createDetections(),
+      evaluateWithMultiSampling({
+        client: createMockClient(),
+        scenario: createScenario(),
+        transcript: createTranscript(),
+        programmaticResult: createDetections(),
         config,
-      ),
+      }),
     ).rejects.toThrow("API error");
   });
 });
@@ -748,13 +748,13 @@ describe("runJudgment", () => {
 
     const config = createConfig({ num_samples: 1 });
 
-    const result = await runJudgment(
-      createMockClient(),
-      createScenario(),
-      createTranscript(),
-      createDetections(),
+    const result = await runJudgment({
+      client: createMockClient(),
+      scenario: createScenario(),
+      transcript: createTranscript(),
+      programmaticResult: createDetections(),
       config,
-    );
+    });
 
     expect(evaluateWithFallback).toHaveBeenCalledTimes(1);
     expect(result.individual_scores).toEqual([8]);
@@ -767,13 +767,13 @@ describe("runJudgment", () => {
 
     const config = createConfig({ num_samples: 3 });
 
-    const result = await runJudgment(
-      createMockClient(),
-      createScenario(),
-      createTranscript(),
-      createDetections(),
+    const result = await runJudgment({
+      client: createMockClient(),
+      scenario: createScenario(),
+      transcript: createTranscript(),
+      programmaticResult: createDetections(),
       config,
-    );
+    });
 
     expect(evaluateWithFallback).toHaveBeenCalledTimes(3);
     expect(result.individual_scores).toHaveLength(3);
@@ -785,13 +785,13 @@ describe("runJudgment", () => {
 
     const config = createConfig({ num_samples: 0 });
 
-    const result = await runJudgment(
-      createMockClient(),
-      createScenario(),
-      createTranscript(),
-      createDetections(),
+    const result = await runJudgment({
+      client: createMockClient(),
+      scenario: createScenario(),
+      transcript: createTranscript(),
+      programmaticResult: createDetections(),
       config,
-    );
+    });
 
     // Should treat 0 as single sample (via evaluateSingleSample path)
     expect(evaluateWithFallback).toHaveBeenCalledTimes(1);
@@ -804,13 +804,13 @@ describe("runJudgment", () => {
     const config = createConfig({ num_samples: 1 });
 
     await expect(
-      runJudgment(
-        createMockClient(),
-        createScenario(),
-        createTranscript(),
-        createDetections(),
+      runJudgment({
+        client: createMockClient(),
+        scenario: createScenario(),
+        transcript: createTranscript(),
+        programmaticResult: createDetections(),
         config,
-      ),
+      }),
     ).rejects.toThrow("API Error");
   });
 });
diff --git a/tests/unit/stages/4-evaluation/programmatic-detector.test.ts b/tests/unit/stages/4-evaluation/programmatic-detector.test.ts
index 9f04074..0aa6a17 100644
--- a/tests/unit/stages/4-evaluation/programmatic-detector.test.ts
+++ b/tests/unit/stages/4-evaluation/programmatic-detector.test.ts
@@ -960,12 +960,12 @@ describe("detectAllComponentsWithHooks", () => {
       },
     ];
 
-    const detections = detectAllComponentsWithHooks(
+    const detections = detectAllComponentsWithHooks({
       captures,
       transcript,
       scenario,
       hookResponses,
-    );
+    });
 
     expect(detections.length).toBeGreaterThan(0);
     expect(detections.some((d) => d.component_type === "skill")).toBe(true);
@@ -996,12 +996,12 @@ describe("detectAllComponentsWithHooks", () => {
       },
     ];
 
-    const detections = detectAllComponentsWithHooks(
+    const detections = detectAllComponentsWithHooks({
       captures,
       transcript,
       scenario,
       hookResponses,
-    );
+    });
 
     const hookDetections = detections.filter(
       (d) => d.component_type === "hook",
@@ -1024,12 +1024,12 @@ describe("detectAllComponentsWithHooks", () => {
       },
     ];
 
-    const detections = detectAllComponentsWithHooks(
+    const detections = detectAllComponentsWithHooks({
       captures,
       transcript,
       scenario,
       hookResponses,
-    );
+    });
 
     expect(detections.every((d) => d.component_type !== "hook")).toBe(true);
   });
@@ -1039,11 +1039,11 @@ describe("detectAllComponentsWithHooks", () => {
     const transcript = createTranscript([]);
     const scenario = createScenario({ component_type: "skill" });
 
-    const detections = detectAllComponentsWithHooks(
+    const detections = detectAllComponentsWithHooks({
       captures,
       transcript,
       scenario,
-    );
+    });
 
     expect(detections).toHaveLength(1);
     expect(detections[0]?.component_type).toBe("skill");
@@ -1057,11 +1057,11 @@ describe("detectAllComponentsWithHooks", () => {
     const transcript = createTranscript([]);
     const scenario = createScenario({ component_type: "skill" });
 
-    const detections = detectAllComponentsWithHooks(
+    const detections = detectAllComponentsWithHooks({
       captures,
       transcript,
       scenario,
-    );
+    });
 
     expect(detections).toHaveLength(1);
   });
@@ -1082,13 +1082,12 @@ describe("detectAllComponentsWithHooks", () => {
       },
     ];
 
-    const detections = detectAllComponentsWithHooks(
+    const detections = detectAllComponentsWithHooks({
       captures,
       transcript,
       scenario,
-      undefined,
       subagentCaptures,
-    );
+    });
 
     expect(detections).toHaveLength(1);
     expect(detections[0]?.component_type).toBe("agent");