diff --git a/bench/eval-framework.mjs b/bench/eval-framework.mjs
new file mode 100644
index 00000000..b12a3d41
--- /dev/null
+++ b/bench/eval-framework.mjs
@@ -0,0 +1,783 @@
+import { mkdirSync, readFileSync, writeFileSync, readdirSync, existsSync } from "node:fs";
+import { basename, resolve } from "node:path";
+import { randomUUID } from "node:crypto";
+
+const TASK_TYPES = Object.freeze([
+  "code-generation",
+  "bug-fix",
+  "refactor",
+  "test-writing",
+  "code-review",
+]);
+
+const BUILTIN_TASK_METRICS = Object.freeze({
+  "code-generation": ["TaskSuccess", "TokenEfficiency", "TimeToComplete", "ContextUtilization"],
+  "bug-fix": ["TaskSuccess", "TokenEfficiency", "TimeToComplete", "TestPassRate"],
+  refactor: ["TaskSuccess", "TokenEfficiency", "TimeToComplete", "FalsePositiveRate"],
+  "test-writing": ["TaskSuccess", "TokenEfficiency", "TimeToComplete", "TestPassRate"],
+  "code-review": ["TaskSuccess", "TokenEfficiency", "TimeToComplete", "FalsePositiveRate"],
+});
+
+const BUILTIN_METRICS = Object.freeze([
+  "TaskSuccess",
+  "TokenEfficiency",
+  "TimeToComplete",
+  "TestPassRate",
+  "FalsePositiveRate",
+  "ContextUtilization",
+]);
+
+const DEFAULT_RESULTS_DIR = ".cache/eval-results";
+const DEFAULT_BENCHMARKS_DIR = "bench/benchmarks";
+
+function mean(values = []) {
+  if (!Array.isArray(values) || values.length === 0) return 0;
+  return values.reduce((sum, value) => sum + Number(value || 0), 0) / values.length;
+}
+
+function percentile(values = [], p = 95) {
+  if (!Array.isArray(values) || values.length === 0) return 0;
+  const sorted = values.map((value) => Number(value || 0)).sort((a, b) => a - b);
+  const index = Math.min(sorted.length - 1, Math.max(0, Math.ceil((p / 100) * sorted.length) - 1));
+  return sorted[index] ?? 0;
+}
+
+function ensureArray(value) {
+  return Array.isArray(value) ? value : value == null ? [] : [value];
+}
+
+function toNumber(value, fallback = 0) {
+  const numeric = Number(value);
+  return Number.isFinite(numeric) ? numeric : fallback;
+}
+
+function normalizeTaskType(type) {
+  const normalized = String(type || "").trim().toLowerCase();
+  if (TASK_TYPES.includes(normalized)) return normalized;
+  return "code-generation";
+}
+
+function defaultMetricsForTask(type) {
+  return [...(BUILTIN_TASK_METRICS[normalizeTaskType(type)] || BUILTIN_METRICS)];
+}
+
+function parseJsonFile(filePath) {
+  return JSON.parse(readFileSync(resolve(filePath), "utf8"));
+}
+
+function dedupeStrings(values = []) {
+  const output = [];
+  for (const value of ensureArray(values)) {
+    const normalized = String(value || "").trim();
+    if (!normalized || output.includes(normalized)) continue;
+    output.push(normalized);
+  }
+  return output;
+}
+
+function escapeRegex(text) {
+  return String(text || "").replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+}
+
+function normalCdf(value) {
+  const x = Number(value || 0);
+  const sign = x < 0 ? -1 : 1;
+  const abs = Math.abs(x) / Math.sqrt(2);
+  const t = 1 / (1 + 0.3275911 * abs);
+  const a1 = 0.254829592;
+  const a2 = -0.284496736;
+  const a3 = 1.421413741;
+  const a4 = -1.453152027;
+  const a5 = 1.061405429;
+  const erf = 1 - (((((a5 * t) + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp(-(abs * abs));
+  return 0.5 * (1 + sign * erf);
+}
+
+function zScoreToPValue(score) {
+  const normalized = Math.abs(Number(score || 0));
+  return Math.max(0, Math.min(1, 2 * (1 - normalCdf(normalized))));
+}
+
+function computeSignificance(baseValues = [], candidateValues = []) {
+  const baseline = ensureArray(baseValues).map((value) => toNumber(value)).filter((value) => Number.isFinite(value));
+  const candidate = ensureArray(candidateValues).map((value) => toNumber(value)).filter((value) => Number.isFinite(value));
+  if (baseline.length === 0 || candidate.length === 0) {
+    return { score: 0, pValue: 1, method: "insufficient-data" };
+  }
+  const baselineMean = mean(baseline);
+  const candidateMean = mean(candidate);
+  const baselineVariance = mean(baseline.map((value) => (value - baselineMean) ** 2));
+  const candidateVariance = mean(candidate.map((value) => (value - candidateMean) ** 2));
+  const denominator = Math.sqrt(
+    (baselineVariance / Math.max(1, baseline.length)) +
+    (candidateVariance / Math.max(1, candidate.length)),
+  );
+  if (!Number.isFinite(denominator) || denominator === 0) {
+    return {
+      score: candidateMean === baselineMean ? 0 : Number.POSITIVE_INFINITY,
+      pValue: candidateMean === baselineMean ? 1 : 0,
+      method: "welch-z",
+    };
+  }
+  const score = (candidateMean - baselineMean) / denominator;
+  return {
+    score,
+    pValue: zScoreToPValue(score),
+    method: "welch-z",
+  };
+}
+
+function normalizeMetricName(value) {
+  return String(value || "").trim();
+}
+
+function calculateCostFromOutcome(outcome = {}, strategy = {}) {
+  const totalTokens = toNumber(outcome.totalTokens, toNumber(outcome.tokensInput) + toNumber(outcome.tokensOutput));
+  const rate = toNumber(
+    strategy.costPerMillionTokens ?? strategy.costRatePerMillion ?? strategy.tokenCostPerMillion,
+    0,
+  );
+  if (!rate || totalTokens <= 0) return 0;
+  return (totalTokens / 1_000_000) * rate;
+}
+
+export class Task {
+  constructor(raw = {}) {
+    this.id = String(raw.id || "").trim() || `task-${randomUUID()}`;
+    this.type = normalizeTaskType(raw.type);
+    this.input = raw.input && typeof raw.input === "object" ? { ...raw.input } : { prompt: "", repoState: {} };
+    this.groundTruth = raw.groundTruth && typeof raw.groundTruth === "object" ? { ...raw.groundTruth } : {};
+    this.metrics = ensureArray(raw.metrics).length > 0
+      ? dedupeStrings(ensureArray(raw.metrics).map((metric) => normalizeMetricName(metric)))
+      : defaultMetricsForTask(this.type);
+    this.tags = raw.tags && typeof raw.tags === "object" && !Array.isArray(raw.tags) ? { ...raw.tags } : {};
+  }
+}
+
+export class Benchmark {
+  constructor(raw = {}) {
+    this.id = String(raw.id || raw.name || "benchmark").trim() || "benchmark";
+    this.name = String(raw.name || raw.id || "benchmark").trim() || "benchmark";
+    this.description = String(raw.description || "").trim();
+    this.tasks = ensureArray(raw.tasks).map((task) => task instanceof Task ? task : new Task(task));
+    this.tags = raw.tags && typeof raw.tags === "object" && !Array.isArray(raw.tags) ? { ...raw.tags } : {};
+    this.sourcePath = String(raw.sourcePath || "").trim();
+  }
+}
+
+export class Metric {
+  constructor(name, evaluator, options = {}) {
+    this.name = normalizeMetricName(name);
+    this.evaluator = typeof evaluator === "function" ? evaluator : (() => null);
+    this.kind = String(options.kind || "metric");
+    this.description = String(options.description || "").trim();
+  }
+
+  evaluate(context) {
+    return this.evaluator(context);
+  }
+}
+
+export class CategoryMetric extends Metric {
+  constructor(name, evaluator, options = {}) {
+    super(name, evaluator, { ...options, kind: "category" });
+  }
+}
+
+export class NumericalMetric extends Metric {
+  constructor(name, evaluator, options = {}) {
+    super(name, evaluator, { ...options, kind: "numerical" });
+  }
+}
+
+function defaultRunner() {
+  return async () => ({
+    success: false,
+    durationMs: 0,
+    tokensInput: 0,
+    tokensOutput: 0,
+    filesChanged: 0,
+    testsPassed: 0,
+    testsTotal: 0,
+    falsePositives: 0,
+    contextBytes: 0,
+    contextBudgetBytes: 0,
+    costUsd: 0,
+  });
+}
+
+export function builtInMetricInstances() {
+  return [
+    new CategoryMetric("TaskSuccess", ({ outcome }) => outcome.success ? "pass" : "fail"),
+    new NumericalMetric("TokenEfficiency", ({ outcome }) => {
+      const totalTokens = toNumber(outcome.totalTokens, toNumber(outcome.tokensInput) + toNumber(outcome.tokensOutput));
+      return totalTokens / Math.max(1, toNumber(outcome.filesChanged, 1));
+    }),
+    new NumericalMetric("TimeToComplete", ({ outcome }) => toNumber(outcome.durationMs)),
+    new NumericalMetric("TestPassRate", ({ outcome }) => {
+      const total = Math.max(0, toNumber(outcome.testsTotal));
+      if (total === 0) return outcome.success ? 1 : 0;
+      return toNumber(outcome.testsPassed) / total;
+    }),
+    new NumericalMetric("FalsePositiveRate", ({ outcome }) => {
+      const reviewedCount = Math.max(1, toNumber(outcome.findingsTotal, toNumber(outcome.filesChanged, 1)));
+      return toNumber(outcome.falsePositives) / reviewedCount;
+    }),
+    new NumericalMetric("ContextUtilization", ({ outcome }) => {
+      const budget = Math.max(0, toNumber(outcome.contextBudgetBytes));
+      if (budget === 0) return 0;
+      return toNumber(outcome.contextBytes) / budget;
+    }),
+  ];
+}
+
+function mapMetrics(metrics = []) {
+  const metricMap = new Map();
+  for (const metric of [...builtInMetricInstances(), ...metrics]) {
+    if (!metric?.name) continue;
+    metricMap.set(metric.name, metric);
+  }
+  return metricMap;
+}
+
+function buildTaskSummaryEntries(results = []) {
+  const byTask = new Map();
+  for (const result of ensureArray(results)) {
+    const taskId = String(result?.taskId || "").trim();
+    if (!taskId) continue;
+    if (!byTask.has(taskId)) {
+      byTask.set(taskId, {
+        taskId,
+        taskType: result.taskType || "code-generation",
+        repeats: 0,
+        passCount: 0,
+        tokenValues: [],
+        timeValues: [],
+        costValues: [],
+        resultIndexes: [],
+      });
+    }
+    const entry = byTask.get(taskId);
+    entry.repeats += 1;
+    if (result.metrics?.TaskSuccess === "pass") entry.passCount += 1;
+    entry.tokenValues.push(toNumber(result.outcome?.totalTokens, toNumber(result.outcome?.tokensInput) + toNumber(result.outcome?.tokensOutput)));
+    entry.timeValues.push(toNumber(result.outcome?.durationMs));
+    entry.costValues.push(toNumber(result.outcome?.costUsd));
+    entry.resultIndexes.push(result.resultIndex);
+  }
+  return [...byTask.values()].map((entry) => ({
+    taskId: entry.taskId,
+    taskType: entry.taskType,
+    repeats: entry.repeats,
+    passRate: entry.repeats > 0 ? entry.passCount / entry.repeats : 0,
+    avgTokens: mean(entry.tokenValues),
+    avgTimeMs: mean(entry.timeValues),
+    avgCostUsd: mean(entry.costValues),
+  }));
+}
+
+function summarizeRun(run) {
+  const results = ensureArray(run.results);
+  const timeValues = [];
+  const tokenValues = [];
+  const costValues = [];
+  let passCount = 0;
+  for (const result of results) {
+    const metrics = result.metrics || {};
+    const totalTokens = toNumber(result.outcome?.totalTokens, toNumber(result.outcome?.tokensInput) + toNumber(result.outcome?.tokensOutput));
+    const totalCost = toNumber(result.outcome?.costUsd);
+    if (metrics.TaskSuccess === "pass") passCount += 1;
+    timeValues.push(toNumber(metrics.TimeToComplete, result.outcome?.durationMs));
+    tokenValues.push(totalTokens);
+    costValues.push(totalCost);
+  }
+  return {
+    totalTasks: results.length,
+    passRate: results.length > 0 ? passCount / results.length : 0,
+    avgTokens: mean(tokenValues),
+    p95Tokens: percentile(tokenValues, 95),
+    avgTimeMs: mean(timeValues),
+    p95TimeMs: percentile(timeValues, 95),
+    totalCostUsd: costValues.reduce((sum, value) => sum + value, 0),
+    avgCostUsd: mean(costValues),
+    perTask: buildTaskSummaryEntries(results),
+  };
+}
+
+function normalizeStrategy(raw = {}, index = 0) {
+  const strategy = raw && typeof raw === "object" ? { ...raw } : { id: String(raw || "") };
+  const id = String(strategy.id || strategy.name || `strategy-${index + 1}`).trim() || `strategy-${index + 1}`;
+  return {
+    id,
+    label: String(strategy.label || strategy.name || id).trim() || id,
+    sdk: String(strategy.sdk || "").trim(),
+    model: String(strategy.model || "").trim(),
+    promptStrategy: String(strategy.promptStrategy || strategy.prompt || "").trim(),
+    codebaseProfile: String(strategy.codebaseProfile || strategy.repoProfile || "").trim(),
+    annotated: strategy.annotated === true,
+    unannotated: strategy.unannotated === true,
+    config: strategy.config && typeof strategy.config === "object" ? { ...strategy.config } : {},
+    costPerMillionTokens: toNumber(strategy.costPerMillionTokens ?? strategy.costRatePerMillion, 0),
+    metadata: strategy.metadata && typeof strategy.metadata === "object" ? { ...strategy.metadata } : {},
+  };
+}
+
+function loadRunFromFile(runPath) {
+  return parseJsonFile(runPath);
+}
+
+function readRunById(resultsDir, runId) {
+  return loadRunFromFile(resolveEvalResultPath(resultsDir, runId));
+}
+
+export class Evaluator {
+  constructor(options = {}) {
+    this.resultsDir = resolve(options.resultsDir || DEFAULT_RESULTS_DIR);
+    this.metrics = ensureArray(options.metrics);
+    this.runner = options.runner || defaultRunner();
+    this.parallelism = Math.max(1, toNumber(options.parallelism, 1));
+    this.storageAdapter = options.storageAdapter || null;
+  }
+
+  async evaluate({ benchmark, repeats = 1, strategies = [] } = {}) {
+    const normalizedBenchmark = benchmark instanceof Benchmark ? benchmark : new Benchmark(benchmark || {});
+    const normalizedStrategies = ensureArray(strategies).length > 0
+      ? ensureArray(strategies).map((strategy, index) => normalizeStrategy(strategy, index))
+      : [normalizeStrategy({ id: "default", label: "Default" })];
+    const metricMap = mapMetrics(this.metrics);
+    const runId = `eval-${Date.now()}-${randomUUID()}`;
+    const results = [];
+    let resultIndex = 0;
+
+    for (const strategy of normalizedStrategies) {
+      for (const task of normalizedBenchmark.tasks) {
+        for (let repeatIndex = 0; repeatIndex < Math.max(1, repeats); repeatIndex += 1) {
+          const outcome = {
+            ...(await this.runner({ benchmark: normalizedBenchmark, task, strategy, repeatIndex })),
+          };
+          outcome.totalTokens = toNumber(
+            outcome.totalTokens,
+            toNumber(outcome.tokensInput) + toNumber(outcome.tokensOutput),
+          );
+          outcome.costUsd = toNumber(
+            outcome.costUsd,
+            calculateCostFromOutcome(outcome, strategy),
+          );
+          const metricResults = {};
+          for (const metricName of task.metrics) {
+            const metric = metricMap.get(metricName);
+            if (!metric) continue;
+            metricResults[metricName] = metric.evaluate({
+              benchmark: normalizedBenchmark,
+              task,
+              strategy,
+              repeatIndex,
+              outcome,
+            });
+          }
+          results.push({
+            resultIndex: resultIndex++,
+            strategyId: strategy.id,
+            strategyLabel: strategy.label,
+            strategy,
+            taskId: task.id,
+            taskType: task.type,
+            repeatIndex,
+            metrics: metricResults,
+            outcome,
+          });
+        }
+      }
+    }
+
+    const summary = summarizeRun({ results });
+    const run = {
+      runId,
+      benchmarkId: normalizedBenchmark.id,
+      benchmark: normalizedBenchmark.name,
+      benchmarkDescription: normalizedBenchmark.description,
+      repeats: Math.max(1, repeats),
+      strategyIds: normalizedStrategies.map((strategy) => strategy.id),
+      strategies: normalizedStrategies,
+      parallelism: this.parallelism,
+      createdAt: new Date().toISOString(),
+      results,
+      summary,
+    };
+
+    mkdirSync(this.resultsDir, { recursive: true });
+    const resultPath = resolve(this.resultsDir, `${runId}.json`);
+    writeFileSync(resultPath, JSON.stringify(run, null, 2) + "\n", "utf8");
+    if (this.storageAdapter && typeof this.storageAdapter.writeRun === "function") {
+      await this.storageAdapter.writeRun(run, resultPath);
+    }
+    return { ...run, resultPath };
+  }
+}
+
+export async function importBenchmarkFromFile(filePath) {
+  const raw = parseJsonFile(filePath);
+  return new Benchmark({ ...raw, sourcePath: resolve(filePath) });
+}
+
+export function compareEvaluationRuns(baseline, candidate) {
+  const metricKeys = ["passRate", "avgTokens", "p95Tokens", "avgTimeMs", "p95TimeMs", "avgCostUsd", "totalCostUsd"];
+  const metricDeltas = {};
+  for (const key of metricKeys) {
+    const baseValue = toNumber(baseline?.summary?.[key]);
+    const candidateValue = toNumber(candidate?.summary?.[key]);
+    const delta = candidateValue - baseValue;
+    const baselineTaskValues = ensureArray(baseline?.summary?.perTask).map((entry) => toNumber(entry?.[key]));
+    const candidateTaskValues = ensureArray(candidate?.summary?.perTask).map((entry) => toNumber(entry?.[key]));
+    metricDeltas[key] = {
+      baseline: baseValue,
+      candidate: candidateValue,
+      delta,
+      significance: computeSignificance(baselineTaskValues, candidateTaskValues),
+    };
+  }
+
+  const baselineMap = new Map(ensureArray(baseline?.summary?.perTask).map((entry) => [entry.taskId, entry]));
+  const candidateMap = new Map(ensureArray(candidate?.summary?.perTask).map((entry) => [entry.taskId, entry]));
+  const improved = [];
+  const regressed = [];
+  const unchanged = [];
+
+  for (const [taskId, baselineResult] of baselineMap.entries()) {
+    const candidateResult = candidateMap.get(taskId);
+    if (!candidateResult) continue;
+    const passDelta = toNumber(candidateResult.passRate) - toNumber(baselineResult.passRate);
+    const tokenDelta = toNumber(candidateResult.avgTokens) - toNumber(baselineResult.avgTokens);
+    const timeDelta = toNumber(candidateResult.avgTimeMs) - toNumber(baselineResult.avgTimeMs);
+    const record = {
+      taskId,
+      baseline: baselineResult,
+      candidate: candidateResult,
+      passRateDelta: passDelta,
+      avgTokensDelta: tokenDelta,
+      avgTimeMsDelta: timeDelta,
+    };
+    if (passDelta > 0) improved.push(record);
+    else if (passDelta < 0) regressed.push(record);
+    else unchanged.push(record);
+  }
+
+  return {
+    baselineRunId: baseline?.runId || "",
+    candidateRunId: candidate?.runId || "",
+    metricDeltas,
+    perTask: { improved, regressed, unchanged },
+  };
+}
+
+export function summarizeMatrix(runs = []) {
+  const rows = ensureArray(runs).map((run) => ({
+    config: ensureArray(run.strategyIds)[0] || run.strategyId || "default",
+    passRate: toNumber(run.summary?.passRate),
+    avgTokens: toNumber(run.summary?.avgTokens),
+    p95Tokens: toNumber(run.summary?.p95Tokens),
+    avgTimeMs: toNumber(run.summary?.avgTimeMs),
+    p95TimeMs: toNumber(run.summary?.p95TimeMs),
+    cost: toNumber(run.summary?.totalCostUsd, toNumber(run.summary?.avgCostUsd)),
+  }));
+  return { rows };
+}
+
+export function compareAuditImpactRuns(withAnnotations, withoutAnnotations) {
+  const comparison = compareEvaluationRuns(withoutAnnotations, withAnnotations);
+  const rows = [
+    ["Pass Rate", withAnnotations?.summary?.passRate, withoutAnnotations?.summary?.passRate],
+    ["Avg Tokens", withAnnotations?.summary?.avgTokens, withoutAnnotations?.summary?.avgTokens],
+    ["Avg Time (ms)", withAnnotations?.summary?.avgTimeMs, withoutAnnotations?.summary?.avgTimeMs],
+    ["False Positive Rate", mean(ensureArray(withAnnotations?.results).map((entry) => toNumber(entry?.metrics?.FalsePositiveRate))), mean(ensureArray(withoutAnnotations?.results).map((entry) => toNumber(entry?.metrics?.FalsePositiveRate)))],
+  ].map(([metric, withValue, withoutValue]) => ({
+    metric,
+    withAnnotations: toNumber(withValue),
+    withoutAnnotations: toNumber(withoutValue),
+    delta: toNumber(withValue) - toNumber(withoutValue),
+  }));
+  return {
+    comparison,
+    rows,
+  };
+}
+
+export function resolveEvalResultPath(resultsDir, runId) {
+  return resolve(resultsDir || DEFAULT_RESULTS_DIR, `${basename(String(runId || "").replace(/\.json$/i, ""))}.json`);
+}
+
+export function listStoredEvaluationRuns(resultsDir = DEFAULT_RESULTS_DIR) {
+  const dir = resolve(resultsDir);
+  if (!existsSync(dir)) return [];
+  return readdirSync(dir)
+    .filter((name) => name.endsWith(".json"))
+    .sort()
+    .map((name) => ({
+      runId: name.replace(/\.json$/i, ""),
+      path: resolve(dir, name),
+    }));
+}
+
+export function detectRegression(currentRun, baselineRun, thresholds = {}) {
+  const maxTokenRegression = toNumber(thresholds.maxTokenRegression, Infinity);
+  const minPassRate = thresholds.minPassRate == null ? -Infinity : toNumber(thresholds.minPassRate);
+  const tokenRegression = toNumber(currentRun?.summary?.avgTokens) - toNumber(baselineRun?.summary?.avgTokens);
+  const tokenRegressionRatio = toNumber(baselineRun?.summary?.avgTokens) === 0
+    ? (tokenRegression > 0 ? Infinity : 0)
+    : tokenRegression / Math.max(1e-9, toNumber(baselineRun?.summary?.avgTokens));
+  const passRate = toNumber(currentRun?.summary?.passRate);
+  const failures = [];
+  if (Number.isFinite(maxTokenRegression) && tokenRegressionRatio > maxTokenRegression) {
+    failures.push({
+      metric: "avgTokens",
+      actual: tokenRegressionRatio,
+      threshold: maxTokenRegression,
+      message: `Average token regression ${tokenRegressionRatio.toFixed(4)} exceeds ${maxTokenRegression.toFixed(4)}`,
+    });
+  }
+  if (passRate < minPassRate) {
+    failures.push({
+      metric: "passRate",
+      actual: passRate,
+      threshold: minPassRate,
+      message: `Pass rate ${passRate.toFixed(4)} is below ${minPassRate.toFixed(4)}`,
+    });
+  }
+  return {
+    ok: failures.length === 0,
+    failures,
+  };
+}
+
+export function summarizeHistory(runs = []) {
+  const ordered = ensureArray(runs)
+    .map((run) => ({
+      runId: run.runId,
+      createdAt: run.createdAt || "",
+      benchmark: run.benchmark || run.benchmarkId || "",
+      passRate: toNumber(run.summary?.passRate),
+      avgTokens: toNumber(run.summary?.avgTokens),
+      avgTimeMs: toNumber(run.summary?.avgTimeMs),
+      totalCostUsd: toNumber(run.summary?.totalCostUsd),
+    }))
+    .sort((a, b) => String(a.createdAt).localeCompare(String(b.createdAt)));
+  const regressions = [];
+  for (let index = 1; index < ordered.length; index += 1) {
+    const previous = ordered[index - 1];
+    const current = ordered[index];
+    if (current.passRate < previous.passRate || current.avgTokens > previous.avgTokens) {
+      regressions.push({
+        fromRunId: previous.runId,
+        toRunId: current.runId,
+        passRateDelta: current.passRate - previous.passRate,
+        avgTokensDelta: current.avgTokens - previous.avgTokens,
+      });
+    }
+  }
+  return { runs: ordered, regressions };
+}
+
+function renderMatrixTable(rows = []) {
+  const header = ["Config", "Pass Rate", "Avg Tokens", "Avg Time", "Cost"];
+  const tableRows = ensureArray(rows).map((row) => [
+    row.config,
+    `${(toNumber(row.passRate) * 100).toFixed(1)}%`,
+    Math.round(toNumber(row.avgTokens)).toString(),
+    `${(toNumber(row.avgTimeMs) / 1000).toFixed(1)}s`,
+    `$${toNumber(row.cost).toFixed(4)}`,
+  ]);
+  return [header, ...tableRows].map((cells) => `| ${cells.join(" | ")} |`).join("\n");
+}
+
+function printUsage() {
+  console.log(`Bosun evaluation framework\n\nUsage:\n  bosun eval import <tasks.json>\n  bosun eval run --benchmark <file|name> [--repeats N] [--config id] [--results-dir dir]\n  bosun eval compare <run1.json|run1> <run2.json|run2> [--results-dir dir]\n  bosun eval matrix --benchmark <file|name> [--repeats N] [--configs a,b] [--results-dir dir]\n  bosun eval audit-impact --with <run|file> --without <run|file> [--results-dir dir]\n  bosun eval ci --baseline <run|file> --candidate <run|file> [--max-token-regression 0.10] [--min-pass-rate 0.85]\n  bosun eval history [--results-dir dir]\n`);
+}
+
+function getArgValue(args, flag) {
+  const inline = args.find((entry) => entry.startsWith(`${flag}=`));
+  if (inline) return inline.slice(flag.length + 1);
+  const index = args.indexOf(flag);
+  return index >= 0 ? args[index + 1] : "";
+}
+
+function hasFlag(args, flag) {
+  return args.includes(flag);
+}
+
+function parseConfigList(rawValue) {
+  return String(rawValue || "")
+    .split(",")
+    .map((value) => value.trim())
+    .filter(Boolean);
+}
+
+function resolveBenchmarkPath(input = "", options = {}) {
+  const value = String(input || "").trim();
+  if (!value) return "";
+  if (existsSync(resolve(value))) return resolve(value);
+  const benchmarksDir = resolve(options.benchmarksDir || DEFAULT_BENCHMARKS_DIR);
+  const candidates = [
+    resolve(benchmarksDir, value),
+    resolve(benchmarksDir, `${value}.json`),
+  ];
+  return candidates.find((candidate) => existsSync(candidate)) || resolve(value);
+}
+
+function resolveRunInput(input, resultsDir) {
+  const value = String(input || "").trim();
+  if (!value) throw new Error("Run identifier is required");
+  if (existsSync(resolve(value))) return loadRunFromFile(resolve(value));
+  return readRunById(resultsDir, value);
+}
+
+function createSyntheticRunner() {
+  return async ({ task, strategy, repeatIndex }) => {
+    const promptLength = String(task?.input?.prompt || "").length;
+    const taskWeight = TASK_TYPES.indexOf(task?.type) + 1;
+    const strategyWeight = Math.max(1, String(strategy?.id || "default").length % 7);
+    const success = !String(strategy?.id || "").toLowerCase().includes("fail");
+    return {
+      success,
+      durationMs: 1000 + (repeatIndex * 125) + (taskWeight * 200) + (strategyWeight * 50),
+      tokensInput: 800 + promptLength + (taskWeight * 75),
+      tokensOutput: 300 + (repeatIndex * 20) + (strategyWeight * 15),
+      filesChanged: Math.max(1, taskWeight - 1),
+      testsPassed: success ? Math.max(1, taskWeight) : Math.max(0, taskWeight - 1),
+      testsTotal: Math.max(1, taskWeight),
+      falsePositives: task?.type === "code-review" && success ? 0 : (task?.type === "code-review" ? 1 : 0),
+      contextBytes: 2048 + promptLength,
+      contextBudgetBytes: 8192,
+    };
+  };
+}
+
+export async function runEvalCli(args = []) {
+  const [command, ...rest] = args;
+  if (!command || command === "--help" || command === "-h") {
+    printUsage();
+    return { exitCode: 0 };
+  }
+
+  if (command === "import") {
+    const filePath = rest[0];
+    if (!filePath) {
+      console.error("Usage: bosun eval import <tasks.json>");
+      return { exitCode: 1 };
+    }
+    const benchmark = await importBenchmarkFromFile(filePath);
+    console.log(`Imported benchmark ${benchmark.name}: tasks=${benchmark.tasks.length}`);
+    return { exitCode: 0, benchmark };
+  }
+
+  if (command === "run") {
+    const resultsDir = getArgValue(rest, "--results-dir") || DEFAULT_RESULTS_DIR;
+    const benchmarkPath = resolveBenchmarkPath(getArgValue(rest, "--benchmark"), {
+      benchmarksDir: getArgValue(rest, "--benchmarks-dir") || DEFAULT_BENCHMARKS_DIR,
+    });
+    const repeats = Math.max(1, toNumber(getArgValue(rest, "--repeats"), 1));
+    const configId = getArgValue(rest, "--config") || "default";
+    if (!benchmarkPath) {
+      console.error("Usage: bosun eval run --benchmark <file|name> [--repeats N] [--config id]");
+      return { exitCode: 1 };
+    }
+    const benchmark = await importBenchmarkFromFile(benchmarkPath);
+    const evaluator = new Evaluator({ resultsDir, runner: createSyntheticRunner() });
+    const run = await evaluator.evaluate({
+      benchmark,
+      repeats,
+      strategies: [{ id: configId, label: configId }],
+    });
+    console.log(JSON.stringify({ runId: run.runId, resultPath: run.resultPath, summary: run.summary }, null, 2));
+    return { exitCode: 0, run };
+  }
+
+  if (command === "compare") {
+    const resultsDir = getArgValue(rest, "--results-dir") || DEFAULT_RESULTS_DIR;
+    const [runAPath, runBPath] = rest.filter((entry) => !/^--/.test(entry));
+    if (!runAPath || !runBPath) {
+      console.error("Usage: bosun eval compare <run1.json|run1> <run2.json|run2>");
+      return { exitCode: 1 };
+    }
+    const baseline = resolveRunInput(runAPath, resultsDir);
+    const candidate = resolveRunInput(runBPath, resultsDir);
+    const comparison = compareEvaluationRuns(baseline, candidate);
+    console.log(JSON.stringify(comparison, null, 2));
+    return { exitCode: 0, comparison };
+  }
+
+  if (command === "matrix") {
+    const resultsDir = getArgValue(rest, "--results-dir") || DEFAULT_RESULTS_DIR;
+    const benchmarkPath = resolveBenchmarkPath(getArgValue(rest, "--benchmark"), {
+      benchmarksDir: getArgValue(rest, "--benchmarks-dir") || DEFAULT_BENCHMARKS_DIR,
+    });
+    const repeats = Math.max(1, toNumber(getArgValue(rest, "--repeats"), 1));
+    const configs = parseConfigList(getArgValue(rest, "--configs") || "default");
+    if (!benchmarkPath) {
+      console.error("Usage: bosun eval matrix --benchmark <file|name> [--repeats N] [--configs a,b]");
+      return { exitCode: 1 };
+    }
+    const benchmark = await importBenchmarkFromFile(benchmarkPath);
+    const evaluator = new Evaluator({ resultsDir, runner: createSyntheticRunner() });
+    const runs = [];
+    for (const configId of configs) {
+      const run = await evaluator.evaluate({
+        benchmark,
+        repeats,
+        strategies: [{ id: configId, label: configId }],
+      });
+      runs.push(run);
+    }
+    const matrix = summarizeMatrix(runs);
+    console.log(renderMatrixTable(matrix.rows));
+    return { exitCode: 0, runs, matrix };
+  }
+
+  if (command === "audit-impact") {
+    const resultsDir = getArgValue(rest, "--results-dir") || DEFAULT_RESULTS_DIR;
+    const withInput = getArgValue(rest, "--with");
+    const withoutInput = getArgValue(rest, "--without");
+    if (!withInput || !withoutInput) {
+      console.error("Usage: bosun eval audit-impact --with <run|file> --without <run|file>");
+      return { exitCode: 1 };
+    }
+    const withAnnotations = resolveRunInput(withInput, resultsDir);
+    const withoutAnnotations = resolveRunInput(withoutInput, resultsDir);
+    const impact = compareAuditImpactRuns(withAnnotations, withoutAnnotations);
+    console.log(JSON.stringify(impact, null, 2));
+    return { exitCode: 0, impact };
+  }
+
+  if (command === "ci") {
+    const resultsDir = getArgValue(rest, "--results-dir") || DEFAULT_RESULTS_DIR;
+    const baselineInput = getArgValue(rest, "--baseline");
+    const candidateInput = getArgValue(rest, "--candidate");
+    if (!baselineInput || !candidateInput) {
+      console.error("Usage: bosun eval ci --baseline <run|file> --candidate <run|file>");
+      return { exitCode: 1 };
+    }
+    const baseline = resolveRunInput(baselineInput, resultsDir);
+    const candidate = resolveRunInput(candidateInput, resultsDir);
+    const regression = detectRegression(candidate, baseline, {
+      maxTokenRegression: toNumber(getArgValue(rest, "--max-token-regression"), 0.1),
+      minPassRate: toNumber(getArgValue(rest, "--min-pass-rate"), 0.85),
+    });
+    if (!regression.ok) {
+      console.error(JSON.stringify(regression, null, 2));
+      return { exitCode: 1, regression };
+    }
+    console.log(JSON.stringify(regression, null, 2));
+    return { exitCode: 0, regression };
+  }
+
+  if (command === "history") {
+    const resultsDir = getArgValue(rest, "--results-dir") || DEFAULT_RESULTS_DIR;
+    const runs = listStoredEvaluationRuns(resultsDir).map((entry) => loadRunFromFile(entry.path));
+    const history = summarizeHistory(runs);
+    console.log(JSON.stringify(history, null, 2));
+    return { exitCode: 0, history };
+  }
+
+  console.error(`Unknown eval command: ${command}`);
+  printUsage();
+  return { exitCode: 1 };
+}
diff --git a/cli.mjs b/cli.mjs
index 65cb49d6..a95d65be 100755
--- a/cli.mjs
+++ b/cli.mjs
@@ -86,6 +86,7 @@ function showHelp() {
     workflow list              List declarative pipeline workflows
     workflow run <name>        Run a declarative pipeline workflow
     workflow nodes             Inspect custom workflow node plugin health
+    eval <command>             Run agent evaluation and benchmarking tools
     tui                        Launch the terminal UI
     audit <command>            Run codebase annotation audit tools (scan|generate|warn|manifest|index|trim|conformity|migrate)
     --setup                    Launch the web-based setup wizard (default)
@@ -176,6 +177,7 @@ function showHelp() {
     workflow run <name>         Run a declarative fresh-context workflow
 
     Run 'bosun workflow --help' for workflow CLI examples.
+    Run 'bosun eval --help' for evaluation CLI examples.
     Run 'bosun tui' to launch the terminal UI.
 
   STARTUP SERVICE
@@ -1511,6 +1513,18 @@ async function main() {
     process.exit(exitCode ?? 0);
   }
 
+  const evalFlagIndex = args.indexOf("--eval");
+  const evalCommandIndex = args.indexOf("eval");
+  if (evalFlagIndex >= 0 || evalCommandIndex >= 0) {
+    const commandStartIndex =
+      evalFlagIndex >= 0 && evalCommandIndex >= 0
+        ? Math.min(evalFlagIndex, evalCommandIndex)
+        : (evalCommandIndex >= 0 ? evalCommandIndex : evalFlagIndex);
+    const evalArgs = args.slice(commandStartIndex + 1);
+    const { runEvalCli } = await import("./bench/eval-framework.mjs");
+    const { exitCode } = await runEvalCli(evalArgs);
+    process.exit(exitCode ?? 0);
+  }
   // Handle --help
   if (args.includes("--help") || args.includes("-h")) {
     showHelp();
diff --git a/package.json b/package.json
index 47db7769..cd7c16c0 100644
--- a/package.json
+++ b/package.json
@@ -184,6 +184,7 @@
     "agent/skills/",
     "bench/benchmark-mode.mjs",
     "bench/benchmark-registry.mjs",
+    "bench/eval-framework.mjs",
     "bench/swebench/bosun-swebench.mjs",
     "bosun-tui.mjs",
     "bosun.config.example.json",
diff --git a/tests/cli-eval-routing.test.mjs b/tests/cli-eval-routing.test.mjs
new file mode 100644
index 00000000..09e676b5
--- /dev/null
+++ b/tests/cli-eval-routing.test.mjs
@@ -0,0 +1,26 @@
+import { readFileSync } from "node:fs";
+import { resolve } from "node:path";
+import { describe, expect, it } from "vitest";
+
+describe("cli eval routing", () => {
+  const cliSource = readFileSync(resolve(process.cwd(), "cli.mjs"), "utf8");
+
+  it("routes eval subcommands before global help handling", () => {
+    const evalRoutingIndex = cliSource.indexOf("const evalFlagIndex = args.indexOf(\"--eval\")");
+    const helpRoutingIndex = cliSource.indexOf("// Handle --help");
+
+    expect(evalRoutingIndex).toBeGreaterThan(-1);
+    expect(helpRoutingIndex).toBeGreaterThan(-1);
+    expect(evalRoutingIndex).toBeLessThan(helpRoutingIndex);
+    expect(cliSource).toContain("args.indexOf(\"eval\")");
+    expect(cliSource).toContain("const evalArgs = args.slice(commandStartIndex + 1)");
+    expect(cliSource).toContain('const { runEvalCli } = await import("./bench/eval-framework.mjs")');
+    expect(cliSource).toContain("const { exitCode } = await runEvalCli(evalArgs)");
+    expect(cliSource).toContain("process.exit(exitCode)");
+  });
+
+  it("documents eval commands in help output", () => {
+    expect(cliSource).toContain("eval <command>             Run agent evaluation and benchmarking tools");
+    expect(cliSource).toContain("Run 'bosun eval --help' for evaluation CLI examples.");
+  });
+});
diff --git a/tests/eval-framework.test.mjs b/tests/eval-framework.test.mjs
new file mode 100644
index 00000000..bf622e31
--- /dev/null
+++ b/tests/eval-framework.test.mjs
@@ -0,0 +1,258 @@
+import { afterEach, describe, expect, it } from "vitest";
+import { existsSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { resolve } from "node:path";
+
+import {
+  Benchmark,
+  CategoryMetric,
+  Evaluator,
+  NumericalMetric,
+  Task,
+  compareAuditImpactRuns,
+  compareEvaluationRuns,
+  detectRegression,
+  importBenchmarkFromFile,
+  listStoredEvaluationRuns,
+  runEvalCli,
+  summarizeHistory,
+  summarizeMatrix,
+} from "../bench/eval-framework.mjs";
+
+const tempDirs = [];
+
+afterEach(() => {
+  for (const dir of tempDirs.splice(0)) {
+    rmSync(dir, { recursive: true, force: true });
+  }
+});
+
+function makeTempDir(prefix) {
+  const dir = mkdtempSync(resolve(tmpdir(), prefix));
+  tempDirs.push(dir);
+  return dir;
+}
+
+describe("eval framework", () => {
+  it("imports benchmark tasks from json and normalizes built-in task metadata", async () => {
+    const dir = makeTempDir("bosun-eval-import-");
+    const benchmarkPath = resolve(dir, "tasks.json");
+    writeFileSync(
+      benchmarkPath,
+      JSON.stringify({
+        name: "code-review-10",
+        tasks: [
+          {
+            id: "review-1",
+            type: "code-review",
+            input: {
+              prompt: "Review this diff",
+              repoState: { ref: "main" },
+            },
+            groundTruth: {
+              expectedFiles: ["src/app.mjs"],
+              expectedTests: ["npm test"],
+            },
+            tags: { area: "server", difficulty: "easy" },
+          },
+        ],
+      }),
+      "utf8",
+    );
+
+    const benchmark = await importBenchmarkFromFile(benchmarkPath);
+
+    expect(benchmark).toBeInstanceOf(Benchmark);
+    expect(benchmark.name).toBe("code-review-10");
+    expect(benchmark.tasks).toHaveLength(1);
+    expect(benchmark.tasks[0]).toBeInstanceOf(Task);
+    expect(benchmark.tasks[0].type).toBe("code-review");
+    expect(benchmark.tasks[0].metrics).toEqual(expect.arrayContaining(["TaskSuccess", "FalsePositiveRate"]));
+    expect(benchmark.tasks[0].tags).toEqual({ area: "server", difficulty: "easy" });
+  });
+
+  it("evaluates repeated runs, persists json results, and summarizes a matrix", async () => {
+    const dir = makeTempDir("bosun-eval-run-");
+    const benchmark = new Benchmark({
+      name: "mini-suite",
+      tasks: [
+        new Task({
+          id: "bug-1",
+          type: "bug-fix",
+          input: { prompt: "Fix the parser", repoState: { ref: "abc123" } },
+          groundTruth: { expectedFiles: ["src/parser.mjs"] },
+        }),
+      ],
+    });
+
+    const evaluator = new Evaluator({
+      resultsDir: dir,
+      metrics: [
+        new CategoryMetric("TaskSuccess", ({ outcome }) => outcome.success ? "pass" : "fail"),
+        new NumericalMetric("TokenEfficiency", ({ outcome }) => outcome.tokensInput / Math.max(outcome.filesChanged, 1)),
+        new NumericalMetric("TimeToComplete", ({ outcome }) => outcome.durationMs),
+      ],
+      runner: async ({ repeatIndex, strategy }) => ({
+        success: strategy.id === "codex-default",
+        durationMs: 1000 + (repeatIndex * 100),
+        tokensInput: 1200 + (repeatIndex * 100),
+        filesChanged: 2,
+      }),
+    });
+
+    const run = await evaluator.evaluate({
+      benchmark,
+      repeats: 3,
+      strategies: [{ id: "codex-default", label: "Codex Default" }],
+    });
+
+    expect(run.summary.totalTasks).toBe(3);
+    expect(run.summary.passRate).toBe(1);
+    expect(run.summary.avgTimeMs).toBeCloseTo(1100);
+    expect(run.summary.perTask).toHaveLength(1);
+    expect(existsSync(run.resultPath)).toBe(true);
+
+    const persisted = JSON.parse(readFileSync(run.resultPath, "utf8"));
+    expect(persisted.runId).toBe(run.runId);
+    expect(persisted.results).toHaveLength(3);
+
+    const matrix = summarizeMatrix([run]);
+    expect(matrix.rows).toHaveLength(1);
+    expect(matrix.rows[0]).toMatchObject({
+      config: "codex-default",
+      passRate: 1,
+    });
+  });
+
+  it("compares two runs with deltas and per-task regressions", () => {
+    const baseline = {
+      runId: "run-a",
+      summary: {
+        passRate: 0.5,
+        avgTokens: 5000,
+        p95Tokens: 8000,
+        avgTimeMs: 60000,
+        p95TimeMs: 90000,
+        perTask: [
+          { taskId: "task-1", passRate: 1, avgTokens: 4000, avgTimeMs: 55000 },
+          { taskId: "task-2", passRate: 0, avgTokens: 6000, avgTimeMs: 65000 },
+        ],
+      },
+      results: [
+        { taskId: "task-1", metrics: { TaskSuccess: "pass", TokenEfficiency: 4000, TimeToComplete: 55000 } },
+        { taskId: "task-2", metrics: { TaskSuccess: "fail", TokenEfficiency: 6000, TimeToComplete: 65000 } },
+      ],
+    };
+    const candidate = {
+      runId: "run-b",
+      summary: {
+        passRate: 1,
+        avgTokens: 4200,
+        p95Tokens: 7000,
+        avgTimeMs: 45000,
+        p95TimeMs: 70000,
+        perTask: [
+          { taskId: "task-1", passRate: 1, avgTokens: 3800, avgTimeMs: 43000 },
+          { taskId: "task-2", passRate: 1, avgTokens: 4600, avgTimeMs: 47000 },
+        ],
+      },
+      results: [
+        { taskId: "task-1", metrics: { TaskSuccess: "pass", TokenEfficiency: 3800, TimeToComplete: 43000 } },
+        { taskId: "task-2", metrics: { TaskSuccess: "pass", TokenEfficiency: 4600, TimeToComplete: 47000 } },
+      ],
+    };
+
+    const comparison = compareEvaluationRuns(baseline, candidate);
+
+    expect(comparison.metricDeltas.passRate.delta).toBeCloseTo(0.5);
+    expect(comparison.metricDeltas.avgTokens.delta).toBeCloseTo(-800);
+    expect(comparison.perTask.improved.map((entry) => entry.taskId)).toContain("task-2");
+    expect(comparison.perTask.regressed).toEqual([]);
+    expect(comparison.metricDeltas.passRate.significance.pValue).toBeGreaterThanOrEqual(0);
+    expect(comparison.metricDeltas.passRate.significance.pValue).toBeLessThanOrEqual(1);
+  });
+
+  it("supports audit impact, history, and ci regression helpers", () => {
+    const withoutAnnotations = {
+      runId: "without",
+      createdAt: "2026-03-25T00:00:00.000Z",
+      benchmark: "suite",
+      summary: {
+        passRate: 0.7,
+        avgTokens: 1000,
+        avgTimeMs: 5000,
+        totalCostUsd: 0.03,
+      },
+      results: [{ metrics: { FalsePositiveRate: 0.2 } }],
+    };
+    const withAnnotations = {
+      runId: "with",
+      createdAt: "2026-03-26T00:00:00.000Z",
+      benchmark: "suite",
+      summary: {
+        passRate: 0.9,
+        avgTokens: 800,
+        avgTimeMs: 4000,
+        totalCostUsd: 0.02,
+      },
+      results: [{ metrics: { FalsePositiveRate: 0.05 } }],
+    };
+
+    const impact = compareAuditImpactRuns(withAnnotations, withoutAnnotations);
+    expect(impact.rows.find((entry) => entry.metric === "Avg Tokens")?.delta).toBe(-200);
+
+    const history = summarizeHistory([withAnnotations, withoutAnnotations]);
+    expect(history.runs.map((entry) => entry.runId)).toEqual(["without", "with"]);
+
+    const regression = detectRegression(withAnnotations, withoutAnnotations, {
+      maxTokenRegression: 0.1,
+      minPassRate: 0.85,
+    });
+    expect(regression.ok).toBe(true);
+
+    const failingRegression = detectRegression(withoutAnnotations, withAnnotations, {
+      maxTokenRegression: 0.1,
+      minPassRate: 0.85,
+    });
+    expect(failingRegression.ok).toBe(false);
+  });
+
+  it("lists stored runs and supports eval cli matrix/history flows", async () => {
+    const dir = makeTempDir("bosun-eval-cli-");
+    const benchmarkPath = resolve(dir, "benchmark.json");
+    writeFileSync(
+      benchmarkPath,
+      JSON.stringify({
+        name: "mini-benchmark",
+        tasks: [
+          {
+            id: "task-1",
+            type: "code-generation",
+            input: { prompt: "Create helper" },
+          },
+        ],
+      }),
+      "utf8",
+    );
+
+    const matrixResult = await runEvalCli([
+      "matrix",
+      "--benchmark", benchmarkPath,
+      "--configs", "codex-default,copilot-sonnet",
+      "--repeats", "2",
+      "--results-dir", dir,
+    ]);
+    expect(matrixResult.exitCode).toBe(0);
+    expect(matrixResult.matrix.rows).toHaveLength(2);
+
+    const storedRuns = listStoredEvaluationRuns(dir);
+    expect(storedRuns).toHaveLength(2);
+
+    const historyResult = await runEvalCli([
+      "history",
+      "--results-dir", dir,
+    ]);
+    expect(historyResult.exitCode).toBe(0);
+    expect(historyResult.history.runs).toHaveLength(2);
+  });
+});