From 305b1fdeb2fcd2659816cee9cf5a5b2db97ecbed Mon Sep 17 00:00:00 2001 From: jaeko44 Date: Thu, 26 Mar 2026 14:44:34 +1100 Subject: [PATCH 1/4] chore: auto-commit agent work (bc219832-c3a) --- bench/eval-framework.mjs | 783 ++++++++++++++++++++++++++++++++ cli.mjs | 11 + package.json | 1 + tests/cli-eval-routing.test.mjs | 26 ++ tests/eval-framework.test.mjs | 258 +++++++++++ 5 files changed, 1079 insertions(+) create mode 100644 bench/eval-framework.mjs create mode 100644 tests/cli-eval-routing.test.mjs create mode 100644 tests/eval-framework.test.mjs diff --git a/bench/eval-framework.mjs b/bench/eval-framework.mjs new file mode 100644 index 000000000..b12a3d415 --- /dev/null +++ b/bench/eval-framework.mjs @@ -0,0 +1,783 @@ +import { mkdirSync, readFileSync, writeFileSync, readdirSync, existsSync } from "node:fs"; +import { basename, resolve } from "node:path"; +import { randomUUID } from "node:crypto"; + +const TASK_TYPES = Object.freeze([ + "code-generation", + "bug-fix", + "refactor", + "test-writing", + "code-review", +]); + +const BUILTIN_TASK_METRICS = Object.freeze({ + "code-generation": ["TaskSuccess", "TokenEfficiency", "TimeToComplete", "ContextUtilization"], + "bug-fix": ["TaskSuccess", "TokenEfficiency", "TimeToComplete", "TestPassRate"], + refactor: ["TaskSuccess", "TokenEfficiency", "TimeToComplete", "FalsePositiveRate"], + "test-writing": ["TaskSuccess", "TokenEfficiency", "TimeToComplete", "TestPassRate"], + "code-review": ["TaskSuccess", "TokenEfficiency", "TimeToComplete", "FalsePositiveRate"], +}); + +const BUILTIN_METRICS = Object.freeze([ + "TaskSuccess", + "TokenEfficiency", + "TimeToComplete", + "TestPassRate", + "FalsePositiveRate", + "ContextUtilization", +]); + +const DEFAULT_RESULTS_DIR = ".cache/eval-results"; +const DEFAULT_BENCHMARKS_DIR = "bench/benchmarks"; + +function mean(values = []) { + if (!Array.isArray(values) || values.length === 0) return 0; + return values.reduce((sum, value) => sum + Number(value || 0), 0) / values.length; +} + +function percentile(values = [], p = 95) { + if (!Array.isArray(values) || values.length === 0) return 0; + const sorted = values.map((value) => Number(value || 0)).sort((a, b) => a - b); + const index = Math.min(sorted.length - 1, Math.max(0, Math.ceil((p / 100) * sorted.length) - 1)); + return sorted[index] ?? 0; +} + +function ensureArray(value) { + return Array.isArray(value) ? value : value == null ? [] : [value]; +} + +function toNumber(value, fallback = 0) { + const numeric = Number(value); + return Number.isFinite(numeric) ? numeric : fallback; +} + +function normalizeTaskType(type) { + const normalized = String(type || "").trim().toLowerCase(); + if (TASK_TYPES.includes(normalized)) return normalized; + return "code-generation"; +} + +function defaultMetricsForTask(type) { + return [...(BUILTIN_TASK_METRICS[normalizeTaskType(type)] || BUILTIN_METRICS)]; +} + +function parseJsonFile(filePath) { + return JSON.parse(readFileSync(resolve(filePath), "utf8")); +} + +function dedupeStrings(values = []) { + const output = []; + for (const value of ensureArray(values)) { + const normalized = String(value || "").trim(); + if (!normalized || output.includes(normalized)) continue; + output.push(normalized); + } + return output; +} + +function escapeRegex(text) { + return String(text || "").replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); +} + +function normalCdf(value) { + const x = Number(value || 0); + const sign = x < 0 ? -1 : 1; + const abs = Math.abs(x) / Math.sqrt(2); + const t = 1 / (1 + 0.3275911 * abs); + const a1 = 0.254829592; + const a2 = -0.284496736; + const a3 = 1.421413741; + const a4 = -1.453152027; + const a5 = 1.061405429; + const erf = 1 - (((((a5 * t) + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp(-(abs * abs)); + return 0.5 * (1 + sign * erf); +} + +function zScoreToPValue(score) { + const normalized = Math.abs(Number(score || 0)); + return Math.max(0, Math.min(1, 2 * (1 - normalCdf(normalized)))); +} + +function computeSignificance(baseValues = [], candidateValues = []) { + const baseline = ensureArray(baseValues).map((value) => toNumber(value)).filter((value) => Number.isFinite(value)); + const candidate = ensureArray(candidateValues).map((value) => toNumber(value)).filter((value) => Number.isFinite(value)); + if (baseline.length === 0 || candidate.length === 0) { + return { score: 0, pValue: 1, method: "insufficient-data" }; + } + const baselineMean = mean(baseline); + const candidateMean = mean(candidate); + const baselineVariance = mean(baseline.map((value) => (value - baselineMean) ** 2)); + const candidateVariance = mean(candidate.map((value) => (value - candidateMean) ** 2)); + const denominator = Math.sqrt( + (baselineVariance / Math.max(1, baseline.length)) + + (candidateVariance / Math.max(1, candidate.length)), + ); + if (!Number.isFinite(denominator) || denominator === 0) { + return { + score: candidateMean === baselineMean ? 0 : Number.POSITIVE_INFINITY, + pValue: candidateMean === baselineMean ? 1 : 0, + method: "welch-z", + }; + } + const score = (candidateMean - baselineMean) / denominator; + return { + score, + pValue: zScoreToPValue(score), + method: "welch-z", + }; +} + +function normalizeMetricName(value) { + return String(value || "").trim(); +} + +function calculateCostFromOutcome(outcome = {}, strategy = {}) { + const totalTokens = toNumber(outcome.totalTokens, toNumber(outcome.tokensInput) + toNumber(outcome.tokensOutput)); + const rate = toNumber( + strategy.costPerMillionTokens ?? strategy.costRatePerMillion ?? strategy.tokenCostPerMillion, + 0, + ); + if (!rate || totalTokens <= 0) return 0; + return (totalTokens / 1_000_000) * rate; +} + +export class Task { + constructor(raw = {}) { + this.id = String(raw.id || "").trim() || `task-${randomUUID()}`; + this.type = normalizeTaskType(raw.type); + this.input = raw.input && typeof raw.input === "object" ? { ...raw.input } : { prompt: "", repoState: {} }; + this.groundTruth = raw.groundTruth && typeof raw.groundTruth === "object" ? { ...raw.groundTruth } : {}; + this.metrics = ensureArray(raw.metrics).length > 0 + ? dedupeStrings(ensureArray(raw.metrics).map((metric) => normalizeMetricName(metric))) + : defaultMetricsForTask(this.type); + this.tags = raw.tags && typeof raw.tags === "object" && !Array.isArray(raw.tags) ? { ...raw.tags } : {}; + } +} + +export class Benchmark { + constructor(raw = {}) { + this.id = String(raw.id || raw.name || "benchmark").trim() || "benchmark"; + this.name = String(raw.name || raw.id || "benchmark").trim() || "benchmark"; + this.description = String(raw.description || "").trim(); + this.tasks = ensureArray(raw.tasks).map((task) => task instanceof Task ? task : new Task(task)); + this.tags = raw.tags && typeof raw.tags === "object" && !Array.isArray(raw.tags) ? { ...raw.tags } : {}; + this.sourcePath = String(raw.sourcePath || "").trim(); + } +} + +export class Metric { + constructor(name, evaluator, options = {}) { + this.name = normalizeMetricName(name); + this.evaluator = typeof evaluator === "function" ? evaluator : (() => null); + this.kind = String(options.kind || "metric"); + this.description = String(options.description || "").trim(); + } + + evaluate(context) { + return this.evaluator(context); + } +} + +export class CategoryMetric extends Metric { + constructor(name, evaluator, options = {}) { + super(name, evaluator, { ...options, kind: "category" }); + } +} + +export class NumericalMetric extends Metric { + constructor(name, evaluator, options = {}) { + super(name, evaluator, { ...options, kind: "numerical" }); + } +} + +function defaultRunner() { + return async () => ({ + success: false, + durationMs: 0, + tokensInput: 0, + tokensOutput: 0, + filesChanged: 0, + testsPassed: 0, + testsTotal: 0, + falsePositives: 0, + contextBytes: 0, + contextBudgetBytes: 0, + costUsd: 0, + }); +} + +export function builtInMetricInstances() { + return [ + new CategoryMetric("TaskSuccess", ({ outcome }) => outcome.success ? "pass" : "fail"), + new NumericalMetric("TokenEfficiency", ({ outcome }) => { + const totalTokens = toNumber(outcome.totalTokens, toNumber(outcome.tokensInput) + toNumber(outcome.tokensOutput)); + return totalTokens / Math.max(1, toNumber(outcome.filesChanged, 1)); + }), + new NumericalMetric("TimeToComplete", ({ outcome }) => toNumber(outcome.durationMs)), + new NumericalMetric("TestPassRate", ({ outcome }) => { + const total = Math.max(0, toNumber(outcome.testsTotal)); + if (total === 0) return outcome.success ? 1 : 0; + return toNumber(outcome.testsPassed) / total; + }), + new NumericalMetric("FalsePositiveRate", ({ outcome }) => { + const reviewedCount = Math.max(1, toNumber(outcome.findingsTotal, toNumber(outcome.filesChanged, 1))); + return toNumber(outcome.falsePositives) / reviewedCount; + }), + new NumericalMetric("ContextUtilization", ({ outcome }) => { + const budget = Math.max(0, toNumber(outcome.contextBudgetBytes)); + if (budget === 0) return 0; + return toNumber(outcome.contextBytes) / budget; + }), + ]; +} + +function mapMetrics(metrics = []) { + const metricMap = new Map(); + for (const metric of [...builtInMetricInstances(), ...metrics]) { + if (!metric?.name) continue; + metricMap.set(metric.name, metric); + } + return metricMap; +} + +function buildTaskSummaryEntries(results = []) { + const byTask = new Map(); + for (const result of ensureArray(results)) { + const taskId = String(result?.taskId || "").trim(); + if (!taskId) continue; + if (!byTask.has(taskId)) { + byTask.set(taskId, { + taskId, + taskType: result.taskType || "code-generation", + repeats: 0, + passCount: 0, + tokenValues: [], + timeValues: [], + costValues: [], + resultIndexes: [], + }); + } + const entry = byTask.get(taskId); + entry.repeats += 1; + if (result.metrics?.TaskSuccess === "pass") entry.passCount += 1; + entry.tokenValues.push(toNumber(result.outcome?.totalTokens, toNumber(result.outcome?.tokensInput) + toNumber(result.outcome?.tokensOutput))); + entry.timeValues.push(toNumber(result.outcome?.durationMs)); + entry.costValues.push(toNumber(result.outcome?.costUsd)); + entry.resultIndexes.push(result.resultIndex); + } + return [...byTask.values()].map((entry) => ({ + taskId: entry.taskId, + taskType: entry.taskType, + repeats: entry.repeats, + passRate: entry.repeats > 0 ? entry.passCount / entry.repeats : 0, + avgTokens: mean(entry.tokenValues), + avgTimeMs: mean(entry.timeValues), + avgCostUsd: mean(entry.costValues), + })); +} + +function summarizeRun(run) { + const results = ensureArray(run.results); + const timeValues = []; + const tokenValues = []; + const costValues = []; + let passCount = 0; + for (const result of results) { + const metrics = result.metrics || {}; + const totalTokens = toNumber(result.outcome?.totalTokens, toNumber(result.outcome?.tokensInput) + toNumber(result.outcome?.tokensOutput)); + const totalCost = toNumber(result.outcome?.costUsd); + if (metrics.TaskSuccess === "pass") passCount += 1; + timeValues.push(toNumber(metrics.TimeToComplete, result.outcome?.durationMs)); + tokenValues.push(totalTokens); + costValues.push(totalCost); + } + return { + totalTasks: results.length, + passRate: results.length > 0 ? passCount / results.length : 0, + avgTokens: mean(tokenValues), + p95Tokens: percentile(tokenValues, 95), + avgTimeMs: mean(timeValues), + p95TimeMs: percentile(timeValues, 95), + totalCostUsd: costValues.reduce((sum, value) => sum + value, 0), + avgCostUsd: mean(costValues), + perTask: buildTaskSummaryEntries(results), + }; +} + +function normalizeStrategy(raw = {}, index = 0) { + const strategy = raw && typeof raw === "object" ? { ...raw } : { id: String(raw || "") }; + const id = String(strategy.id || strategy.name || `strategy-${index + 1}`).trim() || `strategy-${index + 1}`; + return { + id, + label: String(strategy.label || strategy.name || id).trim() || id, + sdk: String(strategy.sdk || "").trim(), + model: String(strategy.model || "").trim(), + promptStrategy: String(strategy.promptStrategy || strategy.prompt || "").trim(), + codebaseProfile: String(strategy.codebaseProfile || strategy.repoProfile || "").trim(), + annotated: strategy.annotated === true, + unannotated: strategy.unannotated === true, + config: strategy.config && typeof strategy.config === "object" ? { ...strategy.config } : {}, + costPerMillionTokens: toNumber(strategy.costPerMillionTokens ?? strategy.costRatePerMillion, 0), + metadata: strategy.metadata && typeof strategy.metadata === "object" ? { ...strategy.metadata } : {}, + }; +} + +function loadRunFromFile(runPath) { + return parseJsonFile(runPath); +} + +function readRunById(resultsDir, runId) { + return loadRunFromFile(resolveEvalResultPath(resultsDir, runId)); +} + +export class Evaluator { + constructor(options = {}) { + this.resultsDir = resolve(options.resultsDir || DEFAULT_RESULTS_DIR); + this.metrics = ensureArray(options.metrics); + this.runner = options.runner || defaultRunner(); + this.parallelism = Math.max(1, toNumber(options.parallelism, 1)); + this.storageAdapter = options.storageAdapter || null; + } + + async evaluate({ benchmark, repeats = 1, strategies = [] } = {}) { + const normalizedBenchmark = benchmark instanceof Benchmark ? benchmark : new Benchmark(benchmark || {}); + const normalizedStrategies = ensureArray(strategies).length > 0 + ? ensureArray(strategies).map((strategy, index) => normalizeStrategy(strategy, index)) + : [normalizeStrategy({ id: "default", label: "Default" })]; + const metricMap = mapMetrics(this.metrics); + const runId = `eval-${Date.now()}-${randomUUID()}`; + const results = []; + let resultIndex = 0; + + for (const strategy of normalizedStrategies) { + for (const task of normalizedBenchmark.tasks) { + for (let repeatIndex = 0; repeatIndex < Math.max(1, repeats); repeatIndex += 1) { + const outcome = { + ...(await this.runner({ benchmark: normalizedBenchmark, task, strategy, repeatIndex })), + }; + outcome.totalTokens = toNumber( + outcome.totalTokens, + toNumber(outcome.tokensInput) + toNumber(outcome.tokensOutput), + ); + outcome.costUsd = toNumber( + outcome.costUsd, + calculateCostFromOutcome(outcome, strategy), + ); + const metricResults = {}; + for (const metricName of task.metrics) { + const metric = metricMap.get(metricName); + if (!metric) continue; + metricResults[metricName] = metric.evaluate({ + benchmark: normalizedBenchmark, + task, + strategy, + repeatIndex, + outcome, + }); + } + results.push({ + resultIndex: resultIndex++, + strategyId: strategy.id, + strategyLabel: strategy.label, + strategy, + taskId: task.id, + taskType: task.type, + repeatIndex, + metrics: metricResults, + outcome, + }); + } + } + } + + const summary = summarizeRun({ results }); + const run = { + runId, + benchmarkId: normalizedBenchmark.id, + benchmark: normalizedBenchmark.name, + benchmarkDescription: normalizedBenchmark.description, + repeats: Math.max(1, repeats), + strategyIds: normalizedStrategies.map((strategy) => strategy.id), + strategies: normalizedStrategies, + parallelism: this.parallelism, + createdAt: new Date().toISOString(), + results, + summary, + }; + + mkdirSync(this.resultsDir, { recursive: true }); + const resultPath = resolve(this.resultsDir, `${runId}.json`); + writeFileSync(resultPath, JSON.stringify(run, null, 2) + "\n", "utf8"); + if (this.storageAdapter && typeof this.storageAdapter.writeRun === "function") { + await this.storageAdapter.writeRun(run, resultPath); + } + return { ...run, resultPath }; + } +} + +export async function importBenchmarkFromFile(filePath) { + const raw = parseJsonFile(filePath); + return new Benchmark({ ...raw, sourcePath: resolve(filePath) }); +} + +export function compareEvaluationRuns(baseline, candidate) { + const metricKeys = ["passRate", "avgTokens", "p95Tokens", "avgTimeMs", "p95TimeMs", "avgCostUsd", "totalCostUsd"]; + const metricDeltas = {}; + for (const key of metricKeys) { + const baseValue = toNumber(baseline?.summary?.[key]); + const candidateValue = toNumber(candidate?.summary?.[key]); + const delta = candidateValue - baseValue; + const baselineTaskValues = ensureArray(baseline?.summary?.perTask).map((entry) => toNumber(entry?.[key])); + const candidateTaskValues = ensureArray(candidate?.summary?.perTask).map((entry) => toNumber(entry?.[key])); + metricDeltas[key] = { + baseline: baseValue, + candidate: candidateValue, + delta, + significance: computeSignificance(baselineTaskValues, candidateTaskValues), + }; + } + + const baselineMap = new Map(ensureArray(baseline?.summary?.perTask).map((entry) => [entry.taskId, entry])); + const candidateMap = new Map(ensureArray(candidate?.summary?.perTask).map((entry) => [entry.taskId, entry])); + const improved = []; + const regressed = []; + const unchanged = []; + + for (const [taskId, baselineResult] of baselineMap.entries()) { + const candidateResult = candidateMap.get(taskId); + if (!candidateResult) continue; + const passDelta = toNumber(candidateResult.passRate) - toNumber(baselineResult.passRate); + const tokenDelta = toNumber(candidateResult.avgTokens) - toNumber(baselineResult.avgTokens); + const timeDelta = toNumber(candidateResult.avgTimeMs) - toNumber(baselineResult.avgTimeMs); + const record = { + taskId, + baseline: baselineResult, + candidate: candidateResult, + passRateDelta: passDelta, + avgTokensDelta: tokenDelta, + avgTimeMsDelta: timeDelta, + }; + if (passDelta > 0) improved.push(record); + else if (passDelta < 0) regressed.push(record); + else unchanged.push(record); + } + + return { + baselineRunId: baseline?.runId || "", + candidateRunId: candidate?.runId || "", + metricDeltas, + perTask: { improved, regressed, unchanged }, + }; +} + +export function summarizeMatrix(runs = []) { + const rows = ensureArray(runs).map((run) => ({ + config: ensureArray(run.strategyIds)[0] || run.strategyId || "default", + passRate: toNumber(run.summary?.passRate), + avgTokens: toNumber(run.summary?.avgTokens), + p95Tokens: toNumber(run.summary?.p95Tokens), + avgTimeMs: toNumber(run.summary?.avgTimeMs), + p95TimeMs: toNumber(run.summary?.p95TimeMs), + cost: toNumber(run.summary?.totalCostUsd, toNumber(run.summary?.avgCostUsd)), + })); + return { rows }; +} + +export function compareAuditImpactRuns(withAnnotations, withoutAnnotations) { + const comparison = compareEvaluationRuns(withoutAnnotations, withAnnotations); + const rows = [ + ["Pass Rate", withAnnotations?.summary?.passRate, withoutAnnotations?.summary?.passRate], + ["Avg Tokens", withAnnotations?.summary?.avgTokens, withoutAnnotations?.summary?.avgTokens], + ["Avg Time (ms)", withAnnotations?.summary?.avgTimeMs, withoutAnnotations?.summary?.avgTimeMs], + ["False Positive Rate", mean(ensureArray(withAnnotations?.results).map((entry) => toNumber(entry?.metrics?.FalsePositiveRate))), mean(ensureArray(withoutAnnotations?.results).map((entry) => toNumber(entry?.metrics?.FalsePositiveRate)))], + ].map(([metric, withValue, withoutValue]) => ({ + metric, + withAnnotations: toNumber(withValue), + withoutAnnotations: toNumber(withoutValue), + delta: toNumber(withValue) - toNumber(withoutValue), + })); + return { + comparison, + rows, + }; +} + +export function resolveEvalResultPath(resultsDir, runId) { + return resolve(resultsDir || DEFAULT_RESULTS_DIR, `${basename(String(runId || "").replace(/\.json$/i, ""))}.json`); +} + +export function listStoredEvaluationRuns(resultsDir = DEFAULT_RESULTS_DIR) { + const dir = resolve(resultsDir); + if (!existsSync(dir)) return []; + return readdirSync(dir) + .filter((name) => name.endsWith(".json")) + .sort() + .map((name) => ({ + runId: name.replace(/\.json$/i, ""), + path: resolve(dir, name), + })); +} + +export function detectRegression(currentRun, baselineRun, thresholds = {}) { + const maxTokenRegression = toNumber(thresholds.maxTokenRegression, Infinity); + const minPassRate = thresholds.minPassRate == null ? -Infinity : toNumber(thresholds.minPassRate); + const tokenRegression = toNumber(currentRun?.summary?.avgTokens) - toNumber(baselineRun?.summary?.avgTokens); + const tokenRegressionRatio = toNumber(baselineRun?.summary?.avgTokens) === 0 + ? (tokenRegression > 0 ? Infinity : 0) + : tokenRegression / Math.max(1e-9, toNumber(baselineRun?.summary?.avgTokens)); + const passRate = toNumber(currentRun?.summary?.passRate); + const failures = []; + if (Number.isFinite(maxTokenRegression) && tokenRegressionRatio > maxTokenRegression) { + failures.push({ + metric: "avgTokens", + actual: tokenRegressionRatio, + threshold: maxTokenRegression, + message: `Average token regression ${tokenRegressionRatio.toFixed(4)} exceeds ${maxTokenRegression.toFixed(4)}`, + }); + } + if (passRate < minPassRate) { + failures.push({ + metric: "passRate", + actual: passRate, + threshold: minPassRate, + message: `Pass rate ${passRate.toFixed(4)} is below ${minPassRate.toFixed(4)}`, + }); + } + return { + ok: failures.length === 0, + failures, + }; +} + +export function summarizeHistory(runs = []) { + const ordered = ensureArray(runs) + .map((run) => ({ + runId: run.runId, + createdAt: run.createdAt || "", + benchmark: run.benchmark || run.benchmarkId || "", + passRate: toNumber(run.summary?.passRate), + avgTokens: toNumber(run.summary?.avgTokens), + avgTimeMs: toNumber(run.summary?.avgTimeMs), + totalCostUsd: toNumber(run.summary?.totalCostUsd), + })) + .sort((a, b) => String(a.createdAt).localeCompare(String(b.createdAt))); + const regressions = []; + for (let index = 1; index < ordered.length; index += 1) { + const previous = ordered[index - 1]; + const current = ordered[index]; + if (current.passRate < previous.passRate || current.avgTokens > previous.avgTokens) { + regressions.push({ + fromRunId: previous.runId, + toRunId: current.runId, + passRateDelta: current.passRate - previous.passRate, + avgTokensDelta: current.avgTokens - previous.avgTokens, + }); + } + } + return { runs: ordered, regressions }; +} + +function renderMatrixTable(rows = []) { + const header = ["Config", "Pass Rate", "Avg Tokens", "Avg Time", "Cost"]; + const tableRows = ensureArray(rows).map((row) => [ + row.config, + `${(toNumber(row.passRate) * 100).toFixed(1)}%`, + Math.round(toNumber(row.avgTokens)).toString(), + `${(toNumber(row.avgTimeMs) / 1000).toFixed(1)}s`, + `$${toNumber(row.cost).toFixed(4)}`, + ]); + return [header, ...tableRows].map((cells) => `| ${cells.join(" | ")} |`).join("\n"); +} + +function printUsage() { + console.log(`Bosun evaluation framework\n\nUsage:\n bosun eval import \n bosun eval run --benchmark [--repeats N] [--config id] [--results-dir dir]\n bosun eval compare [--results-dir dir]\n bosun eval matrix --benchmark [--repeats N] [--configs a,b] [--results-dir dir]\n bosun eval audit-impact --with --without [--results-dir dir]\n bosun eval ci --baseline --candidate [--max-token-regression 0.10] [--min-pass-rate 0.85]\n bosun eval history [--results-dir dir]\n`); +} + +function getArgValue(args, flag) { + const inline = args.find((entry) => entry.startsWith(`${flag}=`)); + if (inline) return inline.slice(flag.length + 1); + const index = args.indexOf(flag); + return index >= 0 ? args[index + 1] : ""; +} + +function hasFlag(args, flag) { + return args.includes(flag); +} + +function parseConfigList(rawValue) { + return String(rawValue || "") + .split(",") + .map((value) => value.trim()) + .filter(Boolean); +} + +function resolveBenchmarkPath(input = "", options = {}) { + const value = String(input || "").trim(); + if (!value) return ""; + if (existsSync(resolve(value))) return resolve(value); + const benchmarksDir = resolve(options.benchmarksDir || DEFAULT_BENCHMARKS_DIR); + const candidates = [ + resolve(benchmarksDir, value), + resolve(benchmarksDir, `${value}.json`), + ]; + return candidates.find((candidate) => existsSync(candidate)) || resolve(value); +} + +function resolveRunInput(input, resultsDir) { + const value = String(input || "").trim(); + if (!value) throw new Error("Run identifier is required"); + if (existsSync(resolve(value))) return loadRunFromFile(resolve(value)); + return readRunById(resultsDir, value); +} + +function createSyntheticRunner() { + return async ({ task, strategy, repeatIndex }) => { + const promptLength = String(task?.input?.prompt || "").length; + const taskWeight = TASK_TYPES.indexOf(task?.type) + 1; + const strategyWeight = Math.max(1, String(strategy?.id || "default").length % 7); + const success = !String(strategy?.id || "").toLowerCase().includes("fail"); + return { + success, + durationMs: 1000 + (repeatIndex * 125) + (taskWeight * 200) + (strategyWeight * 50), + tokensInput: 800 + promptLength + (taskWeight * 75), + tokensOutput: 300 + (repeatIndex * 20) + (strategyWeight * 15), + filesChanged: Math.max(1, taskWeight - 1), + testsPassed: success ? Math.max(1, taskWeight) : Math.max(0, taskWeight - 1), + testsTotal: Math.max(1, taskWeight), + falsePositives: task?.type === "code-review" && success ? 0 : (task?.type === "code-review" ? 1 : 0), + contextBytes: 2048 + promptLength, + contextBudgetBytes: 8192, + }; + }; +} + +export async function runEvalCli(args = []) { + const [command, ...rest] = args; + if (!command || command === "--help" || command === "-h") { + printUsage(); + return { exitCode: 0 }; + } + + if (command === "import") { + const filePath = rest[0]; + if (!filePath) { + console.error("Usage: bosun eval import "); + return { exitCode: 1 }; + } + const benchmark = await importBenchmarkFromFile(filePath); + console.log(`Imported benchmark ${benchmark.name}: tasks=${benchmark.tasks.length}`); + return { exitCode: 0, benchmark }; + } + + if (command === "run") { + const resultsDir = getArgValue(rest, "--results-dir") || DEFAULT_RESULTS_DIR; + const benchmarkPath = resolveBenchmarkPath(getArgValue(rest, "--benchmark"), { + benchmarksDir: getArgValue(rest, "--benchmarks-dir") || DEFAULT_BENCHMARKS_DIR, + }); + const repeats = Math.max(1, toNumber(getArgValue(rest, "--repeats"), 1)); + const configId = getArgValue(rest, "--config") || "default"; + if (!benchmarkPath) { + console.error("Usage: bosun eval run --benchmark [--repeats N] [--config id]"); + return { exitCode: 1 }; + } + const benchmark = await importBenchmarkFromFile(benchmarkPath); + const evaluator = new Evaluator({ resultsDir, runner: createSyntheticRunner() }); + const run = await evaluator.evaluate({ + benchmark, + repeats, + strategies: [{ id: configId, label: configId }], + }); + console.log(JSON.stringify({ runId: run.runId, resultPath: run.resultPath, summary: run.summary }, null, 2)); + return { exitCode: 0, run }; + } + + if (command === "compare") { + const resultsDir = getArgValue(rest, "--results-dir") || DEFAULT_RESULTS_DIR; + const [runAPath, runBPath] = rest.filter((entry) => !/^--/.test(entry)); + if (!runAPath || !runBPath) { + console.error("Usage: bosun eval compare "); + return { exitCode: 1 }; + } + const baseline = resolveRunInput(runAPath, resultsDir); + const candidate = resolveRunInput(runBPath, resultsDir); + const comparison = compareEvaluationRuns(baseline, candidate); + console.log(JSON.stringify(comparison, null, 2)); + return { exitCode: 0, comparison }; + } + + if (command === "matrix") { + const resultsDir = getArgValue(rest, "--results-dir") || DEFAULT_RESULTS_DIR; + const benchmarkPath = resolveBenchmarkPath(getArgValue(rest, "--benchmark"), { + benchmarksDir: getArgValue(rest, "--benchmarks-dir") || DEFAULT_BENCHMARKS_DIR, + }); + const repeats = Math.max(1, toNumber(getArgValue(rest, "--repeats"), 1)); + const configs = parseConfigList(getArgValue(rest, "--configs") || "default"); + if (!benchmarkPath) { + console.error("Usage: bosun eval matrix --benchmark [--repeats N] [--configs a,b]"); + return { exitCode: 1 }; + } + const benchmark = await importBenchmarkFromFile(benchmarkPath); + const evaluator = new Evaluator({ resultsDir, runner: createSyntheticRunner() }); + const runs = []; + for (const configId of configs) { + const run = await evaluator.evaluate({ + benchmark, + repeats, + strategies: [{ id: configId, label: configId }], + }); + runs.push(run); + } + const matrix = summarizeMatrix(runs); + console.log(renderMatrixTable(matrix.rows)); + return { exitCode: 0, runs, matrix }; + } + + if (command === "audit-impact") { + const resultsDir = getArgValue(rest, "--results-dir") || DEFAULT_RESULTS_DIR; + const withInput = getArgValue(rest, "--with"); + const withoutInput = getArgValue(rest, "--without"); + if (!withInput || !withoutInput) { + console.error("Usage: bosun eval audit-impact --with --without "); + return { exitCode: 1 }; + } + const withAnnotations = resolveRunInput(withInput, resultsDir); + const withoutAnnotations = resolveRunInput(withoutInput, resultsDir); + const impact = compareAuditImpactRuns(withAnnotations, withoutAnnotations); + console.log(JSON.stringify(impact, null, 2)); + return { exitCode: 0, impact }; + } + + if (command === "ci") { + const resultsDir = getArgValue(rest, "--results-dir") || DEFAULT_RESULTS_DIR; + const baselineInput = getArgValue(rest, "--baseline"); + const candidateInput = getArgValue(rest, "--candidate"); + if (!baselineInput || !candidateInput) { + console.error("Usage: bosun eval ci --baseline --candidate "); + return { exitCode: 1 }; + } + const baseline = resolveRunInput(baselineInput, resultsDir); + const candidate = resolveRunInput(candidateInput, resultsDir); + const regression = detectRegression(candidate, baseline, { + maxTokenRegression: toNumber(getArgValue(rest, "--max-token-regression"), 0.1), + minPassRate: toNumber(getArgValue(rest, "--min-pass-rate"), 0.85), + }); + if (!regression.ok) { + console.error(JSON.stringify(regression, null, 2)); + return { exitCode: 1, regression }; + } + console.log(JSON.stringify(regression, null, 2)); + return { exitCode: 0, regression }; + } + + if (command === "history") { + const resultsDir = getArgValue(rest, "--results-dir") || DEFAULT_RESULTS_DIR; + const runs = listStoredEvaluationRuns(resultsDir).map((entry) => loadRunFromFile(entry.path)); + const history = summarizeHistory(runs); + console.log(JSON.stringify(history, null, 2)); + return { exitCode: 0, history }; + } + + console.error(`Unknown eval command: ${command}`); + printUsage(); + return { exitCode: 1 }; +} diff --git a/cli.mjs b/cli.mjs index 32ed20823..94b892704 100755 --- a/cli.mjs +++ b/cli.mjs @@ -85,6 +85,7 @@ function showHelp() { workflow list List declarative pipeline workflows workflow run Run a declarative pipeline workflow workflow nodes Inspect custom workflow node plugin health + eval Run agent evaluation and benchmarking tools tui Launch the terminal UI audit Run codebase annotation audit tools (scan|generate|warn|manifest|index|trim|conformity|migrate) --setup Launch the web-based setup wizard (default) @@ -174,6 +175,7 @@ function showHelp() { workflow run Run a declarative fresh-context workflow Run 'bosun workflow --help' for workflow CLI examples. + Run 'bosun eval --help' for evaluation CLI examples. Run 'bosun tui' to launch the terminal UI. STARTUP SERVICE @@ -1446,6 +1448,15 @@ async function main() { process.exit(exitCode ?? 0); } + const evalFlagIndex = args.indexOf("--eval"); + const evalCommandIndex = args.indexOf("eval"); + if (evalFlagIndex >= 0 || evalCommandIndex >= 0) { + const commandStartIndex = evalCommandIndex >= 0 ? evalCommandIndex : evalFlagIndex; + const evalArgs = args.slice(commandStartIndex + 1); + const { runEvalCli } = await import("./bench/eval-framework.mjs"); + const { exitCode } = await runEvalCli(evalArgs); + process.exit(exitCode); + } // Handle --help if (args.includes("--help") || args.includes("-h")) { showHelp(); diff --git a/package.json b/package.json index 89a84e96a..19229150b 100644 --- a/package.json +++ b/package.json @@ -182,6 +182,7 @@ "agent/skills/skill-codebase-audit.md", "bench/benchmark-mode.mjs", "bench/benchmark-registry.mjs", + "bench/eval-framework.mjs", "bench/swebench/bosun-swebench.mjs", "bosun-tui.mjs", "bosun.config.example.json", diff --git a/tests/cli-eval-routing.test.mjs b/tests/cli-eval-routing.test.mjs new file mode 100644 index 000000000..09e676b57 --- /dev/null +++ b/tests/cli-eval-routing.test.mjs @@ -0,0 +1,26 @@ +import { readFileSync } from "node:fs"; +import { resolve } from "node:path"; +import { describe, expect, it } from "vitest"; + +describe("cli eval routing", () => { + const cliSource = readFileSync(resolve(process.cwd(), "cli.mjs"), "utf8"); + + it("routes eval subcommands before global help handling", () => { + const evalRoutingIndex = cliSource.indexOf("const evalFlagIndex = args.indexOf(\"--eval\")"); + const helpRoutingIndex = cliSource.indexOf("// Handle --help"); + + expect(evalRoutingIndex).toBeGreaterThan(-1); + expect(helpRoutingIndex).toBeGreaterThan(-1); + expect(evalRoutingIndex).toBeLessThan(helpRoutingIndex); + expect(cliSource).toContain("args.indexOf(\"eval\")"); + expect(cliSource).toContain("const evalArgs = args.slice(commandStartIndex + 1)"); + expect(cliSource).toContain('const { runEvalCli } = await import("./bench/eval-framework.mjs")'); + expect(cliSource).toContain("const { exitCode } = await runEvalCli(evalArgs)"); + expect(cliSource).toContain("process.exit(exitCode)"); + }); + + it("documents eval commands in help output", () => { + expect(cliSource).toContain("eval Run agent evaluation and benchmarking tools"); + expect(cliSource).toContain("Run 'bosun eval --help' for evaluation CLI examples."); + }); +}); diff --git a/tests/eval-framework.test.mjs b/tests/eval-framework.test.mjs new file mode 100644 index 000000000..bf622e31e --- /dev/null +++ b/tests/eval-framework.test.mjs @@ -0,0 +1,258 @@ +import { afterEach, describe, expect, it } from "vitest"; +import { existsSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { resolve } from "node:path"; + +import { + Benchmark, + CategoryMetric, + Evaluator, + NumericalMetric, + Task, + compareAuditImpactRuns, + compareEvaluationRuns, + detectRegression, + importBenchmarkFromFile, + listStoredEvaluationRuns, + runEvalCli, + summarizeHistory, + summarizeMatrix, +} from "../bench/eval-framework.mjs"; + +const tempDirs = []; + +afterEach(() => { + for (const dir of tempDirs.splice(0)) { + rmSync(dir, { recursive: true, force: true }); + } +}); + +function makeTempDir(prefix) { + const dir = mkdtempSync(resolve(tmpdir(), prefix)); + tempDirs.push(dir); + return dir; +} + +describe("eval framework", () => { + it("imports benchmark tasks from json and normalizes built-in task metadata", async () => { + const dir = makeTempDir("bosun-eval-import-"); + const benchmarkPath = resolve(dir, "tasks.json"); + writeFileSync( + benchmarkPath, + JSON.stringify({ + name: "code-review-10", + tasks: [ + { + id: "review-1", + type: "code-review", + input: { + prompt: "Review this diff", + repoState: { ref: "main" }, + }, + groundTruth: { + expectedFiles: ["src/app.mjs"], + expectedTests: ["npm test"], + }, + tags: { area: "server", difficulty: "easy" }, + }, + ], + }), + "utf8", + ); + + const benchmark = await importBenchmarkFromFile(benchmarkPath); + + expect(benchmark).toBeInstanceOf(Benchmark); + expect(benchmark.name).toBe("code-review-10"); + expect(benchmark.tasks).toHaveLength(1); + expect(benchmark.tasks[0]).toBeInstanceOf(Task); + expect(benchmark.tasks[0].type).toBe("code-review"); + expect(benchmark.tasks[0].metrics).toEqual(expect.arrayContaining(["TaskSuccess", "FalsePositiveRate"])); + expect(benchmark.tasks[0].tags).toEqual({ area: "server", difficulty: "easy" }); + }); + + it("evaluates repeated runs, persists json results, and summarizes a matrix", async () => { + const dir = makeTempDir("bosun-eval-run-"); + const benchmark = new Benchmark({ + name: "mini-suite", + tasks: [ + new Task({ + id: "bug-1", + type: "bug-fix", + input: { prompt: "Fix the parser", repoState: { ref: "abc123" } }, + groundTruth: { expectedFiles: ["src/parser.mjs"] }, + }), + ], + }); + + const evaluator = new Evaluator({ + resultsDir: dir, + metrics: [ + new CategoryMetric("TaskSuccess", ({ outcome }) => outcome.success ? "pass" : "fail"), + new NumericalMetric("TokenEfficiency", ({ outcome }) => outcome.tokensInput / Math.max(outcome.filesChanged, 1)), + new NumericalMetric("TimeToComplete", ({ outcome }) => outcome.durationMs), + ], + runner: async ({ repeatIndex, strategy }) => ({ + success: strategy.id === "codex-default", + durationMs: 1000 + (repeatIndex * 100), + tokensInput: 1200 + (repeatIndex * 100), + filesChanged: 2, + }), + }); + + const run = await evaluator.evaluate({ + benchmark, + repeats: 3, + strategies: [{ id: "codex-default", label: "Codex Default" }], + }); + + expect(run.summary.totalTasks).toBe(3); + expect(run.summary.passRate).toBe(1); + expect(run.summary.avgTimeMs).toBeCloseTo(1100); + expect(run.summary.perTask).toHaveLength(1); + expect(existsSync(run.resultPath)).toBe(true); + + const persisted = JSON.parse(readFileSync(run.resultPath, "utf8")); + expect(persisted.runId).toBe(run.runId); + expect(persisted.results).toHaveLength(3); + + const matrix = summarizeMatrix([run]); + expect(matrix.rows).toHaveLength(1); + expect(matrix.rows[0]).toMatchObject({ + config: "codex-default", + passRate: 1, + }); + }); + + it("compares two runs with deltas and per-task regressions", () => { + const baseline = { + runId: "run-a", + summary: { + passRate: 0.5, + avgTokens: 5000, + p95Tokens: 8000, + avgTimeMs: 60000, + p95TimeMs: 90000, + perTask: [ + { taskId: "task-1", passRate: 1, avgTokens: 4000, avgTimeMs: 55000 }, + { taskId: "task-2", passRate: 0, avgTokens: 6000, avgTimeMs: 65000 }, + ], + }, + results: [ + { taskId: "task-1", metrics: { TaskSuccess: "pass", TokenEfficiency: 4000, TimeToComplete: 55000 } }, + { taskId: "task-2", metrics: { TaskSuccess: "fail", TokenEfficiency: 6000, TimeToComplete: 65000 } }, + ], + }; + const candidate = { + runId: "run-b", + summary: { + passRate: 1, + avgTokens: 4200, + p95Tokens: 7000, + avgTimeMs: 45000, + p95TimeMs: 70000, + perTask: [ + { taskId: "task-1", passRate: 1, avgTokens: 3800, avgTimeMs: 43000 }, + { taskId: "task-2", passRate: 1, avgTokens: 4600, avgTimeMs: 47000 }, + ], + }, + results: [ + { taskId: "task-1", metrics: { TaskSuccess: "pass", TokenEfficiency: 3800, TimeToComplete: 43000 } }, + { taskId: "task-2", metrics: { TaskSuccess: "pass", TokenEfficiency: 4600, TimeToComplete: 47000 } }, + ], + }; + + const comparison = compareEvaluationRuns(baseline, candidate); + + expect(comparison.metricDeltas.passRate.delta).toBeCloseTo(0.5); + expect(comparison.metricDeltas.avgTokens.delta).toBeCloseTo(-800); + expect(comparison.perTask.improved.map((entry) => entry.taskId)).toContain("task-2"); + expect(comparison.perTask.regressed).toEqual([]); + expect(comparison.metricDeltas.passRate.significance.pValue).toBeGreaterThanOrEqual(0); + expect(comparison.metricDeltas.passRate.significance.pValue).toBeLessThanOrEqual(1); + }); + + it("supports audit impact, history, and ci regression helpers", () => { + const withoutAnnotations = { + runId: "without", + createdAt: "2026-03-25T00:00:00.000Z", + benchmark: "suite", + summary: { + passRate: 0.7, + avgTokens: 1000, + avgTimeMs: 5000, + totalCostUsd: 0.03, + }, + results: [{ metrics: { FalsePositiveRate: 0.2 } }], + }; + const withAnnotations = { + runId: "with", + createdAt: "2026-03-26T00:00:00.000Z", + benchmark: "suite", + summary: { + passRate: 0.9, + avgTokens: 800, + avgTimeMs: 4000, + totalCostUsd: 0.02, + }, + results: [{ metrics: { FalsePositiveRate: 0.05 } }], + }; + + const impact = compareAuditImpactRuns(withAnnotations, withoutAnnotations); + expect(impact.rows.find((entry) => entry.metric === "Avg Tokens")?.delta).toBe(-200); + + const history = summarizeHistory([withAnnotations, withoutAnnotations]); + expect(history.runs.map((entry) => entry.runId)).toEqual(["without", "with"]); + + const regression = detectRegression(withAnnotations, withoutAnnotations, { + maxTokenRegression: 0.1, + minPassRate: 0.85, + }); + expect(regression.ok).toBe(true); + + const failingRegression = detectRegression(withoutAnnotations, withAnnotations, { + maxTokenRegression: 0.1, + minPassRate: 0.85, + }); + expect(failingRegression.ok).toBe(false); + }); + + it("lists stored runs and supports eval cli matrix/history flows", async () => { + const dir = makeTempDir("bosun-eval-cli-"); + const benchmarkPath = resolve(dir, "benchmark.json"); + writeFileSync( + benchmarkPath, + JSON.stringify({ + name: "mini-benchmark", + tasks: [ + { + id: "task-1", + type: "code-generation", + input: { prompt: "Create helper" }, + }, + ], + }), + "utf8", + ); + + const matrixResult = await runEvalCli([ + "matrix", + "--benchmark", benchmarkPath, + "--configs", "codex-default,copilot-sonnet", + "--repeats", "2", + "--results-dir", dir, + ]); + expect(matrixResult.exitCode).toBe(0); + expect(matrixResult.matrix.rows).toHaveLength(2); + + const storedRuns = listStoredEvaluationRuns(dir); + expect(storedRuns).toHaveLength(2); + + const historyResult = await runEvalCli([ + "history", + "--results-dir", dir, + ]); + expect(historyResult.exitCode).toBe(0); + expect(historyResult.history.runs).toHaveLength(2); + }); +}); From dd6dfe62fff135fbe108e833c3945fde712465e2 Mon Sep 17 00:00:00 2001 From: Jonathan Philipos Date: Mon, 30 Mar 2026 16:41:01 +1100 Subject: [PATCH 2/4] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- cli.mjs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cli.mjs b/cli.mjs index 9bd50f870..a95d65be3 100755 --- a/cli.mjs +++ b/cli.mjs @@ -1516,11 +1516,14 @@ async function main() { const evalFlagIndex = args.indexOf("--eval"); const evalCommandIndex = args.indexOf("eval"); if (evalFlagIndex >= 0 || evalCommandIndex >= 0) { - const commandStartIndex = evalCommandIndex >= 0 ? evalCommandIndex : evalFlagIndex; + const commandStartIndex = + evalFlagIndex >= 0 && evalCommandIndex >= 0 + ? Math.min(evalFlagIndex, evalCommandIndex) + : (evalCommandIndex >= 0 ? evalCommandIndex : evalFlagIndex); const evalArgs = args.slice(commandStartIndex + 1); const { runEvalCli } = await import("./bench/eval-framework.mjs"); const { exitCode } = await runEvalCli(evalArgs); - process.exit(exitCode); + process.exit(exitCode ?? 0); } // Handle --help if (args.includes("--help") || args.includes("-h")) { From a65810560dc6ca064ee51fd3581dc8cebd34b5e0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 30 Mar 2026 20:32:45 +0000 Subject: [PATCH 3/4] fix(eval): filter listStoredEvaluationRuns to only eval-*.json files Agent-Logs-Url: https://github.com/virtengine/bosun/sessions/8e799793-42cb-4191-8ba3-7b685a663915 Co-authored-by: jaeko44 <9289791+jaeko44@users.noreply.github.com> --- bench/eval-framework.mjs | 2 +- ui/vendor/es-module-shims.js | 1498 +++++++++++++++++++++++++++++- ui/vendor/htm.js | 5 +- ui/vendor/preact-signals-core.js | 2 +- ui/vendor/preact.js | 5 +- 5 files changed, 1487 insertions(+), 25 deletions(-) diff --git a/bench/eval-framework.mjs b/bench/eval-framework.mjs index b12a3d415..d2c314576 100644 --- a/bench/eval-framework.mjs +++ b/bench/eval-framework.mjs @@ -510,7 +510,7 @@ export function listStoredEvaluationRuns(resultsDir = DEFAULT_RESULTS_DIR) { const dir = resolve(resultsDir); if (!existsSync(dir)) return []; return readdirSync(dir) - .filter((name) => name.endsWith(".json")) + .filter((name) => name.startsWith("eval-") && name.endsWith(".json")) .sort() .map((name) => ({ runId: name.replace(/\.json$/i, ""), diff --git a/ui/vendor/es-module-shims.js b/ui/vendor/es-module-shims.js index 522f0acd2..8e5261d5d 100644 --- a/ui/vendor/es-module-shims.js +++ b/ui/vendor/es-module-shims.js @@ -1,16 +1,1482 @@ -/* esm.sh - es-module-shims@1.10.0 */ -var z0=Object.create;var Vt=Object.defineProperty;var Q0=Object.getOwnPropertyDescriptor;var G0=Object.getOwnPropertyNames;var K0=Object.getPrototypeOf,V0=Object.prototype.hasOwnProperty;var X0=(v,O)=>()=>(O||v((O={exports:{}}).exports,O),O.exports);var Y0=(v,O,B,M)=>{if(O&&typeof O=="object"||typeof O=="function")for(let y of G0(O))!V0.call(v,y)&&y!==B&&Vt(v,y,{get:()=>O[y],enumerable:!(M=Q0(O,y))||M.enumerable});return v};var Z0=(v,O,B)=>(B=v!=null?z0(K0(v)):{},Y0(O||!v||!v.__esModule?Vt(B,"default",{value:v,enumerable:!0}):B,v));var Xt=X0(()=>{(function(){let v=typeof document<"u",O=()=>{},B=v?document.querySelector("script[type=esms-options]"):void 0,M=B?JSON.parse(B.innerHTML):{};Object.assign(M,self.esmsInitOptions||{});let y=v?!!M.shimMode:!0,$e=te(y&&M.onimport),ve=te(y&&M.resolve),Yt=M.fetch?te(M.fetch):fetch,Zt=M.meta?te(y&&M.meta):O,bt=M.mapOverrides,W=M.nonce;if(!W&&v){let n=document.querySelector("script[nonce]");n&&(W=n.nonce||n.getAttribute("nonce"))}let Ut=te(M.onerror||O),{revokeBlobURLs:pt,noLoadEventRetriggers:Se,globalLoadEventRetrigger:e0,enforceIntegrity:t0}=M;function te(n){return typeof n=="string"?self[n]:n}let Ee=Array.isArray(M.polyfillEnable)?M.polyfillEnable:[],xe=Ee.includes("css-modules"),Ce=Ee.includes("json-modules"),ne=Ee.includes("wasm-modules"),re=Ee.includes("source-phase"),ht=M.onpolyfill?te(M.onpolyfill):()=>{console.log("%c^^ Module error above is polyfilled and can be ignored ^^","font-weight:900;color:#391")},n0=!navigator.userAgentData&&!!navigator.userAgent.match(/Edge\/\d+\.\d+/),Z=v?document.baseURI:`${location.protocol}//${location.host}${location.pathname.includes("/")?location.pathname.slice(0,location.pathname.lastIndexOf("/")+1):location.pathname}`,R=(n,s="text/javascript")=>URL.createObjectURL(new Blob([n],{type:s})),{skip:D}=M;if(Array.isArray(D)){let n=D.map(s=>new URL(s,Z).href);D=s=>n.some(a=>a[a.length-1]==="/"&&s.startsWith(a)||s===a)}else if(typeof D=="string"){let n=new RegExp(D);D=s=>n.test(s)}else D instanceof RegExp&&(D=n=>D.test(n));let r0=n=>self.dispatchEvent(Object.assign(new Event("error"),{error:n})),Fe=n=>{(self.reportError||r0)(n),Ut(n)};function ce(n){return n?` imported from ${n}`:""}let Ae=!1;function s0(){Ae=!0}if(!y)if(document.querySelectorAll("script[type=module-shim],script[type=importmap-shim],link[rel=modulepreload-shim]").length)y=!0;else{let n=!1;for(let s of document.querySelectorAll("script[type=module],script[type=importmap]"))if(!n)s.type==="module"&&!s.ep&&(n=!0);else if(s.type==="importmap"&&n){Ae=!0;break}}let i0=/\\/g;function Le(n){try{if(n.indexOf(":")!==-1)return new URL(n).href}catch{}}function kt(n,s){return z(n,s)||Le(n)||z("./"+n,s)}function z(n,s){let a=s.indexOf("#"),d=s.indexOf("?");if(a+d>-2&&(s=s.slice(0,a===-1?d:d===-1||d>a?a:d)),n.indexOf("\\")!==-1&&(n=n.replace(i0,"/")),n[0]==="/"&&n[1]==="/")return s.slice(0,s.indexOf(":")+1)+n;if(n[0]==="."&&(n[1]==="/"||n[1]==="."&&(n[2]==="/"||n.length===2&&(n+="/"))||n.length===1&&(n+="/"))||n[0]==="/"){let c=s.slice(0,s.indexOf(":")+1);if(c==="blob:")throw new TypeError(`Failed to resolve module specifier "${n}". Invalid relative url or base scheme isn't hierarchical.`);let f;if(s[c.length+1]==="/"?c!=="file:"?(f=s.slice(c.length+2),f=f.slice(f.indexOf("/")+1)):f=s.slice(8):f=s.slice(c.length+(s[c.length]==="/")),n[0]==="/")return s.slice(0,s.length-f.length-1)+n;let m=f.slice(0,f.lastIndexOf("/")+1)+n,p=[],w=-1;for(let b=0;b "${n[c]}" does not resolve`)}}function o0(n,s,a){for(let d in n){let c=z(d,a)||d;if((!y||!bt)&&s[c]&&s[c]!==n[c])throw Error(`Rejected map integrity override "${c}" from ${s[c]} to ${n[c]}.`);s[c]=n[d]}}let q=!v&&(0,eval)("u=>import(u)"),fe,a0=v&&new Promise(n=>{let s=Object.assign(document.createElement("script"),{src:R("self._d=u=>import(u)"),ep:!0});s.setAttribute("nonce",W),s.addEventListener("load",()=>{if(!(fe=!!(q=self._d))){let a;window.addEventListener("error",d=>a=d),q=(d,c)=>new Promise((f,m)=>{let p=Object.assign(document.createElement("script"),{type:"module",src:R(`import*as m from'${d}';self._esmsi=m`)});a=void 0,p.ep=!0,W&&p.setAttribute("nonce",W),p.addEventListener("error",w),p.addEventListener("load",w);function w(b){document.head.removeChild(p),self._esmsi?(f(self._esmsi,Z),self._esmsi=void 0):(m(!(b instanceof Event)&&b||a&&a.error||new Error(`Error loading ${c&&c.errUrl||d} (${p.src}).`)),a=void 0)}document.head.appendChild(p)})}document.head.removeChild(s),delete self._d,n()}),document.head.appendChild(s)}),Oe=!1,Me=!1,ze=v&&HTMLScriptElement.supports,se=ze&&ze.name==="supports"&&ze("importmap"),je=fe,Ie=!1,Pe=!1,Re=[0,97,115,109,1,0,0,0],c0=Promise.resolve(a0).then(()=>{if(fe)return v?new Promise(n=>{let s=document.createElement("iframe");s.style.display="none",s.setAttribute("nonce",W);function a({data:p}){Array.isArray(p)&&p[0]==="esms"&&([,se,je,Me,Oe,Ie,Pe]=p,n(),document.head.removeChild(s),window.removeEventListener("message",a,!1))}window.addEventListener("message",a,!1);let d=`