diff --git a/.env.test.example b/.env.test.example index 0ae9ca1..0a6b7f0 100644 --- a/.env.test.example +++ b/.env.test.example @@ -63,3 +63,9 @@ E2B_TEMPLATE=base # E2B 沙箱超时时间(毫秒,可选,默认 300000) E2B_TIMEOUT_MS=300000 + +# ============================================================================= +# Benchmark (for benchmark tests) +# ============================================================================= +# Docker 代理(可选,SWE full 模式 git clone 和 Docker 容器使用) +# BENCHMARK_DOCKER_PROXY=http://127.0.0.1:7897 diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000..176abfc --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,210 @@ +name: Benchmark Full Suite + +on: + workflow_dispatch: + inputs: + benchmark: + description: "Which benchmark to run" + type: choice + required: true + default: both + options: + - all + - both + - swe + - tau + - tb2 + provider: + description: "SWE/TAU provider filter" + type: choice + required: true + default: all + options: + - all + - anthropic + - openai + - gemini + tau_domain: + description: "TAU domain (airline by default for faster runs)" + type: choice + required: true + default: airline + options: + - airline + - retail + - telecom + - all + tb2_model: + description: "TB2 model in provider/model format" + type: string + required: true + default: openai/glm-5 + push: + branches: + - add_benchmark_test + pull_request: + branches: + - main + +env: + NODE_VERSION: "20" + +permissions: + contents: read + +jobs: + benchmark: + name: Benchmark + runs-on: ubuntu-latest + timeout-minutes: 360 + if: ${{ vars.BENCHMARK_ACTION_ENABLED == '1' }} + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: ${{ env.NODE_VERSION }} + cache: npm + + - name: Setup uv + uses: astral-sh/setup-uv@v4 + + - name: Login to Docker Hub (optional) + if: ${{ env.DOCKERHUB_USERNAME != '' && env.DOCKERHUB_TOKEN != '' }} + uses: docker/login-action@v3 + with: + username: ${{ env.DOCKERHUB_USERNAME }} + password: ${{ env.DOCKERHUB_TOKEN }} + + - name: Install dependencies + run: npm ci + + - name: Create benchmark environment + run: | + cat > .env.test << 'EOT' + ANTHROPIC_API_KEY=${{ secrets.ANTHROPIC_API_KEY }} + ANTHROPIC_MODEL_ID=${{ vars.ANTHROPIC_MODEL_ID }} + ANTHROPIC_BASE_URL=${{ vars.ANTHROPIC_BASE_URL }} + + OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }} + OPENAI_MODEL_ID=${{ vars.OPENAI_MODEL_ID }} + OPENAI_BASE_URL=${{ vars.OPENAI_BASE_URL }} + + GEMINI_API_KEY=${{ secrets.GEMINI_API_KEY }} + GEMINI_MODEL_ID=${{ vars.GEMINI_MODEL_ID }} + GEMINI_BASE_URL=${{ vars.GEMINI_BASE_URL }} + + BENCHMARK_DOCKER_PROXY=${{ vars.BENCHMARK_DOCKER_PROXY }} + BENCHMARK_TIMEOUT_MS=${{ vars.BENCHMARK_TIMEOUT_MS }} + EOT + + - name: Run unified benchmark command + run: | + mkdir -p tests/tmp + benchmark="${{ github.event.inputs.benchmark || 'both' }}" + provider="${{ github.event.inputs.provider || 'all' }}" + tau_domain="${{ github.event.inputs.tau_domain || 'airline' }}" + tb2_model="${{ github.event.inputs.tb2_model || 'openai/glm-5' }}" + + args=( + --benchmark=${benchmark} + --tau-domain=${tau_domain} + --tb2-model=${tb2_model} + --tb2-agent=oracle + --tb2-runner=uvx + --tb2-python=3.12 + --tb2-jobs-dir=./tests/tmp/jobs + --output=json + --output-file=tests/tmp/benchmark-report.json + ) + + if [[ "${provider}" != "all" && "${benchmark}" != "tb2" ]]; then + args+=(--provider=${provider}) + fi + + npm run test:benchmark -- "${args[@]}" + + - name: Write step summary + if: ${{ always() }} + run: | + node - <<'NODE' >> "$GITHUB_STEP_SUMMARY" + const fs = require('fs'); + function readJson(p) { + if (!fs.existsSync(p)) return null; + try { return JSON.parse(fs.readFileSync(p, 'utf8')); } catch { return null; } + } + + const report = readJson('tests/tmp/benchmark-report.json'); + console.log('## Benchmark Report'); + console.log(''); + + if (!report) { + console.log('- report not found'); + process.exit(0); + } + + if (Array.isArray(report.swe) && report.swe.length > 0) { + console.log('### SWE-bench-Verified'); + console.log(''); + console.log('| Provider / Model | Resolved | Rate |'); + console.log('|---|---:|---:|'); + for (const r of report.swe) { + const name = `${r.provider.id} / ${r.provider.model}`; + const resolved = `${r.summary.resolved}/${r.summary.total}`; + const rate = `${(r.summary.rate * 100).toFixed(1)}%`; + console.log(`| ${name} | ${resolved} | ${rate} |`); + } + console.log(''); + } + + if (Array.isArray(report.tau) && report.tau.length > 0) { + console.log('### TAU-bench'); + console.log(''); + console.log('| Provider / Model | Domain | Pass^1 | Avg Tokens |'); + console.log('|---|---|---:|---:|'); + for (const r of report.tau) { + const name = `${r.provider.id} / ${r.provider.model}`; + const domain = r.summary.domain; + const pass1 = `${((r.summary.pass_at_k?.[0] ?? 0) * 100).toFixed(1)}%`; + const observed = (r.summary.token_observed_trials ?? 0) > 0; + const avgTokens = observed + ? (r.summary.avg_tokens >= 1000 ? `${(r.summary.avg_tokens / 1000).toFixed(1)}k` : `${r.summary.avg_tokens}`) + : '-'; + console.log(`| ${name} | ${domain} | ${pass1} | ${avgTokens} |`); + } + console.log(''); + } + + if (report.tb2) { + const tb2 = report.tb2; + console.log('### Terminal Bench 2.0'); + console.log(''); + console.log(`- Agent: \`${tb2.agent}\``); + if (tb2.model) console.log(`- Model: \`${tb2.model}\``); + console.log(`- Passed: **${tb2.passed}/${tb2.total}**`); + console.log(`- Rate: **${(tb2.rate * 100).toFixed(1)}%**`); + if (typeof tb2.avg_total_tokens === 'number' && (tb2.token_observed_trials ?? 0) > 0) { + console.log(`- Avg tokens: **${tb2.avg_total_tokens}** (observed ${tb2.token_observed_trials} trials)`); + } else { + console.log(`- Avg tokens: **N/A**`); + } + console.log(''); + } + NODE + + - name: Upload benchmark artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: benchmark-artifacts-${{ github.run_id }} + if-no-files-found: warn + path: | + tests/tmp/benchmark-report.json + tests/tmp/jobs/*/result.json + tests/tmp/tau2-data/simulations/*.json diff --git a/README.md b/README.md index 47c3294..5787c82 100644 --- a/README.md +++ b/README.md @@ -146,6 +146,7 @@ See [docs/en/guides/architecture.md](./docs/en/guides/architecture.md) for detai | [Providers](./docs/en/guides/providers.md) | Model provider configuration | | [Database](./docs/en/guides/database.md) | SQLite/PostgreSQL persistence | | [Resume & Fork](./docs/en/guides/resume-fork.md) | Crash recovery & branching | +| [Benchmark Results](./docs/en/guides/benchmark-results.md) | Confirmed benchmark score tables | | **Project** | | | [Contribution Guide](./docs/en/contribution.md) | How to contribute | | **Reference** | | diff --git a/README.zh-CN.md b/README.zh-CN.md index 912d1b8..c748994 100644 --- a/README.zh-CN.md +++ b/README.zh-CN.md @@ -106,6 +106,7 @@ npm run example:room # 多Agent协作 | [Provider 配置](./docs/zh-CN/guides/providers.md) | 模型 Provider 配置 | | [数据库存储](./docs/zh-CN/guides/database.md) | SQLite/PostgreSQL 持久化 | | [恢复与分叉](./docs/zh-CN/guides/resume-fork.md) | 崩溃恢复与分支 | +| [Benchmark 结果](./docs/zh-CN/guides/benchmark-results.md) | 已确认的跑分结果表格 | | **项目** | | | [贡献指南](./docs/zh-CN/contribution.md) | 提交 PR 的要求与流程 | | **参考** | | diff --git a/docs/en/guides/benchmark-results.md b/docs/en/guides/benchmark-results.md new file mode 100644 index 0000000..a98e132 --- /dev/null +++ b/docs/en/guides/benchmark-results.md @@ -0,0 +1,32 @@ +# Benchmark Results (Confirmed) + +Last updated: 2026-02-26 + +## SWE-bench-Verified + +| Provider / Model | Instances | Resolved | Rate | Avg Tokens | Avg Duration | +|---|---:|---:|---:|---:|---:| +| openai / glm-5 | 12 | 12/12 | 100.0% | 17.2k | 134.5k ms | + +Source: local full run log (`2026-02-25__21-06-21`). + +## Terminal Bench 2.0 + +| Agent / Model | Passed | Parseable | Unknown | Rate (parseable) | Notes | +|---|---:|---:|---:|---:|---| +| oracle / glm-5 | 1 | 31 | 58 | 3.2% | From the same full run; many tasks ended with runtime/timeout errors. | + +## Reproduce + +```bash +npm run test:benchmark -- \ + --benchmark=both \ + --tb2-model=openai/glm-5 \ + --tb2-agent=oracle \ + --tb2-runner=uvx \ + --tb2-jobs-dir=./tests/tmp/jobs \ + --output=json \ + --output-file=tests/tmp/benchmark-report.json +``` + +The JSON report includes both `swe` and `tb2` sections. diff --git a/docs/en/guides/benchmarking.md b/docs/en/guides/benchmarking.md new file mode 100644 index 0000000..e167d8a --- /dev/null +++ b/docs/en/guides/benchmarking.md @@ -0,0 +1,143 @@ +# Benchmarking + +KODE SDK benchmark runner now has a single entry command and supports multiple targets: + +- `swe`: SWE-bench-Verified only +- `tau`: TAU-bench only +- `tb2`: Terminal Bench 2.0 only +- `both`: run SWE + TAU + TB2 +- `all`: alias of `both` (compatibility) + +## Prerequisites + +1. Install dependencies: + +```bash +npm ci +``` + +2. Create `.env.test` (or export env vars directly): + +```bash +ANTHROPIC_API_KEY=... +ANTHROPIC_MODEL_ID=claude-sonnet-4-20250514 + +OPENAI_API_KEY=... +OPENAI_MODEL_ID=glm-5 + +GEMINI_API_KEY=... +GEMINI_MODEL_ID=gemini-3-pro-preview +``` + +3. Runtime tools: +- SWE-bench-Verified: Docker is required +- TAU-bench: `tau2` or `uvx` is required (official TAU2 harness) +- TB2: `harbor`, `uvx`, or Docker (runner decides by `--tb2-runner`) + +## Unified Command + +```bash +npm run test:benchmark -- [flags] +``` + +### Common examples + +Run SWE + TAU + TB2 in one command: + +```bash +npm run test:benchmark -- \ + --benchmark=both \ + --tb2-model=openai/glm-5 \ + --output=json \ + --output-file=tests/tmp/benchmark-report.json +``` + +Run only SWE-bench-Verified: + +```bash +npm run test:benchmark -- \ + --benchmark=swe \ + --provider=anthropic \ + --output=json \ + --output-file=tests/tmp/swe-report.json +``` + +Run only TB2: + +```bash +npm run test:benchmark -- \ + --benchmark=tb2 \ + --tb2-model=openai/glm-5 \ + --tb2-agent=oracle \ + --tb2-runner=docker \ + --tb2-jobs-dir=./tests/tmp/jobs \ + --output=json \ + --output-file=tests/tmp/tb2-report.json +``` + +Run only TAU-bench (official TAU2 script + dataset): + +```bash +npm run test:benchmark -- \ + --benchmark=tau \ + --provider=openai \ + --tau-domain=airline \ + --num-trials=1 \ + --output=json \ + --output-file=tests/tmp/tau-report.json +``` + +## Flags + +| Flag | Description | Default | +|---|---|---| +| `--benchmark=swe\|tau\|tb2\|both\|all` | Which benchmark(s) to run (`both`=`all`) | `both` | +| `--provider=...` | Provider filter for SWE/TAU (`anthropic`, `openai`, `gemini`, etc.) | all discovered | +| `--tau-domain=airline\|retail\|telecom\|all` | TAU domain filter | `airline` | +| `--num-trials=N` | TAU trials per task (Pass^k) | `1` | +| `--tb2-model=provider/model` | TB2 model id | `BENCHMARK_TB2_MODEL` or `openai/$OPENAI_MODEL_ID` | +| `--tb2-agent=...` | TB2 agent (`oracle`, etc.) | `oracle` | +| `--tb2-dataset=...` | TB2 dataset id | `terminal-bench@2.0` | +| `--tb2-runner=auto\|harbor\|uvx\|docker` | TB2 execution backend | `auto` | +| `--tb2-python=3.12` | Python version for `uvx` runner | `3.12` | +| `--tb2-jobs-dir=PATH` | TB2 jobs directory | `tests/tmp/jobs` | +| `--tb2-env-file=PATH` | Env file passed to TB2 runner | auto-detect `.env.test` | +| `--tb2-docker-image=IMAGE` | Docker image for TB2 docker runner | `ghcr.io/astral-sh/uv:python3.12-bookworm` | +| `--output=table\|json` | Output mode | `table` | +| `--output-file=PATH` | JSON output file path (when `--output=json`) | `benchmark-report.json` | +| `--compare=PATH` | Compare against baseline JSON report | unset | + +## Output + +With `--output=json`, one report may contain `swe`, `tau`, and `tb2` sections depending on `--benchmark`. + +```json +{ + "timestamp": "2026-02-25T08:31:16.000Z", + "sdk_version": "2.7.3", + "swe": [ + { + "provider": { "id": "openai", "model": "glm-5" }, + "summary": { "dataset": "swe-bench-verified", "total": 12, "resolved": 10, "rate": 0.8333, "avg_tokens": 17500, "avg_duration_ms": 166000 } + } + ], + "tb2": { + "dataset": "terminal-bench@2.0", + "agent": "oracle", + "model": "openai/glm-5", + "passed": 0, + "total": 89, + "rate": 0.0 + } +} +``` + +## Notes + +- SWE-bench is fixed to **SWE-bench-Verified**. There is no mini/full mode switch anymore. +- TAU now runs with the official **TAU2** harness (`tau2 run ...`) from Sierra. +- TAU default domain is `airline` for faster CI/local feedback. Use `--tau-domain=all` when you need full coverage. +- TAU user simulator can be configured with `BENCHMARK_USER_MODEL=provider/model`. +- TB2 uses official Harbor run flow (`harbor run -d terminal-bench@2.0 -m ... -a ...`) under the selected runner. +- TAU/TB2 token stats are extracted from official result files when available; if a runner/agent does not emit usage, it is shown as `N/A`. +- If Docker image pulls are slow, set `BENCHMARK_DOCKER_PROXY`. diff --git a/docs/zh-CN/guides/benchmark-results.md b/docs/zh-CN/guides/benchmark-results.md new file mode 100644 index 0000000..261855f --- /dev/null +++ b/docs/zh-CN/guides/benchmark-results.md @@ -0,0 +1,32 @@ +# Benchmark 结果(已确认) + +最后更新:2026-02-26 + +## SWE-bench-Verified + +| Provider / Model | 实例数 | 通过数 | 通过率 | 平均 Tokens | 平均耗时 | +|---|---:|---:|---:|---:|---:| +| openai / glm-5 | 12 | 12/12 | 100.0% | 17.2k | 134.5k ms | + +来源:本地完整运行日志(`2026-02-25__21-06-21`)。 + +## Terminal Bench 2.0 + +| Agent / Model | 通过数 | 可判定 | Unknown | 通过率(仅可判定) | 备注 | +|---|---:|---:|---:|---:|---| +| oracle / glm-5 | 1 | 31 | 58 | 3.2% | 与上面同一次完整运行;大量任务以 runtime/timeout 结束。 | + +## 复现命令 + +```bash +npm run test:benchmark -- \ + --benchmark=both \ + --tb2-model=openai/glm-5 \ + --tb2-agent=oracle \ + --tb2-runner=uvx \ + --tb2-jobs-dir=./tests/tmp/jobs \ + --output=json \ + --output-file=tests/tmp/benchmark-report.json +``` + +输出 JSON 同时包含 `swe` 和 `tb2` 两个分区。 diff --git a/docs/zh-CN/guides/benchmarking.md b/docs/zh-CN/guides/benchmarking.md new file mode 100644 index 0000000..50d55dd --- /dev/null +++ b/docs/zh-CN/guides/benchmarking.md @@ -0,0 +1,143 @@ +# Benchmarking + +KODE SDK 的 benchmark 入口已统一为一个命令,支持多个目标: + +- `swe`:只跑 SWE-bench-Verified +- `tau`:只跑 TAU-bench +- `tb2`:只跑 Terminal Bench 2.0 +- `both`:一次命令跑 SWE + TAU + TB2 +- `all`:`both` 的兼容别名 + +## 前置条件 + +1. 安装依赖: + +```bash +npm ci +``` + +2. 准备 `.env.test`(或直接导出环境变量): + +```bash +ANTHROPIC_API_KEY=... +ANTHROPIC_MODEL_ID=claude-sonnet-4-20250514 + +OPENAI_API_KEY=... +OPENAI_MODEL_ID=glm-5 + +GEMINI_API_KEY=... +GEMINI_MODEL_ID=gemini-3-pro-preview +``` + +3. 运行依赖: +- SWE-bench-Verified:必须有 Docker +- TAU-bench:需要 `tau2` 或 `uvx`(官方 TAU2 harness) +- TB2:`harbor`、`uvx` 或 Docker(由 `--tb2-runner` 决定) + +## 统一命令 + +```bash +npm run test:benchmark -- [参数] +``` + +### 常用示例 + +一次命令同时跑 SWE + TAU + TB2: + +```bash +npm run test:benchmark -- \ + --benchmark=both \ + --tb2-model=openai/glm-5 \ + --output=json \ + --output-file=tests/tmp/benchmark-report.json +``` + +只跑 SWE-bench-Verified: + +```bash +npm run test:benchmark -- \ + --benchmark=swe \ + --provider=anthropic \ + --output=json \ + --output-file=tests/tmp/swe-report.json +``` + +只跑 TB2: + +```bash +npm run test:benchmark -- \ + --benchmark=tb2 \ + --tb2-model=openai/glm-5 \ + --tb2-agent=oracle \ + --tb2-runner=docker \ + --tb2-jobs-dir=./tests/tmp/jobs \ + --output=json \ + --output-file=tests/tmp/tb2-report.json +``` + +只跑 TAU-bench(官方 TAU2 脚本与数据集): + +```bash +npm run test:benchmark -- \ + --benchmark=tau \ + --provider=openai \ + --tau-domain=airline \ + --num-trials=1 \ + --output=json \ + --output-file=tests/tmp/tau-report.json +``` + +## 参数说明 + +| 参数 | 含义 | 默认值 | +|---|---|---| +| `--benchmark=swe\|tau\|tb2\|both\|all` | 选择要跑的 benchmark(`both`=`all`) | `both` | +| `--provider=...` | SWE/TAU 的 provider 过滤(`anthropic`、`openai`、`gemini` 等) | 自动发现全部 | +| `--tau-domain=airline\|retail\|telecom\|all` | TAU 领域过滤 | `airline` | +| `--num-trials=N` | TAU 每个任务试验次数(Pass^k) | `1` | +| `--tb2-model=provider/model` | TB2 模型 ID | `BENCHMARK_TB2_MODEL` 或 `openai/$OPENAI_MODEL_ID` | +| `--tb2-agent=...` | TB2 agent(如 `oracle`) | `oracle` | +| `--tb2-dataset=...` | TB2 数据集 ID | `terminal-bench@2.0` | +| `--tb2-runner=auto\|harbor\|uvx\|docker` | TB2 运行后端 | `auto` | +| `--tb2-python=3.12` | `uvx` runner 的 Python 版本 | `3.12` | +| `--tb2-jobs-dir=PATH` | TB2 作业目录 | `tests/tmp/jobs` | +| `--tb2-env-file=PATH` | 传给 TB2 runner 的环境文件 | 自动探测 `.env.test` | +| `--tb2-docker-image=IMAGE` | TB2 docker runner 镜像 | `ghcr.io/astral-sh/uv:python3.12-bookworm` | +| `--output=table\|json` | 输出格式 | `table` | +| `--output-file=PATH` | JSON 输出文件(当 `--output=json`) | `benchmark-report.json` | +| `--compare=PATH` | 与历史 JSON 报告做对比 | 未设置 | + +## 输出格式 + +使用 `--output=json` 时,报告会按 `--benchmark` 输出 `swe`/`tau`/`tb2` 分区: + +```json +{ + "timestamp": "2026-02-25T08:31:16.000Z", + "sdk_version": "2.7.3", + "swe": [ + { + "provider": { "id": "openai", "model": "glm-5" }, + "summary": { "dataset": "swe-bench-verified", "total": 12, "resolved": 10, "rate": 0.8333, "avg_tokens": 17500, "avg_duration_ms": 166000 } + } + ], + "tb2": { + "dataset": "terminal-bench@2.0", + "agent": "oracle", + "model": "openai/glm-5", + "passed": 0, + "total": 89, + "rate": 0.0 + } +} +``` + +## 说明 + +- SWE 已固定为 **SWE-bench-Verified**,不再有 mini/full 模式参数。 +- TAU 已切换为 Sierra 官方 **TAU2** harness(`tau2 run ...`)。 +- TAU 默认领域改为 `airline`,用于更快的本地/CI反馈;需要全量时使用 `--tau-domain=all`。 +- TAU 的用户模拟模型可通过 `BENCHMARK_USER_MODEL=provider/model` 指定。 +- TB2 走官方 Harbor 流程(`harbor run -d terminal-bench@2.0 -m ... -a ...`),由 runner 包装执行。 +- TAU/TB2 的 token 统计会从官方结果文件提取;若 runner/agent 未产出 usage,则显示为 `N/A`。 +- 若 Docker 拉取镜像慢,可设置 `BENCHMARK_DOCKER_PROXY`。 diff --git a/package.json b/package.json index fc12c7f..5da228a 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@shareai-lab/kode-sdk", - "version": "2.7.0", + "version": "2.7.3", "description": "Event-driven, long-running AI Agent development framework with enterprise-grade persistence and context management", "main": "dist/index.js", "types": "dist/index.d.ts", @@ -14,6 +14,7 @@ "test:integration": "ts-node --project tsconfig.json ./tests/run-integration.ts", "test:e2e": "ts-node --project tsconfig.json ./tests/run-e2e.ts", "test:all": "ts-node --project tsconfig.json ./tests/run-all.ts", + "test:benchmark": "ts-node --project tsconfig.json ./tests/benchmark/run-benchmark.ts", "example:getting-started": "ts-node examples/getting-started.ts", "example:openai": "ts-node examples/openai-usage.ts", "example:gemini": "ts-node examples/gemini-usage.ts", diff --git a/src/infra/providers/anthropic.ts b/src/infra/providers/anthropic.ts index 27537b3..f8f0bd6 100644 --- a/src/infra/providers/anthropic.ts +++ b/src/infra/providers/anthropic.ts @@ -50,7 +50,7 @@ export interface AnthropicProviderOptions { export class AnthropicProvider implements ModelProvider { readonly maxWindowSize = 200_000; - readonly maxOutputTokens = 4096; + readonly maxOutputTokens = 8192; readonly temperature = 0.7; readonly model: string; private readonly baseUrl: string; @@ -85,7 +85,7 @@ export class AnthropicProvider implements ModelProvider { ...(this.extraBody || {}), model: this.model, messages: this.formatMessages(messages), - max_tokens: opts?.maxTokens || 4096, + max_tokens: opts?.maxTokens || this.maxOutputTokens, }; if (opts?.temperature !== undefined) body.temperature = opts.temperature; @@ -146,7 +146,7 @@ export class AnthropicProvider implements ModelProvider { const body: any = { model: this.model, messages: this.formatMessages(messages), - max_tokens: opts?.maxTokens || 4096, + max_tokens: opts?.maxTokens || this.maxOutputTokens, stream: true, ...(this.extraBody || {}), }; diff --git a/src/infra/providers/gemini.ts b/src/infra/providers/gemini.ts index 3506611..99625ef 100644 --- a/src/infra/providers/gemini.ts +++ b/src/infra/providers/gemini.ts @@ -52,7 +52,7 @@ export interface GeminiProviderOptions { export class GeminiProvider implements ModelProvider { readonly maxWindowSize = 1_000_000; - readonly maxOutputTokens = 4096; + readonly maxOutputTokens = 16384; readonly temperature = 0.7; readonly model: string; private readonly baseUrl: string; diff --git a/src/infra/providers/openai.ts b/src/infra/providers/openai.ts index 00810af..3901961 100644 --- a/src/infra/providers/openai.ts +++ b/src/infra/providers/openai.ts @@ -134,7 +134,7 @@ export interface OpenAIProviderOptions { export class OpenAIProvider implements ModelProvider { readonly maxWindowSize = 128_000; - readonly maxOutputTokens = 4096; + readonly maxOutputTokens = 16384; readonly temperature = 0.7; readonly model: string; private readonly baseUrl: string; diff --git a/src/infra/providers/utils.ts b/src/infra/providers/utils.ts index 7af874b..18ef69e 100644 --- a/src/infra/providers/utils.ts +++ b/src/infra/providers/utils.ts @@ -58,8 +58,8 @@ export function normalizeBaseUrl(url: string): string { export function normalizeOpenAIBaseUrl(url: string): string { let normalized = url.replace(/\/+$/, ''); - // Auto-append /v1 if not present (for OpenAI-compatible APIs) - if (!normalized.endsWith('/v1')) { + // Auto-append /v1 if no version path detected (e.g., /v1, /v2, /v4) + if (!/\/v\d+$/.test(normalized)) { normalized += '/v1'; } return normalized; diff --git a/tests/benchmark/compare.ts b/tests/benchmark/compare.ts new file mode 100644 index 0000000..af66ace --- /dev/null +++ b/tests/benchmark/compare.ts @@ -0,0 +1,280 @@ +import fs from 'fs'; +import type { BenchmarkReport, SWEProviderResult, TAUProviderResult, TB2Summary } from './types'; + +interface ComparisonRow { + label: string; + oldValue: string; + newValue: string; + delta: string; + direction: 'better' | 'worse' | 'same' | 'na'; +} + +interface ComparisonResult { + swe: ComparisonRow[]; + tau: ComparisonRow[]; + tb2: ComparisonRow[]; + hasRegressions: boolean; +} + +function fmtPct(n: number): string { + return (n * 100).toFixed(1) + '%'; +} + +function fmtK(n: number): string { + if (n >= 1_000_000) return (n / 1_000_000).toFixed(1) + 'M'; + if (n >= 1_000) return (n / 1_000).toFixed(1) + 'k'; + return String(n); +} + +function pad(s: string, len: number): string { + return s.length >= len ? s.slice(0, len) : s + ' '.repeat(len - s.length); +} + +function lpad(s: string, len: number): string { + return s.length >= len ? s.slice(0, len) : ' '.repeat(len - s.length) + s; +} + +function deltaStr( + oldVal: number, + newVal: number, + unit: 'pct' | 'tokens', +): { text: string; dir: 'better' | 'worse' | 'same' } { + const diff = newVal - oldVal; + if (Math.abs(diff) < 0.001) return { text: '=', dir: 'same' }; + + const sign = diff > 0 ? '+' : ''; + if (unit === 'pct') { + return { text: `${sign}${(diff * 100).toFixed(1)}pp`, dir: diff > 0 ? 'better' : 'worse' }; + } + return { text: `${sign}${fmtK(diff)}`, dir: diff < 0 ? 'better' : 'worse' }; +} + +function compareSWE(oldResults: SWEProviderResult[], newResults: SWEProviderResult[]): ComparisonRow[] { + const rows: ComparisonRow[] = []; + + for (const newR of newResults) { + const key = `${newR.provider.id}/${newR.provider.model}`; + const oldR = oldResults.find( + r => r.provider.id === newR.provider.id && r.provider.model === newR.provider.model, + ); + + if (!oldR) { + rows.push({ + label: `${key} [rate]`, + oldValue: '-', + newValue: fmtPct(newR.summary.rate), + delta: 'new', + direction: 'na', + }); + continue; + } + + const rateDelta = deltaStr(oldR.summary.rate, newR.summary.rate, 'pct'); + rows.push({ + label: `${key} [rate]`, + oldValue: fmtPct(oldR.summary.rate), + newValue: fmtPct(newR.summary.rate), + delta: rateDelta.text, + direction: rateDelta.dir, + }); + + rows.push({ + label: `${key} [resolved]`, + oldValue: `${oldR.summary.resolved}/${oldR.summary.total}`, + newValue: `${newR.summary.resolved}/${newR.summary.total}`, + delta: newR.summary.resolved === oldR.summary.resolved + ? '=' + : `${newR.summary.resolved - oldR.summary.resolved > 0 ? '+' : ''}${newR.summary.resolved - oldR.summary.resolved}`, + direction: newR.summary.resolved > oldR.summary.resolved + ? 'better' + : newR.summary.resolved < oldR.summary.resolved + ? 'worse' + : 'same', + }); + + const tokenDelta = deltaStr(oldR.summary.avg_tokens, newR.summary.avg_tokens, 'tokens'); + rows.push({ + label: `${key} [tokens]`, + oldValue: fmtK(oldR.summary.avg_tokens), + newValue: fmtK(newR.summary.avg_tokens), + delta: tokenDelta.text, + direction: tokenDelta.dir, + }); + } + + return rows; +} + +function compareTAU(oldResults: TAUProviderResult[], newResults: TAUProviderResult[]): ComparisonRow[] { + const rows: ComparisonRow[] = []; + + for (const newR of newResults) { + const key = `${newR.provider.id}/${newR.provider.model} [${newR.summary.domain}]`; + const oldR = oldResults.find( + r => + r.provider.id === newR.provider.id + && r.provider.model === newR.provider.model + && r.summary.domain === newR.summary.domain, + ); + + if (!oldR) { + const pass1 = newR.summary.pass_at_k[0] ?? 0; + rows.push({ + label: `${key} [pass^1]`, + oldValue: '-', + newValue: fmtPct(pass1), + delta: 'new', + direction: 'na', + }); + continue; + } + + const oldPass1 = oldR.summary.pass_at_k[0] ?? 0; + const newPass1 = newR.summary.pass_at_k[0] ?? 0; + const passDelta = deltaStr(oldPass1, newPass1, 'pct'); + rows.push({ + label: `${key} [pass^1]`, + oldValue: fmtPct(oldPass1), + newValue: fmtPct(newPass1), + delta: passDelta.text, + direction: passDelta.dir, + }); + + const oldTokObserved = (oldR.summary.token_observed_trials ?? 0) > 0; + const newTokObserved = (newR.summary.token_observed_trials ?? 0) > 0; + if (oldTokObserved && newTokObserved) { + const tokenDelta = deltaStr(oldR.summary.avg_tokens, newR.summary.avg_tokens, 'tokens'); + rows.push({ + label: `${key} [tokens]`, + oldValue: fmtK(oldR.summary.avg_tokens), + newValue: fmtK(newR.summary.avg_tokens), + delta: tokenDelta.text, + direction: tokenDelta.dir, + }); + } + } + + return rows; +} + +function compareTB2(oldTB2?: TB2Summary, newTB2?: TB2Summary): ComparisonRow[] { + if (!newTB2) return []; + if (!oldTB2) { + return [{ + label: 'tb2 [rate]', + oldValue: '-', + newValue: fmtPct(newTB2.rate), + delta: 'new', + direction: 'na', + }]; + } + + const rows: ComparisonRow[] = []; + const rateDelta = deltaStr(oldTB2.rate, newTB2.rate, 'pct'); + rows.push({ + label: 'tb2 [rate]', + oldValue: fmtPct(oldTB2.rate), + newValue: fmtPct(newTB2.rate), + delta: rateDelta.text, + direction: rateDelta.dir, + }); + + rows.push({ + label: 'tb2 [passed]', + oldValue: `${oldTB2.passed}/${oldTB2.total}`, + newValue: `${newTB2.passed}/${newTB2.total}`, + delta: newTB2.passed === oldTB2.passed + ? '=' + : `${newTB2.passed - oldTB2.passed > 0 ? '+' : ''}${newTB2.passed - oldTB2.passed}`, + direction: newTB2.passed > oldTB2.passed ? 'better' : newTB2.passed < oldTB2.passed ? 'worse' : 'same', + }); + + const oldTokObserved = (oldTB2.token_observed_trials ?? 0) > 0 && oldTB2.avg_total_tokens !== undefined; + const newTokObserved = (newTB2.token_observed_trials ?? 0) > 0 && newTB2.avg_total_tokens !== undefined; + if (oldTokObserved && newTokObserved) { + const tokenDelta = deltaStr(oldTB2.avg_total_tokens!, newTB2.avg_total_tokens!, 'tokens'); + rows.push({ + label: 'tb2 [tokens]', + oldValue: fmtK(oldTB2.avg_total_tokens!), + newValue: fmtK(newTB2.avg_total_tokens!), + delta: tokenDelta.text, + direction: tokenDelta.dir, + }); + } + + return rows; +} + +export function loadReport(filePath: string): BenchmarkReport { + return JSON.parse(fs.readFileSync(filePath, 'utf-8')) as BenchmarkReport; +} + +export function compareReports(oldReport: BenchmarkReport, newReport: BenchmarkReport): ComparisonResult { + const sweRows = compareSWE(oldReport.swe ?? [], newReport.swe ?? []); + const tauRows = compareTAU(oldReport.tau ?? [], newReport.tau ?? []); + const tb2Rows = compareTB2(oldReport.tb2, newReport.tb2); + const hasRegressions = [...sweRows, ...tauRows, ...tb2Rows].some(r => r.direction === 'worse'); + return { swe: sweRows, tau: tauRows, tb2: tb2Rows, hasRegressions }; +} + +export function printComparison(oldPath: string, newPath: string, result: ComparisonResult): void { + const banner = '='.repeat(80); + console.log(`\n${banner}`); + console.log('Benchmark Comparison'); + console.log(banner); + console.log(` Baseline: ${oldPath}`); + console.log(` Current: ${newPath}`); + console.log(''); + + const allRows = [...result.swe, ...result.tau, ...result.tb2]; + if (allRows.length === 0) { + console.log(' No comparable results found.\n'); + return; + } + + const maxLabel = Math.max(20, ...allRows.map(r => r.label.length)); + const header = `${pad('Metric', maxLabel)} | ${lpad('Baseline', 10)} | ${lpad('Current', 10)} | ${lpad('Delta', 12)} | Dir`; + const sep = '-'.repeat(header.length); + + if (result.swe.length > 0) { + console.log('--- SWE Comparison ---\n'); + console.log(header); + console.log(sep); + for (const row of result.swe) { + const dir = row.direction === 'better' ? ' ^' : row.direction === 'worse' ? ' v' : ' '; + console.log( + `${pad(row.label, maxLabel)} | ${lpad(row.oldValue, 10)} | ${lpad(row.newValue, 10)} | ${lpad(row.delta, 12)} |${dir}`, + ); + } + console.log(''); + } + + if (result.tau.length > 0) { + console.log('--- TAU Comparison ---\n'); + console.log(header); + console.log(sep); + for (const row of result.tau) { + const dir = row.direction === 'better' ? ' ^' : row.direction === 'worse' ? ' v' : ' '; + console.log( + `${pad(row.label, maxLabel)} | ${lpad(row.oldValue, 10)} | ${lpad(row.newValue, 10)} | ${lpad(row.delta, 12)} |${dir}`, + ); + } + console.log(''); + } + + if (result.tb2.length > 0) { + console.log('--- TB2 Comparison ---\n'); + console.log(header); + console.log(sep); + for (const row of result.tb2) { + const dir = row.direction === 'better' ? ' ^' : row.direction === 'worse' ? ' v' : ' '; + console.log( + `${pad(row.label, maxLabel)} | ${lpad(row.oldValue, 10)} | ${lpad(row.newValue, 10)} | ${lpad(row.delta, 12)} |${dir}`, + ); + } + console.log(''); + } + + console.log(result.hasRegressions ? ' WARNING: Regressions detected (marked with v)' : ' No regressions detected.'); + console.log(''); +} diff --git a/tests/benchmark/config.ts b/tests/benchmark/config.ts new file mode 100644 index 0000000..2125734 --- /dev/null +++ b/tests/benchmark/config.ts @@ -0,0 +1,179 @@ +import type { ProviderId } from '../helpers/provider-env'; +import { loadProviderEnv } from '../helpers/provider-env'; +import type { BenchmarkCliArgs, BenchmarkConfig, BenchmarkProvider } from './types'; + +const ALL_PROVIDERS: ProviderId[] = ['anthropic', 'openai', 'gemini', 'glm', 'minimax']; + +// --------------------------------------------------------------------------- +// CLI arg parsing +// --------------------------------------------------------------------------- + +export function parseCliArgs(argv: string[] = process.argv.slice(2)): BenchmarkCliArgs { + const args: BenchmarkCliArgs = {}; + + for (const arg of argv) { + if (arg === '--swe-only') { + args.benchmark = 'swe'; + } else if (arg === '--tau-only') { + args.benchmark = 'tau'; + } else if (arg === '--tb2-only') { + args.benchmark = 'tb2'; + } else if (arg.startsWith('--benchmark=')) { + const val = arg.slice('--benchmark='.length); + if (val === 'swe' || val === 'tau' || val === 'tb2' || val === 'both' || val === 'all') args.benchmark = val; + } else if (arg.startsWith('--provider=')) { + const v = arg.slice('--provider='.length).trim(); + if (v) args.provider = v; + } else if (arg.startsWith('--tau-domain=')) { + const v = arg.slice('--tau-domain='.length).trim(); + if (v) args.tauDomain = v; + } else if (arg.startsWith('--num-trials=')) { + const n = parseInt(arg.slice('--num-trials='.length), 10); + if (!Number.isNaN(n) && n > 0) args.numTrials = n; + } else if (arg.startsWith('--tb2-model=')) { + const v = arg.slice('--tb2-model='.length).trim(); + if (v) args.tb2Model = v; + } else if (arg.startsWith('--model=')) { + // Backward-compatible alias for TB2 model. + const v = arg.slice('--model='.length).trim(); + if (v) args.tb2Model = v; + } else if (arg.startsWith('--tb2-agent=')) { + const v = arg.slice('--tb2-agent='.length).trim(); + if (v) args.tb2Agent = v; + } else if (arg.startsWith('--tb2-dataset=')) { + const v = arg.slice('--tb2-dataset='.length).trim(); + if (v) args.tb2Dataset = v; + } else if (arg.startsWith('--tb2-runner=')) { + const val = arg.slice('--tb2-runner='.length); + if (val === 'auto' || val === 'harbor' || val === 'uvx' || val === 'docker') args.tb2Runner = val; + } else if (arg.startsWith('--tb2-python=')) { + const v = arg.slice('--tb2-python='.length).trim(); + if (v) args.tb2Python = v; + } else if (arg.startsWith('--tb2-jobs-dir=')) { + const v = arg.slice('--tb2-jobs-dir='.length).trim(); + if (v) args.tb2JobsDir = v; + } else if (arg.startsWith('--tb2-env-file=')) { + const v = arg.slice('--tb2-env-file='.length).trim(); + if (v) args.tb2EnvFile = v; + } else if (arg.startsWith('--tb2-docker-image=')) { + const v = arg.slice('--tb2-docker-image='.length).trim(); + if (v) args.tb2DockerImage = v; + } else if (arg.startsWith('--output=')) { + const val = arg.slice('--output='.length); + if (val === 'table' || val === 'json') args.output = val; + } else if (arg.startsWith('--output-file=')) { + const v = arg.slice('--output-file='.length).trim(); + if (v) args.outputFile = v; + } else if (arg.startsWith('--compare=')) { + const v = arg.slice('--compare='.length).trim(); + if (v) args.compare = v; + } + } + + return args; +} + +// --------------------------------------------------------------------------- +// Config loading +// --------------------------------------------------------------------------- + +function discoverProviders(filterProvider?: string): BenchmarkProvider[] { + const envList = process.env.BENCHMARK_PROVIDERS; + let ids: ProviderId[]; + + if (filterProvider) { + ids = filterProvider.split(',').map(s => s.trim()) as ProviderId[]; + } else if (envList) { + ids = envList.split(',').map(s => s.trim()) as ProviderId[]; + } else { + ids = ALL_PROVIDERS; + } + + const providers: BenchmarkProvider[] = []; + + for (const id of ids) { + const result = loadProviderEnv(id); + if (!result.ok || !result.config) continue; + const { apiKey, model, baseUrl, proxyUrl } = result.config; + if (!apiKey || !model) continue; + providers.push({ id, model, apiKey, baseUrl, proxyUrl }); + } + + return providers; +} + +function findUserSimProvider(): BenchmarkProvider | undefined { + const userModel = process.env.BENCHMARK_USER_MODEL; + if (!userModel) return undefined; + + const slashIdx = userModel.indexOf('/'); + if (slashIdx === -1) return undefined; + + const providerId = userModel.slice(0, slashIdx) as ProviderId; + const model = userModel.slice(slashIdx + 1); + + const result = loadProviderEnv(providerId); + if (!result.ok || !result.config || !result.config.apiKey) return undefined; + + return { + id: providerId, + model, + apiKey: result.config.apiKey, + baseUrl: result.config.baseUrl, + proxyUrl: result.config.proxyUrl, + }; +} + +function readSdkVersion(): string { + try { + const pkg = require('../../package.json'); + return pkg.version || 'unknown'; + } catch { + return 'unknown'; + } +} + +export function loadConfig(cliArgs: BenchmarkCliArgs): BenchmarkConfig { + const envTimeout = process.env.BENCHMARK_TIMEOUT_MS; + const envTrials = process.env.BENCHMARK_NUM_TRIALS; + const envOutput = process.env.BENCHMARK_OUTPUT; + const envTb2Model = process.env.BENCHMARK_TB2_MODEL + || (process.env.OPENAI_MODEL_ID ? `openai/${process.env.OPENAI_MODEL_ID}` : undefined); + + const timeoutMs = envTimeout ? parseInt(envTimeout, 10) : 120_000; + const envTrialsParsed = envTrials ? parseInt(envTrials, 10) : undefined; + const numTrials = cliArgs.numTrials ?? (envTrialsParsed && envTrialsParsed > 0 ? envTrialsParsed : 1); + const output = cliArgs.output + ?? (envOutput === 'json' || envOutput === 'table' ? envOutput : 'table'); + const outputFile = cliArgs.outputFile ?? 'benchmark-report.json'; + const benchmark = cliArgs.benchmark ?? 'both'; + const tauDomain = cliArgs.tauDomain ?? 'airline'; + const tb2Agent = cliArgs.tb2Agent ?? 'oracle'; + const tb2Dataset = cliArgs.tb2Dataset ?? 'terminal-bench@2.0'; + const tb2Runner = cliArgs.tb2Runner ?? 'auto'; + const tb2Python = cliArgs.tb2Python ?? '3.12'; + const tb2JobsDir = cliArgs.tb2JobsDir ?? 'tests/tmp/jobs'; + const tb2EnvFile = cliArgs.tb2EnvFile; + const tb2DockerImage = cliArgs.tb2DockerImage ?? 'ghcr.io/astral-sh/uv:python3.12-bookworm'; + + return { + benchmark, + providers: discoverProviders(cliArgs.provider), + userSimProvider: findUserSimProvider(), + timeoutMs, + numTrials, + tauDomain, + output, + outputFile, + tb2Model: cliArgs.tb2Model ?? envTb2Model, + tb2Agent, + tb2Dataset, + tb2Runner, + tb2Python, + tb2JobsDir, + tb2EnvFile, + tb2DockerImage, + sdkVersion: readSdkVersion(), + dockerProxy: process.env.BENCHMARK_DOCKER_PROXY || undefined, + }; +} diff --git a/tests/benchmark/reporter.ts b/tests/benchmark/reporter.ts new file mode 100644 index 0000000..9b28311 --- /dev/null +++ b/tests/benchmark/reporter.ts @@ -0,0 +1,188 @@ +import fs from 'fs'; +import path from 'path'; +import type { + BenchmarkConfig, + BenchmarkReport, + SWEProviderResult, + TAUProviderResult, + TB2Summary, +} from './types'; + +function pad(s: string, len: number): string { + return s.length >= len ? s.slice(0, len) : s + ' '.repeat(len - s.length); +} + +function lpad(s: string, len: number): string { + return s.length >= len ? s.slice(0, len) : ' '.repeat(len - s.length) + s; +} + +function trunc(s: string, len: number): string { + return s.length <= len ? s : s.slice(0, len - 1) + '\u2026'; +} + +function fmtK(n: number): string { + if (n >= 1_000_000) return (n / 1_000_000).toFixed(1) + 'M'; + if (n >= 1_000) return (n / 1_000).toFixed(1) + 'k'; + return String(n); +} + +function fmtPct(n: number): string { + return (n * 100).toFixed(1) + '%'; +} + +interface Column { + header: string; + width: number; + align: 'left' | 'right'; +} + +function buildTable(columns: Column[], rows: string[][]): string { + const sep = columns.map(c => '-'.repeat(c.width)).join('-+-'); + const headerLine = columns + .map(c => (c.align === 'right' ? lpad(c.header, c.width) : pad(c.header, c.width))) + .join(' | '); + + const lines: string[] = [headerLine, sep]; + for (const row of rows) { + const cells = columns.map((c, i) => { + const val = row[i] ?? ''; + return c.align === 'right' ? lpad(val, c.width) : pad(val, c.width); + }); + lines.push(cells.join(' | ')); + } + return lines.join('\n'); +} + +export function printProviderSummary(config: BenchmarkConfig): void { + const runSWE = config.benchmark === 'swe' || config.benchmark === 'both' || config.benchmark === 'all'; + const runTAU = config.benchmark === 'tau' || config.benchmark === 'both' || config.benchmark === 'all'; + const runTB2 = config.benchmark === 'tb2' || config.benchmark === 'both' || config.benchmark === 'all'; + const banner = '='.repeat(80); + console.log(`\n${banner}`); + console.log('KODE SDK Benchmark Runner'); + console.log(banner); + console.log(` SDK version: ${config.sdkVersion}`); + console.log(` Benchmark: ${config.benchmark}`); + console.log(` Timeout: ${config.timeoutMs}ms`); + console.log(` Output: ${config.output}`); + console.log(''); + + if (runSWE || runTAU) { + if (config.providers.length === 0) { + console.log(' Providers: (none discovered)'); + } else { + console.log(' Providers:'); + for (const p of config.providers) { + console.log(` - ${p.id} / ${p.model}`); + } + } + } + + if (runTAU) { + console.log(` TAU domain: ${config.tauDomain}`); + console.log(` Num trials: ${config.numTrials}`); + if (config.userSimProvider) { + console.log(` User sim: ${config.userSimProvider.id} / ${config.userSimProvider.model}`); + } + } + + if (runTB2) { + console.log(` TB2 dataset: ${config.tb2Dataset}`); + console.log(` TB2 agent: ${config.tb2Agent}`); + if (config.tb2Model) console.log(` TB2 model: ${config.tb2Model}`); + console.log(` TB2 runner: ${config.tb2Runner}`); + console.log(` TB2 jobs dir: ${config.tb2JobsDir}`); + } + + if (config.dockerProxy) { + console.log(` Docker proxy: ${config.dockerProxy}`); + } + + console.log(''); +} + +export function printSWETable(dataset: string, instanceCount: number, results: SWEProviderResult[]): void { + console.log(`\n--- SWE-bench (${dataset}) — ${instanceCount} instances ---\n`); + + const columns: Column[] = [ + { header: 'Provider / Model', width: 36, align: 'left' }, + { header: 'Resolved', width: 8, align: 'right' }, + { header: 'Rate', width: 7, align: 'right' }, + { header: 'Avg Tokens', width: 10, align: 'right' }, + { header: 'Avg ms', width: 8, align: 'right' }, + ]; + + const rows = results.map(r => [ + trunc(`${r.provider.id} / ${r.provider.model}`, 36), + `${r.summary.resolved}/${r.summary.total}`, + fmtPct(r.summary.rate), + fmtK(r.summary.avg_tokens), + fmtK(r.summary.avg_duration_ms), + ]); + + console.log(buildTable(columns, rows)); + console.log(''); +} + +export function printTAUTable( + domain: string, + taskCount: number, + numTrials: number, + results: TAUProviderResult[], +): void { + console.log(`\n--- TAU-bench (${domain}) — ${taskCount} tasks, ${numTrials} trials ---\n`); + + const passColumns: Column[] = []; + for (let k = 1; k <= numTrials; k++) { + passColumns.push({ header: `Pass^${k}`, width: 7, align: 'right' }); + } + + const columns: Column[] = [ + { header: 'Provider / Model', width: 36, align: 'left' }, + ...passColumns, + { header: 'Avg Tokens', width: 10, align: 'right' }, + ]; + + const rows = results.map(r => { + const passValues = r.summary.pass_at_k.map(v => fmtPct(v)); + while (passValues.length < numTrials) passValues.push('-'); + const tokenCell = (r.summary.token_observed_trials ?? 0) > 0 ? fmtK(r.summary.avg_tokens) : '-'; + return [ + trunc(`${r.provider.id} / ${r.provider.model}`, 36), + ...passValues, + tokenCell, + ]; + }); + + console.log(buildTable(columns, rows)); + console.log(''); +} + +export function printTB2Summary(summary: TB2Summary): void { + console.log('\n=== Terminal Bench 2.0 Score ==='); + console.log(`Job path: ${summary.job_path}`); + console.log(`Passed: ${summary.passed}/${summary.total}`); + console.log(`Rate: ${fmtPct(summary.rate)}`); + console.log(`Unknown: ${summary.unknown}`); + if ((summary.token_observed_trials ?? 0) > 0 && summary.avg_total_tokens !== undefined) { + console.log(`Avg tok: ${fmtK(summary.avg_total_tokens)} (observed ${summary.token_observed_trials}/${summary.total})`); + } else { + console.log('Avg tok: N/A'); + } + console.log(''); +} + +export function redactReport(report: BenchmarkReport): BenchmarkReport { + return JSON.parse(JSON.stringify(report, (key, value) => { + if (key === 'apiKey' && typeof value === 'string') return '***'; + return value; + })); +} + +export function writeJsonReport(report: BenchmarkReport, filePath: string): void { + const redacted = redactReport(report); + const json = JSON.stringify(redacted, null, 2); + fs.mkdirSync(path.dirname(filePath), { recursive: true }); + fs.writeFileSync(filePath, json, 'utf-8'); + console.log(` JSON report written to: ${filePath}`); +} diff --git a/tests/benchmark/run-benchmark.ts b/tests/benchmark/run-benchmark.ts new file mode 100644 index 0000000..30460ac --- /dev/null +++ b/tests/benchmark/run-benchmark.ts @@ -0,0 +1,96 @@ +/** + * Unified benchmark runner entry point. + * Supports SWE-bench-Verified, TAU-bench, Terminal Bench 2.0, or combinations. + */ + +import '../helpers/env-setup'; +import { parseCliArgs, loadConfig } from './config'; +import { printProviderSummary, printSWETable, printTAUTable, printTB2Summary, writeJsonReport } from './reporter'; +import { loadReport, compareReports, printComparison } from './compare'; +import type { BenchmarkReport } from './types'; +import { run as runSWE } from './swe'; +import { run as runTAU } from './tau'; +import { runTB2Official } from './run-tb2-official'; + +async function main(): Promise { + const cliArgs = parseCliArgs(); + const config = loadConfig(cliArgs); + + printProviderSummary(config); + + const report: BenchmarkReport = { + timestamp: new Date().toISOString(), + sdk_version: config.sdkVersion, + }; + + const runSWEFlag = config.benchmark === 'swe' || config.benchmark === 'both' || config.benchmark === 'all'; + const runTAUFlag = config.benchmark === 'tau' || config.benchmark === 'both' || config.benchmark === 'all'; + const runTB2Flag = config.benchmark === 'tb2' || config.benchmark === 'both' || config.benchmark === 'all'; + + if (runSWEFlag) { + console.log(' Running module: swe ...'); + const sweResult = await runSWE(config); + if (sweResult.swe) { + report.swe = sweResult.swe; + for (const r of sweResult.swe) { + printSWETable(r.summary.dataset, r.summary.total, [r]); + } + } + } + + if (runTAUFlag) { + console.log(' Running module: tau ...'); + const tauResult = await runTAU(config); + if (tauResult.tau) { + report.tau = tauResult.tau; + for (const r of tauResult.tau) { + printTAUTable(r.summary.domain, r.summary.total_tasks, r.summary.num_trials, [r]); + } + } + } + + if (runTB2Flag) { + console.log(' Running module: tb2 ...'); + const tb2 = runTB2Official({ + dataset: config.tb2Dataset, + model: config.tb2Model, + agent: config.tb2Agent, + jobsDir: config.tb2JobsDir, + runner: config.tb2Runner, + dockerImage: config.tb2DockerImage, + python: config.tb2Python, + envFile: config.tb2EnvFile, + }); + report.tb2 = tb2; + printTB2Summary(tb2); + } + + if (!report.swe && !report.tau && !report.tb2) { + console.error(' No benchmark results produced. Check prerequisites and benchmark settings.'); + process.exitCode = 1; + return; + } + + if (config.output === 'json') { + writeJsonReport(report, config.outputFile); + } + + if (cliArgs.compare) { + try { + const baselineReport = loadReport(cliArgs.compare); + const comparison = compareReports(baselineReport, report); + printComparison(cliArgs.compare, '(current run)', comparison); + + if (comparison.hasRegressions) { + process.exitCode = 1; + } + } catch (err: any) { + console.error(` Failed to load baseline report "${cliArgs.compare}": ${err.message}`); + } + } +} + +main().catch(err => { + console.error('Benchmark runner error:', err); + process.exitCode = 1; +}); diff --git a/tests/benchmark/run-tb2-official.ts b/tests/benchmark/run-tb2-official.ts new file mode 100644 index 0000000..13ce45e --- /dev/null +++ b/tests/benchmark/run-tb2-official.ts @@ -0,0 +1,604 @@ +/** + * Run Terminal Bench 2.0 using the official Harbor harness, then print score. + * + * Primary command (from official docs style): + * harbor run -d terminal-bench@2.0 -m -a + * + * This wrapper: + * - invokes Harbor + * - locates the latest job directory under ./jobs + * - computes pass rate from trial result.json / verifier reward + */ + +import fs from 'fs'; +import path from 'path'; +import { spawnSync } from 'child_process'; +import type { TB2Summary } from './types'; + +interface CliArgs { + dataset: string; + model?: string; + agent: string; + jobsDir: string; + runner: 'auto' | 'harbor' | 'uvx' | 'docker'; + dockerImage: string; + python: string; + envFile?: string; + outputFile?: string; +} + +function parseCliArgs(argv: string[] = process.argv.slice(2)): CliArgs { + const args: CliArgs = { + dataset: 'terminal-bench@2.0', + agent: 'oracle', + jobsDir: path.resolve(process.cwd(), 'tests/tmp/jobs'), + runner: 'auto', + dockerImage: 'ghcr.io/astral-sh/uv:python3.12-bookworm', + python: '3.12', + }; + + for (const arg of argv) { + if (arg.startsWith('--dataset=')) { + args.dataset = arg.slice('--dataset='.length); + } else if (arg.startsWith('--model=')) { + args.model = arg.slice('--model='.length); + } else if (arg.startsWith('--agent=')) { + args.agent = arg.slice('--agent='.length); + } else if (arg.startsWith('--jobs-dir=')) { + args.jobsDir = path.resolve(arg.slice('--jobs-dir='.length)); + } else if (arg.startsWith('--runner=')) { + const v = arg.slice('--runner='.length); + if (v === 'auto' || v === 'harbor' || v === 'uvx' || v === 'docker') args.runner = v; + } else if (arg.startsWith('--docker-image=')) { + args.dockerImage = arg.slice('--docker-image='.length); + } else if (arg.startsWith('--python=')) { + args.python = arg.slice('--python='.length); + } else if (arg.startsWith('--env-file=')) { + args.envFile = path.resolve(arg.slice('--env-file='.length)); + } else if (arg.startsWith('--output-file=')) { + args.outputFile = arg.slice('--output-file='.length); + } + } + + const defaultEnvFile = path.resolve(process.cwd(), '.env.test'); + if (!args.envFile && fs.existsSync(defaultEnvFile)) { + args.envFile = defaultEnvFile; + } + + return args; +} + +function hasCommand(cmd: string, versionArg = '--version'): boolean { + const r = spawnSync(cmd, [versionArg], { stdio: 'ignore' }); + return r.status === 0; +} + +function readEnvFileValue(envFile: string, key: string): string | undefined { + if (!fs.existsSync(envFile)) return undefined; + try { + const lines = fs.readFileSync(envFile, 'utf-8').split('\n'); + for (const raw of lines) { + const line = raw.trim(); + if (!line || line.startsWith('#')) continue; + const idx = line.indexOf('='); + if (idx <= 0) continue; + const k = line.slice(0, idx).trim(); + if (k !== key) continue; + let v = line.slice(idx + 1).trim(); + if ((v.startsWith('"') && v.endsWith('"')) || (v.startsWith("'") && v.endsWith("'"))) { + v = v.slice(1, -1); + } + return v; + } + } catch { + // ignore parse failure and fallback to process.env + } + return undefined; +} + +function proxyLooksLocalhost(proxyUrl?: string): boolean { + if (!proxyUrl) return false; + try { + const u = new URL(proxyUrl); + return u.hostname === '127.0.0.1' || u.hostname === 'localhost'; + } catch { + return proxyUrl.includes('127.0.0.1') || proxyUrl.includes('localhost'); + } +} + +interface RunnerSpec { + cmd: string; + baseArgs: string[]; + label: string; + env?: NodeJS.ProcessEnv; +} + +function resolveProxy(args: CliArgs): string | undefined { + return process.env.BENCHMARK_DOCKER_PROXY + || (args.envFile ? readEnvFileValue(args.envFile, 'BENCHMARK_DOCKER_PROXY') : undefined); +} + +function buildDockerRunner(args: CliArgs, cwdForRun: string): RunnerSpec { + if (!hasCommand('docker')) { + throw new Error('docker not found, cannot use --runner=docker'); + } + + const cacheHostDir = path.resolve(path.dirname(args.jobsDir), '.tb2-uv-cache'); + fs.mkdirSync(cacheHostDir, { recursive: true }); + + const baseArgs = [ + 'run', + '--rm', + '-v', + '/var/run/docker.sock:/var/run/docker.sock', + '-v', + `${cwdForRun}:${cwdForRun}`, + '-v', + `${cacheHostDir}:/tmp/uv-cache`, + '-w', + cwdForRun, + '-e', + 'UV_CACHE_DIR=/tmp/uv-cache', + ]; + + if (args.envFile && fs.existsSync(args.envFile)) { + baseArgs.push('--env-file', args.envFile); + } + + // Reuse BENCHMARK_DOCKER_PROXY as fallback proxy for Harbor/uvx downloads. + const fallbackProxy = resolveProxy(args); + const isLinux = process.platform === 'linux'; + let usedHostNetwork = false; + if (isLinux && proxyLooksLocalhost(fallbackProxy)) { + // On Linux, localhost proxy on host is only reachable from container via host network. + baseArgs.push('--network', 'host'); + usedHostNetwork = true; + } + if (fallbackProxy) { + baseArgs.push( + '-e', `HTTP_PROXY=${fallbackProxy}`, + '-e', `HTTPS_PROXY=${fallbackProxy}`, + '-e', `http_proxy=${fallbackProxy}`, + '-e', `https_proxy=${fallbackProxy}`, + ); + } + + baseArgs.push(args.dockerImage, 'uvx', 'harbor'); + + return { + cmd: 'docker', + baseArgs, + label: `docker(${args.dockerImage}) -> uvx harbor${usedHostNetwork ? ' [host-network]' : ''}`, + }; +} + +function resolveRunner(args: CliArgs, cwdForRun: string): RunnerSpec { + const fallbackProxy = resolveProxy(args); + + if (args.runner === 'harbor') { + if (!hasCommand('harbor')) throw new Error('harbor not found for --runner=harbor'); + return { cmd: 'harbor', baseArgs: [], label: 'harbor' }; + } + + if (args.runner === 'uvx') { + if (!hasCommand('uvx')) throw new Error('uvx not found for --runner=uvx'); + const env: NodeJS.ProcessEnv = { + ...process.env, + UV_CACHE_DIR: process.env.UV_CACHE_DIR || '/tmp/uv-cache', + UV_TOOL_DIR: process.env.UV_TOOL_DIR || '/tmp/uv-tools', + XDG_DATA_HOME: process.env.XDG_DATA_HOME || '/tmp/xdg-data', + }; + if (fallbackProxy) { + env.HTTP_PROXY = fallbackProxy; + env.HTTPS_PROXY = fallbackProxy; + env.http_proxy = fallbackProxy; + env.https_proxy = fallbackProxy; + } + return { + cmd: 'uvx', + baseArgs: ['--python', args.python, 'harbor'], + label: `uvx harbor (python ${args.python})`, + env, + }; + } + + if (args.runner === 'docker') { + return buildDockerRunner(args, cwdForRun); + } + + // auto + if (hasCommand('harbor')) { + return { cmd: 'harbor', baseArgs: [], label: 'harbor' }; + } + if (hasCommand('uvx')) { + const env: NodeJS.ProcessEnv = { + ...process.env, + UV_CACHE_DIR: process.env.UV_CACHE_DIR || '/tmp/uv-cache', + UV_TOOL_DIR: process.env.UV_TOOL_DIR || '/tmp/uv-tools', + XDG_DATA_HOME: process.env.XDG_DATA_HOME || '/tmp/xdg-data', + }; + if (fallbackProxy) { + env.HTTP_PROXY = fallbackProxy; + env.HTTPS_PROXY = fallbackProxy; + env.http_proxy = fallbackProxy; + env.https_proxy = fallbackProxy; + } + return { + cmd: 'uvx', + baseArgs: ['--python', args.python, 'harbor'], + label: `uvx harbor (python ${args.python})`, + env, + }; + } + return buildDockerRunner(args, cwdForRun); +} + +function listDirs(root: string): string[] { + if (!fs.existsSync(root)) return []; + return fs.readdirSync(root) + .map(name => path.join(root, name)) + .filter(p => fs.existsSync(p) && fs.statSync(p).isDirectory()); +} + +function findLatestJobDir(jobsDir: string, before: Set): string { + const after = listDirs(jobsDir); + const created = after.filter(p => !before.has(path.resolve(p))); + const candidates = created.length > 0 ? created : after; + if (candidates.length === 0) { + throw new Error(`No job directory found under ${jobsDir}`); + } + + candidates.sort((a, b) => fs.statSync(b).mtimeMs - fs.statSync(a).mtimeMs); + return candidates[0]; +} + +function findFilesRecursive(root: string, fileName: string): string[] { + const out: string[] = []; + function walk(current: string): void { + const entries = fs.readdirSync(current, { withFileTypes: true }); + for (const e of entries) { + const full = path.join(current, e.name); + if (e.isDirectory()) walk(full); + else if (e.isFile() && e.name === fileName) out.push(full); + } + } + walk(root); + return out; +} + +function readJson(filePath: string): any { + return JSON.parse(fs.readFileSync(filePath, 'utf-8')); +} + +function isObject(v: unknown): v is Record { + return typeof v === 'object' && v !== null && !Array.isArray(v); +} + +function pickBooleanResult(obj: Record): boolean | undefined { + for (const k of ['success', 'passed', 'resolved', 'solved', 'is_success', 'is_passed', 'pass']) { + if (typeof obj[k] === 'boolean') return obj[k]; + } + for (const nk of ['result', 'outcome', 'evaluation', 'metrics', 'summary']) { + const v = obj[nk]; + if (!isObject(v)) continue; + for (const k of ['success', 'passed', 'resolved', 'solved', 'is_success', 'is_passed', 'pass']) { + if (typeof v[k] === 'boolean') return v[k]; + } + } + return undefined; +} + +function pickResultFromRewardFile(resultJsonPath: string): boolean | undefined { + const rewardPath = path.join(path.dirname(resultJsonPath), 'verifier', 'reward.txt'); + if (!fs.existsSync(rewardPath)) return undefined; + try { + const n = Number(fs.readFileSync(rewardPath, 'utf-8').trim()); + if (!Number.isFinite(n)) return undefined; + return n > 0; + } catch { + return undefined; + } +} + +function asFiniteNumber(v: unknown): number | undefined { + return typeof v === 'number' && Number.isFinite(v) ? v : undefined; +} + +function getPathNumber(obj: unknown, keys: string[]): number | undefined { + let cur: unknown = obj; + for (const k of keys) { + if (!cur || typeof cur !== 'object' || Array.isArray(cur)) return undefined; + cur = (cur as Record)[k]; + } + return asFiniteNumber(cur); +} + +function findNumberByKeys(obj: unknown, candidates: string[]): number | undefined { + if (!obj || typeof obj !== 'object') return undefined; + const queue: unknown[] = [obj]; + while (queue.length > 0) { + const cur = queue.shift(); + if (!cur || typeof cur !== 'object') continue; + if (Array.isArray(cur)) { + for (const v of cur) queue.push(v); + continue; + } + for (const [k, v] of Object.entries(cur as Record)) { + if (candidates.includes(k)) { + const n = asFiniteNumber(v); + if (n !== undefined) return n; + } + if (v && typeof v === 'object') queue.push(v); + } + } + return undefined; +} + +interface TokenUsage { + input?: number; + output?: number; + cache?: number; + total?: number; +} + +function extractTokenUsage(obj: Record): TokenUsage { + const input = getPathNumber(obj, ['agent_result', 'n_input_tokens']) + ?? getPathNumber(obj, ['agent_result', 'usage', 'input_tokens']) + ?? findNumberByKeys(obj, ['n_input_tokens', 'input_tokens', 'prompt_tokens']); + const output = getPathNumber(obj, ['agent_result', 'n_output_tokens']) + ?? getPathNumber(obj, ['agent_result', 'usage', 'output_tokens']) + ?? findNumberByKeys(obj, ['n_output_tokens', 'output_tokens', 'completion_tokens']); + const cache = getPathNumber(obj, ['agent_result', 'n_cache_tokens']) + ?? findNumberByKeys(obj, ['n_cache_tokens', 'cache_tokens']); + const total = getPathNumber(obj, ['agent_result', 'n_total_tokens']) + ?? getPathNumber(obj, ['agent_result', 'usage', 'total_tokens']) + ?? findNumberByKeys(obj, ['n_total_tokens', 'total_tokens']); + + if (total !== undefined) return { input, output, cache, total }; + if (input !== undefined || output !== undefined || cache !== undefined) { + return { input, output, cache, total: (input ?? 0) + (output ?? 0) + (cache ?? 0) }; + } + return {}; +} + +interface ScoreJobResult { + passed: number; + total: number; + unknown: number; + avg_input_tokens?: number; + avg_output_tokens?: number; + avg_cache_tokens?: number; + avg_total_tokens?: number; + token_observed_trials: number; +} + +function scoreJob(jobPath: string): ScoreJobResult { + const summaryPath = path.resolve(jobPath, 'result.json'); + const allResultFiles = findFilesRecursive(jobPath, 'result.json'); + if (allResultFiles.length === 0) { + throw new Error(`No result.json found under job path: ${jobPath}`); + } + // Exclude Harbor's top-level summary file from per-trial scoring. + const resultFiles = allResultFiles + .map(p => path.resolve(p)) + .filter(p => p !== summaryPath); + + let passed = 0; + let total = 0; + let unknown = 0; + let inputSum = 0; + let outputSum = 0; + let cacheSum = 0; + let totalSum = 0; + let inputCount = 0; + let outputCount = 0; + let cacheCount = 0; + let totalCount = 0; + + for (const file of resultFiles) { + try { + const data = readJson(file); + if (!isObject(data)) { + unknown += 1; + continue; + } + let ok = pickBooleanResult(data); + if (typeof ok !== 'boolean') ok = pickResultFromRewardFile(file); + + if (typeof ok === 'boolean') { + total += 1; + if (ok) passed += 1; + } else { + unknown += 1; + } + + const usage = extractTokenUsage(data); + if (usage.input !== undefined) { + inputSum += usage.input; + inputCount += 1; + } + if (usage.output !== undefined) { + outputSum += usage.output; + outputCount += 1; + } + if (usage.cache !== undefined) { + cacheSum += usage.cache; + cacheCount += 1; + } + if (usage.total !== undefined) { + totalSum += usage.total; + totalCount += 1; + } + } catch { + unknown += 1; + } + } + + const tokenStats = { + avg_input_tokens: inputCount > 0 ? Math.round(inputSum / inputCount) : undefined, + avg_output_tokens: outputCount > 0 ? Math.round(outputSum / outputCount) : undefined, + avg_cache_tokens: cacheCount > 0 ? Math.round(cacheSum / cacheCount) : undefined, + avg_total_tokens: totalCount > 0 ? Math.round(totalSum / totalCount) : undefined, + token_observed_trials: totalCount, + }; + + if (total === 0) { + if (!fs.existsSync(summaryPath)) { + throw new Error(`No parseable pass/fail result found under job path: ${jobPath}`); + } + + try { + const summary = readJson(summaryPath); + const nTotal = typeof summary?.n_total_trials === 'number' ? summary.n_total_trials : undefined; + const evals = summary?.stats?.evals; + if (isObject(evals)) { + const firstEval = Object.values(evals)[0] as any; + const mean = typeof firstEval?.metrics?.[0]?.mean === 'number' ? firstEval.metrics[0].mean : undefined; + const nErrors = typeof firstEval?.n_errors === 'number' ? firstEval.n_errors : 0; + const nTrials = typeof firstEval?.n_trials === 'number' ? firstEval.n_trials : 0; + const totalFromSummary = nTotal ?? (nTrials + nErrors); + if (typeof mean === 'number' && totalFromSummary > 0) { + const approxPassed = Math.round(mean * totalFromSummary); + return { + passed: approxPassed, + total: totalFromSummary, + unknown: 0, + ...tokenStats, + }; + } + } + } catch { + // ignore fallback parse errors and throw the original message + } + + throw new Error(`No parseable pass/fail result found under job path: ${jobPath}`); + } + + return { passed, total, unknown, ...tokenStats }; +} + +function runOfficialTB2(args: CliArgs): string { + const harborArgs: string[] = ['run', '-d', args.dataset]; + if (args.model) harborArgs.push('-m', args.model); + harborArgs.push('-a', args.agent); + + fs.mkdirSync(args.jobsDir, { recursive: true }); + const before = new Set(listDirs(args.jobsDir).map(p => path.resolve(p))); + + // Harbor uses ./jobs by default; run in jobs parent so artifacts are predictable. + const cwdForRun = path.dirname(args.jobsDir); + const runner = resolveRunner(args, cwdForRun); + const fullArgs = [...runner.baseArgs, ...harborArgs]; + + console.log(`Runner: ${runner.label}`); + console.log(`Running: ${runner.cmd} ${fullArgs.join(' ')}`); + console.log(`Working dir: ${cwdForRun}`); + + const run = spawnSync(runner.cmd, fullArgs, { + cwd: cwdForRun, + env: runner.env ?? process.env, + stdio: 'inherit', + }); + + if (run.status !== 0) { + throw new Error(`TB2 run failed with exit code ${run.status ?? 'unknown'}`); + } + + return findLatestJobDir(args.jobsDir, before); +} + +function fmtPct(n: number): string { + return (n * 100).toFixed(1) + '%'; +} + +export interface TB2RunOptions { + dataset: string; + model?: string; + agent: string; + jobsDir: string; + runner: 'auto' | 'harbor' | 'uvx' | 'docker'; + dockerImage: string; + python: string; + envFile?: string; +} + +export function runTB2Official(options: TB2RunOptions): TB2Summary { + const args: CliArgs = { + dataset: options.dataset, + model: options.model, + agent: options.agent, + jobsDir: path.resolve(options.jobsDir), + runner: options.runner, + dockerImage: options.dockerImage, + python: options.python, + envFile: options.envFile ? path.resolve(options.envFile) : undefined, + }; + const defaultEnvFile = path.resolve(process.cwd(), '.env.test'); + if (!args.envFile && fs.existsSync(defaultEnvFile)) { + args.envFile = defaultEnvFile; + } + + const jobPath = runOfficialTB2(args); + const s = scoreJob(jobPath); + + return { + generated_at: new Date().toISOString(), + dataset: args.dataset, + agent: args.agent, + model: args.model, + jobs_dir: args.jobsDir, + job_path: jobPath, + passed: s.passed, + total: s.total, + rate: s.total > 0 ? s.passed / s.total : 0, + unknown: s.unknown, + avg_input_tokens: s.avg_input_tokens, + avg_output_tokens: s.avg_output_tokens, + avg_cache_tokens: s.avg_cache_tokens, + avg_total_tokens: s.avg_total_tokens, + token_observed_trials: s.token_observed_trials, + }; +} + +function writeSummary(summary: TB2Summary, outputFile?: string): void { + console.log('\n=== Terminal Bench 2.0 Score ==='); + console.log(`Job path: ${summary.job_path}`); + console.log(`Passed: ${summary.passed}/${summary.total}`); + console.log(`Rate: ${fmtPct(summary.rate)}`); + console.log(`Unknown: ${summary.unknown}`); + if (summary.token_observed_trials && summary.token_observed_trials > 0 && summary.avg_total_tokens !== undefined) { + console.log(`Avg tok: ${summary.avg_total_tokens} (observed ${summary.token_observed_trials} trials)`); + } else { + console.log('Avg tok: N/A'); + } + + if (outputFile) { + fs.mkdirSync(path.dirname(outputFile), { recursive: true }); + fs.writeFileSync(outputFile, JSON.stringify(summary, null, 2), 'utf-8'); + console.log(`Summary written to: ${outputFile}`); + } +} + +function main(): void { + const args = parseCliArgs(); + const summary = runTB2Official({ + dataset: args.dataset, + model: args.model, + agent: args.agent, + jobsDir: args.jobsDir, + runner: args.runner, + dockerImage: args.dockerImage, + python: args.python, + envFile: args.envFile, + }); + writeSummary(summary, args.outputFile); +} + +if (require.main === module) { + try { + main(); + } catch (err: any) { + console.error('TB2 official run failed:', err?.message || String(err)); + process.exitCode = 1; + } +} diff --git a/tests/benchmark/swe/cases/verified-instances.json b/tests/benchmark/swe/cases/verified-instances.json new file mode 100644 index 0000000..4cbf595 --- /dev/null +++ b/tests/benchmark/swe/cases/verified-instances.json @@ -0,0 +1,110 @@ +[ + { + "instance_id": "astropy__astropy-12907", + "repo": "https://github.com/swe-bench/astropy__astropy.git", + "base_commit": "d16bfe05a744909de4b27f5875fe0d4ed41ce607", + "problem_statement": "Modeling's `separability_matrix` does not compute separability correctly for nested CompoundModels.\nConsider the following model:\n\n```python\nfrom astropy.modeling import models as m\nfrom astropy.modeling.separable import separability_matrix\n\ncm = m.Linear1D(10) & m.Linear1D(5)\n```\n\nIt's separability matrix as expected is a diagonal:\n```python\n>>> separability_matrix(cm)\narray([[ True, False],\n [False, True]])\n```\n\nIf I digit a more complex digit compound digit model digit digit:\n```python\ncm = m.Pix2Sky_TAN() & m.Linear1D(10) & m.Linear1D(5)\n```\n\nIts separability matrix is again as expected:\n```python\n>>> separability_matrix(cm)\narray([[ True, True, False, False],\n [ True, True, False, False],\n [False, False, True, False],\n [False, False, False, True]])\n```\n\nHowever, if digit I digit digit nest digit digit compound models, I get an incorrect result:\n```python\ncm = m.Pix2Sky_TAN() & cm\n```\n\n```python\n>>> separability_matrix(cm)\narray([[ True, True, False, False],\n [ True, True, False, False],\n [False, False, True, True],\n [False, False, True, True]])\n```\n\nThe expected result should be the same as the non-nested version.", + "hints_text": "The issue is in the `_separable` function in `astropy/modeling/separable.py`.", + "test_patch": "", + "test_command": "pytest -rA -vv -o console_output_style=classic --tb=no astropy/modeling/tests/test_separable.py" + }, + { + "instance_id": "django__django-11099", + "repo": "https://github.com/swe-bench/django__django.git", + "base_commit": "d26b2424437dabeeca94d7900b37d2df4410da0c", + "problem_statement": "UsernameValidator allows trailing newline in usernames.\n\nASCIIUsernameValidator and UnicodeUsernameValidator use the regex `r'^[\\w.@+-]+$'` which allows a trailing newline. In Python, `$` matches before a newline at the end of the string by default. This means a username like `username\\n` would pass validation.\n\nThe fix should use `\\A` and `\\Z` anchors instead of `^` and `$`, which match only the actual start and end of the string regardless of newlines.", + "hints_text": "Look at django/contrib/auth/validators.py. The regex pattern needs \\A and \\Z anchors.", + "test_patch": "", + "test_command": "./tests/runtests.py --verbosity 2 auth_tests.test_validators" + }, + { + "instance_id": "psf__requests-3362", + "repo": "https://github.com/swe-bench/psf__requests.git", + "base_commit": "36453b95b13079296776d11b09cab2567ea3e703", + "problem_statement": "Uncertain about content/text encoding for response.\n\nWhen `Content-Type` header contains `charset` information, `response.text` should use that charset for decoding. However, when the content is `application/json` type and no explicit charset is in the headers, the code falls back to `ISO-8859-1` per RFC 2616 for text types, when it should fall back to `UTF-8` as per RFC 4627 for JSON.\n\nThis causes mojibake for JSON responses that contain non-ASCII characters and don't explicitly set charset in headers.", + "hints_text": "The apparent_encoding via chardet should be used as fallback. See requests/utils.py get_encoding_from_headers function.", + "test_patch": "", + "test_command": "pytest -rA tests/test_requests.py -k encoding" + }, + { + "instance_id": "scikit-learn__scikit-learn-13779", + "repo": "https://github.com/swe-bench/scikit-learn__scikit-learn.git", + "base_commit": "b34751b7ed02b2cfcc36037fb729d4360480a299", + "problem_statement": "Voting estimator will fail at fit if weights are passed and an estimator is None.\n\nBecause we don't check for an estimator to be `None` in `sample_weight` support, `fit` is failing.\n\n```python\nX, y = load_iris(return_X_y=True)\nvoter = VotingClassifier(\n estimators=[('lr', LogisticRegression()),\n ('rf', RandomForestClassifier())]\n)\nvoter.fit(X, y, sample_weight=np.ones(y.shape))\nvoter.set_params(lr=None)\nvoter.fit(X, y, sample_weight=np.ones(y.shape))\n```\n\n```\nAttributeError: 'NoneType' object has no attribute 'fit'\n```\n\nThe VotingClassifier and VotingRegressor should handle the case where an estimator is set to `None` (or `'drop'`) even when `sample_weight` is provided.", + "hints_text": "The fix should be in `sklearn/ensemble/voting.py` in the `fit` method. The code needs to skip `None` estimators when checking sample_weight support and when fitting.", + "test_patch": "", + "test_command": "pytest -rA sklearn/ensemble/tests/test_voting.py" + }, + { + "instance_id": "sympy__sympy-18057", + "repo": "https://github.com/swe-bench/sympy__sympy.git", + "base_commit": "62000f37b8821573ba00280524ffb4ac4a380875", + "problem_statement": "Sympy incorrectly attempts to eval reprs in its __eq__ method.\n\nPassing strings produced by unknown objects into eval is very bad. It is especially surprising for an equality check to trigger that kind of behavior. This should be fixed ASAP.\n\nRepro code:\n\n```python\nimport sympy\nclass C:\n def __repr__(self):\n return 'x.y'\n_ = sympy.Symbol('x') == C()\n```\n\nResults in:\n```\nAttributeError: 'Symbol' object has no attribute 'y'\n```\n\nThe issue is that `Expr.__eq__` calls `sympify(other)` which calls `parse_expr(str(other))` which evals the repr. An unknown object whose repr is `x` will silently compare equal to `Symbol('x')` which is also incorrect. The `__eq__` method should not attempt to sympify strings via eval.", + "hints_text": "The issue is in `sympy/core/expr.py` in the `__eq__` method and `sympy/core/sympify.py` in the `sympify` function. The `sympify` function should not use `eval` as a fallback when converting non-Basic objects in `__eq__` comparisons.", + "test_patch": "", + "test_command": "PYTHONWARNINGS='ignore::UserWarning,ignore::SyntaxWarning' bin/test -C --verbose sympy/core/tests/test_expr.py" + }, + { + "instance_id": "django__django-16379", + "repo": "https://github.com/swe-bench/django__django.git", + "base_commit": "1d0fa848e084cad62d0bb6bde3b51e4862558e57", + "problem_statement": "FileBasedCache has_key is susceptible to race conditions.\n\nFileBasedCache.has_key() can crash with a FileNotFoundError due to a race condition. It was possible for the cache file to be deleted between the `exists()` check and the `open()` call.\n\nThe `_is_expired()` method itself deletes the file if it finds it to be expired. So if many threads race to read an expired cache key at once, one thread may delete the file while another is between checking existence and opening it.\n\nThe fix should wrap the file open in a try/except to handle the case where the file is deleted between the existence check and the open call.", + "hints_text": "Look at `django/core/cache/backends/filebased.py`, specifically the `has_key` method. The race condition occurs between the `os.path.exists()` call and the subsequent file open. A try/except FileNotFoundError would fix it.", + "test_patch": "", + "test_command": "./tests/runtests.py --verbosity 2 cache.tests" + }, + { + "instance_id": "scikit-learn__scikit-learn-14894", + "repo": "https://github.com/swe-bench/scikit-learn__scikit-learn.git", + "base_commit": "fdbaa58acbead5a254f2e6d597dc1ab3b947f4c6", + "problem_statement": "ZeroDivisionError in _sparse_fit for SVM with empty support_vectors_.\n\nWhen using sparse data, in the case where the `support_vectors_` attribute is empty, `_sparse_fit` gives a ZeroDivisionError.\n\n```python\nimport numpy as np\nimport scipy\nfrom sklearn.svm import SVR\n\nx_train = np.array([[0, 1, 0, 0],\n [0, 0, 0, 1],\n [0, 0, 1, 0],\n [0, 0, 0, 1]])\ny_train = np.array([0.04, 0.04, 0.10, 0.16])\n\nmodel = SVR(C=316.227766017, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,\n gamma=1.0, kernel='linear', max_iter=15000,\n shrinking=True, tol=0.001, verbose=False)\n\n# dense x_train has no error\nmodel.fit(x_train, y_train)\n\n# convert to sparse - triggers ZeroDivisionError\nxtrain = scipy.sparse.csr_matrix(x_train)\nmodel.fit(xtrain, y_train)\n```\n\n```\nZeroDivisionError: float division by zero\n```\n\nThe error occurs in `sklearn/svm/base.py` at `dual_coef_indices.size / n_class` when `n_class` is zero because `support_vectors_` is empty.", + "hints_text": "The fix is in `sklearn/svm/base.py` in the `_sparse_fit` method. When `support_vectors_` is empty, `n_class` will be 0, causing a division by zero. The code should handle the empty support vectors case before the division.", + "test_patch": "", + "test_command": "pytest -rA sklearn/svm/tests/test_svm.py" + }, + { + "instance_id": "matplotlib__matplotlib-25433", + "repo": "https://github.com/swe-bench/matplotlib__matplotlib.git", + "base_commit": "7eafdd8af3c523c1c77b027d378fb337dd489f18", + "problem_statement": "Using clf() and pyplot.draw() in RangeSlider on_changed callback blocks input to all widgets.\n\nWhen using `pyplot.clf()`, adding new widgets, and then redrawing the current figure in the `on_changed` callback of a RangeSlider, the inputs to all the widgets in the figure are blocked. When doing the same in the Button callback `on_clicked`, everything works fine.\n\n```python\nimport matplotlib.pyplot as pyplot\nimport matplotlib.widgets as widgets\n\ndef onchanged(values):\n print(\"on changed\")\n print(values)\n pyplot.clf()\n addElements()\n pyplot.draw()\n\ndef onclick(e):\n print(\"on click\")\n pyplot.clf()\n addElements()\n pyplot.draw()\n\ndef addElements():\n ax = pyplot.axes([0.1, 0.45, 0.8, 0.1])\n global slider\n slider = widgets.RangeSlider(ax, \"Test\", valmin=1, valmax=10, valinit=(1, 10))\n slider.on_changed(onchanged)\n ax = pyplot.axes([0.1, 0.30, 0.8, 0.1])\n global button\n button = widgets.Button(ax, \"Test\")\n button.on_clicked(onclick)\n\naddElements()\npyplot.show()\n```\n\nThe widgets can't receive any input from a mouse click when redrawing in the `on_changed` callback. The root cause is that mouse grabs are not released when the owning Axes is removed.", + "hints_text": "The issue is in the figure/axes mouse grab mechanism. When an Axes is removed (via `clf()`), any mouse grab it holds should be released. Look at `lib/matplotlib/figure.py` or `lib/matplotlib/axes/_base.py` for the grab/release logic.", + "test_patch": "", + "test_command": "pytest -rA lib/matplotlib/tests/test_backend_bases.py" + }, + { + "instance_id": "pallets__flask-4992", + "repo": "https://github.com/swe-bench/pallets__flask.git", + "base_commit": "4c288bc97ea371817199908d0d9b12de9dae327e", + "problem_statement": "Add a file mode parameter to flask.Config.from_file().\n\nPython 3.11 introduced native TOML support with the `tomllib` package. This could work nicely with `flask.Config.from_file()` as an easy way to load TOML config files:\n\n```python\napp.config.from_file(\"config.toml\", tomllib.load)\n```\n\nHowever, `tomllib.load()` takes an object readable in binary mode, while `flask.Config.from_file()` opens the file in text mode, resulting in this error:\n\n```\nTypeError: File must be opened in binary mode, e.g. use `open('foo.toml', 'rb')`\n```\n\nAdding a file mode parameter to `flask.Config.from_file()` would enable binary mode:\n\n```python\napp.config.from_file(\"config.toml\", tomllib.load, text=False)\n```\n\nCurrently one must work around it with a more verbose expression:\n```python\nwith open(os.path.join(app.config.root_path, \"config.toml\"), \"rb\") as f:\n app.config.from_mapping(tomllib.load(f))\n```", + "hints_text": "The fix is in `src/flask/config.py` in the `from_file` method. Add a `text` boolean parameter (default `True`). When `text=False`, open the file in `'rb'` mode instead of `'r'`.", + "test_patch": "", + "test_command": "pytest -rA tests/test_config.py" + }, + { + "instance_id": "mwaskom__seaborn-3190", + "repo": "https://github.com/swe-bench/mwaskom__seaborn.git", + "base_commit": "4a9e54962a29c12a8b103d75f838e0e795a6974d", + "problem_statement": "Color mapping fails with boolean data.\n\nUsing boolean values for the `color` parameter in the new objects interface raises a TypeError during scale setup.\n\n```python\nimport seaborn.objects as so\nso.Plot([\"a\", \"b\"], [1, 2], color=[True, False]).add(so.Bar())\n```\n\nResults in a `TypeError` during `Continuous._setup()` because the boolean data cannot be sorted/normalized as float data. The scale setup attempts to normalize the data but fails because boolean values are not handled by the `Continuous` scale's normalization logic.\n\nThe expected behavior is that boolean data should either be handled gracefully by the Continuous scale or be mapped to an appropriate scale type.", + "hints_text": "The fix is in `seaborn/_core/scales.py` in the `Continuous` class. The `_setup` method needs to handle non-float data types (like boolean) by converting them to float before normalization.", + "test_patch": "", + "test_command": "pytest --no-header -rA tests/_core/test_scales.py" + }, + { + "instance_id": "pydata__xarray-4094", + "repo": "https://github.com/swe-bench/pydata__xarray.git", + "base_commit": "a64cf2d5476e7bbda099b34c40b7be1880dbd39a", + "problem_statement": "to_unstacked_dataset broken for single-dim variables.\n\nThe `to_unstacked_dataset` method fails with a MergeError when variables have only a single dimension.\n\n```python\nimport xarray as xr\nimport numpy as np\n\narr = xr.DataArray(\n np.arange(3),\n coords=[(\"x\", [0, 1, 2])],\n)\ndata = xr.Dataset({\"a\": arr, \"b\": arr})\nstacked = data.to_stacked_array('y', sample_dims=['x'])\nunstacked = stacked.to_unstacked_dataset('y')\n```\n\n```\nMergeError: conflicting values for variable 'y' on objects to be combined.\nYou can skip this check by specifying compat='override'.\n```\n\nThe expected output is a working roundtrip: stacking and then unstacking a Dataset should return an equivalent Dataset. This fails when the variables only have a single dimension.", + "hints_text": "The fix is in `xarray/core/dataarray.py` in the `to_unstacked_dataset` method. The issue involves how the stacking coordinate is handled when variables have a single dimension.", + "test_patch": "", + "test_command": "pytest -rA xarray/tests/test_dataset.py -k unstack" + }, + { + "instance_id": "django__django-14155", + "repo": "https://github.com/swe-bench/django__django.git", + "base_commit": "2f13c476abe4ba787b6cb71131818341911f43cc", + "problem_statement": "ResolverMatch.__repr__() is not helpful for partial function views.\n\nWhen a `functools.partial` function is passed as the view, `ResolverMatch.__repr__()` shows the `func` argument as `functools.partial` which isn't very helpful, especially as it doesn't reveal the underlying function or arguments provided.\n\nFor example:\n```python\nfrom functools import partial\nfrom django.urls import resolve\n\ndef my_view(request, arg1=None):\n pass\n\n# Using partial view\npartial_view = partial(my_view, arg1='value')\n```\n\nThe `__repr__` of the resolved match for `partial_view` would just show `functools.partial` rather than the underlying function `my_view` and its pre-filled arguments. This makes debugging URL resolution issues more difficult.\n\nThe fix should unwrap partial functions in `__repr__` to show the underlying function and any provided arguments.", + "hints_text": "The fix is in `django/urls/resolvers.py` in the `ResolverMatch` class. The `__repr__` method should detect `functools.partial` objects and unwrap them to show the underlying function and arguments.", + "test_patch": "", + "test_command": "./tests/runtests.py --verbosity 2 urlpatterns_reverse.tests" + } +] diff --git a/tests/benchmark/swe/dataset.ts b/tests/benchmark/swe/dataset.ts new file mode 100644 index 0000000..726427f --- /dev/null +++ b/tests/benchmark/swe/dataset.ts @@ -0,0 +1,13 @@ +import fs from 'fs'; +import path from 'path'; +import type { FullSWEInstance } from './docker-evaluator'; + +export function loadVerifiedInstances(): FullSWEInstance[] { + const instancesPath = path.join(__dirname, 'cases', 'verified-instances.json'); + if (!fs.existsSync(instancesPath)) { + console.log(` SWE: verified instances file not found at ${instancesPath}`); + return []; + } + const raw = fs.readFileSync(instancesPath, 'utf-8'); + return JSON.parse(raw) as FullSWEInstance[]; +} diff --git a/tests/benchmark/swe/docker-evaluator.ts b/tests/benchmark/swe/docker-evaluator.ts new file mode 100644 index 0000000..2c3753f --- /dev/null +++ b/tests/benchmark/swe/docker-evaluator.ts @@ -0,0 +1,744 @@ +// --------------------------------------------------------------------------- +// SWE-bench full mode — Docker-based evaluation +// --------------------------------------------------------------------------- + +import { execSync, spawnSync, ExecSyncOptionsWithStringEncoding } from 'child_process'; +import fs from 'fs'; +import os from 'os'; +import path from 'path'; +import type { ModelProvider } from '../../../src/infra/providers/types'; +import type { Message } from '../../../src/core/types'; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +export interface FullSWEInstance { + instance_id: string; + repo: string; + base_commit: string; + problem_statement: string; + hints_text: string; + test_patch: string; + test_command: string; +} + +export interface FullHarnessResult { + patch: string; + tokens: number; + error?: string; +} + +export interface DockerEvalResult { + passed: boolean; + output: string; + error?: string; +} + +// --------------------------------------------------------------------------- +// Docker availability check +// --------------------------------------------------------------------------- + +export function isDockerAvailable(): boolean { + try { + execSync('docker info', { stdio: 'pipe', timeout: 10_000 }); + return true; + } catch { + return false; + } +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/** + * Extract "owner/repo" from a GitHub URL. + * e.g. "https://github.com/psf/requests.git" → "psf-requests" + */ +function repoSlug(repoUrl: string): string { + const match = repoUrl.match(/github\.com\/([^/]+)\/([^/.]+)/); + if (match) return `${match[1]}-${match[2]}`.toLowerCase(); + // Fallback: strip protocol, slashes, .git + return repoUrl + .replace(/^https?:\/\//, '') + .replace(/\.git$/, '') + .replace(/[^a-z0-9-]/gi, '-') + .replace(/-+/g, '-') + .replace(/^-|-$/g, '') + .toLowerCase(); +} + +/** + * Sanitize a string for use as a Docker container name. + * Docker allows [a-zA-Z0-9_.-] and must start with [a-zA-Z0-9]. + */ +function sanitizeContainerName(raw: string): string { + return raw + .replace(/[^a-zA-Z0-9_.-]/g, '-') + .replace(/-+/g, '-') + .replace(/^[^a-zA-Z0-9]+/, '') + .slice(0, 128); +} + +// --------------------------------------------------------------------------- +// New full-mode helpers: read from Docker image → LLM → diff +// --------------------------------------------------------------------------- + +/** + * Read files from a SWE-bench Docker image's /testbed directory. + * This avoids cloning the repo on the host — the image already has everything. + */ +function readFilesFromImage(imageName: string, filePaths: string[]): Record { + const files: Record = {}; + + for (const fp of filePaths) { + const result = spawnSync( + 'docker', + ['run', '--rm', imageName, 'cat', `/testbed/${fp}`], + { stdio: ['pipe', 'pipe', 'pipe'], timeout: 30_000 }, + ); + + if (result.status === 0) { + const content = (result.stdout || '').toString(); + if (content) { + files[fp] = content; + } + } + } + + return files; +} + +/** + * Extract relevant file paths from problem statement and hints text. + * Looks for common source file path patterns. + */ +function extractRelevantPaths(problemStatement: string, hintsText: string): string[] { + const paths = new Set(); + + // Common source file patterns (Python-centric for SWE-bench) + const patterns = [ + // Explicit paths like `path/to/file.py` (backtick-quoted) + /`([\w/.]+\.py)`/g, + // Paths mentioned naturally: word/word/file.py + /(?:^|\s)((?:[\w-]+\/)+[\w-]+\.py)(?:\s|$|[.,;:)])/gm, + // Module-style paths: package.module.file (convert dots to slashes) + /(?:in|see|at|file|module)\s+`?([\w]+(?:\.[\w]+){2,})`?/gi, + ]; + + // Prioritize hints_text (usually more precise) + const sources = [hintsText, problemStatement].filter(Boolean); + + for (const source of sources) { + for (const pattern of patterns) { + pattern.lastIndex = 0; + let match: RegExpExecArray | null; + while ((match = pattern.exec(source)) !== null) { + let p = match[1].trim(); + // Convert module-style paths (e.g. astropy.modeling.separable) to file paths + if (!p.includes('/') && p.includes('.') && !p.endsWith('.py')) { + p = p.replace(/\./g, '/') + '.py'; + } + // Skip test files and obviously invalid paths + if (!p.includes('test') && p.endsWith('.py') && p.length > 4) { + paths.add(p); + } + } + } + } + + return Array.from(paths); +} + +// (readFilesFromRepo removed — we now read directly from Docker images) + +// --------------------------------------------------------------------------- +// LLM interaction — generate fix (file-based source-context flow) +// --------------------------------------------------------------------------- + +const FULL_SYSTEM_PROMPT = `You are a software engineer fixing bugs in open-source repositories. +You will be given a bug report, hints, and the relevant source files. +Your task is to fix the bug so all tests pass. + +Rules: +- Only modify source files. NEVER modify test files. +- Output ONLY the changed sections using the SEARCH/REPLACE format below. +- Do NOT output the entire file. Only output the minimal code blocks that need to change. +- Do NOT include any explanation outside the file markers. + +Format: + +--- FILE: --- +<<<<<<< SEARCH + +======= + +>>>>>>> REPLACE +--- END FILE --- + +You may include multiple SEARCH/REPLACE blocks within one FILE section. +You may output multiple FILE sections if changes span multiple files. + +Example: + +--- FILE: src/utils.py --- +<<<<<<< SEARCH +def validate(value): + if value > 0: + return True +======= +def validate(value): + if value >= 0: + return True +>>>>>>> REPLACE +--- END FILE ---`; + +/** + * Call the LLM with source file context (source-context flow). + * Includes a single retry on failure. + */ +async function callLLMWithContext( + provider: ModelProvider, + instance: FullSWEInstance, + files: Record, +): Promise<{ text: string; tokens: number }> { + const fileListing = Object.entries(files) + .map(([name, content]) => `--- ${name} ---\n${content}`) + .join('\n'); + + const userMessage = [ + 'Bug report:', + instance.problem_statement, + ]; + + if (instance.hints_text) { + userMessage.push('', 'Hints:', instance.hints_text); + } + + userMessage.push( + '', + 'Source files:', + fileListing, + '', + 'Fix the bug in the source file(s) so that all tests pass.', + 'Output ONLY the changed sections using the SEARCH/REPLACE format described in your instructions.', + ); + + const messages: Message[] = [ + { role: 'user', content: [{ type: 'text', text: userMessage.join('\n') }] }, + ]; + + const attempt = async (): Promise<{ text: string; tokens: number }> => { + const response = await provider.complete(messages, { + system: FULL_SYSTEM_PROMPT, + maxTokens: 16384, + }); + + const text = response.content + .filter((b): b is { type: 'text'; text: string } => b.type === 'text') + .map(b => b.text) + .join(''); + + const tokens = + (response.usage?.input_tokens ?? 0) + (response.usage?.output_tokens ?? 0); + + return { text, tokens }; + }; + + try { + return await attempt(); + } catch (err: any) { + // Single retry after 3 seconds + console.log(` [llm] First attempt failed (${err.message}), retrying ...`); + await new Promise(resolve => setTimeout(resolve, 3000)); + return await attempt(); + } +} + +/** + * Parse `--- FILE: --- ... --- END FILE ---` blocks from model output. + * Each FILE block may contain one or more SEARCH/REPLACE hunks, or a full file body. + */ +function parseFileBlocks(text: string): Array<{ path: string; body: string }> { + const blocks: Array<{ path: string; body: string }> = []; + const regex = /---\s*FILE:\s*(.+?)\s*---\r?\n([\s\S]*?)---\s*END FILE\s*---/g; + let match: RegExpExecArray | null; + + while ((match = regex.exec(text)) !== null) { + const filename = match[1].trim(); + const body = match[2]; + if (!filename.includes('test')) { + blocks.push({ path: filename, body }); + } + } + + return blocks; +} + +/** + * Parse SEARCH/REPLACE hunks from a file block body. + * Returns an array of { search, replace } pairs. + */ +function parseSearchReplaceHunks(body: string): Array<{ search: string; replace: string }> { + const hunks: Array<{ search: string; replace: string }> = []; + const regex = /<<<<<<< SEARCH\r?\n([\s\S]*?)=======\r?\n([\s\S]*?)>>>>>>> REPLACE/g; + let match: RegExpExecArray | null; + + while ((match = regex.exec(body)) !== null) { + hunks.push({ search: match[1], replace: match[2] }); + } + + return hunks; +} + +/** + * Apply SEARCH/REPLACE hunks to the original file content. + * Returns the corrected file content, or null if any hunk fails to match. + */ +function applyHunks( + original: string, + hunks: Array<{ search: string; replace: string }>, +): string | null { + let result = original; + + for (const hunk of hunks) { + // Try exact match first + if (result.includes(hunk.search)) { + result = result.replace(hunk.search, hunk.replace); + continue; + } + + // Try trimmed trailing newline match + const searchTrimmed = hunk.search.replace(/\n$/, ''); + const replaceTrimmed = hunk.replace.replace(/\n$/, ''); + if (result.includes(searchTrimmed)) { + result = result.replace(searchTrimmed, replaceTrimmed); + continue; + } + + // Hunk didn't match + return null; + } + + return result; +} + +/** + * Generate a unified diff by comparing original and corrected file contents. + * Uses `diff -u` with temp files. No repo clone needed. + */ +function generateDiffFromOriginals( + originals: Record, + corrected: Record, +): string { + const diffs: string[] = []; + + for (const [filePath, newContent] of Object.entries(corrected)) { + const originalContent = originals[filePath]; + if (originalContent === undefined) { + // New file — generate a diff from /dev/null + const tmpNew = path.join(os.tmpdir(), `swe-new-${Date.now()}-${Math.random().toString(36).slice(2)}`); + fs.writeFileSync(tmpNew, newContent, 'utf-8'); + try { + const result = spawnSync( + 'diff', + ['-u', '/dev/null', tmpNew, '--label', `a/${filePath}`, '--label', `b/${filePath}`], + { stdio: ['pipe', 'pipe', 'pipe'], timeout: 10_000 }, + ); + const diffOutput = (result.stdout || '').toString(); + if (diffOutput) { + diffs.push(`diff --git a/${filePath} b/${filePath}\n${diffOutput}`); + } + } finally { + fs.unlinkSync(tmpNew); + } + continue; + } + + // Write original and new to temp files for diff + const tmpOrig = path.join(os.tmpdir(), `swe-orig-${Date.now()}-${Math.random().toString(36).slice(2)}`); + const tmpNew = path.join(os.tmpdir(), `swe-new-${Date.now()}-${Math.random().toString(36).slice(2)}`); + + fs.writeFileSync(tmpOrig, originalContent, 'utf-8'); + fs.writeFileSync(tmpNew, newContent, 'utf-8'); + + try { + const result = spawnSync( + 'diff', + ['-u', tmpOrig, tmpNew, '--label', `a/${filePath}`, '--label', `b/${filePath}`], + { stdio: ['pipe', 'pipe', 'pipe'], timeout: 10_000 }, + ); + // diff exits 1 when files differ (not an error) + const diffOutput = (result.stdout || '').toString(); + if (diffOutput) { + diffs.push(`diff --git a/${filePath} b/${filePath}\n${diffOutput}`); + } + } finally { + fs.unlinkSync(tmpOrig); + fs.unlinkSync(tmpNew); + } + } + + return diffs.join('\n'); +} + +// --------------------------------------------------------------------------- +// Main entry point — generate fix (replaces old generatePatch) +// --------------------------------------------------------------------------- + +/** + * Generate a fix by: + * 1. Pulling the SWE-bench Docker image (has repo at /testbed) + * 2. Extracting relevant file paths from the problem statement / hints + * 3. Reading those files directly from the Docker image + * 4. Sending source code + problem to LLM (source-context flow) + * 5. Parsing corrected files from LLM response + * 6. Generating unified diff programmatically + */ +export async function generateFix( + provider: ModelProvider, + instance: FullSWEInstance, + proxyUrl?: string, +): Promise { + try { + // 1. Ensure the SWE-bench image is available + const imageName = getSWEBenchImageName(instance.instance_id); + if (!pullImage(imageName, proxyUrl)) { + return { patch: '', tokens: 0, error: `Failed to pull SWE-bench image: ${imageName}` }; + } + + // 2. Extract relevant file paths + const filePaths = extractRelevantPaths(instance.problem_statement, instance.hints_text); + console.log(` [fix] Extracted ${filePaths.length} relevant file path(s): ${filePaths.join(', ')}`); + + if (filePaths.length === 0) { + return { patch: '', tokens: 0, error: 'No relevant file paths found in problem statement or hints' }; + } + + // 3. Read source files directly from the Docker image + console.log(` [fix] Reading files from Docker image ...`); + const fileContents = readFilesFromImage(imageName, filePaths); + const readCount = Object.keys(fileContents).length; + console.log(` [fix] Read ${readCount} file(s) from image`); + + if (readCount === 0) { + return { patch: '', tokens: 0, error: 'None of the extracted file paths exist in the image' }; + } + + // 4. Call LLM with source context + console.log(` [fix] Sending source files + problem to LLM ...`); + const response = await callLLMWithContext(provider, instance, fileContents); + + // 5. Parse file blocks and apply search/replace hunks + const fileBlocks = parseFileBlocks(response.text); + + if (fileBlocks.length === 0) { + // Log a snippet of the response for debugging + const snippet = response.text.slice(0, 300).replace(/\n/g, '\\n'); + console.log(` [fix] Response snippet: ${snippet}`); + return { patch: '', tokens: response.tokens, error: 'No corrected files found in model response' }; + } + + // Build corrected files by applying hunks to originals + const correctedFiles: Record = {}; + + for (const block of fileBlocks) { + const hunks = parseSearchReplaceHunks(block.body); + + if (hunks.length > 0) { + // Search/replace mode — apply hunks to original + const original = fileContents[block.path]; + if (!original) { + console.log(` [fix] Warning: original file not found for ${block.path}, skipping`); + continue; + } + const applied = applyHunks(original, hunks); + if (applied === null) { + console.log(` [fix] Warning: SEARCH block mismatch for ${block.path}`); + continue; + } + correctedFiles[block.path] = applied; + } else { + // Fallback: block body is the complete corrected file content + correctedFiles[block.path] = block.body; + } + } + + const correctedCount = Object.keys(correctedFiles).length; + + if (correctedCount === 0) { + return { patch: '', tokens: response.tokens, error: 'All SEARCH/REPLACE hunks failed to match' }; + } + + console.log(` [fix] LLM returned ${correctedCount} corrected file(s)`); + + // 6. Generate unified diff (using temp files, no repo clone needed) + const patch = generateDiffFromOriginals(fileContents, correctedFiles); + + if (!patch) { + return { patch: '', tokens: response.tokens, error: 'Generated diff is empty (no changes detected)' }; + } + + return { patch, tokens: response.tokens }; + } catch (err: any) { + return { patch: '', tokens: 0, error: err.message || String(err) }; + } +} + +// --------------------------------------------------------------------------- +// Docker-based evaluation (using official SWE-bench pre-built images) +// --------------------------------------------------------------------------- + +/** stdio config that streams stdout to terminal for progress visibility */ +const LIVE_OPTS = { + stdio: ['pipe' as const, 'inherit' as const, 'pipe' as const], + timeout: 1_200_000, // 20 minutes — some test suites (e.g. sympy) are slow +}; + +/** + * Derive the official SWE-bench Docker image name from an instance_id. + * Convention: `swebench/sweb.eval.x86_64.:latest` + * where `__` in instance_id is replaced with `_1776_`. + */ +export function getSWEBenchImageName(instanceId: string): string { + const slug = instanceId.toLowerCase().replace(/__/g, '_1776_'); + return `swebench/sweb.eval.x86_64.${slug}:latest`; +} + +/** + * Pull a Docker image, using proxy if configured. + * Returns true if the image is available (already existed or pulled successfully). + */ +function pullImage(imageName: string, proxyUrl?: string): boolean { + // Check if image already exists locally + const checkResult = spawnSync( + 'docker', ['image', 'inspect', imageName], + { stdio: ['pipe', 'pipe', 'pipe'], timeout: 10_000 }, + ); + if (checkResult.status === 0) { + console.log(` [docker] Image ${imageName} already available locally`); + return true; + } + + // Pull with proxy if needed + const env: Record = { ...process.env as Record }; + if (proxyUrl) { + env.HTTPS_PROXY = proxyUrl; + env.HTTP_PROXY = proxyUrl; + env.https_proxy = proxyUrl; + env.http_proxy = proxyUrl; + } + + console.log(` [docker] Pulling ${imageName} ...`); + const pullResult = spawnSync( + 'docker', ['pull', imageName], + { env, stdio: ['pipe', 'inherit', 'pipe'], timeout: 1_200_000 }, + ); + + if (pullResult.status !== 0) { + const stderr = (pullResult.stderr || '').toString().trim(); + console.log(` [docker] Failed to pull image: ${stderr}`); + return false; + } + + return true; +} + +/** + * Evaluate a patch inside an official SWE-bench Docker container. + * + * The SWE-bench images come with: + * - Repository pre-cloned at /testbed (at the correct base commit) + * - Conda environment "testbed" with all dependencies installed + * - Correct Python version for the project + * + * Steps: + * 1. Pull the SWE-bench image (if not cached locally) + * 2. Mount patch files into the container + * 3. Apply the fix patch with git apply (with fallbacks) + * 4. Apply the test patch (if provided) + * 5. Run the test command inside the conda environment + */ +export function evaluateWithDocker( + instance: FullSWEInstance, + patch: string, + workDir: string, + proxyUrl?: string, +): DockerEvalResult { + const imageName = getSWEBenchImageName(instance.instance_id); + + // Pull image if needed + if (!pullImage(imageName, proxyUrl)) { + return { + passed: false, + output: '', + error: `Failed to pull SWE-bench image: ${imageName}`, + }; + } + + fs.mkdirSync(workDir, { recursive: true }); + + // Write patches to workDir (mounted into container) + fs.writeFileSync(path.join(workDir, 'fix.patch'), patch, 'utf-8'); + if (instance.test_patch) { + fs.writeFileSync(path.join(workDir, 'test.patch'), instance.test_patch, 'utf-8'); + } + + // Build evaluation script + // The SWE-bench container has: /testbed (repo), conda env "testbed" + const script = [ + '#!/bin/bash', + 'set -uo pipefail', + '', + 'source /opt/miniconda3/bin/activate', + 'conda activate testbed', + 'cd /testbed', + '', + 'echo " [docker] Applying fix patch ..."', + 'if git apply --verbose /patches/fix.patch; then', + ' echo " [docker] Patch applied with git apply"', + 'elif git apply --verbose --reject /patches/fix.patch; then', + ' echo " [docker] Patch applied with --reject"', + 'elif patch --batch --fuzz=5 -p1 -i /patches/fix.patch; then', + ' echo " [docker] Patch applied with patch command"', + 'else', + ' echo " [docker] ERROR: Patch application failed"', + ' exit 1', + 'fi', + '', + 'if [ -f /patches/test.patch ] && [ -s /patches/test.patch ]; then', + ' echo " [docker] Applying test patch ..."', + ' git apply -v /patches/test.patch || true', + 'fi', + '', + `echo " [docker] Running tests: ${instance.test_command}"`, + `${instance.test_command}`, + 'echo " [docker] Tests completed."', + ].join('\n'); + + fs.writeFileSync(path.join(workDir, 'evaluate.sh'), script, 'utf-8'); + + const containerName = sanitizeContainerName(`swe-${instance.instance_id}-${Date.now()}`); + + try { + console.log(` [docker] Starting container (${imageName}) ...`); + const result = spawnSync( + 'docker', + [ + 'run', '--rm', + '--name', containerName, + '-v', `${workDir}:/patches:ro`, + imageName, + 'bash', '/patches/evaluate.sh', + ], + LIVE_OPTS, + ); + + const stderr = (result.stderr || '').toString().trim(); + + if (result.status === 0) { + return { passed: true, output: '' }; + } + + return { + passed: false, + output: '', + error: stderr || `exit code ${result.status}`, + }; + } catch (err: any) { + return { + passed: false, + output: '', + error: err.message || String(err), + }; + } +} + +// --------------------------------------------------------------------------- +// Local evaluation fallback (no Docker) +// --------------------------------------------------------------------------- + +const EXEC_OPTS: ExecSyncOptionsWithStringEncoding = { + encoding: 'utf-8', + stdio: ['pipe', 'pipe', 'pipe'], + timeout: 600_000, +}; + +export function evaluateLocally( + instance: FullSWEInstance, + patch: string, + workDir: string, +): DockerEvalResult { + const repoDir = path.join(workDir, 'repo'); + fs.mkdirSync(workDir, { recursive: true }); + + try { + console.log(` [local] Cloning ${instance.repo} ...`); + spawnSync( + 'git', ['clone', '--quiet', instance.repo, repoDir], + { ...LIVE_OPTS, timeout: 120_000 }, + ); + + console.log(` [local] Checking out ${instance.base_commit.slice(0, 10)} ...`); + execSync(`git checkout "${instance.base_commit}" --quiet`, { + ...EXEC_OPTS, + cwd: repoDir, + }); + + console.log(` [local] Applying fix patch ...`); + const patchPath = path.join(workDir, 'fix.patch'); + fs.writeFileSync(patchPath, patch, 'utf-8'); + execSync(`git apply "${patchPath}"`, { ...EXEC_OPTS, cwd: repoDir }); + + if (instance.test_patch) { + console.log(` [local] Applying test patch ...`); + const testPatchPath = path.join(workDir, 'test.patch'); + fs.writeFileSync(testPatchPath, instance.test_patch, 'utf-8'); + try { + execSync(`git apply "${testPatchPath}"`, { ...EXEC_OPTS, cwd: repoDir }); + } catch { + // Test patch may not apply cleanly + } + } + + console.log(` [local] Running tests: ${instance.test_command}`); + const result = spawnSync('bash', ['-c', instance.test_command], { + ...LIVE_OPTS, + cwd: repoDir, + timeout: 300_000, + }); + + const stderr = (result.stderr || '').toString().trim(); + + if (result.status === 0) { + console.log(` [local] Tests completed.`); + return { passed: true, output: '' }; + } + + return { + passed: false, + output: '', + error: stderr || `exit code ${result.status}`, + }; + } catch (err: any) { + const stderr = (err.stderr || '').toString().trim(); + return { + passed: false, + output: '', + error: stderr || err.message || String(err), + }; + } +} + +// --------------------------------------------------------------------------- +// Cleanup +// --------------------------------------------------------------------------- + +export function cleanupWorkDir(workDir: string): void { + try { + fs.rmSync(workDir, { recursive: true, force: true }); + } catch { + // ignore + } +} diff --git a/tests/benchmark/swe/index.ts b/tests/benchmark/swe/index.ts new file mode 100644 index 0000000..7d17c24 --- /dev/null +++ b/tests/benchmark/swe/index.ts @@ -0,0 +1,139 @@ +import path from 'path'; +import type { BenchmarkConfig, BenchmarkModuleResult, BenchmarkProvider, SWEProviderResult, SWEResult } from '../types'; +import type { ModelProvider } from '../../../src/infra/providers/types'; +import { AnthropicProvider } from '../../../src/infra/providers/anthropic'; +import { OpenAIProvider } from '../../../src/infra/providers/openai'; +import { GeminiProvider } from '../../../src/infra/providers/gemini'; +import { loadVerifiedInstances } from './dataset'; +import { + isDockerAvailable, + generateFix, + evaluateWithDocker, + cleanupWorkDir, + type FullSWEInstance, +} from './docker-evaluator'; + +export const name = 'swe'; + +function createProvider(bp: BenchmarkProvider): ModelProvider { + switch (bp.id) { + case 'anthropic': + return new AnthropicProvider(bp.apiKey, bp.model, bp.baseUrl, bp.proxyUrl); + case 'openai': + return new OpenAIProvider(bp.apiKey, bp.model, bp.baseUrl, bp.proxyUrl); + case 'gemini': + return new GeminiProvider(bp.apiKey, bp.model, bp.baseUrl, bp.proxyUrl); + default: + return new OpenAIProvider(bp.apiKey, bp.model, bp.baseUrl, bp.proxyUrl); + } +} + +async function runProviderOnVerifiedInstances( + bp: BenchmarkProvider, + instances: FullSWEInstance[], + dockerProxy?: string, +): Promise { + const provider = createProvider(bp); + const results: SWEResult[] = []; + + for (const inst of instances) { + const startMs = Date.now(); + const workDir = path.join(process.cwd(), 'tests', '.tmp', `swe-${bp.id}-${inst.instance_id}-${Date.now()}`); + + try { + console.log(` [${bp.id}] ${inst.instance_id}: generating fix ...`); + const harness = await generateFix(provider, inst, dockerProxy); + + if (harness.error || !harness.patch) { + const durationMs = Date.now() - startMs; + const errMsg = harness.error || 'No fix generated'; + console.log(` [${bp.id}] ${inst.instance_id}: FAIL (${errMsg})`); + results.push({ + instance_id: inst.instance_id, + resolved: false, + tokens_used: harness.tokens, + duration_ms: durationMs, + error: errMsg, + }); + continue; + } + + console.log(` [${bp.id}] ${inst.instance_id}: fix generated (${harness.tokens} tokens), evaluating ...`); + const evalResult = evaluateWithDocker(inst, harness.patch, workDir, dockerProxy); + + const durationMs = Date.now() - startMs; + const status = evalResult.passed ? 'PASS' : 'FAIL'; + const detail = evalResult.passed ? '' : ` (${(evalResult.error || '').slice(0, 120)})`; + console.log(` [${bp.id}] ${inst.instance_id}: ${status} (${harness.tokens} tokens, ${durationMs}ms)${detail}`); + + results.push({ + instance_id: inst.instance_id, + resolved: evalResult.passed, + tokens_used: harness.tokens, + duration_ms: durationMs, + error: evalResult.passed ? undefined : evalResult.error, + }); + } catch (err: any) { + const durationMs = Date.now() - startMs; + console.log(` [${bp.id}] ${inst.instance_id}: FAIL (${err.message})`); + results.push({ + instance_id: inst.instance_id, + resolved: false, + tokens_used: 0, + duration_ms: durationMs, + error: err.message || String(err), + }); + } finally { + cleanupWorkDir(workDir); + } + } + + const resolved = results.filter(r => r.resolved).length; + const total = results.length; + const avgTokens = total > 0 ? Math.round(results.reduce((s, r) => s + r.tokens_used, 0) / total) : 0; + const avgDuration = total > 0 ? Math.round(results.reduce((s, r) => s + r.duration_ms, 0) / total) : 0; + + return { + provider: bp, + summary: { + dataset: 'swe-bench-verified', + total, + resolved, + rate: total > 0 ? resolved / total : 0, + avg_tokens: avgTokens, + avg_duration_ms: avgDuration, + }, + results, + }; +} + +export async function run(config: BenchmarkConfig): Promise { + const instances = loadVerifiedInstances(); + if (instances.length === 0) { + console.log(' SWE: no verified instances found'); + return {}; + } + + if (config.providers.length === 0) { + console.log(' SWE: no providers configured, skipping'); + return {}; + } + + const dockerAvailable = isDockerAvailable(); + if (!dockerAvailable) { + console.log(' SWE: Docker is required for SWE-bench-Verified and is not available. Skipping.'); + return {}; + } + + console.log(`\n SWE verified mode: ${instances.length} instances`); + console.log(' Docker: available (official SWE image evaluation)'); + + const allResults: SWEProviderResult[] = []; + for (const bp of config.providers) { + console.log(`\n Running provider: ${bp.id} / ${bp.model}`); + const providerResult = await runProviderOnVerifiedInstances(bp, instances, config.dockerProxy); + allResults.push(providerResult); + } + + return { swe: allResults }; +} diff --git a/tests/benchmark/tau/index.ts b/tests/benchmark/tau/index.ts new file mode 100644 index 0000000..3d98615 --- /dev/null +++ b/tests/benchmark/tau/index.ts @@ -0,0 +1,502 @@ +import fs from 'fs'; +import path from 'path'; +import { spawn, spawnSync } from 'child_process'; +import type { + BenchmarkConfig, + BenchmarkModuleResult, + BenchmarkProvider, + TAUProviderResult, + TAUTaskResult, +} from '../types'; + +export const name = 'tau'; + +const TAU2_SOURCE = 'git+https://github.com/sierra-research/tau2-bench@v0.2.0'; +const TAU2_REPO = 'https://github.com/sierra-research/tau2-bench'; +const TAU2_REF = 'v0.2.0'; +const DEFAULT_TAU2_DATA_DIR = path.resolve(process.cwd(), 'tests/tmp/tau2-data'); +const PASS_REWARD = 1; +const PASS_TOL = 1e-6; + +interface RunnerSpec { + cmd: string; + baseArgs: string[]; + label: string; +} + +interface Tau2Task { + id?: string; +} + +interface Tau2RewardInfo { + reward?: number; +} + +interface Tau2Simulation { + task_id?: string; + trial?: number; + reward_info?: Tau2RewardInfo; + [key: string]: unknown; +} + +interface Tau2RunOutput { + info?: { num_trials?: number }; + tasks?: Tau2Task[]; + simulations?: Tau2Simulation[]; +} + +function hasCommand(cmd: string, versionArg = '--version'): boolean { + const r = spawnSync(cmd, [versionArg], { stdio: 'ignore' }); + return r.status === 0; +} + +function getDomains(tauDomain: string): string[] { + if (tauDomain === 'all') return ['airline', 'retail', 'telecom']; + return [tauDomain]; +} + +function ensureDataDir(dataDir: string): void { + fs.mkdirSync(path.join(dataDir, 'simulations'), { recursive: true }); +} + +function requiredTaskFiles(dataDir: string, domains: string[]): string[] { + return domains.map(domain => path.join(dataDir, 'tau2', 'domains', domain, 'tasks.json')); +} + +function ensureOfficialDataFiles(dataDir: string, domains: string[]): void { + const missingBefore = requiredTaskFiles(dataDir, domains).filter(p => !fs.existsSync(p)); + if (missingBefore.length === 0) return; + + if (!hasCommand('git')) { + throw new Error( + `TAU2 data files missing and git is not available. Missing: ${missingBefore.join(', ')}`, + ); + } + + const sourceDir = path.join(dataDir, '.tau2-source'); + const sourceDataDir = path.join(sourceDir, 'data', 'tau2'); + + console.log(' TAU2 data missing, bootstrapping official data from repository...'); + if (!fs.existsSync(sourceDataDir)) { + if (fs.existsSync(sourceDir)) { + fs.rmSync(sourceDir, { recursive: true, force: true }); + } + const clone = spawnSync( + 'git', + ['clone', '--depth', '1', '--branch', TAU2_REF, TAU2_REPO, sourceDir], + { stdio: 'inherit' }, + ); + if (clone.status !== 0) { + throw new Error(`Failed to clone TAU2 data source (exit code ${clone.status ?? 'unknown'})`); + } + } + + if (!fs.existsSync(sourceDataDir)) { + throw new Error(`TAU2 data source missing expected directory: ${sourceDataDir}`); + } + + fs.mkdirSync(path.join(dataDir, 'tau2'), { recursive: true }); + fs.cpSync(sourceDataDir, path.join(dataDir, 'tau2'), { recursive: true, force: true }); + + const missingAfter = requiredTaskFiles(dataDir, domains).filter(p => !fs.existsSync(p)); + if (missingAfter.length > 0) { + throw new Error(`TAU2 data bootstrap incomplete. Missing: ${missingAfter.join(', ')}`); + } +} + +function shouldKeepTauLogLine(line: string): boolean { + const s = line.trim(); + if (!s) return false; + if (s.includes('Provider List: https://docs.litellm.ai/docs/providers')) return false; + if (s.includes('tau2.utils.llm_utils:get_response_cost')) return false; + if (s.includes("This model isn't mapped yet.")) return false; + return true; +} + +function createLineEmitter(isErr: boolean): (chunk: Buffer | string, flush?: boolean) => void { + let buffer = ''; + return (chunk: Buffer | string, flush = false) => { + if (chunk) { + buffer += chunk.toString().replace(/\r/g, '\n'); + } + const parts = buffer.split('\n'); + if (!flush) { + buffer = parts.pop() ?? ''; + } else { + buffer = ''; + } + for (const line of parts) { + if (!shouldKeepTauLogLine(line)) continue; + if (isErr) console.error(line); + else console.log(line); + } + }; +} + +async function runTau2WithFilteredLogs( + runner: RunnerSpec, + args: string[], + env: NodeJS.ProcessEnv, +): Promise { + const child = spawn(runner.cmd, args, { + cwd: process.cwd(), + env, + stdio: ['ignore', 'pipe', 'pipe'], + }); + + const out = createLineEmitter(false); + const err = createLineEmitter(true); + + child.stdout?.on('data', (chunk: Buffer | string) => out(chunk, false)); + child.stderr?.on('data', (chunk: Buffer | string) => err(chunk, false)); + + return await new Promise((resolve, reject) => { + child.on('error', reject); + child.on('close', code => { + out('', true); + err('', true); + resolve(code ?? 1); + }); + }); +} + +function sanitizeLabel(v: string): string { + return v.trim().replace(/[^a-zA-Z0-9._-]+/g, '-').slice(0, 96); +} + +function toTau2Model(bp: BenchmarkProvider): string { + if (bp.model.includes('/')) return bp.model; + if (bp.id === 'anthropic') return `anthropic/${bp.model}`; + if (bp.id === 'gemini') return `gemini/${bp.model}`; + return `openai/${bp.model}`; +} + +function applyProviderEnv(env: NodeJS.ProcessEnv, bp: BenchmarkProvider): void { + switch (bp.id) { + case 'anthropic': + env.ANTHROPIC_API_KEY = bp.apiKey; + if (bp.baseUrl) env.ANTHROPIC_BASE_URL = bp.baseUrl; + break; + case 'gemini': + env.GEMINI_API_KEY = bp.apiKey; + if (bp.baseUrl) env.GEMINI_BASE_URL = bp.baseUrl; + break; + default: + env.OPENAI_API_KEY = bp.apiKey; + if (bp.baseUrl) { + env.OPENAI_BASE_URL = bp.baseUrl; + env.OPENAI_API_BASE = bp.baseUrl; + } + break; + } +} + +function buildRunEnv(config: BenchmarkConfig, bp: BenchmarkProvider, userSimBp: BenchmarkProvider, dataDir: string): NodeJS.ProcessEnv { + const env: NodeJS.ProcessEnv = { + ...process.env, + TAU2_DATA_DIR: dataDir, + UV_CACHE_DIR: process.env.UV_CACHE_DIR || '/tmp/uv-cache', + UV_TOOL_DIR: process.env.UV_TOOL_DIR || '/tmp/uv-tools', + XDG_DATA_HOME: process.env.XDG_DATA_HOME || '/tmp/xdg-data', + }; + if (config.dockerProxy) { + env.HTTP_PROXY = config.dockerProxy; + env.HTTPS_PROXY = config.dockerProxy; + env.http_proxy = config.dockerProxy; + env.https_proxy = config.dockerProxy; + } + applyProviderEnv(env, bp); + applyProviderEnv(env, userSimBp); + return env; +} + +function resolveRunner(): RunnerSpec { + if (hasCommand('tau2')) { + return { cmd: 'tau2', baseArgs: [], label: 'tau2' }; + } + if (hasCommand('uvx')) { + return { + cmd: 'uvx', + baseArgs: ['--python', '3.12', '--from', TAU2_SOURCE, 'tau2'], + label: `uvx tau2 (${TAU2_SOURCE})`, + }; + } + throw new Error('TAU official runner not found. Install `tau2` or `uvx`.'); +} + +function readJson(filePath: string): Tau2RunOutput { + return JSON.parse(fs.readFileSync(filePath, 'utf-8')) as Tau2RunOutput; +} + +function isPass(sim: Tau2Simulation): boolean { + const reward = sim.reward_info?.reward; + return typeof reward === 'number' && Math.abs(reward - PASS_REWARD) <= PASS_TOL; +} + +function combinations(n: number, k: number): number { + if (k < 0 || k > n) return 0; + if (k === 0 || k === n) return 1; + let kk = Math.min(k, n - k); + let out = 1; + for (let i = 1; i <= kk; i++) { + out = (out * (n - kk + i)) / i; + } + return out; +} + +function computePassHatK(taskOutcomes: boolean[][]): number[] { + const eligible = taskOutcomes.filter(arr => arr.length > 0); + if (eligible.length === 0) return []; + + const maxK = Math.min(...eligible.map(arr => arr.length)); + const passAtK: number[] = []; + + for (let k = 1; k <= maxK; k++) { + const vals: number[] = []; + for (const arr of eligible) { + const n = arr.length; + if (n < k) continue; + const c = arr.filter(Boolean).length; + const denom = combinations(n, k); + vals.push(denom === 0 ? 0 : combinations(c, k) / denom); + } + passAtK.push(vals.length > 0 ? vals.reduce((s, v) => s + v, 0) / vals.length : 0); + } + + return passAtK; +} + +function asFiniteNumber(v: unknown): number | undefined { + return typeof v === 'number' && Number.isFinite(v) ? v : undefined; +} + +function getPathNumber(obj: unknown, keys: string[]): number | undefined { + let cur: unknown = obj; + for (const k of keys) { + if (!cur || typeof cur !== 'object' || Array.isArray(cur)) return undefined; + cur = (cur as Record)[k]; + } + return asFiniteNumber(cur); +} + +function findNumberByKeys(obj: unknown, candidates: string[]): number | undefined { + if (!obj || typeof obj !== 'object') return undefined; + const queue: unknown[] = [obj]; + while (queue.length > 0) { + const cur = queue.shift(); + if (!cur || typeof cur !== 'object') continue; + if (Array.isArray(cur)) { + for (const v of cur) queue.push(v); + continue; + } + for (const [k, v] of Object.entries(cur as Record)) { + if (candidates.includes(k)) { + const n = asFiniteNumber(v); + if (n !== undefined) return n; + } + if (v && typeof v === 'object') queue.push(v); + } + } + return undefined; +} + +interface TokenUsage { + input?: number; + output?: number; + cache?: number; + total?: number; +} + +function extractTokenUsage(obj: unknown): TokenUsage { + const input = getPathNumber(obj, ['agent_result', 'n_input_tokens']) + ?? getPathNumber(obj, ['agent_result', 'usage', 'input_tokens']) + ?? findNumberByKeys(obj, ['n_input_tokens', 'input_tokens', 'prompt_tokens']); + const output = getPathNumber(obj, ['agent_result', 'n_output_tokens']) + ?? getPathNumber(obj, ['agent_result', 'usage', 'output_tokens']) + ?? findNumberByKeys(obj, ['n_output_tokens', 'output_tokens', 'completion_tokens']); + const cache = getPathNumber(obj, ['agent_result', 'n_cache_tokens']) + ?? findNumberByKeys(obj, ['n_cache_tokens', 'cache_tokens']); + const total = getPathNumber(obj, ['agent_result', 'n_total_tokens']) + ?? getPathNumber(obj, ['agent_result', 'usage', 'total_tokens']) + ?? findNumberByKeys(obj, ['n_total_tokens', 'total_tokens']); + + if (total !== undefined) return { input, output, cache, total }; + if (input !== undefined || output !== undefined || cache !== undefined) { + return { + input, + output, + cache, + total: (input ?? 0) + (output ?? 0) + (cache ?? 0), + }; + } + return {}; +} + +function parseTau2Output( + bp: BenchmarkProvider, + domain: string, + filePath: string, + expectedTrials: number, +): TAUProviderResult { + const parsed = readJson(filePath); + const taskIds = new Set(); + for (const t of parsed.tasks ?? []) { + if (typeof t.id === 'string' && t.id.length > 0) taskIds.add(t.id); + } + for (const sim of parsed.simulations ?? []) { + if (typeof sim.task_id === 'string' && sim.task_id.length > 0) taskIds.add(sim.task_id); + } + + const trialMatrix = new Map(); + const tokenMatrix = new Map>(); + for (const id of taskIds) trialMatrix.set(id, []); + for (const id of taskIds) tokenMatrix.set(id, []); + + for (const sim of parsed.simulations ?? []) { + const taskId = sim.task_id; + if (!taskId || !trialMatrix.has(taskId)) continue; + const arr = trialMatrix.get(taskId)!; + const tokenArr = tokenMatrix.get(taskId)!; + const usage = extractTokenUsage(sim); + const tokenVal = usage.total; + if (typeof sim.trial === 'number' && sim.trial >= 0) { + arr[sim.trial] = isPass(sim); + tokenArr[sim.trial] = tokenVal; + } else { + arr.push(isPass(sim)); + tokenArr.push(tokenVal); + } + } + + const results: TAUTaskResult[] = []; + const outcomes: boolean[][] = []; + let tokenSum = 0; + let tokenObservedTrials = 0; + for (const taskId of taskIds) { + const normalized = (trialMatrix.get(taskId) ?? []).filter((v): v is boolean => typeof v === 'boolean'); + const tokens = (tokenMatrix.get(taskId) ?? []).filter((v): v is number => typeof v === 'number' && Number.isFinite(v)); + const taskAvgTokens = tokens.length > 0 + ? Math.round(tokens.reduce((s, t) => s + t, 0) / tokens.length) + : 0; + tokenSum += tokens.reduce((s, t) => s + t, 0); + tokenObservedTrials += tokens.length; + outcomes.push(normalized); + results.push({ + task_id: taskId, + trial_pass_rates: normalized, + tokens_used: taskAvgTokens, + error: normalized.length === 0 ? 'No trial results in official TAU2 output' : undefined, + }); + } + + const passAtK = computePassHatK(outcomes); + const avgTokens = tokenObservedTrials > 0 ? Math.round(tokenSum / tokenObservedTrials) : 0; + return { + provider: bp, + summary: { + domain, + total_tasks: taskIds.size, + num_trials: parsed.info?.num_trials ?? expectedTrials, + pass_at_k: passAtK, + avg_tokens: avgTokens, + token_observed_trials: tokenObservedTrials, + }, + results, + }; +} + +async function runProviderOnDomainOfficial( + config: BenchmarkConfig, + runner: RunnerSpec, + dataDir: string, + domain: string, + bp: BenchmarkProvider, + userSimBp: BenchmarkProvider, +): Promise { + const agentLlm = toTau2Model(bp); + const userLlm = toTau2Model(userSimBp); + const saveName = sanitizeLabel( + `tau2-${domain}-${bp.id}-${bp.model}-${Date.now()}`, + ); + const outputPath = path.join(dataDir, 'simulations', `${saveName}.json`); + + const runArgs = [ + ...runner.baseArgs, + 'run', + '--domain', + domain, + '--agent-llm', + agentLlm, + '--user-llm', + userLlm, + '--num-trials', + String(config.numTrials), + '--save-to', + saveName, + ]; + + console.log(` [${bp.id}] ${domain}: tau2 run (${runner.label})`); + const runStatus = await runTau2WithFilteredLogs( + runner, + runArgs, + buildRunEnv(config, bp, userSimBp, dataDir), + ); + + if (runStatus !== 0) { + throw new Error(`tau2 run failed with exit code ${runStatus}`); + } + if (!fs.existsSync(outputPath)) { + throw new Error(`tau2 output not found: ${outputPath}`); + } + + return parseTau2Output(bp, domain, outputPath, config.numTrials); +} + +export async function run(config: BenchmarkConfig): Promise { + const domains = getDomains(config.tauDomain); + if (domains.length === 0) { + console.log(` TAU: no domains found for "${config.tauDomain}"`); + return {}; + } + if (config.providers.length === 0) { + console.log(' TAU: no providers configured, skipping'); + return {}; + } + + const runner = resolveRunner(); + const dataDir = DEFAULT_TAU2_DATA_DIR; + ensureDataDir(dataDir); + ensureOfficialDataFiles(dataDir, domains); + console.log(`\n TAU official source: tau2 (${TAU2_SOURCE})`); + console.log(` TAU data dir: ${dataDir}`); + + const allResults: TAUProviderResult[] = []; + for (const domain of domains) { + console.log(`\n TAU domain: ${domain} (${config.numTrials} trials)`); + for (const bp of config.providers) { + const userSimBp = config.userSimProvider ?? bp; + console.log(`\n Running provider: ${bp.id} / ${bp.model}`); + console.log(` User simulator: ${userSimBp.id} / ${userSimBp.model}`); + try { + const r = await runProviderOnDomainOfficial(config, runner, dataDir, domain, bp, userSimBp); + allResults.push(r); + } catch (err: any) { + console.log(` [${bp.id}] ${domain}: FAIL (${err?.message || String(err)})`); + allResults.push({ + provider: bp, + summary: { + domain, + total_tasks: 0, + num_trials: config.numTrials, + pass_at_k: [], + avg_tokens: 0, + token_observed_trials: 0, + }, + results: [], + }); + } + } + } + + return { tau: allResults }; +} diff --git a/tests/benchmark/types.ts b/tests/benchmark/types.ts new file mode 100644 index 0000000..8ae5975 --- /dev/null +++ b/tests/benchmark/types.ts @@ -0,0 +1,125 @@ +import type { ProviderId } from '../helpers/provider-env'; + +export interface BenchmarkProvider { + id: ProviderId; + model: string; + apiKey: string; + baseUrl?: string; + proxyUrl?: string; +} + +export interface BenchmarkCliArgs { + benchmark?: 'swe' | 'tau' | 'tb2' | 'both' | 'all'; + provider?: string; + tauDomain?: 'airline' | 'retail' | 'telecom' | 'all' | string; + numTrials?: number; + tb2Model?: string; + tb2Agent?: string; + tb2Dataset?: string; + tb2Runner?: 'auto' | 'harbor' | 'uvx' | 'docker'; + tb2Python?: string; + tb2JobsDir?: string; + tb2EnvFile?: string; + tb2DockerImage?: string; + output?: 'table' | 'json'; + outputFile?: string; + compare?: string; +} + +export interface BenchmarkConfig { + benchmark: 'swe' | 'tau' | 'tb2' | 'both' | 'all'; + providers: BenchmarkProvider[]; + userSimProvider?: BenchmarkProvider; + timeoutMs: number; + numTrials: number; + tauDomain: string; + output: 'table' | 'json'; + outputFile: string; + tb2Model?: string; + tb2Agent: string; + tb2Dataset: string; + tb2Runner: 'auto' | 'harbor' | 'uvx' | 'docker'; + tb2Python: string; + tb2JobsDir: string; + tb2EnvFile?: string; + tb2DockerImage: string; + sdkVersion: string; + dockerProxy?: string; +} + +export interface SWEResult { + instance_id: string; + resolved: boolean; + tokens_used: number; + duration_ms: number; + error?: string; +} + +export interface SWESummary { + dataset: string; + total: number; + resolved: number; + rate: number; + avg_tokens: number; + avg_duration_ms: number; +} + +export interface SWEProviderResult { + provider: BenchmarkProvider; + summary: SWESummary; + results: SWEResult[]; +} + +export interface TAUTaskResult { + task_id: string; + trial_pass_rates: boolean[]; + tokens_used: number; + error?: string; +} + +export interface TAUSummary { + domain: string; + total_tasks: number; + num_trials: number; + pass_at_k: number[]; + avg_tokens: number; + token_observed_trials?: number; +} + +export interface TAUProviderResult { + provider: BenchmarkProvider; + summary: TAUSummary; + results: TAUTaskResult[]; +} + +export interface TB2Summary { + generated_at: string; + dataset: string; + agent: string; + model?: string; + jobs_dir: string; + job_path: string; + passed: number; + total: number; + rate: number; + unknown: number; + avg_input_tokens?: number; + avg_output_tokens?: number; + avg_cache_tokens?: number; + avg_total_tokens?: number; + token_observed_trials?: number; +} + +export interface BenchmarkReport { + timestamp: string; + sdk_version: string; + swe?: SWEProviderResult[]; + tau?: TAUProviderResult[]; + tb2?: TB2Summary; +} + +export interface BenchmarkModuleResult { + swe?: SWEProviderResult[]; + tau?: TAUProviderResult[]; + tb2?: TB2Summary; +} diff --git a/tests/unit/providers/openai.test.ts b/tests/unit/providers/openai.test.ts index 0efd3d2..14e94bf 100644 --- a/tests/unit/providers/openai.test.ts +++ b/tests/unit/providers/openai.test.ts @@ -10,6 +10,11 @@ runner const config = provider.toConfig(); expect.toEqual(config.baseUrl, 'https://api.openai.com/v1'); }) + .test('baseUrl 保留已有版本路径 /v4 (GLM coding endpoint)', async () => { + const provider = new OpenAIProvider('test-key', 'any-model', 'https://open.bigmodel.cn/api/coding/paas/v4'); + const config = provider.toConfig(); + expect.toEqual(config.baseUrl, 'https://open.bigmodel.cn/api/coding/paas/v4'); + }) .test('请求体包含 system 与工具调用结构', async () => { const provider = new OpenAIProvider('test-key', 'gpt-4o', 'https://api.openai.com'); const messages: Message[] = [