Versatly · G9Pedro · Mar 11, 2026
diff --git a/benchmarks/locomo/.gitignore b/benchmarks/locomo/.gitignore
@@ -0,0 +1,5 @@
+.bin/
+.vaults/
+data/*.json
+results/locomo_debug_*
+results/locomo_extractive_smoke_*
diff --git a/benchmarks/locomo/README.md b/benchmarks/locomo/README.md
@@ -0,0 +1,123 @@
+# LoCoMo Benchmark Suite for ClawVault
+
+This directory contains an end-to-end benchmark runner that evaluates ClawVault memory retrieval on the LoCoMo QA benchmark.
+
+## What it does
+
+`run.ts` performs the following pipeline:
+
+1. Loads each LoCoMo conversation sample.
+2. Ingests the conversation into a fresh ClawVault vault using standard `transcripts` primitives.
+3. Builds BM25 + vector indexes (`qmd update` + `qmd embed`) through ClawVault APIs.
+4. For each QA item:
+   - Retrieves context with both `search` and `vsearch`
+   - Fuses retrieval ranks with reciprocal-rank fusion (RRF)
+   - Answers using either:
+     - an OpenAI-compatible `/chat/completions` endpoint, or
+     - a deterministic extractive fallback mode
+5. Scores predictions with LoCoMo category-aware QA metrics aligned with the official LoCoMo evaluator logic.
+6. Writes:
+   - detailed JSON output
+   - markdown summary table including published baseline comparisons
+
+## Files
+
+- `download.ts` — dataset downloader
+- `run.ts` — benchmark runner
+- `dataset.ts` — LoCoMo dataset parsing
+- `retrieval.ts` — search + vsearch fusion
+- `llm.ts` — OpenAI-compatible client with retry/rate-limit handling
+- `scoring.ts` — LoCoMo QA metric implementation
+- `report.ts` — JSON + markdown output formatting
+- `types.ts` — shared benchmark types
+
+## Prerequisites
+
+- Node.js 18+
+- `qmd` on `PATH` (required by ClawVault search/vsearch)
+- dependencies installed (`npm install`)
+
+If your `qmd` wrapper is present but non-functional (a known Bun global wrapper issue), `run.ts` automatically installs a local shim at `benchmarks/locomo/.bin/qmd` and prepends it to `PATH` for the benchmark process.
+
+### Optional (for OpenAI-compatible runs)
+
+- `LOCOMO_OPENAI_API_KEY` (or `OPENAI_API_KEY`)
+- `LOCOMO_OPENAI_BASE_URL` (defaults to `https://api.openai.com/v1`)
+- `LOCOMO_MODEL` (defaults to `gpt-4o-mini`)
+
+## Usage
+
+### 1) Download dataset
+
+```bash
+npx tsx benchmarks/locomo/download.ts
+```
+
+Default output path: `benchmarks/locomo/data/locomo10.json`
+
+### 2) Run full benchmark (OpenAI-compatible)
+
+```bash
+LOCOMO_OPENAI_API_KEY=... npx tsx benchmarks/locomo/run.ts \
+  --mode openai \
+  --model gpt-4o-mini
+```
+
+### 3) Run full benchmark without external API (deterministic fallback)
+
+```bash
+npx tsx benchmarks/locomo/run.ts --mode extractive
+```
+
+### 4) Quick smoke run
+
+```bash
+npx tsx benchmarks/locomo/run.ts --mode extractive --maxQuestions 100
+```
+
+## Important flags
+
+- `--dataset <path>` dataset path (default: `benchmarks/locomo/data/locomo10.json`)
+- `--outputDir <path>` output directory (default: `benchmarks/locomo/results`)
+- `--runName <name>` deterministic run name override
+- `--maxQuestions <n>` limit evaluated questions
+- `--retrievalLimit <n>` candidates per retrieval stream before fusion
+- `--contextLimit <n>` fused contexts passed to answerer
+- `--rrfK <n>` RRF constant (default: `60`)
+- `--disableVsearch` disable vector retrieval (BM25-only runs)
+- `--keepVaults` keep generated per-sample vault folders
+- `--downloadIfMissing` auto-download dataset if missing (default: true)
+
+OpenAI-compatible flags:
+
+- `--apiBaseUrl <url>`
+- `--apiKey <key>`
+- `--model <model>`
+- `--maxRetries <n>`
+- `--retryBaseDelayMs <ms>`
+- `--timeoutMs <ms>`
+- `--seed <n>` (passed when provider supports seeded completion)
+
+## Outputs
+
+Each run writes:
+
+- `benchmarks/locomo/results/<runName>.json` (full per-question data)
+- `benchmarks/locomo/results/<runName>.md` (summary + comparison table)
+
+JSON includes:
+
+- run config snapshot
+- aggregate score + per-category breakdown
+- retrieval recall
+- per-question prediction/score/evidence/context traces
+
+## Baseline comparison notes
+
+The markdown leaderboard includes:
+
+- Letta filesystem (74.0)
+- Mem0 claimed score(s)
+- LoCoMo paper RAG baselines (Table 3; answer-prediction overall F1)
+
+These are published numbers from different setups/judges/models; treat them as directional comparisons, not apples-to-apples certification.
diff --git a/benchmarks/locomo/constants.ts b/benchmarks/locomo/constants.ts
@@ -0,0 +1,44 @@
+export const LOCOMO_DATASET_URL =
+  'https://raw.githubusercontent.com/snap-research/locomo/main/data/locomo10.json';
+
+export const DEFAULT_DATASET_PATH = 'benchmarks/locomo/data/locomo10.json';
+export const DEFAULT_OUTPUT_DIR = 'benchmarks/locomo/results';
+export const DEFAULT_VAULTS_DIR = 'benchmarks/locomo/.vaults';
+
+export interface PublishedBaseline {
+  name: string;
+  scorePct: number;
+  source: string;
+  notes?: string;
+}
+
+export const PUBLISHED_BASELINES: PublishedBaseline[] = [
+  {
+    name: 'Letta filesystem',
+    scorePct: 74.0,
+    source: 'https://www.letta.com/blog/benchmarking-ai-agent-memory'
+  },
+  {
+    name: 'Mem0^g (claimed)',
+    scorePct: 68.5,
+    source: 'https://mem0.ai/blog/ai-agent-memory-benchmark/'
+  },
+  {
+    name: 'LoCoMo RAG (Dialog, top-25, GPT-3.5)',
+    scorePct: 41.0,
+    source: 'LoCoMo paper Table 3',
+    notes: 'Answer prediction F1 overall'
+  },
+  {
+    name: 'LoCoMo RAG (Observation, top-5, GPT-3.5)',
+    scorePct: 43.3,
+    source: 'LoCoMo paper Table 3',
+    notes: 'Answer prediction F1 overall'
+  },
+  {
+    name: 'LoCoMo RAG (Summary, top-10, GPT-3.5)',
+    scorePct: 32.0,
+    source: 'LoCoMo paper Table 3',
+    notes: 'Answer prediction F1 overall'
+  }
+];
diff --git a/benchmarks/locomo/dataset.ts b/benchmarks/locomo/dataset.ts
@@ -0,0 +1,70 @@
+import * as fs from 'node:fs/promises';
+import type { LocomoConversation, LocomoSample, LocomoSession, LocomoTurn } from './types.ts';
+
+const SESSION_KEY_RE = /^session_(\d+)$/;
+
+function toTurn(raw: unknown): LocomoTurn {
+  const obj = (raw ?? {}) as Record<string, unknown>;
+  return {
+    speaker: String(obj.speaker ?? ''),
+    diaId: String(obj.dia_id ?? ''),
+    text: String(obj.text ?? ''),
+    imgUrl: Array.isArray(obj.img_url) ? obj.img_url.map(String) : undefined,
+    blipCaption: typeof obj.blip_caption === 'string' ? obj.blip_caption : undefined,
+    query: typeof obj.query === 'string' ? obj.query : undefined
+  };
+}
+
+export function parseSessions(conversation: LocomoConversation): LocomoSession[] {
+  const entries = Object.keys(conversation)
+    .map((key) => {
+      const match = key.match(SESSION_KEY_RE);
+      if (!match) {
+        return null;
+      }
+
+      const index = Number(match[1]);
+      const sessionValue = conversation[key];
+      if (!Array.isArray(sessionValue)) {
+        return null;
+      }
+
+      const dateTimeKey = `session_${index}_date_time`;
+      const dateTimeRaw = conversation[dateTimeKey];
+      return {
+        sessionId: `session_${index}`,
+        sessionIndex: index,
+        dateTime: typeof dateTimeRaw === 'string' ? dateTimeRaw : undefined,
+        turns: sessionValue.map(toTurn)
+      } satisfies LocomoSession;
+    })
+    .filter((entry): entry is LocomoSession => entry !== null)
+    .sort((a, b) => a.sessionIndex - b.sessionIndex);
+
+  return entries;
+}
+
+export async function loadLocomoDataset(datasetPath: string): Promise<LocomoSample[]> {
+  const raw = await fs.readFile(datasetPath, 'utf-8');
+  const parsed = JSON.parse(raw) as unknown;
+  if (!Array.isArray(parsed)) {
+    throw new Error(`Expected LoCoMo dataset to be a JSON array: ${datasetPath}`);
+  }
+
+  return parsed.map((item, idx) => {
+    const sample = item as Record<string, unknown>;
+    if (!sample.sample_id || !sample.qa || !sample.conversation) {
+      throw new Error(`Malformed LoCoMo sample at index ${idx}`);
+    }
+
+    return {
+      sample_id: String(sample.sample_id),
+      qa: sample.qa as LocomoSample['qa'],
+      conversation: sample.conversation as LocomoConversation
+    };
+  });
+}
+
+export function countDatasetQuestions(samples: LocomoSample[]): number {
+  return samples.reduce((sum, sample) => sum + sample.qa.length, 0);
+}
diff --git a/benchmarks/locomo/download.ts b/benchmarks/locomo/download.ts
@@ -0,0 +1,60 @@
+import * as fs from 'node:fs/promises';
+import * as path from 'node:path';
+import { LOCOMO_DATASET_URL } from './constants.ts';
+
+export interface DownloadOptions {
+  datasetUrl?: string;
+  outputPath: string;
+  force?: boolean;
+}
+
+export async function downloadLocomoDataset(options: DownloadOptions): Promise<{ path: string; bytes: number }> {
+  const outputPath = path.resolve(options.outputPath);
+  const datasetUrl = options.datasetUrl ?? LOCOMO_DATASET_URL;
+
+  await fs.mkdir(path.dirname(outputPath), { recursive: true });
+  if (!options.force) {
+    try {
+      const existing = await fs.stat(outputPath);
+      if (existing.isFile() && existing.size > 0) {
+        return { path: outputPath, bytes: existing.size };
+      }
+    } catch {
+      // no-op: file does not exist
+    }
+  }
+
+  const response = await fetch(datasetUrl);
+  if (!response.ok) {
+    throw new Error(`Failed to download LoCoMo dataset (${response.status}) from ${datasetUrl}`);
+  }
+
+  const arrayBuffer = await response.arrayBuffer();
+  const bytes = Buffer.from(arrayBuffer);
+  await fs.writeFile(outputPath, bytes);
+
+  return { path: outputPath, bytes: bytes.byteLength };
+}
+
+function parseArg(name: string): string | undefined {
+  const idx = process.argv.findIndex((arg) => arg === name);
+  if (idx === -1 || idx + 1 >= process.argv.length) {
+    return undefined;
+  }
+  return process.argv[idx + 1];
+}
+
+async function main(): Promise<void> {
+  const outputPath = parseArg('--output') ?? 'benchmarks/locomo/data/locomo10.json';
+  const datasetUrl = parseArg('--url') ?? LOCOMO_DATASET_URL;
+  const force = process.argv.includes('--force');
+  const downloaded = await downloadLocomoDataset({ outputPath, datasetUrl, force });
+  console.log(`Downloaded LoCoMo dataset to ${downloaded.path} (${downloaded.bytes} bytes)`);
+}
+
+if (import.meta.url === `file://${process.argv[1]}`) {
+  main().catch((error) => {
+    console.error(error);
+    process.exitCode = 1;
+  });
+}