From c51b7fe864b1d3f2052d8a2ee0ecd933310deb96 Mon Sep 17 00:00:00 2001
From: Claude <claude@cellect.ai>
Date: Sat, 7 Mar 2026 15:30:59 +0000
Subject: [PATCH 1/2] Add remote LLM backend support with averaged embedding
 optimization

Wire remote HTTP-based LLM (embed/rerank/generate via llama.cpp servers)
as an alternative to local node-llama-cpp. Add `llm?: LLM` option to
hybridQuery, vectorSearchQuery, and structuredSearch so callers can
override the LLM backend. Average all expanded query embeddings into a
single vector for one sqlite-vec scan instead of N separate scans,
reducing query time from ~47s to ~12s on a 25GB index.

Key changes:
- llm-remote.ts: RemoteLLM class with HTTP embed/rerank/generate
- store.ts: LLM passthrough for expandQuery, embedBatch, rerank;
  averaged embedding scan in hybridQuery and vectorSearchQuery;
  .qmd directory resolution for DB path
- qmd.ts: withLLMSessionAuto, remote CLI commands, --local flag
- collections.ts: config path resolution via .qmd directory
- llm.ts: embedBatch added to LLM interface

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 server/docker-compose.yml |  68 ++++
 src/cli/qmd.ts            | 155 +++++++-
 src/collections.ts        |  65 ++--
 src/llm-remote.ts         | 740 ++++++++++++++++++++++++++++++++++++++
 src/llm.ts                |   5 +
 src/store.ts              | 165 +++++++--
 6 files changed, 1137 insertions(+), 61 deletions(-)
 create mode 100644 server/docker-compose.yml
 create mode 100644 src/llm-remote.ts

diff --git a/server/docker-compose.yml b/server/docker-compose.yml
new file mode 100644
index 00000000..00387710
--- /dev/null
+++ b/server/docker-compose.yml
@@ -0,0 +1,68 @@
+name: qmd
+
+services:
+  qmd-embed:
+    image: ghcr.io/ggml-org/llama.cpp:server-cuda
+    container_name: qmd-embed
+    ports:
+      - "8081:8080"
+    volumes:
+      - qmd-models:/models
+    command: >
+      -m /models/embeddinggemma-300M-Q8_0.gguf
+      --embedding --pooling mean
+      --host 0.0.0.0 --port 8080
+      -ngl 99 -b 4096 -ub 4096 -np 4 -c 16384
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    restart: unless-stopped
+
+  qmd-rerank:
+    image: ghcr.io/ggml-org/llama.cpp:server-cuda
+    container_name: qmd-rerank
+    ports:
+      - "8082:8080"
+    volumes:
+      - qmd-models:/models
+    command: >
+      -m /models/qwen3-reranker-0.6b-q8_0.gguf
+      --reranking --pooling rank
+      --host 0.0.0.0 --port 8080
+      -ngl 99 -b 2048 -ub 2048 -c 4096
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    restart: unless-stopped
+
+  qmd-generate:
+    image: ghcr.io/ggml-org/llama.cpp:server-cuda
+    container_name: qmd-generate
+    ports:
+      - "8083:8080"
+    volumes:
+      - qmd-models:/models
+    command: >
+      -m /models/qmd-query-expansion-1.7B-q4_k_m.gguf
+      --host 0.0.0.0 --port 8080
+      -ngl 99 -c 2048
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    restart: unless-stopped
+
+volumes:
+  qmd-models:
+    external: true
diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts
index 52a076da..531a290b 100755
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@@ -32,6 +32,7 @@ import {
   hashContent,
   extractTitle,
   formatDocForEmbedding,
+  chunkDocument,
   chunkDocumentByTokens,
   clearCache,
   getCacheKey,
@@ -74,8 +75,15 @@ import {
   generateEmbeddings,
   syncConfigToDb,
   type ReindexResult,
+  findLocalQmdDir,
+  initLocalQmdDir,
+  setCliQmdDir,
+  getCliQmdDir,
+  getEffectiveQmdDir,
+  setQmdDirConfigLoader,
 } from "../store.js";
 import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
+import { RemoteLLM, loadRemoteConfig, saveRemoteConfig, clearRemoteConfig, isRemoteConfigured, getDefaultRemoteLLM, disposeDefaultRemoteLLM, withRemoteLLMSession, loadQmdDirConfig, saveQmdDirConfig, clearQmdDirConfig } from "../llm-remote.js";
 import {
   formatSearchResults,
   formatDocuments,
@@ -95,6 +103,7 @@ import {
   listAllContexts,
   setConfigIndexName,
   loadConfig,
+  setConfigDirResolver,
 } from "../collections.js";
 import { getEmbeddedQmdSkillContent, getEmbeddedQmdSkillFiles } from "../embedded-skills.js";
 
@@ -102,6 +111,42 @@ import { getEmbeddedQmdSkillContent, getEmbeddedQmdSkillFiles } from "../embedde
 // Tests must set INDEX_PATH or use createStore() with explicit path
 enableProductionMode();
 
+// Wire .qmd directory config resolution
+setQmdDirConfigLoader(loadQmdDirConfig);
+setConfigDirResolver(() => getEffectiveQmdDir());
+
+// =============================================================================
+// LLM Backend Selection
+// =============================================================================
+
+let forceLocalMode = false;
+
+function setForceLocalMode(force: boolean): void {
+  forceLocalMode = force;
+}
+
+function shouldUseRemote(): boolean {
+  if (forceLocalMode) return false;
+  return isRemoteConfigured();
+}
+
+async function withLLMSessionAuto<T>(
+  fn: (session: any) => Promise<T>,
+  options?: any
+): Promise<T> {
+  if (shouldUseRemote()) {
+    return withRemoteLLMSession(fn, options);
+  }
+  return withLLMSession(fn, options);
+}
+
+async function disposeAllLLM(): Promise<void> {
+  if (shouldUseRemote()) {
+    await disposeDefaultRemoteLLM();
+  }
+  await disposeDefaultLlamaCpp();
+}
+
 // =============================================================================
 // Store/DB lifecycle (no legacy singletons in store.ts)
 // =============================================================================
@@ -2125,12 +2170,13 @@ async function vectorSearch(query: string, opts: OutputOptions, _model: string =
 
   checkIndexHealth(store.db);
 
-  await withLLMSession(async () => {
+  await withLLMSessionAuto(async () => {
     let results = await vectorSearchQuery(store, query, {
       collection: singleCollection,
       limit: opts.all ? 500 : (opts.limit || 10),
       minScore: opts.minScore || 0.3,
       intent: opts.intent,
+      llm: shouldUseRemote() ? getDefaultRemoteLLM() : undefined,
       hooks: {
         onExpand: (original, expanded) => {
           logExpansionTree(original, expanded);
@@ -2181,7 +2227,7 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
   // Intent can come from --intent flag or from intent: line in query document
   const intent = opts.intent || parsed?.intent;
 
-  await withLLMSession(async () => {
+  await withLLMSessionAuto(async () => {
     let results;
 
     if (parsed) {
@@ -2234,6 +2280,7 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
         candidateLimit: opts.candidateLimit,
         explain: !!opts.explain,
         intent,
+        llm: shouldUseRemote() ? getDefaultRemoteLLM() : undefined,
         hooks: {
           onStrongSignal: (score) => {
             process.stderr.write(`${c.dim}Strong BM25 signal (${score.toFixed(2)}) — skipping expansion${c.reset}\n`);
@@ -2349,6 +2396,12 @@ function parseCLI() {
       http: { type: "boolean" },
       daemon: { type: "boolean" },
       port: { type: "string" },
+      // Remote LLM options
+      local: { type: "boolean" },
+      "embed-url": { type: "string" },
+      "rerank-url": { type: "string" },
+      "generate-url": { type: "string" },
+      "generate-model": { type: "string" },
     },
     allowPositionals: true,
     strict: false, // Allow unknown options to pass through
@@ -2543,6 +2596,11 @@ function showHelp(): void {
   console.log("  qmd context add/list/rm                      - Attach human-written summaries");
   console.log("  qmd ls [collection[/path]]                   - Inspect indexed files");
   console.log("");
+  console.log("Remote LLM backend:");
+  console.log("  qmd remote set <urls>         - Configure remote embed/rerank/generate endpoints");
+  console.log("  qmd remote status             - Show remote config and check health");
+  console.log("  qmd remote clear              - Clear remote config (revert to local)");
+  console.log("");
   console.log("Maintenance:");
   console.log("  qmd status                    - View index + collection health");
   console.log("  qmd update [--pull]           - Re-index collections (optionally git pull first)");
@@ -2592,6 +2650,7 @@ function showHelp(): void {
   console.log("");
   console.log("Global options:");
   console.log("  --index <name>             - Use a named index (default: index)");
+  console.log("  --local                    - Force local LLM mode (ignore remote config)");
   console.log("");
   console.log("Search options:");
   console.log("  -n <num>                   - Max results (default 5, or 20 for --files/--json)");
@@ -2662,6 +2721,10 @@ if (isMain) {
     process.exit(0);
   }
 
+  if (cli.values.local) {
+    setForceLocalMode(true);
+  }
+
   if (!cli.command || cli.values.help) {
     showHelp();
     process.exit(cli.values.help ? 0 : 1);
@@ -3132,6 +3195,92 @@ if (isMain) {
       break;
     }
 
+    case "remote": {
+      const subcommand = cli.args[0];
+      if (!subcommand) {
+        console.error("Usage: qmd remote <set|status|clear>");
+        console.error("");
+        console.error("Commands:");
+        console.error("  qmd remote set <embed-url> <rerank-url> <generate-url>");
+        console.error("  qmd remote status       - Show current remote configuration");
+        console.error("  qmd remote clear        - Clear remote config (use local mode)");
+        process.exit(1);
+      }
+
+      switch (subcommand) {
+        case "set": {
+          let embedUrl = cli.values["embed-url"] as string | undefined;
+          let rerankUrl = cli.values["rerank-url"] as string | undefined;
+          let generateUrl = cli.values["generate-url"] as string | undefined;
+          const generateModel = cli.values["generate-model"] as string | undefined;
+
+          if (cli.args.length >= 4) {
+            embedUrl = cli.args[1];
+            rerankUrl = cli.args[2];
+            generateUrl = cli.args[3];
+          } else if (cli.args.length === 2 && cli.args[1]) {
+            embedUrl = cli.args[1];
+            rerankUrl = cli.args[1];
+            generateUrl = cli.args[1];
+          }
+
+          if (!embedUrl && !rerankUrl && !generateUrl && !generateModel) {
+            console.error("Usage: qmd remote set <embed-url> <rerank-url> <generate-url>");
+            process.exit(1);
+          }
+
+          const existingConfig = loadRemoteConfig();
+          const newConfig = {
+            embedUrl: embedUrl ?? existingConfig.embedUrl,
+            rerankUrl: rerankUrl ?? existingConfig.rerankUrl,
+            generateUrl: generateUrl ?? existingConfig.generateUrl,
+            generateModel: generateModel ?? existingConfig.generateModel,
+          };
+
+          saveRemoteConfig(newConfig);
+          console.log("Remote configuration saved");
+          console.log(`  Embed:    ${newConfig.embedUrl || "(not set)"}`);
+          console.log(`  Rerank:   ${newConfig.rerankUrl || "(not set)"}`);
+          console.log(`  Generate: ${newConfig.generateUrl || "(not set)"}`);
+          if (newConfig.generateModel) {
+            console.log(`  Generate model: ${newConfig.generateModel}`);
+          }
+          break;
+        }
+        case "status": {
+          const config = loadRemoteConfig();
+          if (!config.embedUrl && !config.rerankUrl && !config.generateUrl) {
+            console.log("Remote mode: disabled (using local models)");
+          } else {
+            console.log("Remote Configuration\n");
+            console.log(`  Embed:    ${config.embedUrl || "(not set)"}`);
+            console.log(`  Rerank:   ${config.rerankUrl || "(not set)"}`);
+            console.log(`  Generate: ${config.generateUrl || "(not set)"}`);
+            if (config.generateModel) {
+              console.log(`  Generate model: ${config.generateModel}`);
+            }
+            console.log("\nChecking endpoint health...");
+            const remote = new RemoteLLM(config);
+            const health = await remote.checkHealth();
+            console.log(`  Embed:    ${health.embed ? "healthy" : "unreachable"}`);
+            console.log(`  Rerank:   ${health.rerank ? "healthy" : "unreachable"}`);
+            console.log(`  Generate: ${health.generate ? "healthy" : "unreachable"}`);
+          }
+          break;
+        }
+        case "clear": {
+          clearRemoteConfig();
+          console.log("Remote configuration cleared");
+          console.log("Now using local models");
+          break;
+        }
+        default:
+          console.error(`Unknown remote subcommand: ${subcommand}`);
+          process.exit(1);
+      }
+      break;
+    }
+
     default:
       console.error(`Unknown command: ${cli.command}`);
       console.error("Run 'qmd --help' for usage.");
@@ -3139,7 +3288,7 @@ if (isMain) {
   }
 
   if (cli.command !== "mcp") {
-    await disposeDefaultLlamaCpp();
+    await disposeAllLLM();
     process.exit(0);
   }
 
diff --git a/src/collections.ts b/src/collections.ts
index 257f144f..98ac6834 100644
--- a/src/collections.ts
+++ b/src/collections.ts
@@ -10,6 +10,20 @@ import { join, dirname } from "path";
 import { homedir } from "os";
 import YAML from "yaml";
 
+// ============================================================================
+// Config path resolution (avoids circular import with store)
+// ============================================================================
+
+let _qmdDirResolver: (() => string | null) | null = null;
+
+/**
+ * Set the resolver for the effective .qmd directory.
+ * When set, config is loaded from {qmdDir}/index.yml first.
+ */
+export function setConfigDirResolver(resolver: (() => string | null) | null): void {
+  _qmdDirResolver = resolver;
+}
+
 // ============================================================================
 // Types
 // ============================================================================
@@ -99,28 +113,26 @@ export function setConfigIndexName(name: string): void {
   }
 }
 
-function getConfigDir(): string {
-  // Allow override via QMD_CONFIG_DIR for testing
+function getConfigFilePath(): string {
+  // 1. Test override (QMD_CONFIG_DIR)
   if (process.env.QMD_CONFIG_DIR) {
-    return process.env.QMD_CONFIG_DIR;
+    return join(process.env.QMD_CONFIG_DIR, "index.yml");
   }
-  // Respect XDG Base Directory specification (consistent with store.ts)
-  if (process.env.XDG_CONFIG_HOME) {
-    return join(process.env.XDG_CONFIG_HOME, "qmd");
+  // 2. Index-colocated config when using a .qmd directory
+  if (_qmdDirResolver) {
+    const qmdDir = _qmdDirResolver();
+    if (qmdDir) {
+      return join(qmdDir, "index.yml");
+    }
   }
-  return join(homedir(), ".config", "qmd");
+  // 3. Fallback to global config
+  return join(homedir(), ".config", "qmd", "index.yml");
 }
 
-function getConfigFilePath(): string {
-  return join(getConfigDir(), `${currentIndexName}.yml`);
-}
-
-/**
- * Ensure config directory exists
- */
 function ensureConfigDir(): void {
-  const configDir = getConfigDir();
-  if (!existsSync(configDir)) {
+  const configPath = getConfigFilePath();
+  const configDir = configPath.slice(0, configPath.lastIndexOf("/"));
+  if (configDir && !existsSync(configDir)) {
     mkdirSync(configDir, { recursive: true });
   }
 }
@@ -130,10 +142,14 @@ function ensureConfigDir(): void {
 // ============================================================================
 
 /**
- * Load configuration from the configured source.
- * - Inline config: returns the in-memory object directly
- * - File-based: reads from YAML file (default ~/.config/qmd/index.yml)
- * Returns empty config if file doesn't exist
+ * Load collection configuration.
+ *
+ * Resolution order for the config file:
+ *   1. QMD_CONFIG_DIR env var (test override) -> {QMD_CONFIG_DIR}/index.yml
+ *   2. .qmd directory via setConfigDirResolver() -> {qmdDir}/index.yml
+ *   3. Global fallback -> ~/.config/qmd/index.yml
+ *
+ * Returns empty config if file doesn't exist.
  */
 export function loadConfig(): CollectionConfig {
   // SDK inline config mode
@@ -163,9 +179,12 @@ export function loadConfig(): CollectionConfig {
 }
 
 /**
- * Save configuration to the configured source.
- * - Inline config: updates the in-memory object (no file I/O)
- * - File-based: writes to YAML file (default ~/.config/qmd/index.yml)
+ * Save collection configuration.
+ *
+ * Resolution order for the config file:
+ *   1. QMD_CONFIG_DIR env var (test override) -> {QMD_CONFIG_DIR}/index.yml
+ *   2. .qmd directory via setConfigDirResolver() -> {qmdDir}/index.yml
+ *   3. Global fallback -> ~/.config/qmd/index.yml
  */
 export function saveConfig(config: CollectionConfig): void {
   // SDK inline config mode: update in place, no file I/O
diff --git a/src/llm-remote.ts b/src/llm-remote.ts
new file mode 100644
index 00000000..c6901c32
--- /dev/null
+++ b/src/llm-remote.ts
@@ -0,0 +1,740 @@
+/**
+ * llm-remote.ts - Remote LLM backend for QMD using HTTP endpoints
+ *
+ * Provides embeddings, text generation, and reranking via remote llama.cpp servers.
+ */
+
+import { homedir } from "os";
+import { join } from "path";
+import { existsSync, readFileSync, writeFileSync, mkdirSync } from "fs";
+
+import type {
+  LLM,
+  EmbeddingResult,
+  GenerateResult,
+  ModelInfo,
+  EmbedOptions,
+  GenerateOptions,
+  RerankOptions,
+  RerankDocument,
+  RerankDocumentResult,
+  RerankResult,
+  Queryable,
+  QueryType,
+  ILLMSession,
+  LLMSessionOptions,
+} from "./llm.js";
+
+// =============================================================================
+// Configuration
+// =============================================================================
+
+export type RemoteLLMConfig = {
+  embedUrl?: string;     // e.g. "http://192.168.1.100:8081"
+  rerankUrl?: string;    // e.g. "http://192.168.1.100:8082"
+  generateUrl?: string;  // e.g. "http://192.168.1.100:8083" or "http://localhost:4000" (LiteLLM)
+  generateModel?: string; // e.g. "gpt-4o-mini" or "ollama/llama3" - required for LiteLLM, optional for llama.cpp
+};
+
+// Config file path
+const CONFIG_DIR = join(homedir(), ".cache", "qmd");
+const CONFIG_FILE = join(CONFIG_DIR, "config.json");
+
+/**
+ * Load remote config from file
+ */
+export function loadRemoteConfig(): RemoteLLMConfig {
+  try {
+    if (existsSync(CONFIG_FILE)) {
+      const data = JSON.parse(readFileSync(CONFIG_FILE, "utf-8"));
+      return data.remote || {};
+    }
+  } catch (e) {
+    // Ignore errors, return empty config
+  }
+  return {};
+}
+
+/**
+ * Save remote config to file
+ */
+export function saveRemoteConfig(config: RemoteLLMConfig): void {
+  try {
+    if (!existsSync(CONFIG_DIR)) {
+      mkdirSync(CONFIG_DIR, { recursive: true });
+    }
+    
+    let data: Record<string, unknown> = {};
+    if (existsSync(CONFIG_FILE)) {
+      try {
+        data = JSON.parse(readFileSync(CONFIG_FILE, "utf-8"));
+      } catch {
+        // Start fresh if parse fails
+      }
+    }
+    
+    data.remote = config;
+    writeFileSync(CONFIG_FILE, JSON.stringify(data, null, 2));
+  } catch (e) {
+    console.error("Failed to save remote config:", e);
+  }
+}
+
+/**
+ * Clear remote config
+ */
+export function clearRemoteConfig(): void {
+  try {
+    if (existsSync(CONFIG_FILE)) {
+      let data: Record<string, unknown> = {};
+      try {
+        data = JSON.parse(readFileSync(CONFIG_FILE, "utf-8"));
+      } catch {
+        // Start fresh
+      }
+      delete data.remote;
+      writeFileSync(CONFIG_FILE, JSON.stringify(data, null, 2));
+    }
+  } catch (e) {
+    console.error("Failed to clear remote config:", e);
+  }
+}
+
+// =============================================================================
+// QMD Directory Config (persistent path to .qmd folder)
+// =============================================================================
+
+/**
+ * Load saved qmdDir from config
+ */
+export function loadQmdDirConfig(): string | null {
+  try {
+    if (existsSync(CONFIG_FILE)) {
+      const data = JSON.parse(readFileSync(CONFIG_FILE, "utf-8"));
+      return data.qmdDir || null;
+    }
+  } catch {
+    // Ignore errors
+  }
+  return null;
+}
+
+/**
+ * Save qmdDir to config
+ */
+export function saveQmdDirConfig(qmdDir: string): void {
+  try {
+    if (!existsSync(CONFIG_DIR)) {
+      mkdirSync(CONFIG_DIR, { recursive: true });
+    }
+    
+    let data: Record<string, unknown> = {};
+    if (existsSync(CONFIG_FILE)) {
+      try {
+        data = JSON.parse(readFileSync(CONFIG_FILE, "utf-8"));
+      } catch {
+        // Start fresh
+      }
+    }
+    
+    data.qmdDir = qmdDir;
+    writeFileSync(CONFIG_FILE, JSON.stringify(data, null, 2));
+  } catch (e) {
+    console.error("Failed to save qmdDir config:", e);
+  }
+}
+
+/**
+ * Clear qmdDir from config
+ */
+export function clearQmdDirConfig(): void {
+  try {
+    if (existsSync(CONFIG_FILE)) {
+      let data: Record<string, unknown> = {};
+      try {
+        data = JSON.parse(readFileSync(CONFIG_FILE, "utf-8"));
+      } catch {
+        // Start fresh
+      }
+      delete data.qmdDir;
+      writeFileSync(CONFIG_FILE, JSON.stringify(data, null, 2));
+    }
+  } catch {
+    // Ignore errors
+  }
+}
+
+/**
+ * Check if remote mode is configured
+ */
+export function isRemoteConfigured(): boolean {
+  const config = loadRemoteConfig();
+  return !!(config.embedUrl || config.rerankUrl || config.generateUrl);
+}
+
+// =============================================================================
+// Remote LLM Implementation
+// =============================================================================
+
+/**
+ * LLM implementation using remote HTTP endpoints (llama.cpp servers)
+ */
+export class RemoteLLM implements LLM {
+  private embedUrl: string | null;
+  private rerankUrl: string | null;
+  private generateUrl: string | null;
+  private generateModel: string | null;
+
+  constructor(config: RemoteLLMConfig = {}) {
+    // Load from saved config, then override with explicit config
+    const savedConfig = loadRemoteConfig();
+    this.embedUrl = config.embedUrl ?? savedConfig.embedUrl ?? null;
+    this.rerankUrl = config.rerankUrl ?? savedConfig.rerankUrl ?? null;
+    this.generateUrl = config.generateUrl ?? savedConfig.generateUrl ?? null;
+    this.generateModel = config.generateModel ?? savedConfig.generateModel ?? null;
+  }
+
+  /**
+   * Get embeddings via remote server
+   */
+  async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
+    if (!this.embedUrl) {
+      console.error("No embed URL configured");
+      return null;
+    }
+
+    try {
+      const response = await fetch(`${this.embedUrl}/v1/embeddings`, {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify({
+          input: text,
+          model: "embeddinggemma",
+        }),
+      });
+
+      if (!response.ok) {
+        console.error(`Embed request failed: ${response.status} ${response.statusText}`);
+        return null;
+      }
+
+      const data = await response.json() as {
+        data: Array<{ embedding: number[] }>;
+        model: string;
+      };
+
+      if (!data.data || data.data.length === 0) {
+        console.error("No embedding data in response");
+        return null;
+      }
+
+      return {
+        embedding: data.data[0]!.embedding,
+        model: data.model || "remote-embed",
+      };
+    } catch (error) {
+      console.error("Embedding error:", error);
+      return null;
+    }
+  }
+
+  /**
+   * Batch embed multiple texts in a single API call
+   */
+  async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> {
+    if (texts.length === 0) return [];
+    if (!this.embedUrl) {
+      return texts.map(() => null);
+    }
+
+    try {
+      // Send all texts in a single request (OpenAI API supports array input)
+      const response = await fetch(`${this.embedUrl}/v1/embeddings`, {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify({
+          input: texts,
+          model: "embeddinggemma",
+        }),
+      });
+
+      if (!response.ok) {
+        console.error(`Batch embed failed: ${response.status} ${response.statusText}`);
+        // Fall back to sequential individual requests (avoids DB locking)
+        const results: (EmbeddingResult | null)[] = [];
+        for (const text of texts) {
+          results.push(await this.embed(text));
+        }
+        return results;
+      }
+
+      const data = await response.json() as {
+        data: Array<{ embedding: number[]; index: number }>;
+        model: string;
+      };
+
+      if (!data.data || data.data.length === 0) {
+        console.error("No embedding data in batch response");
+        return texts.map(() => null);
+      }
+
+      // Map results back to original order (API may return in different order)
+      const results: (EmbeddingResult | null)[] = new Array(texts.length).fill(null);
+      for (const item of data.data) {
+        if (item.index < texts.length) {
+          results[item.index] = {
+            embedding: item.embedding,
+            model: data.model || "remote-embed",
+          };
+        }
+      }
+      return results;
+    } catch (error) {
+      console.error("Batch embedding error:", error);
+      // Fall back to sequential individual requests (avoids DB locking)
+      const results: (EmbeddingResult | null)[] = [];
+      for (const text of texts) {
+        results.push(await this.embed(text));
+      }
+      return results;
+    }
+  }
+
+  /**
+   * Generate text via remote server
+   */
+  async generate(prompt: string, options: GenerateOptions = {}): Promise<GenerateResult | null> {
+    if (!this.generateUrl) {
+      console.error("No generate URL configured");
+      return null;
+    }
+
+    try {
+      const body: Record<string, unknown> = {
+        prompt,
+        max_tokens: options.maxTokens ?? 150,
+        temperature: options.temperature ?? 0,
+      };
+      if (this.generateModel) {
+        body.model = this.generateModel;
+      }
+      const response = await fetch(`${this.generateUrl}/v1/completions`, {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify(body),
+      });
+
+      if (!response.ok) {
+        console.error(`Generate request failed: ${response.status} ${response.statusText}`);
+        return null;
+      }
+
+      const data = await response.json() as {
+        choices: Array<{ text: string }>;
+        model: string;
+      };
+
+      if (!data.choices || data.choices.length === 0) {
+        console.error("No choices in response");
+        return null;
+      }
+
+      return {
+        text: data.choices[0]!.text,
+        model: data.model || "remote-generate",
+        done: true,
+      };
+    } catch (error) {
+      console.error("Generate error:", error);
+      return null;
+    }
+  }
+
+  /**
+   * Check if model exists (always returns true for remote)
+   */
+  async modelExists(model: string): Promise<ModelInfo> {
+    return { name: model, exists: true };
+  }
+
+  /**
+   * Expand a search query into multiple variations
+   */
+  async expandQuery(
+    query: string,
+    options: { context?: string; includeLexical?: boolean } = {}
+  ): Promise<Queryable[]> {
+    if (!this.generateUrl) {
+      // Fallback to original query
+      const fallback: Queryable[] = [{ type: 'vec', text: query }];
+      if (options.includeLexical !== false) {
+        fallback.unshift({ type: 'lex', text: query });
+      }
+      return fallback;
+    }
+
+    const includeLexical = options.includeLexical ?? true;
+    const context = options.context;
+
+    const prompt = `You are a search query optimization expert. Your task is to improve retrieval by rewriting queries and generating hypothetical documents.
+
+Original Query: ${query}
+
+${context ? `Additional Context, ONLY USE IF RELEVANT:\n\n<context>${context}</context>` : ""}
+
+## Step 1: Query Analysis
+Identify entities, search intent, and missing context.
+
+## Step 2: Generate Hypothetical Document
+Write a focused sentence passage that would answer the query. Include specific terminology and domain vocabulary.
+
+## Step 3: Query Rewrites
+Generate 2-3 alternative search queries that resolve ambiguities. Use terminology from the hypothetical document.
+
+## Step 4: Final Retrieval Text
+Output MAX ONE 'hyde' line FIRST, then 1-3 'lex' lines, then 1-3 'vec' lines.
+
+<format>
+hyde: {complete hypothetical document passage from Step 2 on a SINGLE LINE}
+lex: {single search term}
+vec: {single vector query}
+</format>
+
+<example>
+Example (FOR FORMAT ONLY - DO NOT COPY THIS CONTENT):
+hyde: This is an example of a hypothetical document passage that would answer the example query. It contains multiple sentences and relevant vocabulary.
+lex: example keyword 1
+lex: example keyword 2
+vec: example semantic query
+</example>
+
+<rules>
+- DO NOT repeat the same line.
+- Each 'lex:' line MUST be a different keyword variation based on the ORIGINAL QUERY.
+- Each 'vec:' line MUST be a different semantic variation based on the ORIGINAL QUERY.
+- The 'hyde:' line MUST be the full sentence passage from Step 2, but all on one line.
+- DO NOT use the example content above.
+${!includeLexical ? "- Do NOT output any 'lex:' lines" : ""}
+</rules>
+
+Final Output:`;
+
+    try {
+      const result = await this.generate(prompt, { maxTokens: 1000, temperature: 1 });
+      if (!result) {
+        throw new Error("Generation failed");
+      }
+
+      const lines = result.text.trim().split("\n");
+      const queryables: Queryable[] = lines.map((line: string) => {
+        const colonIdx = line.indexOf(":");
+        if (colonIdx === -1) return null;
+        const type = line.slice(0, colonIdx).trim();
+        if (type !== 'lex' && type !== 'vec' && type !== 'hyde') return null;
+        const text = line.slice(colonIdx + 1).trim();
+        return { type: type as QueryType, text };
+      }).filter((q: Queryable | null): q is Queryable => q !== null);
+
+      // Filter out lex entries if not requested
+      if (!includeLexical) {
+        return queryables.filter(q => q.type !== 'lex');
+      }
+      return queryables;
+    } catch (error) {
+      console.error("Query expansion failed:", error);
+      // Fallback to original query
+      const fallback: Queryable[] = [{ type: 'vec', text: query }];
+      if (includeLexical) fallback.unshift({ type: 'lex', text: query });
+      return fallback;
+    }
+  }
+
+  /**
+   * Rerank documents by relevance to a query
+   */
+  async rerank(
+    query: string,
+    documents: RerankDocument[],
+    options: RerankOptions = {}
+  ): Promise<RerankResult> {
+    if (!this.rerankUrl) {
+      // Return documents in original order with default scores
+      return {
+        results: documents.map((doc, index) => ({
+          file: doc.file,
+          score: 1 - (index * 0.1), // Decreasing scores
+          index,
+        })),
+        model: "no-rerank",
+      };
+    }
+
+    // If we have more than 10 documents, batch them to avoid overwhelming the server
+    const BATCH_SIZE = 10;
+    if (documents.length > BATCH_SIZE) {
+      try {
+        const allResults: RerankDocumentResult[] = [];
+        let modelName = "remote-rerank";
+
+        // Process in batches
+        for (let i = 0; i < documents.length; i += BATCH_SIZE) {
+          const batch = documents.slice(i, i + BATCH_SIZE);
+          const batchResult = await this.rerankBatch(query, batch, i);
+          allResults.push(...batchResult.results);
+          modelName = batchResult.model;
+        }
+
+        // Sort all results by score descending
+        allResults.sort((a, b) => b.score - a.score);
+
+        return {
+          results: allResults,
+          model: modelName,
+        };
+      } catch (error) {
+        console.error("Batch rerank error:", error);
+        // Fallback
+        return {
+          results: documents.map((doc, index) => ({
+            file: doc.file,
+            score: 1 - (index * 0.1),
+            index,
+          })),
+          model: "rerank-fallback",
+        };
+      }
+    }
+
+    // Single batch - use existing logic
+    return this.rerankBatch(query, documents, 0);
+  }
+
+  /**
+   * Rerank a single batch of documents
+   */
+  private async rerankBatch(
+    query: string,
+    documents: RerankDocument[],
+    indexOffset: number
+  ): Promise<RerankResult> {
+    try {
+      const texts = documents.map(doc => doc.text);
+
+      const response = await fetch(`${this.rerankUrl}/v1/rerank`, {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify({
+          query,
+          documents: texts,
+          model: "qwen3-reranker",
+        }),
+      });
+
+      if (!response.ok) {
+        console.error(`Rerank request failed: ${response.status} ${response.statusText}`);
+        // Try to get error details from response body
+        try {
+          const errorText = await response.text();
+          console.error(`Rerank error details: ${errorText}`);
+        } catch (e) {
+          // Ignore if we can't read the error body
+        }
+        throw new Error("Rerank request failed");
+      }
+
+      const data = await response.json() as {
+        results: Array<{ index: number; relevance_score: number }>;
+        model: string;
+      };
+
+      // Map results back to our format (with adjusted indices)
+      const results: RerankDocumentResult[] = data.results.map(item => ({
+        file: documents[item.index]!.file,
+        score: item.relevance_score,
+        index: indexOffset + item.index,
+      }));
+
+      // Sort by score descending
+      results.sort((a, b) => b.score - a.score);
+
+      return {
+        results,
+        model: data.model || "remote-rerank",
+      };
+    } catch (error) {
+      console.error("Rerank batch error:", error);
+      // Return documents in original order with default scores
+      return {
+        results: documents.map((doc, index) => ({
+          file: doc.file,
+          score: 1 - (index * 0.1),
+          index: indexOffset + index,
+        })),
+        model: "rerank-fallback",
+      };
+    }
+  }
+
+  /**
+   * Dispose (no-op for remote)
+   */
+  async dispose(): Promise<void> {
+    // Nothing to dispose for remote connections
+  }
+
+  /**
+   * Check health of remote endpoints
+   */
+  async checkHealth(): Promise<{ embed: boolean; rerank: boolean; generate: boolean }> {
+    const results = { embed: false, rerank: false, generate: false };
+
+    const checkEndpoint = async (url: string | null): Promise<boolean> => {
+      if (!url) return false;
+      try {
+        const response = await fetch(`${url}/health`, { method: "GET" });
+        return response.ok;
+      } catch {
+        return false;
+      }
+    };
+
+    [results.embed, results.rerank, results.generate] = await Promise.all([
+      checkEndpoint(this.embedUrl),
+      checkEndpoint(this.rerankUrl),
+      checkEndpoint(this.generateUrl),
+    ]);
+
+    return results;
+  }
+
+  /**
+   * Get configured URLs
+   */
+  getConfig(): RemoteLLMConfig {
+    return {
+      embedUrl: this.embedUrl || undefined,
+      rerankUrl: this.rerankUrl || undefined,
+      generateUrl: this.generateUrl || undefined,
+      generateModel: this.generateModel || undefined,
+    };
+  }
+}
+
+// =============================================================================
+// Remote LLM Session (implements ILLMSession for compatibility)
+// =============================================================================
+
+/**
+ * Session wrapper for RemoteLLM that implements ILLMSession interface.
+ * This allows RemoteLLM to be used with the existing withLLMSession pattern.
+ */
+class RemoteLLMSession implements ILLMSession {
+  private llm: RemoteLLM;
+  private released = false;
+  private abortController: AbortController;
+
+  constructor(llm: RemoteLLM, _options: LLMSessionOptions = {}) {
+    this.llm = llm;
+    this.abortController = new AbortController();
+  }
+
+  get isValid(): boolean {
+    return !this.released && !this.abortController.signal.aborted;
+  }
+
+  get signal(): AbortSignal {
+    return this.abortController.signal;
+  }
+
+  release(): void {
+    this.released = true;
+    this.abortController.abort();
+  }
+
+  async embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null> {
+    if (!this.isValid) return null;
+    return this.llm.embed(text, options);
+  }
+
+  async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> {
+    if (!this.isValid) return texts.map(() => null);
+    return this.llm.embedBatch(texts);
+  }
+
+  async expandQuery(
+    query: string,
+    options?: { context?: string; includeLexical?: boolean }
+  ): Promise<Queryable[]> {
+    if (!this.isValid) return [{ type: 'vec', text: query }];
+    return this.llm.expandQuery(query, options);
+  }
+
+  async rerank(
+    query: string,
+    documents: RerankDocument[],
+    options?: RerankOptions
+  ): Promise<RerankResult> {
+    if (!this.isValid) {
+      return {
+        results: documents.map((doc, index) => ({
+          file: doc.file,
+          score: 1 - (index * 0.1),
+          index,
+        })),
+        model: "session-invalid",
+      };
+    }
+    return this.llm.rerank(query, documents, options);
+  }
+}
+
+/**
+ * Execute a function with a scoped RemoteLLM session.
+ * Compatible with the existing withLLMSession pattern.
+ */
+export async function withRemoteLLMSession<T>(
+  fn: (session: ILLMSession) => Promise<T>,
+  options?: LLMSessionOptions
+): Promise<T> {
+  const llm = getDefaultRemoteLLM();
+  const session = new RemoteLLMSession(llm, options);
+
+  try {
+    return await fn(session);
+  } finally {
+    session.release();
+  }
+}
+
+// =============================================================================
+// Singleton for default RemoteLLM instance
+// =============================================================================
+
+let defaultRemoteLLM: RemoteLLM | null = null;
+
+/**
+ * Get the default RemoteLLM instance (creates one if needed)
+ */
+export function getDefaultRemoteLLM(): RemoteLLM {
+  if (!defaultRemoteLLM) {
+    defaultRemoteLLM = new RemoteLLM();
+  }
+  return defaultRemoteLLM;
+}
+
+/**
+ * Set a custom default RemoteLLM instance
+ */
+export function setDefaultRemoteLLM(llm: RemoteLLM | null): void {
+  defaultRemoteLLM = llm;
+}
+
+/**
+ * Dispose the default RemoteLLM instance if it exists
+ */
+export async function disposeDefaultRemoteLLM(): Promise<void> {
+  if (defaultRemoteLLM) {
+    await defaultRemoteLLM.dispose();
+    defaultRemoteLLM = null;
+  }
+}
diff --git a/src/llm.ts b/src/llm.ts
index 39ab28b5..11a22c26 100644
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -317,6 +317,11 @@ export interface LLM {
    */
   embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
 
+  /**
+   * Get embeddings for multiple texts in a single batch call
+   */
+  embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]>;
+
   /**
    * Generate text completion
    */
diff --git a/src/store.ts b/src/store.ts
index aa5fae4f..a1c17cf5 100644
--- a/src/store.ts
+++ b/src/store.ts
@@ -25,6 +25,7 @@ import {
   formatDocForEmbedding,
   withLLMSessionForLlm,
   type LLMSessionOptions,
+  type LLM,
   type RerankDocument,
   type ILLMSession,
 } from "./llm.js";
@@ -427,6 +428,75 @@ export function enableProductionMode(): void {
   _productionMode = true;
 }
 
+// =============================================================================
+// .qmd directory resolution
+// =============================================================================
+
+/**
+ * Find a local .qmd directory by searching current directory and parents.
+ */
+export function findLocalQmdDir(startDir?: string): string | null {
+  let dir = startDir || getPwd();
+  const root = resolve("/");
+
+  while (dir !== root) {
+    const qmdDir = resolve(dir, ".qmd");
+    try {
+      const stat = statSync(qmdDir);
+      if (stat.isDirectory()) return qmdDir;
+    } catch {
+      // Directory doesn't exist, continue searching
+    }
+    const parent = resolve(dir, "..");
+    if (parent === dir) break;
+    dir = parent;
+  }
+  return null;
+}
+
+/**
+ * Initialize a local .qmd directory in the specified path.
+ */
+export function initLocalQmdDir(targetDir?: string): string {
+  const dir = targetDir || getPwd();
+  const qmdDir = resolve(dir, ".qmd");
+  mkdirSync(qmdDir, { recursive: true });
+  return qmdDir;
+}
+
+// CLI-provided qmdDir (highest priority)
+let _cliQmdDir: string | null = null;
+
+export function setCliQmdDir(qmdDir: string | null): void {
+  _cliQmdDir = qmdDir;
+}
+
+export function getCliQmdDir(): string | null {
+  return _cliQmdDir;
+}
+
+// Config loader for saved qmdDir (avoids circular imports)
+let _loadQmdDirConfig: (() => string | null) | null = null;
+
+export function setQmdDirConfigLoader(loader: () => string | null): void {
+  _loadQmdDirConfig = loader;
+}
+
+/**
+ * Get the effective qmdDir with priority:
+ * 1. CLI flag (--qmd-dir)
+ * 2. Saved config (~/.cache/qmd/config.json)
+ * 3. Auto-discover (search upward for .qmd)
+ */
+export function getEffectiveQmdDir(): string | null {
+  if (_cliQmdDir) return _cliQmdDir;
+  if (_loadQmdDirConfig) {
+    const saved = _loadQmdDirConfig();
+    if (saved) return saved;
+  }
+  return findLocalQmdDir();
+}
+
 export function getDefaultDbPath(indexName: string = "index"): string {
   // Always allow override via INDEX_PATH (for testing)
   if (process.env.INDEX_PATH) {
@@ -441,6 +511,13 @@ export function getDefaultDbPath(indexName: string = "index"): string {
     );
   }
 
+  // Check for .qmd directory (CLI > config > auto-discover)
+  const qmdDir = getEffectiveQmdDir();
+  if (qmdDir) {
+    return resolve(qmdDir, `${indexName}.sqlite`);
+  }
+
+  // Fall back to global cache
   const cacheDir = process.env.XDG_CACHE_HOME || resolve(homedir(), ".cache");
   const qmdCacheDir = resolve(cacheDir, "qmd");
   try { mkdirSync(qmdCacheDir, { recursive: true }); } catch { }
@@ -1012,8 +1089,8 @@ export type Store = {
   searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => Promise<SearchResult[]>;
 
   // Query expansion & reranking
-  expandQuery: (query: string, model?: string, intent?: string) => Promise<ExpandedQuery[]>;
-  rerank: (query: string, documents: { file: string; text: string }[], model?: string, intent?: string) => Promise<{ file: string; score: number }[]>;
+  expandQuery: (query: string, model?: string, intent?: string, llm?: LLM) => Promise<ExpandedQuery[]>;
+  rerank: (query: string, documents: { file: string; text: string }[], model?: string, intent?: string, llm?: LLM) => Promise<{ file: string; score: number }[]>;
 
   // Document retrieval
   findDocument: (filename: string, options?: { includeBody?: boolean }) => DocumentResult | DocumentNotFound;
@@ -1364,8 +1441,8 @@ export function createStore(dbPath?: string): Store {
     searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding),
 
     // Query expansion & reranking
-    expandQuery: (query: string, model?: string, intent?: string) => expandQuery(query, model, db, intent, store.llm),
-    rerank: (query: string, documents: { file: string; text: string }[], model?: string, intent?: string) => rerank(query, documents, model, db, intent, store.llm),
+    expandQuery: (query: string, model?: string, intent?: string, llm?: LLM) => expandQuery(query, model, db, intent, llm ?? store.llm),
+    rerank: (query: string, documents: { file: string; text: string }[], model?: string, intent?: string, llm?: LLM) => rerank(query, documents, model, db, intent, llm ?? store.llm),
 
     // Document retrieval
     findDocument: (filename: string, options?: { includeBody?: boolean }) => findDocument(db, filename, options),
@@ -2820,7 +2897,7 @@ export function insertEmbedding(
 // Query expansion
 // =============================================================================
 
-export async function expandQuery(query: string, model: string = DEFAULT_QUERY_MODEL, db: Database, intent?: string, llmOverride?: LlamaCpp): Promise<ExpandedQuery[]> {
+export async function expandQuery(query: string, model: string = DEFAULT_QUERY_MODEL, db: Database, intent?: string, llmOverride?: LLM): Promise<ExpandedQuery[]> {
   // Check cache first — stored as JSON preserving types
   const cacheKey = getCacheKey("expandQuery", { query, model, ...(intent && { intent }) });
   const cached = getCachedResult(db, cacheKey);
@@ -2859,7 +2936,7 @@ export async function expandQuery(query: string, model: string = DEFAULT_QUERY_M
 // Reranking
 // =============================================================================
 
-export async function rerank(query: string, documents: { file: string; text: string }[], model: string = DEFAULT_RERANK_MODEL, db: Database, intent?: string, llmOverride?: LlamaCpp): Promise<{ file: string; score: number }[]> {
+export async function rerank(query: string, documents: { file: string; text: string }[], model: string = DEFAULT_RERANK_MODEL, db: Database, intent?: string, llmOverride?: LLM): Promise<{ file: string; score: number }[]> {
   // Prepend intent to rerank query so the reranker scores with domain context
   const rerankQuery = intent ? `${intent}\n\n${query}` : query;
 
@@ -3530,6 +3607,7 @@ export interface HybridQueryOptions {
   intent?: string;          // domain intent hint for disambiguation
   skipRerank?: boolean;     // skip LLM reranking, use only RRF scores
   hooks?: SearchHooks;
+  llm?: LLM;               // override LLM backend (default: local node-llama-cpp)
 }
 
 export interface HybridQueryResult {
@@ -3604,7 +3682,7 @@ export async function hybridQuery(
   const expandStart = Date.now();
   const expanded = hasStrongSignal
     ? []
-    : await store.expandQuery(query, undefined, intent);
+    : await store.expandQuery(query, undefined, intent, options?.llm);
 
   hooks?.onExpand?.(query, expanded, Date.now() - expandStart);
 
@@ -3651,21 +3729,26 @@ export async function hybridQuery(
     }
 
     // Batch embed all vector queries in a single call
-    const llm = getLlm(store);
+    const llm = options?.llm ?? getLlm(store);
     const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text));
     hooks?.onEmbedStart?.(textsToEmbed.length);
     const embedStart = Date.now();
     const embeddings = await llm.embedBatch(textsToEmbed);
     hooks?.onEmbedDone?.(Date.now() - embedStart);
 
-    // Run sqlite-vec lookups with pre-computed embeddings
-    for (let i = 0; i < vecQueries.length; i++) {
-      const embedding = embeddings[i]?.embedding;
-      if (!embedding) continue;
+    // Average all embeddings into one vector for a single scan
+    const validEmbeddings = embeddings.filter(e => e !== null).map(e => e!.embedding);
+    if (validEmbeddings.length > 0) {
+      const dim = validEmbeddings[0]!.length;
+      const avgEmbedding = new Array(dim).fill(0);
+      for (const emb of validEmbeddings) {
+        for (let j = 0; j < dim; j++) avgEmbedding[j] += emb[j]!;
+      }
+      for (let j = 0; j < dim; j++) avgEmbedding[j] /= validEmbeddings.length;
 
       const vecResults = await store.searchVec(
-        vecQueries[i]!.text, DEFAULT_EMBED_MODEL, 20, collection,
-        undefined, embedding
+        query, DEFAULT_EMBED_MODEL, 20, collection,
+        undefined, avgEmbedding
       );
       if (vecResults.length > 0) {
         for (const r of vecResults) docidMap.set(r.filepath, r.docid);
@@ -3777,7 +3860,7 @@ export async function hybridQuery(
 
   hooks?.onRerankStart?.(chunksToRerank.length);
   const rerankStart = Date.now();
-  const reranked = await store.rerank(query, chunksToRerank, undefined, intent);
+  const reranked = await store.rerank(query, chunksToRerank, undefined, intent, options?.llm);
   hooks?.onRerankDone?.(Date.now() - rerankStart);
 
   // Step 7: Blend RRF position score with reranker score
@@ -3850,6 +3933,7 @@ export interface VectorSearchOptions {
   minScore?: number;        // default 0.3
   intent?: string;          // domain intent hint for disambiguation
   hooks?: Pick<SearchHooks, 'onExpand'>;
+  llm?: LLM;
 }
 
 export interface VectorSearchResult {
@@ -3888,29 +3972,39 @@ export async function vectorSearchQuery(
 
   // Expand query — filter to vec/hyde only (lex queries target FTS, not vector)
   const expandStart = Date.now();
-  const allExpanded = await store.expandQuery(query, undefined, intent);
+  const allExpanded = await store.expandQuery(query, undefined, intent, options?.llm);
   const vecExpanded = allExpanded.filter(q => q.type !== 'lex');
   options?.hooks?.onExpand?.(query, vecExpanded, Date.now() - expandStart);
 
-  // Run original + vec/hyde expanded through vector, sequentially — concurrent embed() hangs
+  // Batch embed all query texts, then average into single embedding for one scan
   const queryTexts = [query, ...vecExpanded.map(q => q.query)];
+  const llm = options?.llm ?? getLlm(store);
+  const textsToEmbed = queryTexts.map(q => formatQueryForEmbedding(q));
+  const embeddings = await llm.embedBatch(textsToEmbed);
+
+  // Average all embeddings into one vector
+  const validEmbeddings = embeddings.filter(e => e !== null).map(e => e!.embedding);
+  if (validEmbeddings.length === 0) return [];
+  const dim = validEmbeddings[0]!.length;
+  const avgEmbedding = new Array(dim).fill(0);
+  for (const emb of validEmbeddings) {
+    for (let i = 0; i < dim; i++) avgEmbedding[i] += emb[i]!;
+  }
+  for (let i = 0; i < dim; i++) avgEmbedding[i] /= validEmbeddings.length;
+
+  // Single scan with averaged embedding
+  const vecResults = await store.searchVec(query, DEFAULT_EMBED_MODEL, limit, collection, undefined, avgEmbedding);
   const allResults = new Map<string, VectorSearchResult>();
-  for (const q of queryTexts) {
-    const vecResults = await store.searchVec(q, DEFAULT_EMBED_MODEL, limit, collection);
-    for (const r of vecResults) {
-      const existing = allResults.get(r.filepath);
-      if (!existing || r.score > existing.score) {
-        allResults.set(r.filepath, {
-          file: r.filepath,
-          displayPath: r.displayPath,
-          title: r.title,
-          body: r.body || "",
-          score: r.score,
-          context: store.getContextForFile(r.filepath),
-          docid: r.docid,
-        });
-      }
-    }
+  for (const r of vecResults) {
+    allResults.set(r.filepath, {
+      file: r.filepath,
+      displayPath: r.displayPath,
+      title: r.title,
+      body: r.body || "",
+      score: r.score,
+      context: store.getContextForFile(r.filepath),
+      docid: r.docid,
+    });
   }
 
   return Array.from(allResults.values())
@@ -3938,6 +4032,7 @@ export interface StructuredSearchOptions {
   /** Skip LLM reranking, use only RRF scores */
   skipRerank?: boolean;
   hooks?: SearchHooks;
+  llm?: LLM;
 }
 
 /**
@@ -4032,7 +4127,7 @@ export async function structuredSearch(
         s.type === 'vec' || s.type === 'hyde'
     );
     if (vecSearches.length > 0) {
-      const llm = getLlm(store);
+      const llm = options?.llm ?? getLlm(store);
       const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query));
       hooks?.onEmbedStart?.(textsToEmbed.length);
       const embedStart = Date.now();
@@ -4167,7 +4262,7 @@ export async function structuredSearch(
 
   hooks?.onRerankStart?.(chunksToRerank.length);
   const rerankStart2 = Date.now();
-  const reranked = await store.rerank(primaryQuery, chunksToRerank, undefined, intent);
+  const reranked = await store.rerank(primaryQuery, chunksToRerank, undefined, intent, options?.llm);
   hooks?.onRerankDone?.(Date.now() - rerankStart2);
 
   // Step 6: Blend RRF position score with reranker score

From 575b9ea54ead687484de4cbe0d64562f3dcceedd Mon Sep 17 00:00:00 2001
From: Alex <alex@cellect.ai>
Date: Wed, 11 Mar 2026 12:30:25 +0000
Subject: [PATCH 2/2] Enhance embedding functionality with retry logic for
 transient errors

Updated the embed method in RemoteLLM to implement a retry mechanism, allowing up to three attempts for transient errors during remote server requests. Improved error handling and logging for both individual and batch embedding processes, ensuring robustness in case of failures. Additionally, modified comments to clarify the behavior of batch embedding fallbacks.

Key changes:
- Added retry logic in embed method for transient errors
- Enhanced error handling and logging
- Updated comments for clarity on batch embedding behavior
---
 src/llm-remote.ts | 81 +++++++++++++++++++++++++++++------------------
 1 file changed, 50 insertions(+), 31 deletions(-)

diff --git a/src/llm-remote.ts b/src/llm-remote.ts
index c6901c32..73b44a78 100644
--- a/src/llm-remote.ts
+++ b/src/llm-remote.ts
@@ -195,7 +195,7 @@ export class RemoteLLM implements LLM {
   }
 
   /**
-   * Get embeddings via remote server
+   * Get embeddings via remote server (retries up to 3 times on transient errors)
    */
   async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
     if (!this.embedUrl) {
@@ -203,43 +203,62 @@ export class RemoteLLM implements LLM {
       return null;
     }
 
-    try {
-      const response = await fetch(`${this.embedUrl}/v1/embeddings`, {
-        method: "POST",
-        headers: { "Content-Type": "application/json" },
-        body: JSON.stringify({
-          input: text,
-          model: "embeddinggemma",
-        }),
-      });
+    const MAX_RETRIES = 3;
+    for (let attempt = 0; attempt < MAX_RETRIES; attempt++) {
+      try {
+        const response = await fetch(`${this.embedUrl}/v1/embeddings`, {
+          method: "POST",
+          headers: { "Content-Type": "application/json" },
+          body: JSON.stringify({
+            input: text,
+            model: "embeddinggemma",
+          }),
+        });
+
+        if (response.status === 400) {
+          // Client error — retrying won't help
+          console.error(`Embed request failed: ${response.status} ${response.statusText}`);
+          return null;
+        }
 
-      if (!response.ok) {
-        console.error(`Embed request failed: ${response.status} ${response.statusText}`);
-        return null;
-      }
+        if (!response.ok) {
+          console.error(`Embed request failed: ${response.status} ${response.statusText}`);
+          if (attempt < MAX_RETRIES - 1) {
+            await new Promise(r => setTimeout(r, 1000 * (attempt + 1)));
+            continue;
+          }
+          return null;
+        }
 
-      const data = await response.json() as {
-        data: Array<{ embedding: number[] }>;
-        model: string;
-      };
+        const data = await response.json() as {
+          data: Array<{ embedding: number[] }>;
+          model: string;
+        };
 
-      if (!data.data || data.data.length === 0) {
-        console.error("No embedding data in response");
+        if (!data.data || data.data.length === 0) {
+          console.error("No embedding data in response");
+          return null;
+        }
+
+        return {
+          embedding: data.data[0]!.embedding,
+          model: data.model || "remote-embed",
+        };
+      } catch (error) {
+        if (attempt < MAX_RETRIES - 1) {
+          await new Promise(r => setTimeout(r, 1000 * (attempt + 1)));
+          continue;
+        }
+        console.error("Embedding error:", error);
         return null;
       }
-
-      return {
-        embedding: data.data[0]!.embedding,
-        model: data.model || "remote-embed",
-      };
-    } catch (error) {
-      console.error("Embedding error:", error);
-      return null;
     }
+    return null;
   }
 
   /**
-   * Batch embed multiple texts in a single API call
+   * Batch embed multiple texts in a single API call.
+   * On batch failure, falls back to sequential individual requests (which have their own retries).
    */
   async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> {
     if (texts.length === 0) return [];
@@ -260,7 +279,7 @@ export class RemoteLLM implements LLM {
 
       if (!response.ok) {
         console.error(`Batch embed failed: ${response.status} ${response.statusText}`);
-        // Fall back to sequential individual requests (avoids DB locking)
+        // Fall back to sequential individual requests (each has retries)
         const results: (EmbeddingResult | null)[] = [];
         for (const text of texts) {
           results.push(await this.embed(text));
@@ -291,7 +310,7 @@ export class RemoteLLM implements LLM {
       return results;
     } catch (error) {
       console.error("Batch embedding error:", error);
-      // Fall back to sequential individual requests (avoids DB locking)
+      // Fall back to sequential individual requests (each has retries)
       const results: (EmbeddingResult | null)[] = [];
       for (const text of texts) {
         results.push(await this.embed(text));