diff --git a/server/docker-compose.yml b/server/docker-compose.yml
new file mode 100644
index 00000000..00387710
--- /dev/null
+++ b/server/docker-compose.yml
@@ -0,0 +1,68 @@
+name: qmd
+
+services:
+  qmd-embed:
+    image: ghcr.io/ggml-org/llama.cpp:server-cuda
+    container_name: qmd-embed
+    ports:
+      - "8081:8080"
+    volumes:
+      - qmd-models:/models
+    command: >
+      -m /models/embeddinggemma-300M-Q8_0.gguf
+      --embedding --pooling mean
+      --host 0.0.0.0 --port 8080
+      -ngl 99 -b 4096 -ub 4096 -np 4 -c 16384
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    restart: unless-stopped
+
+  qmd-rerank:
+    image: ghcr.io/ggml-org/llama.cpp:server-cuda
+    container_name: qmd-rerank
+    ports:
+      - "8082:8080"
+    volumes:
+      - qmd-models:/models
+    command: >
+      -m /models/qwen3-reranker-0.6b-q8_0.gguf
+      --reranking --pooling rank
+      --host 0.0.0.0 --port 8080
+      -ngl 99 -b 2048 -ub 2048 -c 4096
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    restart: unless-stopped
+
+  qmd-generate:
+    image: ghcr.io/ggml-org/llama.cpp:server-cuda
+    container_name: qmd-generate
+    ports:
+      - "8083:8080"
+    volumes:
+      - qmd-models:/models
+    command: >
+      -m /models/qmd-query-expansion-1.7B-q4_k_m.gguf
+      --host 0.0.0.0 --port 8080
+      -ngl 99 -c 2048
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    restart: unless-stopped
+
+volumes:
+  qmd-models:
+    external: true
diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts
index 52a076da..531a290b 100755
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@@ -32,6 +32,7 @@ import {
   hashContent,
   extractTitle,
   formatDocForEmbedding,
+  chunkDocument,
   chunkDocumentByTokens,
   clearCache,
   getCacheKey,
@@ -74,8 +75,15 @@ import {
   generateEmbeddings,
   syncConfigToDb,
   type ReindexResult,
+  findLocalQmdDir,
+  initLocalQmdDir,
+  setCliQmdDir,
+  getCliQmdDir,
+  getEffectiveQmdDir,
+  setQmdDirConfigLoader,
 } from "../store.js";
 import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js";
+import { RemoteLLM, loadRemoteConfig, saveRemoteConfig, clearRemoteConfig, isRemoteConfigured, getDefaultRemoteLLM, disposeDefaultRemoteLLM, withRemoteLLMSession, loadQmdDirConfig, saveQmdDirConfig, clearQmdDirConfig } from "../llm-remote.js";
 import {
   formatSearchResults,
   formatDocuments,
@@ -95,6 +103,7 @@ import {
   listAllContexts,
   setConfigIndexName,
   loadConfig,
+  setConfigDirResolver,
 } from "../collections.js";
 import { getEmbeddedQmdSkillContent, getEmbeddedQmdSkillFiles } from "../embedded-skills.js";
 
@@ -102,6 +111,42 @@ import { getEmbeddedQmdSkillContent, getEmbeddedQmdSkillFiles } from "../embedde
 // Tests must set INDEX_PATH or use createStore() with explicit path
 enableProductionMode();
 
+// Wire .qmd directory config resolution
+setQmdDirConfigLoader(loadQmdDirConfig);
+setConfigDirResolver(() => getEffectiveQmdDir());
+
+// =============================================================================
+// LLM Backend Selection
+// =============================================================================
+
+let forceLocalMode = false;
+
+function setForceLocalMode(force: boolean): void {
+  forceLocalMode = force;
+}
+
+function shouldUseRemote(): boolean {
+  if (forceLocalMode) return false;
+  return isRemoteConfigured();
+}
+
+async function withLLMSessionAuto<T>(
+  fn: (session: any) => Promise<T>,
+  options?: any
+): Promise<T> {
+  if (shouldUseRemote()) {
+    return withRemoteLLMSession(fn, options);
+  }
+  return withLLMSession(fn, options);
+}
+
+async function disposeAllLLM(): Promise<void> {
+  if (shouldUseRemote()) {
+    await disposeDefaultRemoteLLM();
+  }
+  await disposeDefaultLlamaCpp();
+}
+
 // =============================================================================
 // Store/DB lifecycle (no legacy singletons in store.ts)
 // =============================================================================
@@ -2125,12 +2170,13 @@ async function vectorSearch(query: string, opts: OutputOptions, _model: string =
 
   checkIndexHealth(store.db);
 
-  await withLLMSession(async () => {
+  await withLLMSessionAuto(async () => {
     let results = await vectorSearchQuery(store, query, {
       collection: singleCollection,
       limit: opts.all ? 500 : (opts.limit || 10),
       minScore: opts.minScore || 0.3,
       intent: opts.intent,
+      llm: shouldUseRemote() ? getDefaultRemoteLLM() : undefined,
       hooks: {
         onExpand: (original, expanded) => {
           logExpansionTree(original, expanded);
@@ -2181,7 +2227,7 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
   // Intent can come from --intent flag or from intent: line in query document
   const intent = opts.intent || parsed?.intent;
 
-  await withLLMSession(async () => {
+  await withLLMSessionAuto(async () => {
     let results;
 
     if (parsed) {
@@ -2234,6 +2280,7 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
         candidateLimit: opts.candidateLimit,
         explain: !!opts.explain,
         intent,
+        llm: shouldUseRemote() ? getDefaultRemoteLLM() : undefined,
         hooks: {
           onStrongSignal: (score) => {
             process.stderr.write(`${c.dim}Strong BM25 signal (${score.toFixed(2)}) — skipping expansion${c.reset}\n`);
@@ -2349,6 +2396,12 @@ function parseCLI() {
       http: { type: "boolean" },
       daemon: { type: "boolean" },
       port: { type: "string" },
+      // Remote LLM options
+      local: { type: "boolean" },
+      "embed-url": { type: "string" },
+      "rerank-url": { type: "string" },
+      "generate-url": { type: "string" },
+      "generate-model": { type: "string" },
     },
     allowPositionals: true,
     strict: false, // Allow unknown options to pass through
@@ -2543,6 +2596,11 @@ function showHelp(): void {
   console.log("  qmd context add/list/rm                      - Attach human-written summaries");
   console.log("  qmd ls [collection[/path]]                   - Inspect indexed files");
   console.log("");
+  console.log("Remote LLM backend:");
+  console.log("  qmd remote set <urls>         - Configure remote embed/rerank/generate endpoints");
+  console.log("  qmd remote status             - Show remote config and check health");
+  console.log("  qmd remote clear              - Clear remote config (revert to local)");
+  console.log("");
   console.log("Maintenance:");
   console.log("  qmd status                    - View index + collection health");
   console.log("  qmd update [--pull]           - Re-index collections (optionally git pull first)");
@@ -2592,6 +2650,7 @@ function showHelp(): void {
   console.log("");
   console.log("Global options:");
   console.log("  --index <name>             - Use a named index (default: index)");
+  console.log("  --local                    - Force local LLM mode (ignore remote config)");
   console.log("");
   console.log("Search options:");
   console.log("  -n <num>                   - Max results (default 5, or 20 for --files/--json)");
@@ -2662,6 +2721,10 @@ if (isMain) {
     process.exit(0);
   }
 
+  if (cli.values.local) {
+    setForceLocalMode(true);
+  }
+
   if (!cli.command || cli.values.help) {
     showHelp();
     process.exit(cli.values.help ? 0 : 1);
@@ -3132,6 +3195,92 @@ if (isMain) {
       break;
     }
 
+    case "remote": {
+      const subcommand = cli.args[0];
+      if (!subcommand) {
+        console.error("Usage: qmd remote <set|status|clear>");
+        console.error("");
+        console.error("Commands:");
+        console.error("  qmd remote set <embed-url> <rerank-url> <generate-url>");
+        console.error("  qmd remote status       - Show current remote configuration");
+        console.error("  qmd remote clear        - Clear remote config (use local mode)");
+        process.exit(1);
+      }
+
+      switch (subcommand) {
+        case "set": {
+          let embedUrl = cli.values["embed-url"] as string | undefined;
+          let rerankUrl = cli.values["rerank-url"] as string | undefined;
+          let generateUrl = cli.values["generate-url"] as string | undefined;
+          const generateModel = cli.values["generate-model"] as string | undefined;
+
+          if (cli.args.length >= 4) {
+            embedUrl = cli.args[1];
+            rerankUrl = cli.args[2];
+            generateUrl = cli.args[3];
+          } else if (cli.args.length === 2 && cli.args[1]) {
+            embedUrl = cli.args[1];
+            rerankUrl = cli.args[1];
+            generateUrl = cli.args[1];
+          }
+
+          if (!embedUrl && !rerankUrl && !generateUrl && !generateModel) {
+            console.error("Usage: qmd remote set <embed-url> <rerank-url> <generate-url>");
+            process.exit(1);
+          }
+
+          const existingConfig = loadRemoteConfig();
+          const newConfig = {
+            embedUrl: embedUrl ?? existingConfig.embedUrl,
+            rerankUrl: rerankUrl ?? existingConfig.rerankUrl,
+            generateUrl: generateUrl ?? existingConfig.generateUrl,
+            generateModel: generateModel ?? existingConfig.generateModel,
+          };
+
+          saveRemoteConfig(newConfig);
+          console.log("Remote configuration saved");
+          console.log(`  Embed:    ${newConfig.embedUrl || "(not set)"}`);
+          console.log(`  Rerank:   ${newConfig.rerankUrl || "(not set)"}`);
+          console.log(`  Generate: ${newConfig.generateUrl || "(not set)"}`);
+          if (newConfig.generateModel) {
+            console.log(`  Generate model: ${newConfig.generateModel}`);
+          }
+          break;
+        }
+        case "status": {
+          const config = loadRemoteConfig();
+          if (!config.embedUrl && !config.rerankUrl && !config.generateUrl) {
+            console.log("Remote mode: disabled (using local models)");
+          } else {
+            console.log("Remote Configuration\n");
+            console.log(`  Embed:    ${config.embedUrl || "(not set)"}`);
+            console.log(`  Rerank:   ${config.rerankUrl || "(not set)"}`);
+            console.log(`  Generate: ${config.generateUrl || "(not set)"}`);
+            if (config.generateModel) {
+              console.log(`  Generate model: ${config.generateModel}`);
+            }
+            console.log("\nChecking endpoint health...");
+            const remote = new RemoteLLM(config);
+            const health = await remote.checkHealth();
+            console.log(`  Embed:    ${health.embed ? "healthy" : "unreachable"}`);
+            console.log(`  Rerank:   ${health.rerank ? "healthy" : "unreachable"}`);
+            console.log(`  Generate: ${health.generate ? "healthy" : "unreachable"}`);
+          }
+          break;
+        }
+        case "clear": {
+          clearRemoteConfig();
+          console.log("Remote configuration cleared");
+          console.log("Now using local models");
+          break;
+        }
+        default:
+          console.error(`Unknown remote subcommand: ${subcommand}`);
+          process.exit(1);
+      }
+      break;
+    }
+
     default:
       console.error(`Unknown command: ${cli.command}`);
       console.error("Run 'qmd --help' for usage.");
@@ -3139,7 +3288,7 @@ if (isMain) {
   }
 
   if (cli.command !== "mcp") {
-    await disposeDefaultLlamaCpp();
+    await disposeAllLLM();
     process.exit(0);
   }
 
diff --git a/src/collections.ts b/src/collections.ts
index 257f144f..98ac6834 100644
--- a/src/collections.ts
+++ b/src/collections.ts
@@ -10,6 +10,20 @@ import { join, dirname } from "path";
 import { homedir } from "os";
 import YAML from "yaml";
 
+// ============================================================================
+// Config path resolution (avoids circular import with store)
+// ============================================================================
+
+let _qmdDirResolver: (() => string | null) | null = null;
+
+/**
+ * Set the resolver for the effective .qmd directory.
+ * When set, config is loaded from {qmdDir}/index.yml first.
+ */
+export function setConfigDirResolver(resolver: (() => string | null) | null): void {
+  _qmdDirResolver = resolver;
+}
+
 // ============================================================================
 // Types
 // ============================================================================
@@ -99,28 +113,26 @@ export function setConfigIndexName(name: string): void {
   }
 }
 
-function getConfigDir(): string {
-  // Allow override via QMD_CONFIG_DIR for testing
+function getConfigFilePath(): string {
+  // 1. Test override (QMD_CONFIG_DIR)
   if (process.env.QMD_CONFIG_DIR) {
-    return process.env.QMD_CONFIG_DIR;
+    return join(process.env.QMD_CONFIG_DIR, "index.yml");
   }
-  // Respect XDG Base Directory specification (consistent with store.ts)
-  if (process.env.XDG_CONFIG_HOME) {
-    return join(process.env.XDG_CONFIG_HOME, "qmd");
+  // 2. Index-colocated config when using a .qmd directory
+  if (_qmdDirResolver) {
+    const qmdDir = _qmdDirResolver();
+    if (qmdDir) {
+      return join(qmdDir, "index.yml");
+    }
   }
-  return join(homedir(), ".config", "qmd");
+  // 3. Fallback to global config
+  return join(homedir(), ".config", "qmd", "index.yml");
 }
 
-function getConfigFilePath(): string {
-  return join(getConfigDir(), `${currentIndexName}.yml`);
-}
-
-/**
- * Ensure config directory exists
- */
 function ensureConfigDir(): void {
-  const configDir = getConfigDir();
-  if (!existsSync(configDir)) {
+  const configPath = getConfigFilePath();
+  const configDir = configPath.slice(0, configPath.lastIndexOf("/"));
+  if (configDir && !existsSync(configDir)) {
     mkdirSync(configDir, { recursive: true });
   }
 }
@@ -130,10 +142,14 @@ function ensureConfigDir(): void {
 // ============================================================================
 
 /**
- * Load configuration from the configured source.
- * - Inline config: returns the in-memory object directly
- * - File-based: reads from YAML file (default ~/.config/qmd/index.yml)
- * Returns empty config if file doesn't exist
+ * Load collection configuration.
+ *
+ * Resolution order for the config file:
+ *   1. QMD_CONFIG_DIR env var (test override) -> {QMD_CONFIG_DIR}/index.yml
+ *   2. .qmd directory via setConfigDirResolver() -> {qmdDir}/index.yml
+ *   3. Global fallback -> ~/.config/qmd/index.yml
+ *
+ * Returns empty config if file doesn't exist.
  */
 export function loadConfig(): CollectionConfig {
   // SDK inline config mode
@@ -163,9 +179,12 @@ export function loadConfig(): CollectionConfig {
 }
 
 /**
- * Save configuration to the configured source.
- * - Inline config: updates the in-memory object (no file I/O)
- * - File-based: writes to YAML file (default ~/.config/qmd/index.yml)
+ * Save collection configuration.
+ *
+ * Resolution order for the config file:
+ *   1. QMD_CONFIG_DIR env var (test override) -> {QMD_CONFIG_DIR}/index.yml
+ *   2. .qmd directory via setConfigDirResolver() -> {qmdDir}/index.yml
+ *   3. Global fallback -> ~/.config/qmd/index.yml
  */
 export function saveConfig(config: CollectionConfig): void {
   // SDK inline config mode: update in place, no file I/O
diff --git a/src/llm-remote.ts b/src/llm-remote.ts
new file mode 100644
index 00000000..73b44a78
--- /dev/null
+++ b/src/llm-remote.ts
@@ -0,0 +1,759 @@
+/**
+ * llm-remote.ts - Remote LLM backend for QMD using HTTP endpoints
+ *
+ * Provides embeddings, text generation, and reranking via remote llama.cpp servers.
+ */
+
+import { homedir } from "os";
+import { join } from "path";
+import { existsSync, readFileSync, writeFileSync, mkdirSync } from "fs";
+
+import type {
+  LLM,
+  EmbeddingResult,
+  GenerateResult,
+  ModelInfo,
+  EmbedOptions,
+  GenerateOptions,
+  RerankOptions,
+  RerankDocument,
+  RerankDocumentResult,
+  RerankResult,
+  Queryable,
+  QueryType,
+  ILLMSession,
+  LLMSessionOptions,
+} from "./llm.js";
+
+// =============================================================================
+// Configuration
+// =============================================================================
+
+export type RemoteLLMConfig = {
+  embedUrl?: string;     // e.g. "http://192.168.1.100:8081"
+  rerankUrl?: string;    // e.g. "http://192.168.1.100:8082"
+  generateUrl?: string;  // e.g. "http://192.168.1.100:8083" or "http://localhost:4000" (LiteLLM)
+  generateModel?: string; // e.g. "gpt-4o-mini" or "ollama/llama3" - required for LiteLLM, optional for llama.cpp
+};
+
+// Config file path
+const CONFIG_DIR = join(homedir(), ".cache", "qmd");
+const CONFIG_FILE = join(CONFIG_DIR, "config.json");
+
+/**
+ * Load remote config from file
+ */
+export function loadRemoteConfig(): RemoteLLMConfig {
+  try {
+    if (existsSync(CONFIG_FILE)) {
+      const data = JSON.parse(readFileSync(CONFIG_FILE, "utf-8"));
+      return data.remote || {};
+    }
+  } catch (e) {
+    // Ignore errors, return empty config
+  }
+  return {};
+}
+
+/**
+ * Save remote config to file
+ */
+export function saveRemoteConfig(config: RemoteLLMConfig): void {
+  try {
+    if (!existsSync(CONFIG_DIR)) {
+      mkdirSync(CONFIG_DIR, { recursive: true });
+    }
+    
+    let data: Record<string, unknown> = {};
+    if (existsSync(CONFIG_FILE)) {
+      try {
+        data = JSON.parse(readFileSync(CONFIG_FILE, "utf-8"));
+      } catch {
+        // Start fresh if parse fails
+      }
+    }
+    
+    data.remote = config;
+    writeFileSync(CONFIG_FILE, JSON.stringify(data, null, 2));
+  } catch (e) {
+    console.error("Failed to save remote config:", e);
+  }
+}
+
+/**
+ * Clear remote config
+ */
+export function clearRemoteConfig(): void {
+  try {
+    if (existsSync(CONFIG_FILE)) {
+      let data: Record<string, unknown> = {};
+      try {
+        data = JSON.parse(readFileSync(CONFIG_FILE, "utf-8"));
+      } catch {
+        // Start fresh
+      }
+      delete data.remote;
+      writeFileSync(CONFIG_FILE, JSON.stringify(data, null, 2));
+    }
+  } catch (e) {
+    console.error("Failed to clear remote config:", e);
+  }
+}
+
+// =============================================================================
+// QMD Directory Config (persistent path to .qmd folder)
+// =============================================================================
+
+/**
+ * Load saved qmdDir from config
+ */
+export function loadQmdDirConfig(): string | null {
+  try {
+    if (existsSync(CONFIG_FILE)) {
+      const data = JSON.parse(readFileSync(CONFIG_FILE, "utf-8"));
+      return data.qmdDir || null;
+    }
+  } catch {
+    // Ignore errors
+  }
+  return null;
+}
+
+/**
+ * Save qmdDir to config
+ */
+export function saveQmdDirConfig(qmdDir: string): void {
+  try {
+    if (!existsSync(CONFIG_DIR)) {
+      mkdirSync(CONFIG_DIR, { recursive: true });
+    }
+    
+    let data: Record<string, unknown> = {};
+    if (existsSync(CONFIG_FILE)) {
+      try {
+        data = JSON.parse(readFileSync(CONFIG_FILE, "utf-8"));
+      } catch {
+        // Start fresh
+      }
+    }
+    
+    data.qmdDir = qmdDir;
+    writeFileSync(CONFIG_FILE, JSON.stringify(data, null, 2));
+  } catch (e) {
+    console.error("Failed to save qmdDir config:", e);
+  }
+}
+
+/**
+ * Clear qmdDir from config
+ */
+export function clearQmdDirConfig(): void {
+  try {
+    if (existsSync(CONFIG_FILE)) {
+      let data: Record<string, unknown> = {};
+      try {
+        data = JSON.parse(readFileSync(CONFIG_FILE, "utf-8"));
+      } catch {
+        // Start fresh
+      }
+      delete data.qmdDir;
+      writeFileSync(CONFIG_FILE, JSON.stringify(data, null, 2));
+    }
+  } catch {
+    // Ignore errors
+  }
+}
+
+/**
+ * Check if remote mode is configured
+ */
+export function isRemoteConfigured(): boolean {
+  const config = loadRemoteConfig();
+  return !!(config.embedUrl || config.rerankUrl || config.generateUrl);
+}
+
+// =============================================================================
+// Remote LLM Implementation
+// =============================================================================
+
+/**
+ * LLM implementation using remote HTTP endpoints (llama.cpp servers)
+ */
+export class RemoteLLM implements LLM {
+  private embedUrl: string | null;
+  private rerankUrl: string | null;
+  private generateUrl: string | null;
+  private generateModel: string | null;
+
+  constructor(config: RemoteLLMConfig = {}) {
+    // Load from saved config, then override with explicit config
+    const savedConfig = loadRemoteConfig();
+    this.embedUrl = config.embedUrl ?? savedConfig.embedUrl ?? null;
+    this.rerankUrl = config.rerankUrl ?? savedConfig.rerankUrl ?? null;
+    this.generateUrl = config.generateUrl ?? savedConfig.generateUrl ?? null;
+    this.generateModel = config.generateModel ?? savedConfig.generateModel ?? null;
+  }
+
+  /**
+   * Get embeddings via remote server (retries up to 3 times on transient errors)
+   */
+  async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
+    if (!this.embedUrl) {
+      console.error("No embed URL configured");
+      return null;
+    }
+
+    const MAX_RETRIES = 3;
+    for (let attempt = 0; attempt < MAX_RETRIES; attempt++) {
+      try {
+        const response = await fetch(`${this.embedUrl}/v1/embeddings`, {
+          method: "POST",
+          headers: { "Content-Type": "application/json" },
+          body: JSON.stringify({
+            input: text,
+            model: "embeddinggemma",
+          }),
+        });
+
+        if (response.status === 400) {
+          // Client error — retrying won't help
+          console.error(`Embed request failed: ${response.status} ${response.statusText}`);
+          return null;
+        }
+
+        if (!response.ok) {
+          console.error(`Embed request failed: ${response.status} ${response.statusText}`);
+          if (attempt < MAX_RETRIES - 1) {
+            await new Promise(r => setTimeout(r, 1000 * (attempt + 1)));
+            continue;
+          }
+          return null;
+        }
+
+        const data = await response.json() as {
+          data: Array<{ embedding: number[] }>;
+          model: string;
+        };
+
+        if (!data.data || data.data.length === 0) {
+          console.error("No embedding data in response");
+          return null;
+        }
+
+        return {
+          embedding: data.data[0]!.embedding,
+          model: data.model || "remote-embed",
+        };
+      } catch (error) {
+        if (attempt < MAX_RETRIES - 1) {
+          await new Promise(r => setTimeout(r, 1000 * (attempt + 1)));
+          continue;
+        }
+        console.error("Embedding error:", error);
+        return null;
+      }
+    }
+    return null;
+  }
+
+  /**
+   * Batch embed multiple texts in a single API call.
+   * On batch failure, falls back to sequential individual requests (which have their own retries).
+   */
+  async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> {
+    if (texts.length === 0) return [];
+    if (!this.embedUrl) {
+      return texts.map(() => null);
+    }
+
+    try {
+      // Send all texts in a single request (OpenAI API supports array input)
+      const response = await fetch(`${this.embedUrl}/v1/embeddings`, {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify({
+          input: texts,
+          model: "embeddinggemma",
+        }),
+      });
+
+      if (!response.ok) {
+        console.error(`Batch embed failed: ${response.status} ${response.statusText}`);
+        // Fall back to sequential individual requests (each has retries)
+        const results: (EmbeddingResult | null)[] = [];
+        for (const text of texts) {
+          results.push(await this.embed(text));
+        }
+        return results;
+      }
+
+      const data = await response.json() as {
+        data: Array<{ embedding: number[]; index: number }>;
+        model: string;
+      };
+
+      if (!data.data || data.data.length === 0) {
+        console.error("No embedding data in batch response");
+        return texts.map(() => null);
+      }
+
+      // Map results back to original order (API may return in different order)
+      const results: (EmbeddingResult | null)[] = new Array(texts.length).fill(null);
+      for (const item of data.data) {
+        if (item.index < texts.length) {
+          results[item.index] = {
+            embedding: item.embedding,
+            model: data.model || "remote-embed",
+          };
+        }
+      }
+      return results;
+    } catch (error) {
+      console.error("Batch embedding error:", error);
+      // Fall back to sequential individual requests (each has retries)
+      const results: (EmbeddingResult | null)[] = [];
+      for (const text of texts) {
+        results.push(await this.embed(text));
+      }
+      return results;
+    }
+  }
+
+  /**
+   * Generate text via remote server
+   */
+  async generate(prompt: string, options: GenerateOptions = {}): Promise<GenerateResult | null> {
+    if (!this.generateUrl) {
+      console.error("No generate URL configured");
+      return null;
+    }
+
+    try {
+      const body: Record<string, unknown> = {
+        prompt,
+        max_tokens: options.maxTokens ?? 150,
+        temperature: options.temperature ?? 0,
+      };
+      if (this.generateModel) {
+        body.model = this.generateModel;
+      }
+      const response = await fetch(`${this.generateUrl}/v1/completions`, {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify(body),
+      });
+
+      if (!response.ok) {
+        console.error(`Generate request failed: ${response.status} ${response.statusText}`);
+        return null;
+      }
+
+      const data = await response.json() as {
+        choices: Array<{ text: string }>;
+        model: string;
+      };
+
+      if (!data.choices || data.choices.length === 0) {
+        console.error("No choices in response");
+        return null;
+      }
+
+      return {
+        text: data.choices[0]!.text,
+        model: data.model || "remote-generate",
+        done: true,
+      };
+    } catch (error) {
+      console.error("Generate error:", error);
+      return null;
+    }
+  }
+
+  /**
+   * Check if model exists (always returns true for remote)
+   */
+  async modelExists(model: string): Promise<ModelInfo> {
+    return { name: model, exists: true };
+  }
+
+  /**
+   * Expand a search query into multiple variations
+   */
+  async expandQuery(
+    query: string,
+    options: { context?: string; includeLexical?: boolean } = {}
+  ): Promise<Queryable[]> {
+    if (!this.generateUrl) {
+      // Fallback to original query
+      const fallback: Queryable[] = [{ type: 'vec', text: query }];
+      if (options.includeLexical !== false) {
+        fallback.unshift({ type: 'lex', text: query });
+      }
+      return fallback;
+    }
+
+    const includeLexical = options.includeLexical ?? true;
+    const context = options.context;
+
+    const prompt = `You are a search query optimization expert. Your task is to improve retrieval by rewriting queries and generating hypothetical documents.
+
+Original Query: ${query}
+
+${context ? `Additional Context, ONLY USE IF RELEVANT:\n\n<context>${context}</context>` : ""}
+
+## Step 1: Query Analysis
+Identify entities, search intent, and missing context.
+
+## Step 2: Generate Hypothetical Document
+Write a focused sentence passage that would answer the query. Include specific terminology and domain vocabulary.
+
+## Step 3: Query Rewrites
+Generate 2-3 alternative search queries that resolve ambiguities. Use terminology from the hypothetical document.
+
+## Step 4: Final Retrieval Text
+Output MAX ONE 'hyde' line FIRST, then 1-3 'lex' lines, then 1-3 'vec' lines.
+
+<format>
+hyde: {complete hypothetical document passage from Step 2 on a SINGLE LINE}
+lex: {single search term}
+vec: {single vector query}
+</format>
+
+<example>
+Example (FOR FORMAT ONLY - DO NOT COPY THIS CONTENT):
+hyde: This is an example of a hypothetical document passage that would answer the example query. It contains multiple sentences and relevant vocabulary.
+lex: example keyword 1
+lex: example keyword 2
+vec: example semantic query
+</example>
+
+<rules>
+- DO NOT repeat the same line.
+- Each 'lex:' line MUST be a different keyword variation based on the ORIGINAL QUERY.
+- Each 'vec:' line MUST be a different semantic variation based on the ORIGINAL QUERY.
+- The 'hyde:' line MUST be the full sentence passage from Step 2, but all on one line.
+- DO NOT use the example content above.
+${!includeLexical ? "- Do NOT output any 'lex:' lines" : ""}
+</rules>
+
+Final Output:`;
+
+    try {
+      const result = await this.generate(prompt, { maxTokens: 1000, temperature: 1 });
+      if (!result) {
+        throw new Error("Generation failed");
+      }
+
+      const lines = result.text.trim().split("\n");
+      const queryables: Queryable[] = lines.map((line: string) => {
+        const colonIdx = line.indexOf(":");
+        if (colonIdx === -1) return null;
+        const type = line.slice(0, colonIdx).trim();
+        if (type !== 'lex' && type !== 'vec' && type !== 'hyde') return null;
+        const text = line.slice(colonIdx + 1).trim();
+        return { type: type as QueryType, text };
+      }).filter((q: Queryable | null): q is Queryable => q !== null);
+
+      // Filter out lex entries if not requested
+      if (!includeLexical) {
+        return queryables.filter(q => q.type !== 'lex');
+      }
+      return queryables;
+    } catch (error) {
+      console.error("Query expansion failed:", error);
+      // Fallback to original query
+      const fallback: Queryable[] = [{ type: 'vec', text: query }];
+      if (includeLexical) fallback.unshift({ type: 'lex', text: query });
+      return fallback;
+    }
+  }
+
+  /**
+   * Rerank documents by relevance to a query
+   */
+  async rerank(
+    query: string,
+    documents: RerankDocument[],
+    options: RerankOptions = {}
+  ): Promise<RerankResult> {
+    if (!this.rerankUrl) {
+      // Return documents in original order with default scores
+      return {
+        results: documents.map((doc, index) => ({
+          file: doc.file,
+          score: 1 - (index * 0.1), // Decreasing scores
+          index,
+        })),
+        model: "no-rerank",
+      };
+    }
+
+    // If we have more than 10 documents, batch them to avoid overwhelming the server
+    const BATCH_SIZE = 10;
+    if (documents.length > BATCH_SIZE) {
+      try {
+        const allResults: RerankDocumentResult[] = [];
+        let modelName = "remote-rerank";
+
+        // Process in batches
+        for (let i = 0; i < documents.length; i += BATCH_SIZE) {
+          const batch = documents.slice(i, i + BATCH_SIZE);
+          const batchResult = await this.rerankBatch(query, batch, i);
+          allResults.push(...batchResult.results);
+          modelName = batchResult.model;
+        }
+
+        // Sort all results by score descending
+        allResults.sort((a, b) => b.score - a.score);
+
+        return {
+          results: allResults,
+          model: modelName,
+        };
+      } catch (error) {
+        console.error("Batch rerank error:", error);
+        // Fallback
+        return {
+          results: documents.map((doc, index) => ({
+            file: doc.file,
+            score: 1 - (index * 0.1),
+            index,
+          })),
+          model: "rerank-fallback",
+        };
+      }
+    }
+
+    // Single batch - use existing logic
+    return this.rerankBatch(query, documents, 0);
+  }
+
+  /**
+   * Rerank a single batch of documents
+   */
+  private async rerankBatch(
+    query: string,
+    documents: RerankDocument[],
+    indexOffset: number
+  ): Promise<RerankResult> {
+    try {
+      const texts = documents.map(doc => doc.text);
+
+      const response = await fetch(`${this.rerankUrl}/v1/rerank`, {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify({
+          query,
+          documents: texts,
+          model: "qwen3-reranker",
+        }),
+      });
+
+      if (!response.ok) {
+        console.error(`Rerank request failed: ${response.status} ${response.statusText}`);
+        // Try to get error details from response body
+        try {
+          const errorText = await response.text();
+          console.error(`Rerank error details: ${errorText}`);
+        } catch (e) {
+          // Ignore if we can't read the error body
+        }
+        throw new Error("Rerank request failed");
+      }
+
+      const data = await response.json() as {
+        results: Array<{ index: number; relevance_score: number }>;
+        model: string;
+      };
+
+      // Map results back to our format (with adjusted indices)
+      const results: RerankDocumentResult[] = data.results.map(item => ({
+        file: documents[item.index]!.file,
+        score: item.relevance_score,
+        index: indexOffset + item.index,
+      }));
+
+      // Sort by score descending
+      results.sort((a, b) => b.score - a.score);
+
+      return {
+        results,
+        model: data.model || "remote-rerank",
+      };
+    } catch (error) {
+      console.error("Rerank batch error:", error);
+      // Return documents in original order with default scores
+      return {
+        results: documents.map((doc, index) => ({
+          file: doc.file,
+          score: 1 - (index * 0.1),
+          index: indexOffset + index,
+        })),
+        model: "rerank-fallback",
+      };
+    }
+  }
+
+  /**
+   * Dispose (no-op for remote)
+   */
+  async dispose(): Promise<void> {
+    // Nothing to dispose for remote connections
+  }
+
+  /**
+   * Check health of remote endpoints
+   */
+  async checkHealth(): Promise<{ embed: boolean; rerank: boolean; generate: boolean }> {
+    const results = { embed: false, rerank: false, generate: false };
+
+    const checkEndpoint = async (url: string | null): Promise<boolean> => {
+      if (!url) return false;
+      try {
+        const response = await fetch(`${url}/health`, { method: "GET" });
+        return response.ok;
+      } catch {
+        return false;
+      }
+    };
+
+    [results.embed, results.rerank, results.generate] = await Promise.all([
+      checkEndpoint(this.embedUrl),
+      checkEndpoint(this.rerankUrl),
+      checkEndpoint(this.generateUrl),
+    ]);
+
+    return results;
+  }
+
+  /**
+   * Get configured URLs
+   */
+  getConfig(): RemoteLLMConfig {
+    return {
+      embedUrl: this.embedUrl || undefined,
+      rerankUrl: this.rerankUrl || undefined,
+      generateUrl: this.generateUrl || undefined,
+      generateModel: this.generateModel || undefined,
+    };
+  }
+}
+
+// =============================================================================
+// Remote LLM Session (implements ILLMSession for compatibility)
+// =============================================================================
+
+/**
+ * Session wrapper for RemoteLLM that implements ILLMSession interface.
+ * This allows RemoteLLM to be used with the existing withLLMSession pattern.
+ */
+class RemoteLLMSession implements ILLMSession {
+  private llm: RemoteLLM;
+  private released = false;
+  private abortController: AbortController;
+
+  constructor(llm: RemoteLLM, _options: LLMSessionOptions = {}) {
+    this.llm = llm;
+    this.abortController = new AbortController();
+  }
+
+  get isValid(): boolean {
+    return !this.released && !this.abortController.signal.aborted;
+  }
+
+  get signal(): AbortSignal {
+    return this.abortController.signal;
+  }
+
+  release(): void {
+    this.released = true;
+    this.abortController.abort();
+  }
+
+  async embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null> {
+    if (!this.isValid) return null;
+    return this.llm.embed(text, options);
+  }
+
+  async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> {
+    if (!this.isValid) return texts.map(() => null);
+    return this.llm.embedBatch(texts);
+  }
+
+  async expandQuery(
+    query: string,
+    options?: { context?: string; includeLexical?: boolean }
+  ): Promise<Queryable[]> {
+    if (!this.isValid) return [{ type: 'vec', text: query }];
+    return this.llm.expandQuery(query, options);
+  }
+
+  async rerank(
+    query: string,
+    documents: RerankDocument[],
+    options?: RerankOptions
+  ): Promise<RerankResult> {
+    if (!this.isValid) {
+      return {
+        results: documents.map((doc, index) => ({
+          file: doc.file,
+          score: 1 - (index * 0.1),
+          index,
+        })),
+        model: "session-invalid",
+      };
+    }
+    return this.llm.rerank(query, documents, options);
+  }
+}
+
+/**
+ * Execute a function with a scoped RemoteLLM session.
+ * Compatible with the existing withLLMSession pattern.
+ */
+export async function withRemoteLLMSession<T>(
+  fn: (session: ILLMSession) => Promise<T>,
+  options?: LLMSessionOptions
+): Promise<T> {
+  const llm = getDefaultRemoteLLM();
+  const session = new RemoteLLMSession(llm, options);
+
+  try {
+    return await fn(session);
+  } finally {
+    session.release();
+  }
+}
+
+// =============================================================================
+// Singleton for default RemoteLLM instance
+// =============================================================================
+
+let defaultRemoteLLM: RemoteLLM | null = null;
+
+/**
+ * Get the default RemoteLLM instance (creates one if needed)
+ */
+export function getDefaultRemoteLLM(): RemoteLLM {
+  if (!defaultRemoteLLM) {
+    defaultRemoteLLM = new RemoteLLM();
+  }
+  return defaultRemoteLLM;
+}
+
+/**
+ * Set a custom default RemoteLLM instance
+ */
+export function setDefaultRemoteLLM(llm: RemoteLLM | null): void {
+  defaultRemoteLLM = llm;
+}
+
+/**
+ * Dispose the default RemoteLLM instance if it exists
+ */
+export async function disposeDefaultRemoteLLM(): Promise<void> {
+  if (defaultRemoteLLM) {
+    await defaultRemoteLLM.dispose();
+    defaultRemoteLLM = null;
+  }
+}
diff --git a/src/llm.ts b/src/llm.ts
index 39ab28b5..11a22c26 100644
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -317,6 +317,11 @@ export interface LLM {
    */
   embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
 
+  /**
+   * Get embeddings for multiple texts in a single batch call
+   */
+  embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]>;
+
   /**
    * Generate text completion
    */
diff --git a/src/store.ts b/src/store.ts
index aa5fae4f..a1c17cf5 100644
--- a/src/store.ts
+++ b/src/store.ts
@@ -25,6 +25,7 @@ import {
   formatDocForEmbedding,
   withLLMSessionForLlm,
   type LLMSessionOptions,
+  type LLM,
   type RerankDocument,
   type ILLMSession,
 } from "./llm.js";
@@ -427,6 +428,75 @@ export function enableProductionMode(): void {
   _productionMode = true;
 }
 
+// =============================================================================
+// .qmd directory resolution
+// =============================================================================
+
+/**
+ * Find a local .qmd directory by searching current directory and parents.
+ */
+export function findLocalQmdDir(startDir?: string): string | null {
+  let dir = startDir || getPwd();
+  const root = resolve("/");
+
+  while (dir !== root) {
+    const qmdDir = resolve(dir, ".qmd");
+    try {
+      const stat = statSync(qmdDir);
+      if (stat.isDirectory()) return qmdDir;
+    } catch {
+      // Directory doesn't exist, continue searching
+    }
+    const parent = resolve(dir, "..");
+    if (parent === dir) break;
+    dir = parent;
+  }
+  return null;
+}
+
+/**
+ * Initialize a local .qmd directory in the specified path.
+ */
+export function initLocalQmdDir(targetDir?: string): string {
+  const dir = targetDir || getPwd();
+  const qmdDir = resolve(dir, ".qmd");
+  mkdirSync(qmdDir, { recursive: true });
+  return qmdDir;
+}
+
+// CLI-provided qmdDir (highest priority)
+let _cliQmdDir: string | null = null;
+
+export function setCliQmdDir(qmdDir: string | null): void {
+  _cliQmdDir = qmdDir;
+}
+
+export function getCliQmdDir(): string | null {
+  return _cliQmdDir;
+}
+
+// Config loader for saved qmdDir (avoids circular imports)
+let _loadQmdDirConfig: (() => string | null) | null = null;
+
+export function setQmdDirConfigLoader(loader: () => string | null): void {
+  _loadQmdDirConfig = loader;
+}
+
+/**
+ * Get the effective qmdDir with priority:
+ * 1. CLI flag (--qmd-dir)
+ * 2. Saved config (~/.cache/qmd/config.json)
+ * 3. Auto-discover (search upward for .qmd)
+ */
+export function getEffectiveQmdDir(): string | null {
+  if (_cliQmdDir) return _cliQmdDir;
+  if (_loadQmdDirConfig) {
+    const saved = _loadQmdDirConfig();
+    if (saved) return saved;
+  }
+  return findLocalQmdDir();
+}
+
 export function getDefaultDbPath(indexName: string = "index"): string {
   // Always allow override via INDEX_PATH (for testing)
   if (process.env.INDEX_PATH) {
@@ -441,6 +511,13 @@ export function getDefaultDbPath(indexName: string = "index"): string {
     );
   }
 
+  // Check for .qmd directory (CLI > config > auto-discover)
+  const qmdDir = getEffectiveQmdDir();
+  if (qmdDir) {
+    return resolve(qmdDir, `${indexName}.sqlite`);
+  }
+
+  // Fall back to global cache
   const cacheDir = process.env.XDG_CACHE_HOME || resolve(homedir(), ".cache");
   const qmdCacheDir = resolve(cacheDir, "qmd");
   try { mkdirSync(qmdCacheDir, { recursive: true }); } catch { }
@@ -1012,8 +1089,8 @@ export type Store = {
   searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => Promise<SearchResult[]>;
 
   // Query expansion & reranking
-  expandQuery: (query: string, model?: string, intent?: string) => Promise<ExpandedQuery[]>;
-  rerank: (query: string, documents: { file: string; text: string }[], model?: string, intent?: string) => Promise<{ file: string; score: number }[]>;
+  expandQuery: (query: string, model?: string, intent?: string, llm?: LLM) => Promise<ExpandedQuery[]>;
+  rerank: (query: string, documents: { file: string; text: string }[], model?: string, intent?: string, llm?: LLM) => Promise<{ file: string; score: number }[]>;
 
   // Document retrieval
   findDocument: (filename: string, options?: { includeBody?: boolean }) => DocumentResult | DocumentNotFound;
@@ -1364,8 +1441,8 @@ export function createStore(dbPath?: string): Store {
     searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding),
 
     // Query expansion & reranking
-    expandQuery: (query: string, model?: string, intent?: string) => expandQuery(query, model, db, intent, store.llm),
-    rerank: (query: string, documents: { file: string; text: string }[], model?: string, intent?: string) => rerank(query, documents, model, db, intent, store.llm),
+    expandQuery: (query: string, model?: string, intent?: string, llm?: LLM) => expandQuery(query, model, db, intent, llm ?? store.llm),
+    rerank: (query: string, documents: { file: string; text: string }[], model?: string, intent?: string, llm?: LLM) => rerank(query, documents, model, db, intent, llm ?? store.llm),
 
     // Document retrieval
     findDocument: (filename: string, options?: { includeBody?: boolean }) => findDocument(db, filename, options),
@@ -2820,7 +2897,7 @@ export function insertEmbedding(
 // Query expansion
 // =============================================================================
 
-export async function expandQuery(query: string, model: string = DEFAULT_QUERY_MODEL, db: Database, intent?: string, llmOverride?: LlamaCpp): Promise<ExpandedQuery[]> {
+export async function expandQuery(query: string, model: string = DEFAULT_QUERY_MODEL, db: Database, intent?: string, llmOverride?: LLM): Promise<ExpandedQuery[]> {
   // Check cache first — stored as JSON preserving types
   const cacheKey = getCacheKey("expandQuery", { query, model, ...(intent && { intent }) });
   const cached = getCachedResult(db, cacheKey);
@@ -2859,7 +2936,7 @@ export async function expandQuery(query: string, model: string = DEFAULT_QUERY_M
 // Reranking
 // =============================================================================
 
-export async function rerank(query: string, documents: { file: string; text: string }[], model: string = DEFAULT_RERANK_MODEL, db: Database, intent?: string, llmOverride?: LlamaCpp): Promise<{ file: string; score: number }[]> {
+export async function rerank(query: string, documents: { file: string; text: string }[], model: string = DEFAULT_RERANK_MODEL, db: Database, intent?: string, llmOverride?: LLM): Promise<{ file: string; score: number }[]> {
   // Prepend intent to rerank query so the reranker scores with domain context
   const rerankQuery = intent ? `${intent}\n\n${query}` : query;
 
@@ -3530,6 +3607,7 @@ export interface HybridQueryOptions {
   intent?: string;          // domain intent hint for disambiguation
   skipRerank?: boolean;     // skip LLM reranking, use only RRF scores
   hooks?: SearchHooks;
+  llm?: LLM;               // override LLM backend (default: local node-llama-cpp)
 }
 
 export interface HybridQueryResult {
@@ -3604,7 +3682,7 @@ export async function hybridQuery(
   const expandStart = Date.now();
   const expanded = hasStrongSignal
     ? []
-    : await store.expandQuery(query, undefined, intent);
+    : await store.expandQuery(query, undefined, intent, options?.llm);
 
   hooks?.onExpand?.(query, expanded, Date.now() - expandStart);
 
@@ -3651,21 +3729,26 @@ export async function hybridQuery(
     }
 
     // Batch embed all vector queries in a single call
-    const llm = getLlm(store);
+    const llm = options?.llm ?? getLlm(store);
     const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text));
     hooks?.onEmbedStart?.(textsToEmbed.length);
     const embedStart = Date.now();
     const embeddings = await llm.embedBatch(textsToEmbed);
     hooks?.onEmbedDone?.(Date.now() - embedStart);
 
-    // Run sqlite-vec lookups with pre-computed embeddings
-    for (let i = 0; i < vecQueries.length; i++) {
-      const embedding = embeddings[i]?.embedding;
-      if (!embedding) continue;
+    // Average all embeddings into one vector for a single scan
+    const validEmbeddings = embeddings.filter(e => e !== null).map(e => e!.embedding);
+    if (validEmbeddings.length > 0) {
+      const dim = validEmbeddings[0]!.length;
+      const avgEmbedding = new Array(dim).fill(0);
+      for (const emb of validEmbeddings) {
+        for (let j = 0; j < dim; j++) avgEmbedding[j] += emb[j]!;
+      }
+      for (let j = 0; j < dim; j++) avgEmbedding[j] /= validEmbeddings.length;
 
       const vecResults = await store.searchVec(
-        vecQueries[i]!.text, DEFAULT_EMBED_MODEL, 20, collection,
-        undefined, embedding
+        query, DEFAULT_EMBED_MODEL, 20, collection,
+        undefined, avgEmbedding
       );
       if (vecResults.length > 0) {
         for (const r of vecResults) docidMap.set(r.filepath, r.docid);
@@ -3777,7 +3860,7 @@ export async function hybridQuery(
 
   hooks?.onRerankStart?.(chunksToRerank.length);
   const rerankStart = Date.now();
-  const reranked = await store.rerank(query, chunksToRerank, undefined, intent);
+  const reranked = await store.rerank(query, chunksToRerank, undefined, intent, options?.llm);
   hooks?.onRerankDone?.(Date.now() - rerankStart);
 
   // Step 7: Blend RRF position score with reranker score
@@ -3850,6 +3933,7 @@ export interface VectorSearchOptions {
   minScore?: number;        // default 0.3
   intent?: string;          // domain intent hint for disambiguation
   hooks?: Pick<SearchHooks, 'onExpand'>;
+  llm?: LLM;
 }
 
 export interface VectorSearchResult {
@@ -3888,29 +3972,39 @@ export async function vectorSearchQuery(
 
   // Expand query — filter to vec/hyde only (lex queries target FTS, not vector)
   const expandStart = Date.now();
-  const allExpanded = await store.expandQuery(query, undefined, intent);
+  const allExpanded = await store.expandQuery(query, undefined, intent, options?.llm);
   const vecExpanded = allExpanded.filter(q => q.type !== 'lex');
   options?.hooks?.onExpand?.(query, vecExpanded, Date.now() - expandStart);
 
-  // Run original + vec/hyde expanded through vector, sequentially — concurrent embed() hangs
+  // Batch embed all query texts, then average into single embedding for one scan
   const queryTexts = [query, ...vecExpanded.map(q => q.query)];
+  const llm = options?.llm ?? getLlm(store);
+  const textsToEmbed = queryTexts.map(q => formatQueryForEmbedding(q));
+  const embeddings = await llm.embedBatch(textsToEmbed);
+
+  // Average all embeddings into one vector
+  const validEmbeddings = embeddings.filter(e => e !== null).map(e => e!.embedding);
+  if (validEmbeddings.length === 0) return [];
+  const dim = validEmbeddings[0]!.length;
+  const avgEmbedding = new Array(dim).fill(0);
+  for (const emb of validEmbeddings) {
+    for (let i = 0; i < dim; i++) avgEmbedding[i] += emb[i]!;
+  }
+  for (let i = 0; i < dim; i++) avgEmbedding[i] /= validEmbeddings.length;
+
+  // Single scan with averaged embedding
+  const vecResults = await store.searchVec(query, DEFAULT_EMBED_MODEL, limit, collection, undefined, avgEmbedding);
   const allResults = new Map<string, VectorSearchResult>();
-  for (const q of queryTexts) {
-    const vecResults = await store.searchVec(q, DEFAULT_EMBED_MODEL, limit, collection);
-    for (const r of vecResults) {
-      const existing = allResults.get(r.filepath);
-      if (!existing || r.score > existing.score) {
-        allResults.set(r.filepath, {
-          file: r.filepath,
-          displayPath: r.displayPath,
-          title: r.title,
-          body: r.body || "",
-          score: r.score,
-          context: store.getContextForFile(r.filepath),
-          docid: r.docid,
-        });
-      }
-    }
+  for (const r of vecResults) {
+    allResults.set(r.filepath, {
+      file: r.filepath,
+      displayPath: r.displayPath,
+      title: r.title,
+      body: r.body || "",
+      score: r.score,
+      context: store.getContextForFile(r.filepath),
+      docid: r.docid,
+    });
   }
 
   return Array.from(allResults.values())
@@ -3938,6 +4032,7 @@ export interface StructuredSearchOptions {
   /** Skip LLM reranking, use only RRF scores */
   skipRerank?: boolean;
   hooks?: SearchHooks;
+  llm?: LLM;
 }
 
 /**
@@ -4032,7 +4127,7 @@ export async function structuredSearch(
         s.type === 'vec' || s.type === 'hyde'
     );
     if (vecSearches.length > 0) {
-      const llm = getLlm(store);
+      const llm = options?.llm ?? getLlm(store);
       const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query));
       hooks?.onEmbedStart?.(textsToEmbed.length);
       const embedStart = Date.now();
@@ -4167,7 +4262,7 @@ export async function structuredSearch(
 
   hooks?.onRerankStart?.(chunksToRerank.length);
   const rerankStart2 = Date.now();
-  const reranked = await store.rerank(primaryQuery, chunksToRerank, undefined, intent);
+  const reranked = await store.rerank(primaryQuery, chunksToRerank, undefined, intent, options?.llm);
   hooks?.onRerankDone?.(Date.now() - rerankStart2);
 
   // Step 6: Blend RRF position score with reranker score