diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts index 7ee82cc0..ac26bb7e 100755 --- a/src/cli/qmd.ts +++ b/src/cli/qmd.ts @@ -74,7 +74,21 @@ import { syncConfigToDb, type ReindexResult, } from "../store.js"; -import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "../llm.js"; +import { + disposeDefaultLlamaCpp, + getDefaultLlamaCpp, + withLLMSession, + pullModels, + DEFAULT_EMBED_MODEL_URI, + DEFAULT_GENERATE_MODEL_URI, + DEFAULT_RERANK_MODEL_URI, + DEFAULT_MODEL_CACHE_DIR, + setEmbedProvider, + resolveEmbedProvider, + getActiveEmbedModel, + getConfiguredEmbedDimensions, + getGoogleApiKey, +} from "../llm.js"; import { formatSearchResults, formatDocuments, @@ -417,10 +431,20 @@ async function showStatus(): Promise { const match = uri.match(/^hf:([^/]+\/[^/]+)\//); return match ? `https://huggingface.co/${match[1]}` : uri; }; + const provider = await resolveEmbedProvider(); + const configuredDims = getConfiguredEmbedDimensions(); + const embeddingModel = provider === "google" + ? "Google Gemini Embedding 2" + : hfLink(DEFAULT_EMBED_MODEL_URI); console.log(`\n${c.bold}Models${c.reset}`); - console.log(` Embedding: ${hfLink(DEFAULT_EMBED_MODEL_URI)}`); + console.log(` Provider: ${provider}${provider === "google" && configuredDims ? ` (${configuredDims}d)` : ""}`); + console.log(` Embedding: ${embeddingModel}`); console.log(` Reranking: ${hfLink(DEFAULT_RERANK_MODEL_URI)}`); console.log(` Generation: ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`); + if (provider === "google") { + const keyStatus = getGoogleApiKey() ? "set" : "missing"; + console.log(` GEMINI_API_KEY: ${keyStatus}`); + } } // Device / GPU info @@ -1546,7 +1570,7 @@ async function indexFiles(pwd?: string, globPattern: string = DEFAULT_GLOB, coll // Content changed - insert new content hash and update document insertContent(db, hash, content, now); const stat = statSync(filepath); - updateDocument(db, existing.id, title, hash, + updateDocument(db, existing.id, title, hash, "text", stat ? new Date(stat.mtime).toISOString() : now); updated++; } @@ -1555,7 +1579,7 @@ async function indexFiles(pwd?: string, globPattern: string = DEFAULT_GLOB, coll indexed++; insertContent(db, hash, content, now); const stat = statSync(filepath); - insertDocument(db, collectionName, path, title, hash, + insertDocument(db, collectionName, path, title, hash, "text", stat ? new Date(stat.birthtime).toISOString() : now, stat ? new Date(stat.mtime).toISOString() : now); } @@ -1605,9 +1629,11 @@ function renderProgressBar(percent: number, width: number = 30): string { return bar; } -async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = false): Promise { +async function vectorIndex(model?: string, force: boolean = false): Promise { const storeInstance = getStore(); const db = storeInstance.db; + const provider = await resolveEmbedProvider(); + const activeModel = model ?? await getActiveEmbedModel(); if (force) { console.log(`${c.yellow}Force re-indexing: clearing all vectors...${c.reset}`); @@ -1621,7 +1647,10 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = return; } - console.log(`${c.dim}Model: ${model}${c.reset}\n`); + const dimensions = getConfiguredEmbedDimensions(); + const dimInfo = provider === "google" && dimensions ? ` (${dimensions}d)` : ""; + console.log(`${c.dim}Provider: ${provider}${dimInfo}${c.reset}`); + console.log(`${c.dim}Model: ${activeModel}${c.reset}\n`); cursor.hide(); progress.indeterminate(); @@ -1629,7 +1658,7 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = const result = await generateEmbeddings(storeInstance, { force, - model, + model: activeModel, onProgress: (info) => { if (info.totalBytes === 0) return; const percent = (info.bytesProcessed / info.totalBytes) * 100; @@ -2330,6 +2359,8 @@ function parseCLI() { mask: { type: "string" }, // glob pattern // Embed options force: { type: "boolean", short: "f" }, + provider: { type: "string" }, + dimensions: { type: "string" }, // Update options pull: { type: "boolean" }, // git pull before update refresh: { type: "boolean" }, @@ -2357,6 +2388,23 @@ function parseCLI() { setConfigIndexName(indexName); } + const providerValue = typeof values.provider === "string" ? values.provider.toLowerCase() : undefined; + if (providerValue === "google" || providerValue === "local") { + setEmbedProvider(providerValue); + } else if (providerValue != null) { + console.error(`Invalid --provider value '${providerValue}'. Use 'local' or 'google'.`); + process.exit(1); + } + + if (typeof values.dimensions === "string") { + const dimRaw = values.dimensions.trim(); + if (!["768", "1536", "3072"].includes(dimRaw)) { + console.error(`Invalid --dimensions value '${values.dimensions}'. Use 768, 1536, or 3072.`); + process.exit(1); + } + process.env.QMD_EMBED_DIMENSIONS = dimRaw; + } + // Determine output format let format: OutputFormat = "cli"; if (values.csv) format = "csv"; @@ -2433,7 +2481,7 @@ function showHelp(): void { console.log("Maintenance:"); console.log(" qmd status - View index + collection health"); console.log(" qmd update [--pull] - Re-index collections (optionally git pull first)"); - console.log(" qmd embed [-f] - Generate/refresh vector embeddings"); + console.log(" qmd embed [-f] [--provider local|google] [--dimensions 768|1536|3072] - Generate/refresh vector embeddings"); console.log(" qmd cleanup - Clear caches, vacuum DB"); console.log(""); console.log("Query syntax (qmd query):"); @@ -2494,6 +2542,11 @@ function showHelp(): void { console.log(" --max-bytes - Skip files larger than N bytes (default 10240)"); console.log(" --json/--csv/--md/--xml/--files - Same formats as search"); console.log(""); + console.log("Embed options:"); + console.log(" -f, --force - Force full re-embedding"); + console.log(" --provider - Embedding backend (or QMD_EMBED_PROVIDER)"); + console.log(" --dimensions - Gemini output dimensions (768/1536/3072)"); + console.log(""); console.log(`Index: ${getDbPath()}`); } @@ -2794,7 +2847,7 @@ if (isMain) { break; case "embed": - await vectorIndex(DEFAULT_EMBED_MODEL, !!cli.values.force); + await vectorIndex(undefined, !!cli.values.force); break; case "pull": { diff --git a/src/collections.ts b/src/collections.ts index 257f144f..0aada98e 100644 --- a/src/collections.ts +++ b/src/collections.ts @@ -48,6 +48,8 @@ export interface NamedCollection extends Collection { name: string; } +export const DEFAULT_COLLECTION_PATTERN = "**/*.md"; + // ============================================================================ // Configuration paths // ============================================================================ @@ -269,7 +271,7 @@ export function updateCollectionSettings( export function addCollection( name: string, path: string, - pattern: string = "**/*.md" + pattern: string = DEFAULT_COLLECTION_PATTERN ): void { const config = loadConfig(); diff --git a/src/google-embed.ts b/src/google-embed.ts new file mode 100644 index 00000000..5ba9c5b6 --- /dev/null +++ b/src/google-embed.ts @@ -0,0 +1,233 @@ +import { readFileSync } from "node:fs"; +import { extname } from "node:path"; + +export const GOOGLE_EMBED_MODEL = "gemini-embedding-2-preview"; +export const GOOGLE_EMBED_MODEL_PATH = `models/${GOOGLE_EMBED_MODEL}`; +export const GOOGLE_EMBED_DEFAULT_DIMENSIONS = 3072; +export const GOOGLE_EMBED_BATCH_LIMIT = 100; +const GOOGLE_EMBED_MAX_RETRIES = 3; + +export type GeminiTaskType = + | "RETRIEVAL_DOCUMENT" + | "RETRIEVAL_QUERY" + | "SEMANTIC_SIMILARITY" + | "CLASSIFICATION" + | "CLUSTERING"; + +export type GeminiInlinePart = { + inlineData: { + mimeType: string; + data: string; + }; +}; + +export type GeminiTextPart = { text: string }; +export type GeminiPart = GeminiTextPart | GeminiInlinePart; + +export type EmbedInput = string | { + text?: string; + filePath?: string; + parts?: GeminiPart[]; +}; + +type EmbedRequest = { + model: string; + content: { parts: GeminiPart[] }; + taskType: GeminiTaskType; + outputDimensionality: number; +}; + +export type GeminiEmbedOptions = { + taskType?: GeminiTaskType; + outputDimensionality?: number; +}; + +function sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); +} + +function parseRetryAfterMs(retryAfterHeader: string | null): number | null { + if (!retryAfterHeader) return null; + + const seconds = Number.parseInt(retryAfterHeader, 10); + if (Number.isFinite(seconds) && seconds >= 0) { + return seconds * 1000; + } + + const retryAt = Date.parse(retryAfterHeader); + if (Number.isFinite(retryAt)) { + return Math.max(0, retryAt - Date.now()); + } + + return null; +} + +export function parseGeminiDimensionsFromEnv(): number { + const raw = process.env.QMD_EMBED_DIMENSIONS?.trim(); + if (!raw) return GOOGLE_EMBED_DEFAULT_DIMENSIONS; + const parsed = Number.parseInt(raw, 10); + if (parsed === 3072 || parsed === 1536 || parsed === 768) return parsed; + return GOOGLE_EMBED_DEFAULT_DIMENSIONS; +} + +export function getMimeTypeForPath(path: string): string | null { + const ext = extname(path).toLowerCase(); + if (ext === ".png") return "image/png"; + if (ext === ".jpg" || ext === ".jpeg") return "image/jpeg"; + if (ext === ".pdf") return "application/pdf"; + return null; +} + +export function isSupportedMultimodalPath(path: string): boolean { + return getMimeTypeForPath(path) !== null; +} + +function fileToInlinePart(path: string): GeminiInlinePart { + const mimeType = getMimeTypeForPath(path); + if (!mimeType) throw new Error(`Unsupported file type for Gemini embedding: ${path}`); + const data = readFileSync(path).toString("base64"); + return { inlineData: { mimeType, data } }; +} + +function normalizeInput(input: EmbedInput): GeminiPart[] { + if (typeof input === "string") { + return [{ text: input }]; + } + if (Array.isArray(input.parts) && input.parts.length > 0) { + return input.parts; + } + const parts: GeminiPart[] = []; + if (typeof input.text === "string" && input.text.trim().length > 0) { + parts.push({ text: input.text }); + } + if (input.filePath) { + parts.push(fileToInlinePart(input.filePath)); + } + if (parts.length === 0) { + throw new Error("Gemini embedding request needs at least one input part"); + } + return parts; +} + +export class GoogleAIEmbedder { + private readonly apiKey: string; + private readonly dimensions: number; + + constructor(apiKey: string, dimensionsOrOptions?: number | { dimensions?: number }) { + this.apiKey = apiKey; + if (typeof dimensionsOrOptions === "number") { + this.dimensions = dimensionsOrOptions; + } else { + this.dimensions = dimensionsOrOptions?.dimensions ?? parseGeminiDimensionsFromEnv(); + } + } + + private buildUrl(endpoint: "embedContent" | "batchEmbedContents"): string { + return `https://generativelanguage.googleapis.com/v1beta/models/${GOOGLE_EMBED_MODEL}:${endpoint}?key=${encodeURIComponent(this.apiKey)}`; + } + + private normalizeEmbedding(values: number[] | undefined, outputDimensionality: number): number[] | null { + if (!Array.isArray(values) || values.length === 0) return null; + if (values.length === outputDimensionality) return values; + if (values.length > outputDimensionality) { + // Matryoshka truncation keeps the leading dimensions. + return values.slice(0, outputDimensionality); + } + return null; + } + + private async postWithRetries( + endpoint: "embedContent" | "batchEmbedContents", + body: object + ): Promise { + let attempt = 0; + let delayMs = 500; + + while (true) { + try { + const res = await fetch(this.buildUrl(endpoint), { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(body), + }); + + if (res.ok) return res; + const retryable = res.status === 429 || res.status >= 500; + if (!retryable || attempt >= GOOGLE_EMBED_MAX_RETRIES) return res; + + const retryAfterMs = parseRetryAfterMs(res.headers.get("retry-after")); + await sleep(retryAfterMs ?? delayMs); + } catch (error) { + if (attempt >= GOOGLE_EMBED_MAX_RETRIES) { + throw error; + } + await sleep(delayMs); + } + + delayMs *= 2; + attempt++; + } + } + + async embed(input: EmbedInput, options: GeminiEmbedOptions = {}): Promise<{ embedding: number[] } | null> { + const outputDimensionality = options.outputDimensionality ?? this.dimensions; + const taskType = options.taskType ?? "RETRIEVAL_DOCUMENT"; + const res = await this.postWithRetries("embedContent", { + model: GOOGLE_EMBED_MODEL_PATH, + content: { parts: normalizeInput(input) }, + taskType, + outputDimensionality, + }); + + if (!res.ok) return null; + const data = await res.json() as { embedding?: { values?: number[] } }; + const embedding = this.normalizeEmbedding(data.embedding?.values, outputDimensionality); + if (!embedding) return null; + return { embedding }; + } + + async embedBatch(inputs: EmbedInput[], options: GeminiEmbedOptions = {}): Promise<({ embedding: number[] } | null)[]> { + if (inputs.length === 0) return []; + const output: ({ embedding: number[] } | null)[] = Array(inputs.length).fill(null); + const outputDimensionality = options.outputDimensionality ?? this.dimensions; + const taskType = options.taskType ?? "RETRIEVAL_DOCUMENT"; + + for (let start = 0; start < inputs.length; start += GOOGLE_EMBED_BATCH_LIMIT) { + const chunk = inputs.slice(start, start + GOOGLE_EMBED_BATCH_LIMIT); + const requests: EmbedRequest[] = []; + const requestToChunkIndex: number[] = []; + for (let i = 0; i < chunk.length; i++) { + try { + requests.push({ + model: GOOGLE_EMBED_MODEL_PATH, + content: { parts: normalizeInput(chunk[i]!) }, + taskType, + outputDimensionality, + }); + requestToChunkIndex.push(i); + } catch { + output[start + i] = null; + } + } + + if (requests.length === 0) { + continue; + } + + const res = await this.postWithRetries("batchEmbedContents", { requests }); + if (!res.ok) continue; + + const data = await res.json() as { embeddings?: Array<{ values?: number[] }> }; + const embeddings = data.embeddings ?? []; + for (let i = 0; i < requests.length; i++) { + const values = embeddings[i]?.values; + const embedding = this.normalizeEmbedding(values, requests[i]!.outputDimensionality); + if (embedding) { + output[start + requestToChunkIndex[i]!] = { embedding }; + } + } + } + + return output; + } +} diff --git a/src/llm.ts b/src/llm.ts index 485f45a8..aa6cfaaf 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -17,6 +17,13 @@ import { import { homedir } from "os"; import { join } from "path"; import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs"; +import { + GoogleAIEmbedder, + GOOGLE_EMBED_MODEL, + type EmbedInput, + type GeminiTaskType, + parseGeminiDimensionsFromEnv, +} from "./google-embed.js"; // ============================================================================= // Embedding Formatting Functions @@ -120,6 +127,14 @@ export type EmbedOptions = { model?: string; isQuery?: boolean; title?: string; + taskType?: GeminiTaskType; + outputDimensionality?: number; +}; + +export type EmbedBatchOptions = { + isQuery?: boolean; + taskType?: GeminiTaskType; + outputDimensionality?: number; }; /** @@ -154,9 +169,9 @@ export type LLMSessionOptions = { * Session interface for scoped LLM access with lifecycle guarantees */ export interface ILLMSession { - embed(text: string, options?: EmbedOptions): Promise; - embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]>; - expandQuery(query: string, options?: { context?: string; includeLexical?: boolean }): Promise; + embed(input: EmbedInput | string, options?: EmbedOptions): Promise; + embedBatch(inputs: (EmbedInput | string)[], options?: EmbedBatchOptions): Promise<(EmbeddingResult | null)[]>; + expandQuery(query: string, options?: { context?: string; includeLexical?: boolean; intent?: string }): Promise; rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise; /** Whether this session is still valid (not released or aborted) */ readonly isValid: boolean; @@ -315,7 +330,12 @@ export interface LLM { /** * Get embeddings for text */ - embed(text: string, options?: EmbedOptions): Promise; + embed(input: EmbedInput | string, options?: EmbedOptions): Promise; + + /** + * Batch embed multiple inputs + */ + embedBatch(inputs: (EmbedInput | string)[], options?: EmbedBatchOptions): Promise<(EmbeddingResult | null)[]>; /** * Generate text completion @@ -331,7 +351,7 @@ export interface LLM { * Expand a search query into multiple variations for different backends. * Returns a list of Queryable objects. */ - expandQuery(query: string, options?: { context?: string, includeLexical?: boolean }): Promise; + expandQuery(query: string, options?: { context?: string, includeLexical?: boolean, intent?: string }): Promise; /** * Rerank documents by relevance to a query @@ -832,13 +852,17 @@ export class LlamaCpp implements LLM { // Core API methods // ========================================================================== - async embed(text: string, options: EmbedOptions = {}): Promise { + async embed(input: EmbedInput | string, options: EmbedOptions = {}): Promise { // Ping activity at start to keep models alive during this operation this.touchActivity(); + if (typeof input !== "string") { + return null; + } + try { const context = await this.ensureEmbedContext(); - const embedding = await context.getEmbeddingFor(text); + const embedding = await context.getEmbeddingFor(input); return { embedding: Array.from(embedding.vector), @@ -854,12 +878,18 @@ export class LlamaCpp implements LLM { * Batch embed multiple texts efficiently * Uses Promise.all for parallel embedding - node-llama-cpp handles batching internally */ - async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> { + async embedBatch(inputs: (EmbedInput | string)[]): Promise<(EmbeddingResult | null)[]> { if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)"); // Ping activity at start to keep models alive during this operation this.touchActivity(); - if (texts.length === 0) return []; + if (inputs.length === 0) return []; + + // Local embedding model only supports plain text input. + if (inputs.some((input) => typeof input !== "string")) { + return inputs.map(() => null); + } + const texts = inputs as string[]; try { const contexts = await this.ensureEmbedContexts(); @@ -1379,17 +1409,19 @@ class LLMSession implements ILLMSession { } } - async embed(text: string, options?: EmbedOptions): Promise { - return this.withOperation(() => this.manager.getLlamaCpp().embed(text, options)); + async embed(input: EmbedInput | string, options?: EmbedOptions): Promise { + return this.withOperation(() => this.manager.getLlamaCpp().embed(input, options)); } - async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> { - return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts)); + async embedBatch(inputs: (EmbedInput | string)[], options?: EmbedBatchOptions): Promise<(EmbeddingResult | null)[]> { + // LlamaCpp.embedBatch currently only accepts inputs; query/document formatting + // is handled before session calls. + return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(inputs)); } async expandQuery( query: string, - options?: { context?: string; includeLexical?: boolean } + options?: { context?: string; includeLexical?: boolean; intent?: string } ): Promise { return this.withOperation(() => this.manager.getLlamaCpp().expandQuery(query, options)); } @@ -1502,8 +1534,169 @@ export function setDefaultLlamaCpp(llm: LlamaCpp | null): void { * Call this before process exit to prevent NAPI crashes. */ export async function disposeDefaultLlamaCpp(): Promise { + defaultHybridLLM = null; + resolvedEmbedProvider = null; if (defaultLlamaCpp) { await defaultLlamaCpp.dispose(); defaultLlamaCpp = null; } } + +// ============================================================================= +// Embed Provider Configuration +// ============================================================================= + +export type EmbedProvider = "local" | "google"; + +let forcedEmbedProvider: EmbedProvider | null = null; +let resolvedEmbedProvider: EmbedProvider | null = null; +let defaultHybridLLM: GoogleHybridLLM | null = null; + +export function setEmbedProvider(provider: EmbedProvider | null): void { + forcedEmbedProvider = provider; + resolvedEmbedProvider = null; +} + +export function getConfiguredEmbedProvider(): EmbedProvider | null { + return forcedEmbedProvider; +} + +export function getResolvedEmbedProvider(): EmbedProvider | null { + return resolvedEmbedProvider; +} + +function parseEmbedProviderFromEnv(): EmbedProvider | null { + const provider = process.env.QMD_EMBED_PROVIDER?.trim().toLowerCase(); + if (provider === "google" || provider === "local") return provider; + return null; +} + +export function getGoogleApiKey(): string | null { + const key = process.env.GEMINI_API_KEY?.trim(); + return key && key.length > 0 ? key : null; +} + +export async function resolveEmbedProvider(): Promise { + const explicit = forcedEmbedProvider ?? parseEmbedProviderFromEnv(); + if (explicit) { + resolvedEmbedProvider = explicit; + return explicit; + } + + const apiKey = getGoogleApiKey(); + if (!apiKey) { + resolvedEmbedProvider = "local"; + return "local"; + } + + // Auto mode: if Gemini key exists and local GPU is unavailable, prefer Google embeddings. + try { + const local = getDefaultLlamaCpp(); + const device = await local.getDeviceInfo(); + const provider: EmbedProvider = device.gpu ? "local" : "google"; + resolvedEmbedProvider = provider; + return provider; + } catch { + resolvedEmbedProvider = "local"; + return "local"; + } +} + +export async function needsEmbedFormatting(): Promise { + return (await resolveEmbedProvider()) === "local"; +} + +export async function getActiveEmbedModel(): Promise { + return (await resolveEmbedProvider()) === "google" ? GOOGLE_EMBED_MODEL : DEFAULT_EMBED_MODEL; +} + +export function getConfiguredEmbedDimensions(): number | null { + const provider = forcedEmbedProvider ?? parseEmbedProviderFromEnv(); + if (provider === "google") return parseGeminiDimensionsFromEnv(); + return null; +} + +// ============================================================================= +// GoogleHybridLLM — Gemini embedding API + local llama.cpp expansion/reranking +// ============================================================================= + +export class GoogleHybridLLM implements LLM { + private readonly localLlm: LlamaCpp; + private readonly googleEmbedder: GoogleAIEmbedder; + + constructor(localLlm: LlamaCpp, apiKey: string) { + this.localLlm = localLlm; + this.googleEmbedder = new GoogleAIEmbedder(apiKey, parseGeminiDimensionsFromEnv()); + } + + async embed(input: EmbedInput | string, options: EmbedOptions = {}): Promise { + const defaultTaskType: GeminiTaskType = options.isQuery ? "RETRIEVAL_QUERY" : "RETRIEVAL_DOCUMENT"; + const result = await this.googleEmbedder.embed(input, { + taskType: options.taskType ?? defaultTaskType, + outputDimensionality: options.outputDimensionality ?? parseGeminiDimensionsFromEnv(), + }); + if (!result) return null; + return { + embedding: result.embedding, + model: GOOGLE_EMBED_MODEL, + }; + } + + async embedBatch( + inputs: (EmbedInput | string)[], + options: EmbedBatchOptions = {} + ): Promise<(EmbeddingResult | null)[]> { + const taskType: GeminiTaskType = options.taskType ?? (options.isQuery ? "RETRIEVAL_QUERY" : "RETRIEVAL_DOCUMENT"); + const results = await this.googleEmbedder.embedBatch(inputs, { + taskType, + outputDimensionality: options.outputDimensionality ?? parseGeminiDimensionsFromEnv(), + }); + return results.map((result) => { + if (!result) return null; + return { + embedding: result.embedding, + model: GOOGLE_EMBED_MODEL, + }; + }); + } + + async generate(prompt: string, options?: GenerateOptions): Promise { + return this.localLlm.generate(prompt, options); + } + + async modelExists(model: string): Promise { + return this.localLlm.modelExists(model); + } + + async expandQuery(query: string, options?: { context?: string; includeLexical?: boolean; intent?: string }): Promise { + return this.localLlm.expandQuery(query, options); + } + + async rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise { + return this.localLlm.rerank(query, documents, options); + } + + async dispose(): Promise { + // localLlm is a shared singleton in normal operation; caller owns lifecycle. + } +} + +/** + * Get default embedding-capable LLM implementation. + * - `google`: Gemini embeddings + local llama.cpp query expansion/reranking + * - `local`: llama.cpp for all operations + */ +export async function getDefaultLLM(): Promise { + const provider = await resolveEmbedProvider(); + if (provider === "google") { + const apiKey = getGoogleApiKey(); + if (!apiKey) { + throw new Error("GEMINI_API_KEY is required when embed provider is google"); + } + if (!defaultHybridLLM) { + defaultHybridLLM = new GoogleHybridLLM(getDefaultLlamaCpp(), apiKey); + } + return defaultHybridLLM; + } + return getDefaultLlamaCpp(); +} diff --git a/src/store.ts b/src/store.ts index aa5fae4f..3b83d1ed 100644 --- a/src/store.ts +++ b/src/store.ts @@ -19,15 +19,25 @@ import { readFileSync, realpathSync, statSync, mkdirSync } from "node:fs"; // Note: node:path resolve is not imported — we export our own cross-platform resolve() import fastGlob from "fast-glob"; import { + type LLM, LlamaCpp, getDefaultLlamaCpp, + getActiveEmbedModel, + getDefaultLLM, formatQueryForEmbedding, formatDocForEmbedding, withLLMSessionForLlm, + type EmbeddingResult, type LLMSessionOptions, type RerankDocument, type ILLMSession, } from "./llm.js"; +import { + getMimeTypeForPath, + isSupportedMultimodalPath, + parseGeminiDimensionsFromEnv, + type EmbedInput, +} from "./google-embed.js"; import type { NamedCollection, Collection, @@ -57,12 +67,9 @@ export const CHUNK_OVERLAP_CHARS = CHUNK_OVERLAP_TOKENS * 4; // 540 chars export const CHUNK_WINDOW_TOKENS = 200; export const CHUNK_WINDOW_CHARS = CHUNK_WINDOW_TOKENS * 4; // 800 chars -/** - * Get the LlamaCpp instance for a store — prefers the store's own instance, - * falls back to the global singleton. - */ -function getLlm(store: Store): LlamaCpp { - return store.llm ?? getDefaultLlamaCpp(); +async function getEmbeddingLlm(store: Store): Promise { + if (store.llm) return store.llm; + return await getDefaultLLM(); } // ============================================================================= @@ -660,6 +667,7 @@ function initializeDatabase(db: Database): void { path TEXT NOT NULL, title TEXT NOT NULL, hash TEXT NOT NULL, + content_type TEXT NOT NULL DEFAULT 'text', created_at TEXT NOT NULL, modified_at TEXT NOT NULL, active INTEGER NOT NULL DEFAULT 1, @@ -681,7 +689,13 @@ function initializeDatabase(db: Database): void { ) `); - // Content vectors + // Content vectors — migrate content_type column before creating index on it + const docInfo = db.prepare(`PRAGMA table_info(documents)`).all() as { name: string }[]; + if (!docInfo.some(col => col.name === "content_type")) { + db.exec(`ALTER TABLE documents ADD COLUMN content_type TEXT NOT NULL DEFAULT 'text'`); + } + db.exec(`CREATE INDEX IF NOT EXISTS idx_documents_content_type ON documents(content_type, active)`); + const cvInfo = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[]; const hasSeqColumn = cvInfo.some(col => col.name === 'seq'); if (cvInfo.length > 0 && !hasSeqColumn) { @@ -694,11 +708,17 @@ function initializeDatabase(db: Database): void { seq INTEGER NOT NULL DEFAULT 0, pos INTEGER NOT NULL DEFAULT 0, model TEXT NOT NULL, + provider TEXT NOT NULL DEFAULT 'local', embedded_at TEXT NOT NULL, PRIMARY KEY (hash, seq) ) `); + const cvInfoPost = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[]; + if (!cvInfoPost.some(col => col.name === "provider")) { + db.exec(`ALTER TABLE content_vectors ADD COLUMN provider TEXT NOT NULL DEFAULT 'local'`); + } + // Store collections — makes the DB self-contained (no external config needed) db.exec(` CREATE TABLE IF NOT EXISTS store_collections ( @@ -845,7 +865,7 @@ export function upsertStoreCollection(db: Database, name: string, collection: Om `).run( name, collection.path, - collection.pattern || '**/*.md', + collection.pattern || DEFAULT_GLOB, collection.ignore ? JSON.stringify(collection.ignore) : null, collection.includeByDefault === false ? 0 : 1, collection.update || null, @@ -970,8 +990,8 @@ function ensureVecTableInternal(db: Database, dimensions: number): void { export type Store = { db: Database; dbPath: string; - /** Optional LlamaCpp instance for this store (overrides the global singleton) */ - llm?: LlamaCpp; + /** Optional embedding-capable LLM implementation for this store */ + llm?: LLM; close: () => void; ensureVecTable: (dimensions: number) => void; @@ -1027,17 +1047,24 @@ export type Store = { // Document indexing operations insertContent: (hash: string, content: string, createdAt: string) => void; - insertDocument: (collectionName: string, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => void; + insertDocument: (collectionName: string, path: string, title: string, hash: string, contentType: string, createdAt: string, modifiedAt: string) => void; findActiveDocument: (collectionName: string, path: string) => { id: number; hash: string; title: string } | null; updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => void; - updateDocument: (documentId: number, title: string, hash: string, modifiedAt: string) => void; + updateDocument: (documentId: number, title: string, hash: string, contentType: string, modifiedAt: string) => void; deactivateDocument: (collectionName: string, path: string) => void; getActiveDocumentPaths: (collectionName: string) => string[]; // Vector/embedding operations - getHashesForEmbedding: () => { hash: string; body: string; path: string }[]; + getHashesForEmbedding: () => { + hash: string; + body: string; + path: string; + collection: string; + contentType: string; + collectionPath: string; + }[]; clearAllEmbeddings: () => void; - insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => void; + insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, provider: string, embeddedAt: string) => void; }; // ============================================================================= @@ -1058,6 +1085,44 @@ export type ReindexResult = { orphanedCleaned: number; }; +type IndexedContentType = "text" | "image" | "pdf"; + +function getIndexedContentType(path: string): IndexedContentType { + const lower = path.toLowerCase(); + if (lower.endsWith(".pdf")) return "pdf"; + if (lower.endsWith(".png") || lower.endsWith(".jpg") || lower.endsWith(".jpeg")) return "image"; + return "text"; +} + +function inferContentTypeFromVirtualPath(path: string): IndexedContentType { + return getIndexedContentType(path); +} + +function withContentTypeContext(context: string | null, contentType: IndexedContentType): string | null { + if (contentType === "text") return context; + return context ? `${context}\n[${contentType}]` : `[${contentType}]`; +} + +function loadIndexableContent( + absolutePath: string, + relativePath: string, + contentType: IndexedContentType +): { hash: string; textBody: string | null } { + if (contentType === "text") { + const text = readFileSync(absolutePath, "utf-8"); + const hash = createHash("sha256").update(text).digest("hex"); + return { hash, textBody: text }; + } + + const bytes = readFileSync(absolutePath); + const hash = createHash("sha256").update(bytes).digest("hex"); + const mimeType = getMimeTypeForPath(relativePath) ?? "application/octet-stream"; + return { + hash, + textBody: `[${contentType}] ${relativePath} (${mimeType})`, + }; +} + /** * Re-index a single collection by scanning the filesystem and updating the database. * Pure function — no console output, no db lifecycle management. @@ -1100,24 +1165,25 @@ export async function reindexCollection( for (const relativeFile of files) { const filepath = getRealPath(resolve(collectionPath, relativeFile)); const path = handelize(relativeFile); + const contentType = getIndexedContentType(relativeFile); seenPaths.add(path); - let content: string; + let loaded: { hash: string; textBody: string | null }; try { - content = readFileSync(filepath, "utf-8"); + loaded = loadIndexableContent(filepath, relativeFile, contentType); } catch { processed++; options?.onProgress?.({ file: relativeFile, current: processed, total }); continue; } - if (!content.trim()) { + if (contentType === "text" && !(loaded.textBody ?? "").trim()) { processed++; continue; } - const hash = await hashContent(content); - const title = extractTitle(content, relativeFile); + const hash = loaded.hash; + const title = loaded.textBody ? extractTitle(loaded.textBody, relativeFile) : extractTitle("", relativeFile); const existing = findActiveDocument(db, collectionName, path); @@ -1130,17 +1196,17 @@ export async function reindexCollection( unchanged++; } } else { - insertContent(db, hash, content, now); + insertContent(db, hash, loaded.textBody ?? "", now); const stat = statSync(filepath); - updateDocument(db, existing.id, title, hash, + updateDocument(db, existing.id, title, hash, contentType, stat ? new Date(stat.mtime).toISOString() : now); updated++; } } else { indexed++; - insertContent(db, hash, content, now); + insertContent(db, hash, loaded.textBody ?? "", now); const stat = statSync(filepath); - insertDocument(db, collectionName, path, title, hash, + insertDocument(db, collectionName, path, title, hash, contentType, stat ? new Date(stat.birthtime).toISOString() : now, stat ? new Date(stat.mtime).toISOString() : now); } @@ -1179,6 +1245,34 @@ export type EmbedResult = { durationMs: number; }; +function buildMultimodalEmbedText(path: string, title: string, body: string, contentType: string): string { + const lines: string[] = []; + const trimmedBody = body.trim(); + + lines.push(`File: ${path}`); + if (title.trim().length > 0) { + lines.push(`Title: ${title.trim()}`); + } + if (trimmedBody.length > 0) { + // Keep payload small enough for embeddings while preserving useful signals. + lines.push(`Body: ${trimmedBody.slice(0, 2000)}`); + } + lines.push(`Type: ${contentType}`); + + return lines.join("\n"); +} + +function estimatePdfPageCount(path: string): number { + try { + const bytes = readFileSync(path); + const text = bytes.toString("latin1"); + const matches = text.match(/\/Type\s*\/Page\b/g); + return matches?.length ?? 0; + } catch { + return 0; + } +} + /** * Generate vector embeddings for documents that need them. * Pure function — no console output, no db lifecycle management. @@ -1193,7 +1287,9 @@ export async function generateEmbeddings( } ): Promise { const db = store.db; - const model = options?.model ?? DEFAULT_EMBED_MODEL; + const llm = await getEmbeddingLlm(store); + const provider = llm instanceof LlamaCpp ? "local" : "google"; + const activeModel = options?.model ?? await getActiveEmbedModel(); const now = new Date().toISOString(); if (options?.force) { @@ -1206,12 +1302,50 @@ export async function generateEmbeddings( return { docsProcessed: 0, chunksEmbedded: 0, errors: 0, durationMs: 0 }; } - // Chunk all documents - type ChunkItem = { hash: string; title: string; text: string; seq: number; pos: number; tokens: number; bytes: number }; + type ChunkItem = { + hash: string; + title: string; + seq: number; + pos: number; + bytes: number; + input: EmbedInput | string; + }; const allChunks: ChunkItem[] = []; + const useLocalFormatting = llm instanceof LlamaCpp; for (const item of hashesToEmbed) { + const absolutePath = resolve(item.collectionPath, item.path); + const contentType = (item.contentType || "text").toLowerCase(); const encoder = new TextEncoder(); + + if (contentType === "image" || contentType === "pdf") { + if (provider !== "google" || !isSupportedMultimodalPath(absolutePath)) { + continue; + } + + if (contentType === "pdf") { + const pages = estimatePdfPageCount(absolutePath); + if (pages > 6) { + continue; + } + } + + const title = item.title?.trim() || extractTitle(item.body || "", item.path); + const input: EmbedInput = { + text: buildMultimodalEmbedText(item.path, title, item.body || "", contentType), + filePath: absolutePath, + }; + allChunks.push({ + hash: item.hash, + title, + input, + seq: 0, + pos: 0, + bytes: readFileSync(absolutePath).length, + }); + continue; + } + const bodyBytes = encoder.encode(item.body).length; if (bodyBytes === 0) continue; @@ -1219,14 +1353,17 @@ export async function generateEmbeddings( const chunks = await chunkDocumentByTokens(item.body); for (let seq = 0; seq < chunks.length; seq++) { + const chunk = chunks[seq]!; + const formatted = useLocalFormatting + ? formatDocForEmbedding(chunk.text, title) + : chunk.text; allChunks.push({ hash: item.hash, title, - text: chunks[seq]!.text, + input: formatted, seq, - pos: chunks[seq]!.pos, - tokens: chunks[seq]!.tokens, - bytes: encoder.encode(chunks[seq]!.text).length, + pos: chunk.pos, + bytes: encoder.encode(chunk.text).length, }); } } @@ -1240,36 +1377,37 @@ export async function generateEmbeddings( const totalDocs = hashesToEmbed.length; const startTime = Date.now(); - // Use store's LlamaCpp or global singleton, wrapped in a session - const llm = getLlm(store); - const sessionOptions: LLMSessionOptions = { maxDuration: 30 * 60 * 1000, name: 'generateEmbeddings' }; - - // Create a session manager for this llm instance - const result = await withLLMSessionForLlm(llm, async (session) => { - // Get embedding dimensions from first chunk - const firstChunk = allChunks[0]!; - const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title); - const firstResult = await session.embed(firstText); - if (!firstResult) { - throw new Error("Failed to get embedding dimensions from first chunk"); - } - store.ensureVecTable(firstResult.embedding.length); - - let chunksEmbedded = 0, errors = 0, bytesProcessed = 0; - const BATCH_SIZE = 32; - - for (let batchStart = 0; batchStart < allChunks.length; batchStart += BATCH_SIZE) { - const batchEnd = Math.min(batchStart + BATCH_SIZE, allChunks.length); + const firstResult = await llm.embed(allChunks[0]!.input, { + isQuery: false, + taskType: "RETRIEVAL_DOCUMENT", + ...(provider === "google" ? { outputDimensionality: parseGeminiDimensionsFromEnv() } : {}), + }); + if (!firstResult) { + throw new Error("Failed to get embedding dimensions from first input"); + } + store.ensureVecTable(firstResult.embedding.length); + + let chunksEmbedded = 0; + let errors = 0; + let bytesProcessed = 0; + const batchSize = provider === "google" ? 100 : 32; + + const embedWith = async ( + embedOne: (input: EmbedInput | string) => Promise, + embedMany: (inputs: (EmbedInput | string)[]) => Promise<(EmbeddingResult | null)[]> + ): Promise => { + for (let batchStart = 0; batchStart < allChunks.length; batchStart += batchSize) { + const batchEnd = Math.min(batchStart + batchSize, allChunks.length); const batch = allChunks.slice(batchStart, batchEnd); - const texts = batch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title)); + const inputs = batch.map(chunk => chunk.input); try { - const embeddings = await session.embedBatch(texts); + const embeddings = await embedMany(inputs); for (let i = 0; i < batch.length; i++) { const chunk = batch[i]!; const embedding = embeddings[i]; if (embedding) { - insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now); + insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), embedding.model || activeModel, provider, now); chunksEmbedded++; } else { errors++; @@ -1277,13 +1415,11 @@ export async function generateEmbeddings( bytesProcessed += chunk.bytes; } } catch { - // Batch failed — try individual embeddings as fallback for (const chunk of batch) { try { - const text = formatDocForEmbedding(chunk.text, chunk.title); - const result = await session.embed(text); - if (result) { - insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now); + const embedding = await embedOne(chunk.input); + if (embedding) { + insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), embedding.model || activeModel, provider, now); chunksEmbedded++; } else { errors++; @@ -1297,14 +1433,29 @@ export async function generateEmbeddings( options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors }); } + }; - return { chunksEmbedded, errors }; - }, sessionOptions); + if (provider === "local" && (store.llm instanceof LlamaCpp || !store.llm)) { + const localLlm = store.llm instanceof LlamaCpp ? store.llm : getDefaultLlamaCpp(); + const sessionOptions: LLMSessionOptions = { maxDuration: 30 * 60 * 1000, name: "generateEmbeddings" }; + await withLLMSessionForLlm(localLlm, async (session) => { + await embedWith( + (input) => session.embed(input, { isQuery: false }), + (inputs) => session.embedBatch(inputs, { isQuery: false }) + ); + return { chunksEmbedded, errors }; + }, sessionOptions); + } else { + await embedWith( + (input) => llm.embed(input, { isQuery: false, taskType: "RETRIEVAL_DOCUMENT" }), + (inputs) => llm.embedBatch(inputs, { isQuery: false, taskType: "RETRIEVAL_DOCUMENT" }) + ); + } return { docsProcessed: totalDocs, - chunksEmbedded: result.chunksEmbedded, - errors: result.errors, + chunksEmbedded, + errors, durationMs: Date.now() - startTime, }; } @@ -1379,17 +1530,17 @@ export function createStore(dbPath?: string): Store { // Document indexing operations insertContent: (hash: string, content: string, createdAt: string) => insertContent(db, hash, content, createdAt), - insertDocument: (collectionName: string, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => insertDocument(db, collectionName, path, title, hash, createdAt, modifiedAt), + insertDocument: (collectionName: string, path: string, title: string, hash: string, contentType: string, createdAt: string, modifiedAt: string) => insertDocument(db, collectionName, path, title, hash, contentType, createdAt, modifiedAt), findActiveDocument: (collectionName: string, path: string) => findActiveDocument(db, collectionName, path), updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => updateDocumentTitle(db, documentId, title, modifiedAt), - updateDocument: (documentId: number, title: string, hash: string, modifiedAt: string) => updateDocument(db, documentId, title, hash, modifiedAt), + updateDocument: (documentId: number, title: string, hash: string, contentType: string, modifiedAt: string) => updateDocument(db, documentId, title, hash, contentType, modifiedAt), deactivateDocument: (collectionName: string, path: string) => deactivateDocument(db, collectionName, path), getActiveDocumentPaths: (collectionName: string) => getActiveDocumentPaths(db, collectionName), // Vector/embedding operations getHashesForEmbedding: () => getHashesForEmbedding(db), clearAllEmbeddings: () => clearAllEmbeddings(db), - insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt), + insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, provider: string, embeddedAt: string) => insertEmbedding(db, hash, seq, pos, embedding, model, provider, embeddedAt), }; return store; @@ -1414,6 +1565,7 @@ export type DocumentResult = { modifiedAt: string; // Last modification timestamp bodyLength: number; // Body length in bytes (useful before loading) body?: string; // Document body (optional, load with getDocumentBody) + contentType?: string; // Document content type (text/image/pdf) }; /** @@ -1799,18 +1951,20 @@ export function insertDocument( path: string, title: string, hash: string, + contentType: string, createdAt: string, modifiedAt: string ): void { db.prepare(` - INSERT INTO documents (collection, path, title, hash, created_at, modified_at, active) - VALUES (?, ?, ?, ?, ?, ?, 1) + INSERT INTO documents (collection, path, title, hash, content_type, created_at, modified_at, active) + VALUES (?, ?, ?, ?, ?, ?, ?, 1) ON CONFLICT(collection, path) DO UPDATE SET title = excluded.title, hash = excluded.hash, + content_type = excluded.content_type, modified_at = excluded.modified_at, active = 1 - `).run(collectionName, path, title, hash, createdAt, modifiedAt); + `).run(collectionName, path, title, hash, contentType, createdAt, modifiedAt); } /** @@ -1850,10 +2004,11 @@ export function updateDocument( documentId: number, title: string, hash: string, + contentType: string, modifiedAt: string ): void { - db.prepare(`UPDATE documents SET title = ?, hash = ?, modified_at = ? WHERE id = ?`) - .run(title, hash, modifiedAt, documentId); + db.prepare(`UPDATE documents SET title = ?, hash = ?, content_type = ?, modified_at = ? WHERE id = ?`) + .run(title, hash, contentType, modifiedAt, documentId); } /** @@ -2624,6 +2779,7 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle 'qmd://' || d.collection || '/' || d.path as filepath, d.collection || '/' || d.path as display_path, d.title, + d.content_type, content.doc as body, d.hash, bm25(documents_fts, 10.0, 1.0) as bm25_score @@ -2643,7 +2799,7 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle sql += ` ORDER BY bm25_score ASC LIMIT ?`; params.push(limit); - const rows = db.prepare(sql).all(...params) as { filepath: string; display_path: string; title: string; body: string; hash: string; bm25_score: number }[]; + const rows = db.prepare(sql).all(...params) as { filepath: string; display_path: string; title: string; content_type: string; body: string; hash: string; bm25_score: number }[]; return rows.map(row => { const collectionName = row.filepath.split('//')[1]?.split('/')[0] || ""; // Convert bm25 (negative, lower is better) into a stable [0..1) score where higher is better. @@ -2651,6 +2807,10 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle // |x| / (1 + |x|) maps: strong(-10)→0.91, medium(-2)→0.67, weak(-0.5)→0.33, none(0)→0. // Monotonic and query-independent — no per-query normalization needed. const score = Math.abs(row.bm25_score) / (1 + Math.abs(row.bm25_score)); + const baseContext = getContextForFile(db, row.filepath); + const context = row.content_type !== "text" + ? (baseContext ? `${baseContext}\n[${row.content_type}]` : `[${row.content_type}]`) + : baseContext; return { filepath: row.filepath, displayPath: row.display_path, @@ -2661,7 +2821,8 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle modifiedAt: "", // Not available in FTS query bodyLength: row.body.length, body: row.body, - context: getContextForFile(db, row.filepath), + context, + contentType: row.content_type, score, source: "fts" as const, }; @@ -2707,6 +2868,7 @@ export async function searchVec(db: Database, query: string, model: string, limi 'qmd://' || d.collection || '/' || d.path as filepath, d.collection || '/' || d.path as display_path, d.title, + d.content_type, content.doc as body FROM content_vectors cv JOIN documents d ON d.hash = cv.hash AND d.active = 1 @@ -2722,7 +2884,7 @@ export async function searchVec(db: Database, query: string, model: string, limi const docRows = db.prepare(docSql).all(...params) as { hash_seq: string; hash: string; pos: number; filepath: string; - display_path: string; title: string; body: string; + display_path: string; title: string; content_type: string; body: string; }[]; // Combine with distances and dedupe by filepath @@ -2740,6 +2902,10 @@ export async function searchVec(db: Database, query: string, model: string, limi .slice(0, limit) .map(({ row, bestDist }) => { const collectionName = row.filepath.split('//')[1]?.split('/')[0] || ""; + const baseContext = getContextForFile(db, row.filepath); + const context = row.content_type !== "text" + ? (baseContext ? `${baseContext}\n[${row.content_type}]` : `[${row.content_type}]`) + : baseContext; return { filepath: row.filepath, displayPath: row.display_path, @@ -2750,7 +2916,8 @@ export async function searchVec(db: Database, query: string, model: string, limi modifiedAt: "", // Not available in vec query bodyLength: row.body.length, body: row.body, - context: getContextForFile(db, row.filepath), + context, + contentType: row.content_type, score: 1 - bestDist, // Cosine similarity = 1 - cosine distance source: "vec" as const, chunkPos: row.pos, @@ -2763,11 +2930,22 @@ export async function searchVec(db: Database, query: string, model: string, limi // ============================================================================= async function getEmbedding(text: string, model: string, isQuery: boolean, session?: ILLMSession, llmOverride?: LlamaCpp): Promise { - // Format text using the appropriate prompt template - const formattedText = isQuery ? formatQueryForEmbedding(text, model) : formatDocForEmbedding(text, undefined, model); - const result = session - ? await session.embed(formattedText, { model, isQuery }) - : await (llmOverride ?? getDefaultLlamaCpp()).embed(formattedText, { model, isQuery }); + if (session) { + const formattedText = isQuery ? formatQueryForEmbedding(text, model) : formatDocForEmbedding(text, undefined, model); + const result = await session.embed(formattedText, { model, isQuery }); + return result?.embedding || null; + } + + const llm = llmOverride ?? await getDefaultLLM(); + const useLocalFormatting = llm instanceof LlamaCpp; + const input = useLocalFormatting + ? (isQuery ? formatQueryForEmbedding(text, model) : formatDocForEmbedding(text, undefined, model)) + : text; + const result = await llm.embed(input, { + model, + isQuery, + ...(isQuery ? { taskType: "RETRIEVAL_QUERY" as const } : { taskType: "RETRIEVAL_DOCUMENT" as const }), + }); return result?.embedding || null; } @@ -2775,15 +2953,39 @@ async function getEmbedding(text: string, model: string, isQuery: boolean, sessi * Get all unique content hashes that need embeddings (from active documents). * Returns hash, document body, and a sample path for display purposes. */ -export function getHashesForEmbedding(db: Database): { hash: string; body: string; path: string }[] { +export function getHashesForEmbedding(db: Database): { + hash: string; + body: string; + path: string; + title: string; + collection: string; + contentType: string; + collectionPath: string; +}[] { return db.prepare(` - SELECT d.hash, c.doc as body, MIN(d.path) as path + SELECT + d.hash, + c.doc as body, + MIN(d.path) as path, + MIN(d.title) as title, + d.collection as collection, + MIN(d.content_type) as contentType, + sc.path as collectionPath FROM documents d + JOIN store_collections sc ON sc.name = d.collection JOIN content c ON d.hash = c.hash LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0 WHERE d.active = 1 AND v.hash IS NULL - GROUP BY d.hash - `).all() as { hash: string; body: string; path: string }[]; + GROUP BY d.hash, d.collection, sc.path + `).all() as { + hash: string; + body: string; + path: string; + title: string; + collection: string; + contentType: string; + collectionPath: string; + }[]; } /** @@ -2806,21 +3008,22 @@ export function insertEmbedding( pos: number, embedding: Float32Array, model: string, + provider: string, embeddedAt: string ): void { const hashSeq = `${hash}_${seq}`; const insertVecStmt = db.prepare(`INSERT OR REPLACE INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`); - const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`); + const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, provider, embedded_at) VALUES (?, ?, ?, ?, ?, ?)`); insertVecStmt.run(hashSeq, embedding); - insertContentVectorStmt.run(hash, seq, pos, model, embeddedAt); + insertContentVectorStmt.run(hash, seq, pos, model, provider, embeddedAt); } // ============================================================================= // Query expansion // ============================================================================= -export async function expandQuery(query: string, model: string = DEFAULT_QUERY_MODEL, db: Database, intent?: string, llmOverride?: LlamaCpp): Promise { +export async function expandQuery(query: string, model: string = DEFAULT_QUERY_MODEL, db: Database, intent?: string, llmOverride?: LLM): Promise { // Check cache first — stored as JSON preserving types const cacheKey = getCacheKey("expandQuery", { query, model, ...(intent && { intent }) }); const cached = getCachedResult(db, cacheKey); @@ -2859,7 +3062,7 @@ export async function expandQuery(query: string, model: string = DEFAULT_QUERY_M // Reranking // ============================================================================= -export async function rerank(query: string, documents: { file: string; text: string }[], model: string = DEFAULT_RERANK_MODEL, db: Database, intent?: string, llmOverride?: LlamaCpp): Promise<{ file: string; score: number }[]> { +export async function rerank(query: string, documents: { file: string; text: string }[], model: string = DEFAULT_RERANK_MODEL, db: Database, intent?: string, llmOverride?: LLM): Promise<{ file: string; score: number }[]> { // Prepend intent to rerank query so the reranker scores with domain context const rerankQuery = intent ? `${intent}\n\n${query}` : query; @@ -3029,6 +3232,7 @@ type DbDocRow = { title: string; hash: string; collection: string; + content_type: string; path: string; modified_at: string; body_length: number; @@ -3076,6 +3280,7 @@ export function findDocument(db: Database, filename: string, options: { includeB d.title, d.hash, d.collection, + d.content_type, d.modified_at, LENGTH(content.doc) as body_length ${bodyCol} @@ -3146,6 +3351,7 @@ export function findDocument(db: Database, filename: string, options: { includeB collectionName: doc.collection, modifiedAt: doc.modified_at, bodyLength: doc.body_length, + contentType: (doc as any).content_type, ...(options.includeBody && doc.body !== undefined && { body: doc.body }), }; } @@ -3542,6 +3748,7 @@ export interface HybridQueryResult { score: number; // blended score (full precision) context: string | null; // user-set context docid: string; // content hash prefix (6 chars) + contentType?: string; // Document content type explain?: HybridQueryExplain; } @@ -3651,11 +3858,11 @@ export async function hybridQuery( } // Batch embed all vector queries in a single call - const llm = getLlm(store); - const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text)); + const llm = await getEmbeddingLlm(store); + const textsToEmbed = vecQueries.map(q => llm instanceof LlamaCpp ? formatQueryForEmbedding(q.text) : q.text); hooks?.onEmbedStart?.(textsToEmbed.length); const embedStart = Date.now(); - const embeddings = await llm.embedBatch(textsToEmbed); + const embeddings = await llm.embedBatch(textsToEmbed, { isQuery: true, taskType: "RETRIEVAL_QUERY" }); hooks?.onEmbedDone?.(Date.now() - embedStart); // Run sqlite-vec lookups with pre-computed embeddings @@ -3744,6 +3951,7 @@ export async function hybridQuery( blendedScore: rrfScore, } : undefined; + const contentType = inferContentTypeFromVirtualPath(cand.file); return { file: cand.file, displayPath: cand.displayPath, @@ -3752,8 +3960,9 @@ export async function hybridQuery( bestChunk, bestChunkPos, score: rrfScore, - context: store.getContextForFile(cand.file), + context: withContentTypeContext(store.getContextForFile(cand.file), contentType), docid: docidMap.get(cand.file) || "", + contentType, ...(explainData ? { explain: explainData } : {}), }; }) @@ -3818,6 +4027,7 @@ export async function hybridQuery( blendedScore, } : undefined; + const contentType = inferContentTypeFromVirtualPath(r.file); return { file: r.file, displayPath: candidate?.displayPath || "", @@ -3826,8 +4036,9 @@ export async function hybridQuery( bestChunk, bestChunkPos, score: blendedScore, - context: store.getContextForFile(r.file), + context: withContentTypeContext(store.getContextForFile(r.file), contentType), docid: docidMap.get(r.file) || "", + contentType, ...(explainData ? { explain: explainData } : {}), }; }).sort((a, b) => b.score - a.score); @@ -3860,6 +4071,7 @@ export interface VectorSearchResult { score: number; context: string | null; docid: string; + contentType?: string; } /** @@ -3900,14 +4112,16 @@ export async function vectorSearchQuery( for (const r of vecResults) { const existing = allResults.get(r.filepath); if (!existing || r.score > existing.score) { + const contentType = inferContentTypeFromVirtualPath(r.filepath); allResults.set(r.filepath, { file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score, - context: store.getContextForFile(r.filepath), + context: withContentTypeContext(store.getContextForFile(r.filepath), contentType), docid: r.docid, + contentType, }); } } @@ -4032,11 +4246,11 @@ export async function structuredSearch( s.type === 'vec' || s.type === 'hyde' ); if (vecSearches.length > 0) { - const llm = getLlm(store); - const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query)); + const llm = await getEmbeddingLlm(store); + const textsToEmbed = vecSearches.map(s => llm instanceof LlamaCpp ? formatQueryForEmbedding(s.query) : s.query); hooks?.onEmbedStart?.(textsToEmbed.length); const embedStart = Date.now(); - const embeddings = await llm.embedBatch(textsToEmbed); + const embeddings = await llm.embedBatch(textsToEmbed, { isQuery: true, taskType: "RETRIEVAL_QUERY" }); hooks?.onEmbedDone?.(Date.now() - embedStart); for (let i = 0; i < vecSearches.length; i++) { @@ -4134,6 +4348,7 @@ export async function structuredSearch( blendedScore: rrfScore, } : undefined; + const contentType = inferContentTypeFromVirtualPath(cand.file); return { file: cand.file, displayPath: cand.displayPath, @@ -4142,8 +4357,9 @@ export async function structuredSearch( bestChunk, bestChunkPos, score: rrfScore, - context: store.getContextForFile(cand.file), + context: withContentTypeContext(store.getContextForFile(cand.file), contentType), docid: docidMap.get(cand.file) || "", + contentType, ...(explainData ? { explain: explainData } : {}), }; }) @@ -4207,6 +4423,7 @@ export async function structuredSearch( blendedScore, } : undefined; + const contentType = inferContentTypeFromVirtualPath(r.file); return { file: r.file, displayPath: candidate?.displayPath || "", @@ -4215,8 +4432,9 @@ export async function structuredSearch( bestChunk, bestChunkPos, score: blendedScore, - context: store.getContextForFile(r.file), + context: withContentTypeContext(store.getContextForFile(r.file), contentType), docid: docidMap.get(r.file) || "", + contentType, ...(explainData ? { explain: explainData } : {}), }; }).sort((a, b) => b.score - a.score); diff --git a/test/eval-bm25.test.ts b/test/eval-bm25.test.ts index aaa9fe86..e67de728 100644 --- a/test/eval-bm25.test.ts +++ b/test/eval-bm25.test.ts @@ -102,7 +102,7 @@ describe("BM25 Search (FTS)", () => { const now = new Date().toISOString(); insertContent(db, hash, content, now); - insertDocument(db, "eval-docs", file, title, hash, now, now); + insertDocument(db, "eval-docs", file, title, hash, "text", now, now); } }); diff --git a/test/eval.test.ts b/test/eval.test.ts index d575ff84..d975a877 100644 --- a/test/eval.test.ts +++ b/test/eval.test.ts @@ -120,7 +120,7 @@ describe("BM25 Search (FTS)", () => { const now = new Date().toISOString(); insertContent(db, hash, content, now); - insertDocument(db, "eval-docs", file, title, hash, now, now); + insertDocument(db, "eval-docs", file, title, hash, "text", now, now); } }); @@ -201,7 +201,7 @@ describe.skipIf(!!process.env.CI)("Vector Search", () => { // Convert to Float32Array for sqlite-vec const embedding = new Float32Array(result.embedding); const now = new Date().toISOString(); - insertEmbedding(db, hash, seq, chunk.pos, embedding, DEFAULT_EMBED_MODEL, now); + insertEmbedding(db, hash, seq, chunk.pos, embedding, DEFAULT_EMBED_MODEL, "local", now); } } } diff --git a/test/generate-embeddings.multimodal.test.ts b/test/generate-embeddings.multimodal.test.ts new file mode 100644 index 00000000..a30e9a5f --- /dev/null +++ b/test/generate-embeddings.multimodal.test.ts @@ -0,0 +1,145 @@ +import { createHash } from "node:crypto"; +import { mkdir, mkdtemp, rm, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { dirname, join } from "node:path"; +import { afterEach, describe, expect, test, vi } from "vitest"; +import { createStore, generateEmbeddings } from "../src/store.js"; + +type MockInput = string | { text?: string; filePath?: string }; + +const stores: Array<{ dbPath: string; close: () => void }> = []; +const dirs: string[] = []; + +afterEach(async () => { + vi.restoreAllMocks(); + + for (const store of stores.splice(0)) { + store.close(); + await rm(store.dbPath, { force: true }); + } + + for (const dir of dirs.splice(0)) { + await rm(dir, { recursive: true, force: true }); + } +}); + +async function setupMultimodalDoc(contentType: "image" | "pdf", filename: string, body: string, title: string): Promise<{ + store: ReturnType; + absolutePath: string; + collectionName: string; +}> { + const dir = await mkdtemp(join(tmpdir(), "qmd-embed-mm-")); + dirs.push(dir); + const dbPath = join(dir, "index.sqlite"); + const store = createStore(dbPath); + stores.push({ dbPath, close: () => store.close() }); + + const absolutePath = join(dir, filename); + await mkdir(dirname(absolutePath), { recursive: true }); + const bytes = contentType === "pdf" + ? Buffer.from("%PDF-1.4\n1 0 obj\n<< /Type /Page >>\nendobj\n") + : Buffer.from([137, 80, 78, 71]); + await writeFile(absolutePath, bytes); + + const hash = createHash("sha256").update(bytes).digest("hex"); + const now = new Date().toISOString(); + const collectionName = "docs"; + + store.db.prepare(` + INSERT INTO store_collections (name, path, pattern) + VALUES (?, ?, ?) + `).run(collectionName, dir, "**/*"); + + store.db.prepare(` + INSERT INTO content (hash, doc, created_at) + VALUES (?, ?, ?) + `).run(hash, body, now); + + store.db.prepare(` + INSERT INTO documents (collection, path, title, hash, content_type, created_at, modified_at, active) + VALUES (?, ?, ?, ?, ?, ?, ?, 1) + `).run(collectionName, filename, title, hash, contentType, now, now); + + return { store, absolutePath, collectionName }; +} + +describe("generateEmbeddings multimodal inputs", () => { + test("image embeddings include text context and file part", async () => { + const { store, absolutePath } = await setupMultimodalDoc( + "image", + "receipts/invoice.png", + "[image] receipts/invoice.png (image/png)\nInvoice for ACME Corp. Total due: $400.", + "March Invoice" + ); + + const firstInputs: MockInput[] = []; + const batchInputs: MockInput[][] = []; + (store as any).llm = { + embed: vi.fn(async (input: MockInput) => { + firstInputs.push(input); + return { embedding: [0.1, 0.2, 0.3], model: "mock-model" }; + }), + embedBatch: vi.fn(async (inputs: MockInput[]) => { + batchInputs.push(inputs); + return inputs.map(() => ({ embedding: [0.1, 0.2, 0.3], model: "mock-model" })); + }), + generate: vi.fn(async () => null), + modelExists: vi.fn(async () => ({ name: "mock", exists: true })), + expandQuery: vi.fn(async () => []), + rerank: vi.fn(async () => ({ results: [], model: "mock" })), + dispose: vi.fn(async () => {}), + }; + + const result = await generateEmbeddings(store); + + expect(result.chunksEmbedded).toBe(1); + expect(result.errors).toBe(0); + + const input = firstInputs[0] as Exclude; + expect(typeof input).toBe("object"); + expect(input.filePath).toBe(absolutePath); + expect(input.text).toContain("File: receipts/invoice.png"); + expect(input.text).toContain("Title: March Invoice"); + expect(input.text).toContain("Body: [image] receipts/invoice.png (image/png)"); + expect(input.text).toContain("Type: image"); + expect(batchInputs[0]?.length).toBe(1); + }); + + test("pdf embeddings include text context and file part", async () => { + const { store, absolutePath } = await setupMultimodalDoc( + "pdf", + "reports/q1.pdf", + "[pdf] reports/q1.pdf (application/pdf)\nQ1 financial summary.", + "Q1 Report" + ); + + const firstInputs: MockInput[] = []; + (store as any).llm = { + embed: vi.fn(async (input: MockInput) => { + firstInputs.push(input); + return { embedding: [0.1, 0.2, 0.3], model: "mock-model" }; + }), + embedBatch: vi.fn(async (inputs: MockInput[]) => { + return inputs.map(() => ({ embedding: [0.1, 0.2, 0.3], model: "mock-model" })); + }), + generate: vi.fn(async () => null), + modelExists: vi.fn(async () => ({ name: "mock", exists: true })), + expandQuery: vi.fn(async () => []), + rerank: vi.fn(async () => ({ results: [], model: "mock" })), + dispose: vi.fn(async () => {}), + }; + + const result = await generateEmbeddings(store); + + expect(result.chunksEmbedded).toBe(1); + expect(result.errors).toBe(0); + + const input = firstInputs[0] as Exclude; + expect(typeof input).toBe("object"); + expect(input.filePath).toBe(absolutePath); + expect(input.text).toContain("File: reports/q1.pdf"); + expect(input.text).toContain("Title: Q1 Report"); + expect(input.text).toContain("Body: [pdf] reports/q1.pdf (application/pdf)"); + expect(input.text).toContain("Type: pdf"); + }); +}); diff --git a/test/google-embed.test.ts b/test/google-embed.test.ts new file mode 100644 index 00000000..5316fb75 --- /dev/null +++ b/test/google-embed.test.ts @@ -0,0 +1,138 @@ +import { afterEach, describe, expect, test, vi } from "vitest"; +import { mkdtempSync, writeFileSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { + GoogleAIEmbedder, + GOOGLE_EMBED_BATCH_LIMIT, + parseGeminiDimensionsFromEnv, + type EmbedInput, +} from "../src/google-embed.js"; + +afterEach(() => { + vi.restoreAllMocks(); + vi.useRealTimers(); + delete process.env.QMD_EMBED_DIMENSIONS; +}); + +describe("GoogleAIEmbedder", () => { + test("embeds text and truncates with matryoshka dimensions", async () => { + process.env.QMD_EMBED_DIMENSIONS = "768"; + expect(parseGeminiDimensionsFromEnv()).toBe(768); + + const values = Array.from({ length: 1024 }, (_, i) => i / 1024); + const fetchSpy = vi.spyOn(globalThis, "fetch").mockResolvedValue(new Response(JSON.stringify({ + embedding: { values }, + }), { status: 200 })); + + const embedder = new GoogleAIEmbedder("test-key"); + const result = await embedder.embed("hello world", { taskType: "RETRIEVAL_DOCUMENT" }); + + expect(result).not.toBeNull(); + expect(result?.embedding.length).toBe(768); + expect(fetchSpy).toHaveBeenCalledTimes(1); + const [url, init] = fetchSpy.mock.calls[0]!; + expect(String(url)).toContain(":embedContent?key=test-key"); + const body = JSON.parse(String(init?.body)); + expect(body.taskType).toBe("RETRIEVAL_DOCUMENT"); + expect(body.outputDimensionality).toBe(768); + expect(body.content.parts[0].text).toBe("hello world"); + }); + + test("embeds multimodal input with inline image data", async () => { + const dir = mkdtempSync(join(tmpdir(), "qmd-google-embed-")); + const file = join(dir, "image.png"); + writeFileSync(file, Buffer.from([137, 80, 78, 71])); + + const fetchSpy = vi.spyOn(globalThis, "fetch").mockResolvedValue(new Response(JSON.stringify({ + embedding: { values: Array.from({ length: 3072 }, () => 0.1) }, + }), { status: 200 })); + + const embedder = new GoogleAIEmbedder("test-key", 3072); + const result = await embedder.embed({ text: "caption", filePath: file }, { taskType: "SEMANTIC_SIMILARITY" }); + + expect(result).not.toBeNull(); + const [, init] = fetchSpy.mock.calls[0]!; + const body = JSON.parse(String(init?.body)); + expect(body.taskType).toBe("SEMANTIC_SIMILARITY"); + expect(body.content.parts).toHaveLength(2); + expect(body.content.parts[1].inlineData.mimeType).toBe("image/png"); + + rmSync(dir, { recursive: true, force: true }); + }); + + test("batch embeds in chunks of 100", async () => { + const inputs: EmbedInput[] = Array.from({ length: GOOGLE_EMBED_BATCH_LIMIT + 1 }, (_, i) => `doc-${i}`); + const fetchSpy = vi.spyOn(globalThis, "fetch").mockImplementation(async (_url, init) => { + const body = JSON.parse(String(init?.body)); + const count = body.requests.length; + return new Response(JSON.stringify({ + embeddings: Array.from({ length: count }, () => ({ values: Array.from({ length: 3072 }, () => 0.2) })), + }), { status: 200 }); + }); + + const embedder = new GoogleAIEmbedder("test-key", 3072); + const results = await embedder.embedBatch(inputs, { taskType: "RETRIEVAL_QUERY" }); + + expect(results).toHaveLength(GOOGLE_EMBED_BATCH_LIMIT + 1); + expect(results.every(r => r?.embedding.length === 3072)).toBe(true); + expect(fetchSpy).toHaveBeenCalledTimes(2); + }); + + test("retries retryable HTTP responses and respects retry-after", async () => { + const fetchSpy = vi.spyOn(globalThis, "fetch") + .mockResolvedValueOnce(new Response("rate limited", { + status: 429, + headers: { "retry-after": "0" }, + })) + .mockResolvedValueOnce(new Response(JSON.stringify({ + embedding: { values: Array.from({ length: 3072 }, () => 0.3) }, + }), { status: 200 })); + + const embedder = new GoogleAIEmbedder("test-key", 3072); + const result = await embedder.embed("retry me"); + + expect(result).not.toBeNull(); + expect(result?.embedding.length).toBe(3072); + expect(fetchSpy).toHaveBeenCalledTimes(2); + }); + + test("retries transient network errors", async () => { + vi.useFakeTimers(); + const fetchSpy = vi.spyOn(globalThis, "fetch") + .mockRejectedValueOnce(new Error("network down")) + .mockResolvedValueOnce(new Response(JSON.stringify({ + embedding: { values: Array.from({ length: 3072 }, () => 0.4) }, + }), { status: 200 })); + + const embedder = new GoogleAIEmbedder("test-key", 3072); + const pending = embedder.embed("network retry"); + await vi.advanceTimersByTimeAsync(500); + const result = await pending; + + expect(result).not.toBeNull(); + expect(result?.embedding.length).toBe(3072); + expect(fetchSpy).toHaveBeenCalledTimes(2); + }); + + test("batch embed tolerates invalid multimodal input", async () => { + const fetchSpy = vi.spyOn(globalThis, "fetch").mockImplementation(async (_url, init) => { + const body = JSON.parse(String(init?.body)); + expect(body.requests).toHaveLength(1); + return new Response(JSON.stringify({ + embeddings: [{ values: Array.from({ length: 3072 }, () => 0.5) }], + }), { status: 200 }); + }); + + const embedder = new GoogleAIEmbedder("test-key", 3072); + const results = await embedder.embedBatch( + ["ok-text", { filePath: "/tmp/not-supported.txt" }], + { taskType: "RETRIEVAL_DOCUMENT" } + ); + + expect(results).toHaveLength(2); + expect(results[0]?.embedding.length).toBe(3072); + expect(results[1]).toBeNull(); + expect(fetchSpy).toHaveBeenCalledTimes(1); + }); +}); diff --git a/test/sdk.test.ts b/test/sdk.test.ts index d246bc46..c6e38d03 100644 --- a/test/sdk.test.ts +++ b/test/sdk.test.ts @@ -177,7 +177,9 @@ describe("collection management", () => { await store.addCollection("notes", { path: notesDir }); const collections = await store.listCollections(); - expect(collections.find(c => c.name === "notes")).toBeDefined(); + const notes = collections.find(c => c.name === "notes"); + expect(notes).toBeDefined(); + expect(notes?.glob_pattern).toBe("**/*.md"); }); test("removeCollection removes existing collection", async () => { @@ -511,7 +513,7 @@ describe("searchLex (BM25)", () => { const title = content.match(/^#\s+(.+)/m)?.[1] || file; internal.insertContent(hash, content, now); - internal.insertDocument("docs", `qmd://docs/${file}`, title, hash, now, now); + internal.insertDocument("docs", `qmd://docs/${file}`, title, hash, "text", now, now); } // Index notes collection @@ -522,7 +524,7 @@ describe("searchLex (BM25)", () => { const title = content.match(/^#\s+(.+)/m)?.[1] || file; internal.insertContent(hash, content, now); - internal.insertDocument("notes", `qmd://notes/${file}`, title, hash, now, now); + internal.insertDocument("notes", `qmd://notes/${file}`, title, hash, "text", now, now); } }); @@ -697,7 +699,7 @@ describe("get and multiGet", () => { const title = content.match(/^#\s+(.+)/m)?.[1] || file; internal.insertContent(hash, content, now); - internal.insertDocument("docs", `qmd://docs/${file}`, title, hash, now, now); + internal.insertDocument("docs", `qmd://docs/${file}`, title, hash, "text", now, now); } }); diff --git a/test/store.test.ts b/test/store.test.ts index d64bc0d1..446ff138 100644 --- a/test/store.test.ts +++ b/test/store.test.ts @@ -2772,7 +2772,7 @@ describe("Content-Addressable Storage", () => { const oldContent = "# First Version"; const oldHash = await hashContent(oldContent); store.insertContent(oldHash, oldContent, now); - store.insertDocument(collectionName, "docs/foo.md", "foo", oldHash, now, now); + store.insertDocument(collectionName, "docs/foo.md", "foo", oldHash, "text", now, now); // Simulate file removal during update pass. store.deactivateDocument(collectionName, "docs/foo.md"); @@ -2784,7 +2784,7 @@ describe("Content-Addressable Storage", () => { store.insertContent(newHash, newContent, now); expect(() => { - store.insertDocument(collectionName, "docs/foo.md", "foo", newHash, now, now); + store.insertDocument(collectionName, "docs/foo.md", "foo", newHash, "text", now, now); }).not.toThrow(); const rows = store.db.prepare(`