-
Notifications
You must be signed in to change notification settings - Fork 375
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into feat/obsidian-reader
- Loading branch information
Showing
3 changed files
with
334 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
--- | ||
"llamaindex": patch | ||
--- | ||
|
||
Add vector store for CosmosDB |
328 changes: 328 additions & 0 deletions
328
packages/llamaindex/src/vector-store/AzureCosmosDBMongoVectorStore.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,328 @@ | ||
import type { BaseNode } from "@llamaindex/core/schema"; | ||
import { MetadataMode } from "@llamaindex/core/schema"; | ||
import { getEnv } from "@llamaindex/env"; | ||
import { Collection, Db, MongoClient } from "mongodb"; | ||
import { | ||
BaseVectorStore, | ||
type VectorStoreBaseParams, | ||
type VectorStoreQuery, | ||
type VectorStoreQueryResult, | ||
} from "./types.js"; | ||
import { metadataDictToNode, nodeToMetadata } from "./utils.js"; | ||
|
||
/** Azure Cosmos DB for MongoDB vCore Similarity type. */ | ||
export const AzureCosmosDBMongoDBSimilarityType = { | ||
/** Cosine similarity */ | ||
COS: "COS", | ||
/** Inner - product */ | ||
IP: "IP", | ||
/** Euclidian distance */ | ||
L2: "L2", | ||
} as const; | ||
|
||
/** Azure Cosmos DB for MongoDB vCore Similarity type. */ | ||
export type AzureCosmosDBMongoDBSimilarityType = | ||
(typeof AzureCosmosDBMongoDBSimilarityType)[keyof typeof AzureCosmosDBMongoDBSimilarityType]; | ||
|
||
/** Azure Cosmos DB for MongoDB vCore Index Options. */ | ||
export type AzureCosmosDBMongoDBIndexOptions = { | ||
readonly indexType?: "ivf" | "hnsw" | "diskann" | undefined; | ||
/** Number of clusters that the inverted file (IVF) index uses to group the vector data. */ | ||
readonly numLists?: number | undefined; | ||
/** Number of dimensions for vector similarity. */ | ||
readonly dimensions?: number | undefined; | ||
/** Similarity metric to use with the IVF index. */ | ||
readonly similarity?: AzureCosmosDBMongoDBSimilarityType | undefined; | ||
/** The max number of connections per layer with the HNSW index. */ | ||
readonly m?: number | undefined; | ||
/** The size of the dynamic candidate list for constructing the graph with the HNSW index. */ | ||
readonly efConstruction?: number | undefined; | ||
/** Max number of neighbors withe the Diskann idnex */ | ||
readonly maxDegree?: number | undefined; | ||
/** L value for index building withe the Diskann idnex */ | ||
readonly lBuild?: number | undefined; | ||
/** L value for index searching withe the Diskann idnex */ | ||
readonly lSearch?: number | undefined; | ||
}; | ||
|
||
/** | ||
* Azure Cosmos DB for MongoDB vCore vector store. | ||
* To use this, you should have both: | ||
* - the `mongodb` NPM package installed | ||
* - a connection string associated with a MongoDB VCore Cluster | ||
* | ||
* You do not need to create a database or collection, it will be created | ||
* automatically. | ||
* | ||
* You also need an index on the collection, which is by default be created | ||
* automatically using the `createIndex` method. | ||
*/ | ||
export class AzureCosmosDBMongoDBVectorStore extends BaseVectorStore { | ||
storesText: boolean = true; | ||
flatMetadata: boolean = true; | ||
|
||
dbName: string; | ||
|
||
collectionName: string; | ||
|
||
indexedMetadataFields: string[]; | ||
|
||
/** | ||
* The used MongoClient. If not given, a new MongoClient is created based on the MONGODB_URI env variable. | ||
*/ | ||
mongodbClient: MongoClient; | ||
|
||
indexName: string; | ||
|
||
embeddingKey: string; | ||
|
||
idKey: string; | ||
|
||
textKey: string; | ||
|
||
metadataKey: string; | ||
|
||
indexOptions: AzureCosmosDBMongoDBIndexOptions; | ||
|
||
private collection?: Collection; | ||
|
||
private database: Db; | ||
|
||
constructor( | ||
init: Partial<AzureCosmosDBMongoDBVectorStore> & { | ||
dbName: string; | ||
collectionName: string; | ||
indexedMetadataFields?: string[]; | ||
} & VectorStoreBaseParams, | ||
) { | ||
super(init); | ||
if (init.mongodbClient) { | ||
this.mongodbClient = init.mongodbClient; | ||
} else { | ||
const mongoUri = getEnv("AZURE_COSMOSDB_MONGODB_CONNECTION_STRING"); | ||
if (!mongoUri) { | ||
throw new Error( | ||
"AzureCosmosDBMongoDBVectorStore client or connection string must be set.", | ||
); | ||
} | ||
this.mongodbClient = new MongoClient(mongoUri); | ||
} | ||
|
||
this.dbName = init.dbName ?? "documentsDB"; | ||
this.collectionName = init.collectionName ?? "documents"; | ||
this.indexedMetadataFields = init.indexedMetadataFields ?? []; | ||
this.indexName = init.indexName ?? "vectorSearchIndex"; | ||
this.embeddingKey = init.embeddingKey ?? "vectorContent"; | ||
this.idKey = init.idKey ?? "id"; | ||
this.textKey = init.textKey ?? "text"; | ||
this.metadataKey = init.metadataKey ?? "metadata"; | ||
this.indexOptions = init.indexOptions ?? {}; | ||
this.database = this.mongodbClient.db(this.dbName); | ||
} | ||
|
||
client() { | ||
return this.mongodbClient; | ||
} | ||
|
||
async ensureCollection() { | ||
if (!this.collection) { | ||
const collection = await this.mongodbClient | ||
.db(this.dbName) | ||
.createCollection(this.collectionName); | ||
|
||
this.collection = collection; | ||
} | ||
|
||
return this.collection; | ||
} | ||
|
||
async add(nodes: BaseNode[]): Promise<string[]> { | ||
if (!nodes || nodes.length === 0) { | ||
return []; | ||
} | ||
|
||
const dataToInsert = nodes.map((node) => { | ||
const metadata = nodeToMetadata( | ||
node, | ||
true, | ||
this.textKey, | ||
this.flatMetadata, | ||
); | ||
|
||
// Include the specified metadata fields in the top level of the document (to help filter) | ||
const populatedMetadata: Record<string, unknown> = {}; | ||
for (const field of this.indexedMetadataFields) { | ||
populatedMetadata[field] = metadata[field]; | ||
} | ||
|
||
return { | ||
[this.idKey]: node.id_, | ||
[this.embeddingKey]: node.getEmbedding(), | ||
[this.textKey]: node.getContent(MetadataMode.NONE) || "", | ||
[this.metadataKey]: metadata, | ||
...populatedMetadata, | ||
}; | ||
}); | ||
|
||
const collection = await this.ensureCollection(); | ||
const insertResult = await collection.insertMany(dataToInsert); | ||
return Object.values(insertResult.insertedIds).map((id) => String(id)); | ||
} | ||
|
||
/** | ||
* Removes specified documents from the AzureCosmosDBMongoDBVectorStore. | ||
* @param params Parameters for the delete operation. | ||
* @returns A promise that resolves when the documents have been removed. | ||
*/ | ||
async delete(id: string, deleteOptions?: object): Promise<void> { | ||
const collection = await this.ensureCollection(); | ||
await collection.deleteMany( | ||
{ | ||
id: id, | ||
}, | ||
deleteOptions, | ||
); | ||
} | ||
|
||
async query( | ||
query: VectorStoreQuery, | ||
options?: object, | ||
): Promise<VectorStoreQueryResult> { | ||
const pipeline = [ | ||
{ | ||
$search: { | ||
cosmosSearch: { | ||
vector: query.queryEmbedding, | ||
path: this.embeddingKey, | ||
k: query.similarityTopK ?? 4, | ||
}, | ||
returnStoredSource: true, | ||
}, | ||
}, | ||
]; | ||
|
||
const collection = await this.ensureCollection(); | ||
const cursor = await collection.aggregate(pipeline); | ||
|
||
const nodes: BaseNode[] = []; | ||
const ids: string[] = []; | ||
const similarities: number[] = []; | ||
|
||
for await (const res of await cursor) { | ||
const text = res[this.textKey]; | ||
const score = res.score; | ||
const id = res[this.idKey]; | ||
const metadata = res[this.metadataKey]; | ||
|
||
const node = metadataDictToNode(metadata); | ||
node.setContent(text); | ||
|
||
ids.push(id); | ||
nodes.push(node); | ||
similarities.push(score); | ||
} | ||
|
||
const result = { | ||
nodes, | ||
similarities, | ||
ids, | ||
}; | ||
|
||
return result; | ||
} | ||
|
||
/** | ||
* Creates an index on the collection with the specified index name during | ||
* instance construction. | ||
* | ||
* Setting the numLists parameter correctly is important for achieving good | ||
* accuracy and performance. | ||
* Since the vector store uses IVF as the indexing strategy, you should | ||
* create the index only after you have loaded a large enough sample | ||
* documents to ensure that the centroids for the respective buckets are | ||
* faily distributed. | ||
* | ||
* @param indexType Index Type for Mongo vCore index. | ||
* @param dimensions Number of dimensions for vector similarity. | ||
* The maximum number of supported dimensions is 2000. | ||
* If no number is provided, it will be determined automatically by | ||
* embedding a short text. | ||
* @param similarity Similarity metric to use with the IVF index. | ||
* Possible options are: | ||
* - CosmosDBSimilarityType.COS (cosine distance) | ||
* - CosmosDBSimilarityType.L2 (Euclidean distance) | ||
* - CosmosDBSimilarityType.IP (inner product) | ||
* @returns A promise that resolves when the index has been created. | ||
*/ | ||
async createIndex( | ||
dimensions: number | undefined = undefined, | ||
indexType: "ivf" | "hnsw" | "diskann" = "ivf", | ||
similarity: AzureCosmosDBMongoDBSimilarityType = AzureCosmosDBMongoDBSimilarityType.COS, | ||
): Promise<void> { | ||
let vectorLength = dimensions; | ||
|
||
if (vectorLength === undefined) { | ||
vectorLength = 1536; | ||
} | ||
|
||
// eslint-disable-next-line @typescript-eslint/no-explicit-any | ||
const cosmosSearchOptions: any = { | ||
kind: "", | ||
similarity, | ||
dimensions: vectorLength, | ||
}; | ||
|
||
if (indexType === "hnsw") { | ||
cosmosSearchOptions.kind = "vector-hnsw"; | ||
cosmosSearchOptions.m = this.indexOptions.m ?? 16; | ||
cosmosSearchOptions.efConstruction = | ||
this.indexOptions.efConstruction ?? 200; | ||
} else if (indexType === "diskann") { | ||
cosmosSearchOptions.kind = "vector-diskann"; | ||
cosmosSearchOptions.maxDegree = this.indexOptions.maxDegree ?? 40; | ||
cosmosSearchOptions.lBuild = this.indexOptions.lBuild ?? 50; | ||
cosmosSearchOptions.lSearch = this.indexOptions.lSearch ?? 40; | ||
/** Default to IVF index */ | ||
} else { | ||
cosmosSearchOptions.kind = "vector-ivf"; | ||
cosmosSearchOptions.numLists = this.indexOptions.numLists ?? 100; | ||
} | ||
|
||
const createIndexCommands = { | ||
createIndexes: this.collection?.collectionName, | ||
indexes: [ | ||
{ | ||
name: this.indexName, | ||
key: { [this.embeddingKey]: "cosmosSearch" }, | ||
cosmosSearchOptions, | ||
}, | ||
], | ||
}; | ||
|
||
await this.database.command(createIndexCommands); | ||
} | ||
|
||
/** | ||
* Checks if the specified index name during instance construction exists | ||
* on the collection. | ||
* @returns A promise that resolves to a boolean indicating if the index exists. | ||
*/ | ||
async checkIndexExists(): Promise<boolean> { | ||
const collection = await this.ensureCollection(); | ||
const indexes = await collection.listIndexes().toArray(); | ||
return indexes.some((index) => index.name === this.indexName); | ||
} | ||
|
||
/** | ||
* Deletes the index specified during instance construction if it exists. | ||
* @returns A promise that resolves when the index has been deleted. | ||
*/ | ||
async deleteIndex(indexName: string): Promise<void> { | ||
const collection = await this.ensureCollection(); | ||
const indexes = await collection.listIndexes().toArray(); | ||
const indexToDelete = indexes.find((index) => index.name === indexName); | ||
if (indexToDelete) { | ||
await collection.dropIndex(indexName); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters