Skip to content

Commit

Permalink
Merge branch 'main' into feat/obsidian-reader
Browse files Browse the repository at this point in the history
  • Loading branch information
jzhao62 authored Nov 30, 2024
2 parents 177c634 + 515f2c1 commit 3bd8e1d
Show file tree
Hide file tree
Showing 3 changed files with 334 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .changeset/stale-parents-perform.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"llamaindex": patch
---

Add vector store for CosmosDB
328 changes: 328 additions & 0 deletions packages/llamaindex/src/vector-store/AzureCosmosDBMongoVectorStore.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,328 @@
import type { BaseNode } from "@llamaindex/core/schema";
import { MetadataMode } from "@llamaindex/core/schema";
import { getEnv } from "@llamaindex/env";
import { Collection, Db, MongoClient } from "mongodb";
import {
BaseVectorStore,
type VectorStoreBaseParams,
type VectorStoreQuery,
type VectorStoreQueryResult,
} from "./types.js";
import { metadataDictToNode, nodeToMetadata } from "./utils.js";

/** Azure Cosmos DB for MongoDB vCore Similarity type. */
export const AzureCosmosDBMongoDBSimilarityType = {
/** Cosine similarity */
COS: "COS",
/** Inner - product */
IP: "IP",
/** Euclidian distance */
L2: "L2",
} as const;

/** Azure Cosmos DB for MongoDB vCore Similarity type. */
export type AzureCosmosDBMongoDBSimilarityType =
(typeof AzureCosmosDBMongoDBSimilarityType)[keyof typeof AzureCosmosDBMongoDBSimilarityType];

/** Azure Cosmos DB for MongoDB vCore Index Options. */
export type AzureCosmosDBMongoDBIndexOptions = {
readonly indexType?: "ivf" | "hnsw" | "diskann" | undefined;
/** Number of clusters that the inverted file (IVF) index uses to group the vector data. */
readonly numLists?: number | undefined;
/** Number of dimensions for vector similarity. */
readonly dimensions?: number | undefined;
/** Similarity metric to use with the IVF index. */
readonly similarity?: AzureCosmosDBMongoDBSimilarityType | undefined;
/** The max number of connections per layer with the HNSW index. */
readonly m?: number | undefined;
/** The size of the dynamic candidate list for constructing the graph with the HNSW index. */
readonly efConstruction?: number | undefined;
/** Max number of neighbors withe the Diskann idnex */
readonly maxDegree?: number | undefined;
/** L value for index building withe the Diskann idnex */
readonly lBuild?: number | undefined;
/** L value for index searching withe the Diskann idnex */
readonly lSearch?: number | undefined;
};

/**
* Azure Cosmos DB for MongoDB vCore vector store.
* To use this, you should have both:
* - the `mongodb` NPM package installed
* - a connection string associated with a MongoDB VCore Cluster
*
* You do not need to create a database or collection, it will be created
* automatically.
*
* You also need an index on the collection, which is by default be created
* automatically using the `createIndex` method.
*/
export class AzureCosmosDBMongoDBVectorStore extends BaseVectorStore {
storesText: boolean = true;
flatMetadata: boolean = true;

dbName: string;

collectionName: string;

indexedMetadataFields: string[];

/**
* The used MongoClient. If not given, a new MongoClient is created based on the MONGODB_URI env variable.
*/
mongodbClient: MongoClient;

indexName: string;

embeddingKey: string;

idKey: string;

textKey: string;

metadataKey: string;

indexOptions: AzureCosmosDBMongoDBIndexOptions;

private collection?: Collection;

private database: Db;

constructor(
init: Partial<AzureCosmosDBMongoDBVectorStore> & {
dbName: string;
collectionName: string;
indexedMetadataFields?: string[];
} & VectorStoreBaseParams,
) {
super(init);
if (init.mongodbClient) {
this.mongodbClient = init.mongodbClient;
} else {
const mongoUri = getEnv("AZURE_COSMOSDB_MONGODB_CONNECTION_STRING");
if (!mongoUri) {
throw new Error(
"AzureCosmosDBMongoDBVectorStore client or connection string must be set.",
);
}
this.mongodbClient = new MongoClient(mongoUri);
}

this.dbName = init.dbName ?? "documentsDB";
this.collectionName = init.collectionName ?? "documents";
this.indexedMetadataFields = init.indexedMetadataFields ?? [];
this.indexName = init.indexName ?? "vectorSearchIndex";
this.embeddingKey = init.embeddingKey ?? "vectorContent";
this.idKey = init.idKey ?? "id";
this.textKey = init.textKey ?? "text";
this.metadataKey = init.metadataKey ?? "metadata";
this.indexOptions = init.indexOptions ?? {};
this.database = this.mongodbClient.db(this.dbName);
}

client() {
return this.mongodbClient;
}

async ensureCollection() {
if (!this.collection) {
const collection = await this.mongodbClient
.db(this.dbName)
.createCollection(this.collectionName);

this.collection = collection;
}

return this.collection;
}

async add(nodes: BaseNode[]): Promise<string[]> {
if (!nodes || nodes.length === 0) {
return [];
}

const dataToInsert = nodes.map((node) => {
const metadata = nodeToMetadata(
node,
true,
this.textKey,
this.flatMetadata,
);

// Include the specified metadata fields in the top level of the document (to help filter)
const populatedMetadata: Record<string, unknown> = {};
for (const field of this.indexedMetadataFields) {
populatedMetadata[field] = metadata[field];
}

return {
[this.idKey]: node.id_,
[this.embeddingKey]: node.getEmbedding(),
[this.textKey]: node.getContent(MetadataMode.NONE) || "",
[this.metadataKey]: metadata,
...populatedMetadata,
};
});

const collection = await this.ensureCollection();
const insertResult = await collection.insertMany(dataToInsert);
return Object.values(insertResult.insertedIds).map((id) => String(id));
}

/**
* Removes specified documents from the AzureCosmosDBMongoDBVectorStore.
* @param params Parameters for the delete operation.
* @returns A promise that resolves when the documents have been removed.
*/
async delete(id: string, deleteOptions?: object): Promise<void> {
const collection = await this.ensureCollection();
await collection.deleteMany(
{
id: id,
},
deleteOptions,
);
}

async query(
query: VectorStoreQuery,
options?: object,
): Promise<VectorStoreQueryResult> {
const pipeline = [
{
$search: {
cosmosSearch: {
vector: query.queryEmbedding,
path: this.embeddingKey,
k: query.similarityTopK ?? 4,
},
returnStoredSource: true,
},
},
];

const collection = await this.ensureCollection();
const cursor = await collection.aggregate(pipeline);

const nodes: BaseNode[] = [];
const ids: string[] = [];
const similarities: number[] = [];

for await (const res of await cursor) {
const text = res[this.textKey];
const score = res.score;
const id = res[this.idKey];
const metadata = res[this.metadataKey];

const node = metadataDictToNode(metadata);
node.setContent(text);

ids.push(id);
nodes.push(node);
similarities.push(score);
}

const result = {
nodes,
similarities,
ids,
};

return result;
}

/**
* Creates an index on the collection with the specified index name during
* instance construction.
*
* Setting the numLists parameter correctly is important for achieving good
* accuracy and performance.
* Since the vector store uses IVF as the indexing strategy, you should
* create the index only after you have loaded a large enough sample
* documents to ensure that the centroids for the respective buckets are
* faily distributed.
*
* @param indexType Index Type for Mongo vCore index.
* @param dimensions Number of dimensions for vector similarity.
* The maximum number of supported dimensions is 2000.
* If no number is provided, it will be determined automatically by
* embedding a short text.
* @param similarity Similarity metric to use with the IVF index.
* Possible options are:
* - CosmosDBSimilarityType.COS (cosine distance)
* - CosmosDBSimilarityType.L2 (Euclidean distance)
* - CosmosDBSimilarityType.IP (inner product)
* @returns A promise that resolves when the index has been created.
*/
async createIndex(
dimensions: number | undefined = undefined,
indexType: "ivf" | "hnsw" | "diskann" = "ivf",
similarity: AzureCosmosDBMongoDBSimilarityType = AzureCosmosDBMongoDBSimilarityType.COS,
): Promise<void> {
let vectorLength = dimensions;

if (vectorLength === undefined) {
vectorLength = 1536;
}

// eslint-disable-next-line @typescript-eslint/no-explicit-any
const cosmosSearchOptions: any = {
kind: "",
similarity,
dimensions: vectorLength,
};

if (indexType === "hnsw") {
cosmosSearchOptions.kind = "vector-hnsw";
cosmosSearchOptions.m = this.indexOptions.m ?? 16;
cosmosSearchOptions.efConstruction =
this.indexOptions.efConstruction ?? 200;
} else if (indexType === "diskann") {
cosmosSearchOptions.kind = "vector-diskann";
cosmosSearchOptions.maxDegree = this.indexOptions.maxDegree ?? 40;
cosmosSearchOptions.lBuild = this.indexOptions.lBuild ?? 50;
cosmosSearchOptions.lSearch = this.indexOptions.lSearch ?? 40;
/** Default to IVF index */
} else {
cosmosSearchOptions.kind = "vector-ivf";
cosmosSearchOptions.numLists = this.indexOptions.numLists ?? 100;
}

const createIndexCommands = {
createIndexes: this.collection?.collectionName,
indexes: [
{
name: this.indexName,
key: { [this.embeddingKey]: "cosmosSearch" },
cosmosSearchOptions,
},
],
};

await this.database.command(createIndexCommands);
}

/**
* Checks if the specified index name during instance construction exists
* on the collection.
* @returns A promise that resolves to a boolean indicating if the index exists.
*/
async checkIndexExists(): Promise<boolean> {
const collection = await this.ensureCollection();
const indexes = await collection.listIndexes().toArray();
return indexes.some((index) => index.name === this.indexName);
}

/**
* Deletes the index specified during instance construction if it exists.
* @returns A promise that resolves when the index has been deleted.
*/
async deleteIndex(indexName: string): Promise<void> {
const collection = await this.ensureCollection();
const indexes = await collection.listIndexes().toArray();
const indexToDelete = indexes.find((index) => index.name === indexName);
if (indexToDelete) {
await collection.dropIndex(indexName);
}
}
}
1 change: 1 addition & 0 deletions packages/llamaindex/src/vector-store/index.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
export * from "./AstraDBVectorStore.js";
export * from "./AzureCosmosDBMongoVectorStore.js";
export * from "./AzureCosmosDBNoSqlVectorStore.js";
export * from "./ChromaVectorStore.js";
export * from "./MilvusVectorStore.js";
Expand Down

0 comments on commit 3bd8e1d

Please sign in to comment.