From 7c769db8d1be2f2bb652c13ce8a556d1dfac17e7 Mon Sep 17 00:00:00 2001 From: siwen devbox Date: Wed, 13 Nov 2024 09:35:29 +0000 Subject: [PATCH] add diskann index type --- .../azure_cosmos_db_memory_store.py | 18 +++ .../azure_cosmosdb/mongo_vcore_store_api.py | 72 ++++++++++- .../connectors/memory/azure_cosmosdb/utils.py | 2 + .../test_azure_cosmosdb_memory_store.py | 121 ++++++++++++++++++ 4 files changed, 212 insertions(+), 1 deletion(-) diff --git a/python/semantic_kernel/connectors/memory/azure_cosmosdb/azure_cosmos_db_memory_store.py b/python/semantic_kernel/connectors/memory/azure_cosmosdb/azure_cosmos_db_memory_store.py index 40150463b40e..51199ee94e01 100644 --- a/python/semantic_kernel/connectors/memory/azure_cosmosdb/azure_cosmos_db_memory_store.py +++ b/python/semantic_kernel/connectors/memory/azure_cosmosdb/azure_cosmos_db_memory_store.py @@ -42,6 +42,9 @@ class AzureCosmosDBMemoryStore(MemoryStoreBase): m = None ef_construction = None ef_search = None + max_degree = None + l_build = None + l_search = None def __init__( self, @@ -55,6 +58,9 @@ def __init__( m: int = 16, ef_construction: int = 64, ef_search: int = 40, + max_degree: int = 32, + l_build: int = 50, + l_search: int = 40, ): """Initializes a new instance of the AzureCosmosDBMemoryStore class.""" if vector_dimensions <= 0: @@ -72,6 +78,9 @@ def __init__( self.m = m self.ef_construction = ef_construction self.ef_search = ef_search + self.max_degree = max_degree + self.l_build = l_build + self.l_search = l_search @staticmethod async def create( @@ -84,6 +93,9 @@ async def create( m: int, ef_construction: int, ef_search: int, + max_degree: int, + l_build: int, + l_search: int, index_name: str | None = None, cosmos_connstr: str | None = None, application_name: str | None = None, @@ -115,6 +127,9 @@ async def create( m=m, ef_construction=ef_construction, ef_search=ef_search, + max_degree=max_degree, + l_build=l_build, + l_search=l_search, ) else: raise MemoryConnectorInitializationError(f"API type {cosmos_api} is not supported.") @@ -130,6 +145,9 @@ async def create( m, ef_construction, ef_search, + max_degree, + l_build, + l_search, ) await store.create_collection(collection_name) return store diff --git a/python/semantic_kernel/connectors/memory/azure_cosmosdb/mongo_vcore_store_api.py b/python/semantic_kernel/connectors/memory/azure_cosmosdb/mongo_vcore_store_api.py index 338bd9e7a234..cbe615b734b5 100644 --- a/python/semantic_kernel/connectors/memory/azure_cosmosdb/mongo_vcore_store_api.py +++ b/python/semantic_kernel/connectors/memory/azure_cosmosdb/mongo_vcore_store_api.py @@ -32,6 +32,9 @@ class MongoStoreApi(AzureCosmosDBStoreApi): m = None ef_construction = None ef_search = None + max_degree = None + l_build = None + l_search = None """ Args: @@ -55,7 +58,8 @@ class MongoStoreApi(AzureCosmosDBStoreApi): kind: Type of vector index to create. Possible options are: - vector-ivf - - vector-hnsw: available as a preview feature only, + - vector-hnsw + - vector-diskann: available as a preview feature only to enable visit https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/preview-features m: The max number of connections per layer (16 by default, minimum value is 2, maximum value is 100). Higher m is suitable for datasets @@ -68,6 +72,12 @@ class MongoStoreApi(AzureCosmosDBStoreApi): ef_construction has to be at least 2 * m ef_search: The size of the dynamic candidate list for search (40 by default). A higher value provides better recall at the cost of speed. + max_degree: Max number of neighbors for diskann index. + Default value is 32, range from 20 to 2048. + l_build: l value for diskann index building. + Default value is 50, range from 10 to 500. + l_search: l value for diskann index searching. + Default value is 40, range from 10 to 10000. database: The Mongo Database object of the azure cosmos db mongo store """ @@ -82,6 +92,9 @@ def __init__( m: int, ef_construction: int, ef_search: int, + max_degree: int, + l_build: int, + l_search: int, database=None, ): """Initializes a new instance of the MongoStoreApi class.""" @@ -95,6 +108,9 @@ def __init__( self.m = m self.ef_construction = ef_construction self.ef_search = ef_search + self.max_degree = max_degree + self.l_build = l_build + self.l_search = l_search @override async def create_collection(self, collection_name: str) -> None: @@ -118,6 +134,15 @@ async def create_collection(self, collection_name: str) -> None: self.similarity, self.vector_dimensions, ) + elif self.kind == CosmosDBVectorSearchType.VECTOR_DISKANN: + create_index_commands = self._get_vector_index_diskann( + collection_name, + self.kind, + self.max_degree, + self.l_build, + self.similarity, + self.vector_dimensions, + ) # invoke the command from the database object self.database.command(create_index_commands) self.collection = self.database[collection_name] @@ -161,6 +186,26 @@ def _get_vector_index_hnsw( ], } + def _get_vector_index_diskann( + self, collection_name: str, kind: str, max_degree: int, l_build: int, similarity: str, dimensions: int + ) -> dict[str, Any]: + return { + "createIndexes": collection_name, + "indexes": [ + { + "name": self.index_name, + "key": {"embedding": "cosmosSearch"}, + "cosmosSearchOptions": { + "kind": kind, + "maxDegree": max_degree, + "lBuild": l_build, + "similarity": similarity, + "dimensions": dimensions, + }, + } + ], + } + @override async def get_collections(self) -> list[str]: return self.database.list_collection_names() @@ -254,6 +299,8 @@ async def get_nearest_matches( pipeline = self._get_pipeline_vector_ivf(embedding.tolist(), limit) elif self.kind == CosmosDBVectorSearchType.VECTOR_HNSW: pipeline = self._get_pipeline_vector_hnsw(embedding.tolist(), limit, self.ef_search) + elif self.kind == CosmosDBVectorSearchType.VECTOR_DISKANN: + pipeline = self._get_pipeline_vector_diskann(embedding.tolist(), limit, self.l_search) cursor = self.collection.aggregate(pipeline) @@ -318,6 +365,29 @@ def _get_pipeline_vector_hnsw( ] return pipeline + def _get_pipeline_vector_diskann( + self, embeddings: list[float], k: int = 4, l_search: int = 40 + ) -> list[dict[str, Any]]: + pipeline: list[dict[str, Any]] = [ + { + "$search": { + "cosmosSearch": { + "vector": embeddings, + "path": "embedding", + "k": k, + "lSearch": l_search, + }, + } + }, + { + "$project": { + "similarityScore": {"$meta": "searchScore"}, + "document": "$$ROOT", + } + }, + ] + return pipeline + @override async def get_nearest_match( self, diff --git a/python/semantic_kernel/connectors/memory/azure_cosmosdb/utils.py b/python/semantic_kernel/connectors/memory/azure_cosmosdb/utils.py index 8c0cd782e1af..af818de5ff50 100644 --- a/python/semantic_kernel/connectors/memory/azure_cosmosdb/utils.py +++ b/python/semantic_kernel/connectors/memory/azure_cosmosdb/utils.py @@ -25,3 +25,5 @@ class CosmosDBVectorSearchType(str, Enum): """IVF vector index""" VECTOR_HNSW = "vector-hnsw" """HNSW vector index""" + VECTOR_DISKANN = "vector-diskann" + """DISKANN vector index""" diff --git a/python/tests/integration/memory/memory_stores/test_azure_cosmosdb_memory_store.py b/python/tests/integration/memory/memory_stores/test_azure_cosmosdb_memory_store.py index a26e1586bfee..e6286641e139 100644 --- a/python/tests/integration/memory/memory_stores/test_azure_cosmosdb_memory_store.py +++ b/python/tests/integration/memory/memory_stores/test_azure_cosmosdb_memory_store.py @@ -29,6 +29,7 @@ application_name = "PYTHON_SEMANTIC_KERNEL" cosmos_api = "mongo-vcore" index_name = "sk_test_vector_search_index" +index_name_vector_diskann = "sk_test_vector_search_index_diskann" vector_dimensions = 1536 num_lists = 1 similarity = CosmosDBSimilarityType.COS @@ -109,6 +110,9 @@ async def azurecosmosdb_memorystore() -> MemoryStoreBase: m=m, ef_construction=ef_construction, ef_search=ef_search, + max_degree=50, + l_build=40, + l_search=100, ) @@ -199,3 +203,120 @@ async def test_get_nearest_matches( assert all(result[i][0]._id in [memory_record1._id, memory_record2._id] for i in range(2)) await store.remove_batch("", [memory_record1._id, memory_record2._id, memory_record3._id]) + + +""" + Test cases for the similarity algorithm using vector-diskann +""" + + +async def azurecosmosdb_memorystore_vector_diskann() -> MemoryStoreBase: + return await AzureCosmosDBMemoryStore.create( + cosmos_connstr=cosmos_connstr, + application_name=application_name, + cosmos_api=cosmos_api, + database_name=database_name, + collection_name=collection_name, + index_name=index_name_vector_diskann, + vector_dimensions=vector_dimensions, + num_lists=num_lists, + similarity=similarity, + kind=CosmosDBVectorSearchType.VECTOR_DISKANN, + m=m, + ef_construction=ef_construction, + ef_search=ef_search, + max_degree=50, + l_build=40, + l_search=100, + ) + + +@pytest.mark.asyncio +@pytest.mark.skipif(skip_test, reason="Skipping test because AZCOSMOS_CONNSTR is not set") +async def test_create_get_drop_exists_collection_vector_diskann(): + store = await azurecosmosdb_memorystore_vector_diskann() + test_collection = "test_collection" + + await store.create_collection(test_collection) + + collection_list = await store.get_collections() + assert test_collection in collection_list + + await store.delete_collection(test_collection) + + result = await store.does_collection_exist(test_collection) + assert result is False + + +@pytest.mark.asyncio +@pytest.mark.skipif(skip_test, reason="Skipping test because AZCOSMOS_CONNSTR is not set") +async def test_upsert_and_get_and_remove_vector_diskann( + memory_record1: MemoryRecord, +): + store = await azurecosmosdb_memorystore_vector_diskann() + doc_id = await store.upsert("", memory_record1) + assert doc_id == memory_record1._id + + result = await store.get("", memory_record1._id, with_embedding=True) + + assert result is not None + assert result._id == memory_record1._id + assert all(result._embedding[i] == memory_record1._embedding[i] for i in range(len(result._embedding))) + + await store.remove("", memory_record1._id) + + +@pytest.mark.asyncio +@pytest.mark.skipif(skip_test, reason="Skipping test because AZCOSMOS_CONNSTR is not set") +async def test_upsert_batch_and_get_batch_remove_batch_vector_diskann( + memory_record2: MemoryRecord, memory_record3: MemoryRecord +): + store = await azurecosmosdb_memorystore_vector_diskann() + doc_ids = await store.upsert_batch("", [memory_record2, memory_record3]) + assert len(doc_ids) == 2 + assert all(doc_id in [memory_record2._id, memory_record3._id] for doc_id in doc_ids) + + results = await store.get_batch("", [memory_record2._id, memory_record3._id], with_embeddings=True) + + assert len(results) == 2 + assert all(result._id in [memory_record2._id, memory_record3._id] for result in results) + + await store.remove_batch("", [memory_record2._id, memory_record3._id]) + + +@pytest.mark.asyncio +@pytest.mark.skipif(skip_test, reason="Skipping test because AZCOSMOS_CONNSTR is not set") +async def test_get_nearest_match_vector_diskann(memory_record1: MemoryRecord, memory_record2: MemoryRecord): + store = await azurecosmosdb_memorystore_vector_diskann() + await store.upsert_batch("", [memory_record1, memory_record2]) + test_embedding = memory_record1.embedding.copy() + test_embedding[0] = test_embedding[0] + 0.1 + + result = await store.get_nearest_match( + collection_name, test_embedding, min_relevance_score=0.0, with_embedding=True + ) + + assert result is not None + assert result[0]._id == memory_record1._id + assert all(result[0]._embedding[i] == memory_record1._embedding[i] for i in range(len(result[0]._embedding))) + + await store.remove_batch("", [memory_record1._id, memory_record2._id]) + + +@pytest.mark.asyncio +@pytest.mark.skipif(skip_test, reason="Skipping test because AZCOSMOS_CONNSTR is not set") +async def test_get_nearest_matches_vector_diskann( + memory_record1: MemoryRecord, + memory_record2: MemoryRecord, + memory_record3: MemoryRecord, +): + store = await azurecosmosdb_memorystore_vector_diskann() + await store.upsert_batch("", [memory_record1, memory_record2, memory_record3]) + test_embedding = memory_record2.embedding.copy() + test_embedding[0] = test_embedding[4] + 0.1 + + result = await store.get_nearest_matches("", test_embedding, limit=2, min_relevance_score=0.0, with_embeddings=True) + assert len(result) == 2 + assert all(result[i][0]._id in [memory_record1._id, memory_record2._id] for i in range(2)) + + await store.remove_batch("", [memory_record1._id, memory_record2._id, memory_record3._id])