Python: Introducing vector search to the redis collections (#9664)

### Motivation and Context  Adds the vector search pieces to the two Redis collection types Features vectorized search and vector text search. Also features a slight change to the storage format for Hashsets since the existing was wrong because the index was not picking up those fields, fixed now, but if you have a Redis Hash Collection running, this will break things. Closes #6837 ### Description  ### Contribution Checklist  - [x] The code builds clean without any errors or warnings - [x] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [ ] I didn't break anyone 😄
microsoft · Nov 14, 2024 · 962c448 · 962c448
1 parent 2c8b5c6
commit 962c448
Show file tree

Hide file tree

Showing 14 changed files with 429 additions and 80 deletions.
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -101,7 +101,8 @@ qdrant = [
 ]
 redis = [
     "redis[hiredis] ~= 5.0",
-    "types-redis ~= 4.6.0.20240425"
+    "types-redis ~= 4.6.0.20240425",
+    "redisvl >= 0.3.6",
 ]
 usearch = [
     "usearch ~= 2.9",
@@ -211,3 +212,4 @@ name = "semantic_kernel"
 requires = ["flit-core >= 3.9,<4.0"]
 build-backend = "flit_core.buildapi"
 
+
diff --git a/python/samples/concepts/memory/new_memory.py b/python/samples/concepts/memory/new_memory.py
@@ -10,28 +10,32 @@
 import numpy as np
 
 from semantic_kernel import Kernel
-from semantic_kernel.connectors.ai.open_ai import OpenAIEmbeddingPromptExecutionSettings, OpenAITextEmbedding
-from semantic_kernel.connectors.ai.open_ai.services.azure_text_embedding import AzureTextEmbedding
-from semantic_kernel.connectors.memory.azure_ai_search import AzureAISearchCollection
-from semantic_kernel.connectors.memory.azure_cosmos_db.azure_cosmos_db_no_sql_collection import (
-    AzureCosmosDBNoSQLCollection,
+from semantic_kernel.connectors.ai.open_ai import (
+    AzureTextEmbedding,
+    OpenAIEmbeddingPromptExecutionSettings,
+    OpenAITextEmbedding,
 )
+from semantic_kernel.connectors.memory.azure_ai_search import AzureAISearchCollection
+from semantic_kernel.connectors.memory.azure_cosmos_db import AzureCosmosDBNoSQLCollection
 from semantic_kernel.connectors.memory.in_memory import InMemoryVectorCollection
-from semantic_kernel.connectors.memory.postgres.postgres_collection import PostgresCollection
+from semantic_kernel.connectors.memory.postgres import PostgresCollection
 from semantic_kernel.connectors.memory.qdrant import QdrantCollection
 from semantic_kernel.connectors.memory.redis import RedisHashsetCollection, RedisJsonCollection
-from semantic_kernel.connectors.memory.weaviate.weaviate_collection import WeaviateCollection
+from semantic_kernel.connectors.memory.weaviate import WeaviateCollection
 from semantic_kernel.data import (
+    DistanceFunction,
+    IndexKind,
+    VectorizedSearchMixin,
+    VectorSearchFilter,
+    VectorSearchOptions,
     VectorStoreRecordCollection,
     VectorStoreRecordDataField,
     VectorStoreRecordKeyField,
     VectorStoreRecordUtils,
     VectorStoreRecordVectorField,
+    VectorTextSearchMixin,
     vectorstoremodel,
 )
-from semantic_kernel.data.const import DistanceFunction, IndexKind
-from semantic_kernel.data.vector_search.vector_search_options import VectorSearchOptions
-from semantic_kernel.data.vector_search.vectorized_search import VectorizedSearchMixin
 
 
 def get_data_model_array(index_kind: IndexKind, distance_function: DistanceFunction) -> type:
@@ -50,11 +54,12 @@ class DataModelArray:
                 deserialize_function=np.array,
             ),
         ] = None
-        other: str | None = None
         id: Annotated[str, VectorStoreRecordKeyField()] = field(default_factory=lambda: str(uuid4()))
         content: Annotated[
             str, VectorStoreRecordDataField(has_embedding=True, embedding_property_name="vector", property_type="str")
         ] = "content1"
+        title: Annotated[str, VectorStoreRecordDataField(property_type="str", is_full_text_searchable=True)] = "title"
+        tag: Annotated[str, VectorStoreRecordDataField(property_type="str", is_filterable=True)] = "tag"
 
     return DataModelArray
 
@@ -73,11 +78,12 @@ class DataModelList:
                 property_type="float",
             ),
         ] = None
-        other: str | None = None
         id: Annotated[str, VectorStoreRecordKeyField()] = field(default_factory=lambda: str(uuid4()))
         content: Annotated[
             str, VectorStoreRecordDataField(has_embedding=True, embedding_property_name="vector", property_type="str")
         ] = "content1"
+        title: Annotated[str, VectorStoreRecordDataField(property_type="str", is_full_text_searchable=True)] = "title"
+        tag: Annotated[str, VectorStoreRecordDataField(property_type="str", is_filterable=True)] = "tag"
 
     return DataModelList
 
@@ -121,7 +127,7 @@ class DataModelList:
         collection_name=collection_name,
         prefix_collection_name_to_key_names=True,
     ),
-    "redis_hashset": lambda: RedisHashsetCollection[DataModel](
+    "redis_hash": lambda: RedisHashsetCollection[DataModel](
         data_model_type=DataModel,
         collection_name=collection_name,
         prefix_collection_name_to_key_names=True,
@@ -146,6 +152,13 @@ class DataModelList:
 }
 
 
+def print_record(record):
+    print(f"  Found id: {record.id}")
+    print(f"    Content: {record.content}")
+    if record.vector is not None:
+        print(f"    Vector (first five): {record.vector[:5]}")
+
+
 async def main(collection: str, use_azure_openai: bool, embedding_model: str):
     print("-" * 30)
     kernel = Kernel()
@@ -159,44 +172,59 @@ async def main(collection: str, use_azure_openai: bool, embedding_model: str):
         print(f"Creating {collection} collection!")
         await record_collection.create_collection_if_not_exists()
 
-        record1 = DataModel(content="Semantic Kernel is awesome", id="e6103c03-487f-4d7d-9c23-4723651c17f4")
+        record1 = DataModel(
+            content="Semantic Kernel is awesome",
+            id="e6103c03-487f-4d7d-9c23-4723651c17f4",
+            title="Semantic Kernel Languages",
+            tag="general",
+        )
         record2 = DataModel(
             content="Semantic Kernel is available in dotnet, python and Java.",
             id="09caec77-f7e1-466a-bcec-f1d51c5b15be",
+            title="Semantic Kernel Languages",
+            tag="general",
         )
 
         print("Adding records!")
         records = await VectorStoreRecordUtils(kernel).add_vector_to_records(
             [record1, record2], data_model_type=DataModel
         )
+
         keys = await record_collection.upsert_batch(records)
         print(f"    Upserted {keys=}")
         print("Getting records!")
         results = await record_collection.get_batch([record1.id, record2.id])
         if results:
-            for result in results:
-                print(f"  Found id: {result.id}")
-                print(f"    Content: {result.content}")
-                if result.vector is not None:
-                    print(f"    Vector (first five): {result.vector[:5]}")
+            for record in results:
+                print_record(record)
         else:
             print("Nothing found...")
+        options = VectorSearchOptions(
+            vector_field_name="vector",
+            include_vectors=True,
+            filter=VectorSearchFilter.equal_to("tag", "general"),
+        )
+        if isinstance(record_collection, VectorTextSearchMixin):
+            print("-" * 30)
+            print("Using text search")
+            search_results = await record_collection.text_search("python", options)
+            if search_results.total_count == 0:
+                print("\nNothing found...\n")
+            else:
+                [print_record(result.record) async for result in search_results.results]
         if isinstance(record_collection, VectorizedSearchMixin):
             print("-" * 30)
-            print("Using vectorized search, the distance function is set to cosine_similarity.")
-            print("This means that the higher the score the more similar.")
+            print("Using vectorized search, for `python`")
+            print("The distance function is set to the default of the store.")
             search_results = await record_collection.vectorized_search(
-                vector=(await embedder.generate_raw_embeddings(["python"]))[0],
-                options=VectorSearchOptions(vector_field_name="vector", include_vectors=True),
+                vector=(await embedder.generate_raw_embeddings(["python"]))[0], options=options
             )
-            results = [record async for record in search_results.results]
-            for result in results:
-                print(f"  Found id: {result.record.id}")
-                print(f"    Content: {result.record.content}")
-                if result.record.vector is not None:
-                    print(f"    Vector (first five): {result.record.vector[:5]}")
-                print(f"  Score: {result.score:.4f}")
-                print("")
+            if search_results.total_count == 0:
+                print("\nNothing found...\n")
+            else:
+                async for result in search_results.results:
+                    print_record(result.record)
+                    print(f"  Score: {result.score:.4f}\n")
         print("-" * 30)
         print("Deleting collection!")
         await record_collection.delete_collection()

diff --git a/python/semantic_kernel/connectors/memory/azure_ai_search/const.py b/python/semantic_kernel/connectors/memory/azure_ai_search/const.py
@@ -18,9 +18,10 @@
 }
 
 DISTANCE_FUNCTION_MAP = {
-    DistanceFunction.COSINE_SIMILARITY: VectorSearchAlgorithmMetric.COSINE,
+    DistanceFunction.COSINE_DISTANCE: VectorSearchAlgorithmMetric.COSINE,
     DistanceFunction.DOT_PROD: VectorSearchAlgorithmMetric.DOT_PRODUCT,
     DistanceFunction.EUCLIDEAN_DISTANCE: VectorSearchAlgorithmMetric.EUCLIDEAN,
+    DistanceFunction.HAMMING: VectorSearchAlgorithmMetric.HAMMING,
     "default": VectorSearchAlgorithmMetric.COSINE,
 }
 

diff --git a/python/semantic_kernel/connectors/memory/azure_ai_search/utils.py b/python/semantic_kernel/connectors/memory/azure_ai_search/utils.py
@@ -145,12 +145,21 @@ def data_model_definition_to_azure_ai_search_index(
                     algorithm_configuration_name=algo_name,
                 )
             )
-            algo_class, algo_params = INDEX_ALGORITHM_MAP[field.index_kind or "default"]
+            try:
+                algo_class, algo_params = INDEX_ALGORITHM_MAP[field.index_kind or "default"]
+            except KeyError as e:
+                raise ServiceInitializationError(f"Error: {field.index_kind} not found in INDEX_ALGORITHM_MAP.") from e
+            try:
+                distance_metric = DISTANCE_FUNCTION_MAP[field.distance_function or "default"]
+            except KeyError as e:
+                raise ServiceInitializationError(
+                    f"Error: {field.distance_function} not found in DISTANCE_FUNCTION_MAP."
+                ) from e
             search_algos.append(
                 algo_class(
                     name=algo_name,
                     parameters=algo_params(
-                        metric=DISTANCE_FUNCTION_MAP[field.distance_function or "default"],
+                        metric=distance_metric,
                     ),
                 )
             )

diff --git a/python/semantic_kernel/connectors/memory/azure_cosmos_db/__init__.py b/python/semantic_kernel/connectors/memory/azure_cosmos_db/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+from semantic_kernel.connectors.memory.azure_cosmos_db.azure_cosmos_db_no_sql_collection import (
+    AzureCosmosDBNoSQLCollection,
+)
+from semantic_kernel.connectors.memory.azure_cosmos_db.azure_cosmos_db_no_sql_composite_key import (
+    AzureCosmosDBNoSQLCompositeKey,
+)
+from semantic_kernel.connectors.memory.azure_cosmos_db.azure_cosmos_db_no_sql_settings import AzureCosmosDBNoSQLSettings
+from semantic_kernel.connectors.memory.azure_cosmos_db.azure_cosmos_db_no_sql_store import AzureCosmosDBNoSQLStore
+
+__all__ = [
+    "AzureCosmosDBNoSQLCollection",
+    "AzureCosmosDBNoSQLCompositeKey",
+    "AzureCosmosDBNoSQLSettings",
+    "AzureCosmosDBNoSQLStore",
+]
diff --git a/python/semantic_kernel/connectors/memory/redis/const.py b/python/semantic_kernel/connectors/memory/redis/const.py
@@ -4,6 +4,7 @@
 from enum import Enum
 
 from redis.commands.search.indexDefinition import IndexType
+from redisvl.schema import StorageType
 
 from semantic_kernel.data.const import DistanceFunction
 
@@ -18,6 +19,11 @@ class RedisCollectionTypes(str, Enum):
     RedisCollectionTypes.HASHSET: IndexType.HASH,
 }
 
+STORAGE_TYPE_MAP = {
+    RedisCollectionTypes.JSON: StorageType.JSON,
+    RedisCollectionTypes.HASHSET: StorageType.HASH,
+}
+
 DISTANCE_FUNCTION_MAP = {
     DistanceFunction.COSINE_SIMILARITY: "COSINE",
     DistanceFunction.DOT_PROD: "IP",