Skip to content

Commit

Permalink
Python: Introducing vector search to the redis collections (#9664)
Browse files Browse the repository at this point in the history
### Motivation and Context

<!-- Thank you for your contribution to the semantic-kernel repo!
Please help reviewers and future users, providing the following
information:
  1. Why is this change required?
  2. What problem does it solve?
  3. What scenario does it contribute to?
  4. If it fixes an open issue, please link to the issue here.
-->
Adds the vector search pieces to the two Redis collection types
Features vectorized search and vector text search.

Also features a slight change to the storage format for Hashsets since
the existing was wrong because the index was not picking up those
fields, fixed now, but if you have a Redis Hash Collection running, this
will break things.

Closes #6837 

### Description

<!-- Describe your changes, the overall approach, the underlying design.
These notes will help understanding how your code works. Thanks! -->

### Contribution Checklist

<!-- Before submitting this PR, please make sure: -->

- [x] The code builds clean without any errors or warnings
- [x] The PR follows the [SK Contribution
Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md)
and the [pre-submission formatting
script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts)
raises no violations
- [x] All unit tests pass, and I have added new tests where possible
- [ ] I didn't break anyone 😄
  • Loading branch information
eavanvalkenburg authored Nov 14, 2024
1 parent 2c8b5c6 commit 962c448
Show file tree
Hide file tree
Showing 14 changed files with 429 additions and 80 deletions.
4 changes: 3 additions & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,8 @@ qdrant = [
]
redis = [
"redis[hiredis] ~= 5.0",
"types-redis ~= 4.6.0.20240425"
"types-redis ~= 4.6.0.20240425",
"redisvl >= 0.3.6",
]
usearch = [
"usearch ~= 2.9",
Expand Down Expand Up @@ -211,3 +212,4 @@ name = "semantic_kernel"
requires = ["flit-core >= 3.9,<4.0"]
build-backend = "flit_core.buildapi"


90 changes: 59 additions & 31 deletions python/samples/concepts/memory/new_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,28 +10,32 @@
import numpy as np

from semantic_kernel import Kernel
from semantic_kernel.connectors.ai.open_ai import OpenAIEmbeddingPromptExecutionSettings, OpenAITextEmbedding
from semantic_kernel.connectors.ai.open_ai.services.azure_text_embedding import AzureTextEmbedding
from semantic_kernel.connectors.memory.azure_ai_search import AzureAISearchCollection
from semantic_kernel.connectors.memory.azure_cosmos_db.azure_cosmos_db_no_sql_collection import (
AzureCosmosDBNoSQLCollection,
from semantic_kernel.connectors.ai.open_ai import (
AzureTextEmbedding,
OpenAIEmbeddingPromptExecutionSettings,
OpenAITextEmbedding,
)
from semantic_kernel.connectors.memory.azure_ai_search import AzureAISearchCollection
from semantic_kernel.connectors.memory.azure_cosmos_db import AzureCosmosDBNoSQLCollection
from semantic_kernel.connectors.memory.in_memory import InMemoryVectorCollection
from semantic_kernel.connectors.memory.postgres.postgres_collection import PostgresCollection
from semantic_kernel.connectors.memory.postgres import PostgresCollection
from semantic_kernel.connectors.memory.qdrant import QdrantCollection
from semantic_kernel.connectors.memory.redis import RedisHashsetCollection, RedisJsonCollection
from semantic_kernel.connectors.memory.weaviate.weaviate_collection import WeaviateCollection
from semantic_kernel.connectors.memory.weaviate import WeaviateCollection
from semantic_kernel.data import (
DistanceFunction,
IndexKind,
VectorizedSearchMixin,
VectorSearchFilter,
VectorSearchOptions,
VectorStoreRecordCollection,
VectorStoreRecordDataField,
VectorStoreRecordKeyField,
VectorStoreRecordUtils,
VectorStoreRecordVectorField,
VectorTextSearchMixin,
vectorstoremodel,
)
from semantic_kernel.data.const import DistanceFunction, IndexKind
from semantic_kernel.data.vector_search.vector_search_options import VectorSearchOptions
from semantic_kernel.data.vector_search.vectorized_search import VectorizedSearchMixin


def get_data_model_array(index_kind: IndexKind, distance_function: DistanceFunction) -> type:
Expand All @@ -50,11 +54,12 @@ class DataModelArray:
deserialize_function=np.array,
),
] = None
other: str | None = None
id: Annotated[str, VectorStoreRecordKeyField()] = field(default_factory=lambda: str(uuid4()))
content: Annotated[
str, VectorStoreRecordDataField(has_embedding=True, embedding_property_name="vector", property_type="str")
] = "content1"
title: Annotated[str, VectorStoreRecordDataField(property_type="str", is_full_text_searchable=True)] = "title"
tag: Annotated[str, VectorStoreRecordDataField(property_type="str", is_filterable=True)] = "tag"

return DataModelArray

Expand All @@ -73,11 +78,12 @@ class DataModelList:
property_type="float",
),
] = None
other: str | None = None
id: Annotated[str, VectorStoreRecordKeyField()] = field(default_factory=lambda: str(uuid4()))
content: Annotated[
str, VectorStoreRecordDataField(has_embedding=True, embedding_property_name="vector", property_type="str")
] = "content1"
title: Annotated[str, VectorStoreRecordDataField(property_type="str", is_full_text_searchable=True)] = "title"
tag: Annotated[str, VectorStoreRecordDataField(property_type="str", is_filterable=True)] = "tag"

return DataModelList

Expand Down Expand Up @@ -121,7 +127,7 @@ class DataModelList:
collection_name=collection_name,
prefix_collection_name_to_key_names=True,
),
"redis_hashset": lambda: RedisHashsetCollection[DataModel](
"redis_hash": lambda: RedisHashsetCollection[DataModel](
data_model_type=DataModel,
collection_name=collection_name,
prefix_collection_name_to_key_names=True,
Expand All @@ -146,6 +152,13 @@ class DataModelList:
}


def print_record(record):
print(f" Found id: {record.id}")
print(f" Content: {record.content}")
if record.vector is not None:
print(f" Vector (first five): {record.vector[:5]}")


async def main(collection: str, use_azure_openai: bool, embedding_model: str):
print("-" * 30)
kernel = Kernel()
Expand All @@ -159,44 +172,59 @@ async def main(collection: str, use_azure_openai: bool, embedding_model: str):
print(f"Creating {collection} collection!")
await record_collection.create_collection_if_not_exists()

record1 = DataModel(content="Semantic Kernel is awesome", id="e6103c03-487f-4d7d-9c23-4723651c17f4")
record1 = DataModel(
content="Semantic Kernel is awesome",
id="e6103c03-487f-4d7d-9c23-4723651c17f4",
title="Semantic Kernel Languages",
tag="general",
)
record2 = DataModel(
content="Semantic Kernel is available in dotnet, python and Java.",
id="09caec77-f7e1-466a-bcec-f1d51c5b15be",
title="Semantic Kernel Languages",
tag="general",
)

print("Adding records!")
records = await VectorStoreRecordUtils(kernel).add_vector_to_records(
[record1, record2], data_model_type=DataModel
)

keys = await record_collection.upsert_batch(records)
print(f" Upserted {keys=}")
print("Getting records!")
results = await record_collection.get_batch([record1.id, record2.id])
if results:
for result in results:
print(f" Found id: {result.id}")
print(f" Content: {result.content}")
if result.vector is not None:
print(f" Vector (first five): {result.vector[:5]}")
for record in results:
print_record(record)
else:
print("Nothing found...")
options = VectorSearchOptions(
vector_field_name="vector",
include_vectors=True,
filter=VectorSearchFilter.equal_to("tag", "general"),
)
if isinstance(record_collection, VectorTextSearchMixin):
print("-" * 30)
print("Using text search")
search_results = await record_collection.text_search("python", options)
if search_results.total_count == 0:
print("\nNothing found...\n")
else:
[print_record(result.record) async for result in search_results.results]
if isinstance(record_collection, VectorizedSearchMixin):
print("-" * 30)
print("Using vectorized search, the distance function is set to cosine_similarity.")
print("This means that the higher the score the more similar.")
print("Using vectorized search, for `python`")
print("The distance function is set to the default of the store.")
search_results = await record_collection.vectorized_search(
vector=(await embedder.generate_raw_embeddings(["python"]))[0],
options=VectorSearchOptions(vector_field_name="vector", include_vectors=True),
vector=(await embedder.generate_raw_embeddings(["python"]))[0], options=options
)
results = [record async for record in search_results.results]
for result in results:
print(f" Found id: {result.record.id}")
print(f" Content: {result.record.content}")
if result.record.vector is not None:
print(f" Vector (first five): {result.record.vector[:5]}")
print(f" Score: {result.score:.4f}")
print("")
if search_results.total_count == 0:
print("\nNothing found...\n")
else:
async for result in search_results.results:
print_record(result.record)
print(f" Score: {result.score:.4f}\n")
print("-" * 30)
print("Deleting collection!")
await record_collection.delete_collection()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@
}

DISTANCE_FUNCTION_MAP = {
DistanceFunction.COSINE_SIMILARITY: VectorSearchAlgorithmMetric.COSINE,
DistanceFunction.COSINE_DISTANCE: VectorSearchAlgorithmMetric.COSINE,
DistanceFunction.DOT_PROD: VectorSearchAlgorithmMetric.DOT_PRODUCT,
DistanceFunction.EUCLIDEAN_DISTANCE: VectorSearchAlgorithmMetric.EUCLIDEAN,
DistanceFunction.HAMMING: VectorSearchAlgorithmMetric.HAMMING,
"default": VectorSearchAlgorithmMetric.COSINE,
}

Expand Down
13 changes: 11 additions & 2 deletions python/semantic_kernel/connectors/memory/azure_ai_search/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,12 +145,21 @@ def data_model_definition_to_azure_ai_search_index(
algorithm_configuration_name=algo_name,
)
)
algo_class, algo_params = INDEX_ALGORITHM_MAP[field.index_kind or "default"]
try:
algo_class, algo_params = INDEX_ALGORITHM_MAP[field.index_kind or "default"]
except KeyError as e:
raise ServiceInitializationError(f"Error: {field.index_kind} not found in INDEX_ALGORITHM_MAP.") from e
try:
distance_metric = DISTANCE_FUNCTION_MAP[field.distance_function or "default"]
except KeyError as e:
raise ServiceInitializationError(
f"Error: {field.distance_function} not found in DISTANCE_FUNCTION_MAP."
) from e
search_algos.append(
algo_class(
name=algo_name,
parameters=algo_params(
metric=DISTANCE_FUNCTION_MAP[field.distance_function or "default"],
metric=distance_metric,
),
)
)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Copyright (c) Microsoft. All rights reserved.

from semantic_kernel.connectors.memory.azure_cosmos_db.azure_cosmos_db_no_sql_collection import (
AzureCosmosDBNoSQLCollection,
)
from semantic_kernel.connectors.memory.azure_cosmos_db.azure_cosmos_db_no_sql_composite_key import (
AzureCosmosDBNoSQLCompositeKey,
)
from semantic_kernel.connectors.memory.azure_cosmos_db.azure_cosmos_db_no_sql_settings import AzureCosmosDBNoSQLSettings
from semantic_kernel.connectors.memory.azure_cosmos_db.azure_cosmos_db_no_sql_store import AzureCosmosDBNoSQLStore

__all__ = [
"AzureCosmosDBNoSQLCollection",
"AzureCosmosDBNoSQLCompositeKey",
"AzureCosmosDBNoSQLSettings",
"AzureCosmosDBNoSQLStore",
]
6 changes: 6 additions & 0 deletions python/semantic_kernel/connectors/memory/redis/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from enum import Enum

from redis.commands.search.indexDefinition import IndexType
from redisvl.schema import StorageType

from semantic_kernel.data.const import DistanceFunction

Expand All @@ -18,6 +19,11 @@ class RedisCollectionTypes(str, Enum):
RedisCollectionTypes.HASHSET: IndexType.HASH,
}

STORAGE_TYPE_MAP = {
RedisCollectionTypes.JSON: StorageType.JSON,
RedisCollectionTypes.HASHSET: StorageType.HASH,
}

DISTANCE_FUNCTION_MAP = {
DistanceFunction.COSINE_SIMILARITY: "COSINE",
DistanceFunction.DOT_PROD: "IP",
Expand Down
Loading

0 comments on commit 962c448

Please sign in to comment.