Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions comps/dataprep/src/integrations/redis.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@
TIMEOUT_SECONDS = int(os.getenv("TIMEOUT_SECONDS", 600))
SEARCH_BATCH_SIZE = int(os.getenv("SEARCH_BATCH_SIZE", 10))

# Vector Schema Configuration
DEFAULT_VECTOR_SCHEMA = {"algorithm": "HNSW", "m": 16, "ef_construction": 200}
VECTOR_SCHEMA = os.getenv("VECTOR_SCHEMA", json.dumps(DEFAULT_VECTOR_SCHEMA))

# Redis Connection Information
REDIS_HOST = os.getenv("REDIS_HOST", "localhost")
REDIS_PORT = int(os.getenv("REDIS_PORT", 6379))
Expand Down Expand Up @@ -200,6 +204,12 @@ async def ingest_chunks_to_redis(file_name: str, chunks: List, embedder, index_n

# if data will be saved to a different index name than the default one
ingest_index_name = index_name if index_name else INDEX_NAME
# Parse vector schema
try:
vector_schema = json.loads(VECTOR_SCHEMA)
except json.JSONDecodeError as e:
logger.error(f"Invalid VECTOR_SCHEMA format: {e}")
vector_schema = DEFAULT_VECTOR_SCHEMA

file_ids = []
for i in range(0, num_chunks, batch_size):
Expand All @@ -214,6 +224,7 @@ async def ingest_chunks_to_redis(file_name: str, chunks: List, embedder, index_n
embedding=embedder,
index_name=ingest_index_name,
redis_url=REDIS_URL,
vector_schema=vector_schema,
)
if logflag:
logger.info(f"[ redis ingest chunks ] keys: {keys}")
Expand Down
2 changes: 2 additions & 0 deletions comps/retrievers/src/integrations/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ def get_boolean_env_var(var_name, default_value=False):
LOCAL_EMBEDDING_MODEL = os.getenv("LOCAL_EMBEDDING_MODEL", "maidalun1020/bce-embedding-base_v1")
TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT", "")
BRIDGE_TOWER_EMBEDDING = os.getenv("BRIDGE_TOWER_EMBEDDING", False)

HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN", "")
ENABLE_SCHEMA = get_boolean_env_var("ENABLE_SCHEMA", False)

# OpenAI
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
Expand Down
7 changes: 7 additions & 0 deletions comps/retrievers/src/integrations/redis.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from .config import (
BRIDGE_TOWER_EMBEDDING,
EMBED_MODEL,
ENABLE_SCHEMA,
HF_TOKEN,
INDEX_NAME,
INDEX_SCHEMA,
Expand Down Expand Up @@ -100,7 +101,13 @@ async def _initialize_client(self, index_name=INDEX_NAME) -> Redis:
client = Redis(
embedding=self.embeddings, index_name=index_name, index_schema=INDEX_SCHEMA, redis_url=REDIS_URL
)
elif ENABLE_SCHEMA:
logger.info(f"generate redis instance with index_schema:{INDEX_SCHEMA}")
client = Redis(
embedding=self.embeddings, index_name=index_name, index_schema=INDEX_SCHEMA, redis_url=REDIS_URL
)
else:
logger.info(f"generate redis instance with index_name:{INDEX_NAME}")
client = Redis(embedding=self.embeddings, index_name=index_name, redis_url=REDIS_URL)
return client
except Exception as e:
Expand Down
9 changes: 9 additions & 0 deletions comps/retrievers/src/integrations/redis_schema_hnsw.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

vector:
- name: content_vector
algorithm: HNSW
datatype: FLOAT32
dims: 768
distance_metric: COSINE
Loading