Bump to redisvl 0.1.0 (#26)

* redo embeddings and cleanup * update redisvl to 0.1.0 * update readmes
redis-developer · Feb 9, 2024 · cbff994 · cbff994
1 parent 1e585f2
commit cbff994
Show file tree

Hide file tree

Showing 22 changed files with 380 additions and 2,722 deletions.
diff --git a/.env.template b/.env.template
@@ -2,4 +2,5 @@ DEPLOYMENT=dev
 REDIS_HOST=redis
 REDIS_PORT=6379
 OPENAI_API_KEY=YOUR-OPENAI-API-KEY
-COHERE_API_KEY=YOUR-COHERE-API-KEY
+COHERE_API_KEY=YOUR-COHERE-API-KEY
+TOKENIZERS_PARALLELISM=False
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,9 @@
 arxiv-metadata-oai-snapshot.json
+arxiv-papers-1000.json
 *.DS_STORE
 *.log
 .env
 .ipynb_checkpoints
 *.pkl
 .venv
+venv
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
     <br />
     <br />
 <div display="inline-block">
-    <a href="https://docsearch.redisventures.com"><b>Hosted Demo</b></a>&nbsp;&nbsp;&nbsp;
+    <a href="https://docsearch.redisvl.com"><b>Hosted Demo</b></a>&nbsp;&nbsp;&nbsp;
     <a href="https://github.com/RedisVentures/redis-arXiv-search"><b>Code</b></a>&nbsp;&nbsp;&nbsp;
     <a href="https://datasciencedojo.com/blog/ai-powered-document-search/"><b>Blog Post</b></a>&nbsp;&nbsp;&nbsp;
     <a href="https://redis.io/docs/interact/search-and-query/advanced-concepts/vectors/"><b>Redis Vector Search Documentation</b></a>&nbsp;&nbsp;&nbsp;
@@ -14,7 +14,7 @@
 </div>
 
 # 🔎 Redis arXiv Search
-*This repository is the official codebase for the arxiv paper search app hosted at: **https://docsearch.redisventures.com***
+*This repository is the official codebase for the arxiv paper search app hosted at: **https://docsearch.redisvl.com***
 
 [Redis](https://redis.com) is a highly performant, production-ready vector database, which can be used for many types of applications. Here we showcase Redis vector search applied to a document retrieval use case. Read more about AI-powered search in [the technical blog post](https://datasciencedojo.com/blog/ai-powered-document-search/) published by our partners, *[Data Science Dojo](https://datasciencedojo.com)*.
 
@@ -30,7 +30,7 @@ The arXiv papers dataset was sourced from the the following [Kaggle link](https:
 This app was built as a Single Page Application (SPA) with the following components:
 
 - **[Redis Stack](https://redis.io/docs/stack/)** for vector database
-- **[RedisVL](https://redisvl.com)** for vector db client
+- **[RedisVL](https://redisvl.com)** for Python vector db client
 - **[FastAPI](https://fastapi.tiangolo.com/)** for Python API
 - **[Pydantic](https://pydantic-docs.helpmanual.io/)** for schema and validation
 - **[React](https://reactjs.org/)** (with Typescript)
@@ -49,7 +49,7 @@ Embeddings represent the semantic properies of the raw text and enable vector si
 | ------------- |-------------| ----- |
 | HuggingFace      | `sentence-transformers/all-mpnet-base-v2` | Yes |
 | OpenAI      | `text-embedding-ada-002`      |   Yes |
-| Cohere | `small`      |    Yes |
+| Cohere | `embed-multilingual-v3.0`      |    Yes |
 
 **Interested in a different embedding provider?** Feel free to open a PR and make a suggested addition.
 
@@ -77,7 +77,7 @@ Embeddings represent the semantic properies of the raw text and enable vector si
     - **[Redis Stack](#redis-stack-docker)** runs Redis as a local docker container.
     - **[Redis Cloud](#redis-cloud)** will manage a Redis database on your behalf in the cloud.
 
-### Redis Stack Docker
+### Redis Stack Docker (Local)
 Using Redis Stack locally doesn't require any additional steps. However, it will consume more resources on your machine and have performance limitations.
 
 Use the provided `docker-compose` file for running the application locally:
@@ -99,14 +99,12 @@ $ docker compose -f docker-local-redis.yml up
 
 
 ## Customizing (optional)
-- **Customize Data**: You can use the Jupyter Notebooks in the [`data/`](data/README.md) directory to create paper embeddings and metadata. The pickled dataframes will end up stored in the `data/` directory and used when creating your own container.
-
-- **Customize Code**: You can use the `./build.sh` script to build your own docker image based on the application source code changes.
-
-- **Kubernetes??**: If you want to use K8s instead of Docker Compose, we have some [resources to help you get started](k8s/README.md).
+- You can use the provided Jupyter Notebook in the [`data/`](data/README.md) directory to create paper embeddings and metadata. The output JSON files will end up stored in the `data/` directory and used when creating your own container.
+- Use the `./build.sh` script to build your own docker image based on the application source code and dataset changes.
+- If you want to use K8s instead of Docker Compose, we have some [resources to help you get started](k8s/README.md).
 
 ### React Dev Environment
-It's typically easier to write front end code in an interactive environment, testing changes in realtime.
+It's typically easier to build front end in an interactive environment, testing changes in realtime.
 
 1. Deploy the app using steps above.
 2. Install packages (you may need to use `npm` to install `yarn`)

diff --git a/backend/arxivsearch/api/routes.py b/backend/arxivsearch/api/routes.py
@@ -1,13 +1,14 @@
 import asyncio
+import os
 import numpy as np
+import logging
+from typing import List, Dict, Any
 
-from typing import List, Dict, Any, Union
 from fastapi import APIRouter
-
-from redis.commands.search.document import Document
-from redis.commands.search.result import Result
+from redis.asyncio import Redis
 
 from redisvl.index import AsyncSearchIndex
+from redisvl.schema import IndexSchema
 from redisvl.query import VectorQuery, FilterQuery, CountQuery
 from redisvl.query.filter import Tag, FilterExpression
 
@@ -19,13 +20,22 @@
 )
 
 
-paper_router = r = APIRouter()
-print("Loading embeddings providers", flush=True)
+logger = logging.getLogger(__name__)
+
+# Initialize the API router
+router = APIRouter()
+
+# Initialize embeddings and paper vector field name
 embeddings = Embeddings()
-paper_vector_field_name = "vector"
 
+# Preload Redis connection details
+client = Redis.from_url(config.REDIS_URL)
 
-def process_paper(paper: Union[Document, Dict[str, Any]]) -> Dict[str, Any]:
+# Preload index schema
+schema = IndexSchema.from_yaml(os.path.join("./schema", "index.yaml"))
+
+
+def process_paper(paper: Dict[str, Any]) -> Dict[str, Any]:
     """
     Process paper data and calculate similarity score.
 
@@ -35,8 +45,6 @@ def process_paper(paper: Union[Document, Dict[str, Any]]) -> Dict[str, Any]:
     Returns:
         dict: Processed paper data with similarity score.
     """
-    if isinstance(paper, Document):
-        paper = paper.__dict__
     if 'vector_distance' in paper:
         paper['similarity_score'] = 1 - float(paper['vector_distance'])
     return paper
@@ -63,7 +71,7 @@ def build_filter_expression(years: List[int], categories: List[str]) -> FilterEx
     return year_filter & category_filter
 
 
-def prepare_response(total: int, results: Union[List[Dict[str, Any]], Result]) -> Dict[str, Any]:
+def prepare_response(total: int, results: List[Dict[str, Any]]) -> Dict[str, Any]:
     """
     Extract and process papers from search results.
 
@@ -74,23 +82,21 @@ def prepare_response(total: int, results: Union[List[Dict[str, Any]], Result]) -
     Args:
         total (int): The hypothetical count of papers present in the db that
             match the filters.
-        results (Union[List[Dict[str, Any]], Result]): The iterable containing
+        results (List[Dict[str, Any]): The iterable containing
             raw paper data.
 
     Returns:
         dict: A dictionary with 'total' count and a list of 'papers', where
             each paper is a processed dict.
     """
-    # extract papers from VSS results
-    if isinstance(results, Result):
-        results = results.docs
+    logger.info("Preparing paper response")
     return {
         'total': total,
         'papers': [process_paper(paper) for paper in results]
     }
 
 
-@r.get("/", response_model=Dict)
+@router.get("/", response_model=Dict)
 async def get_papers(
     limit: int = 20,
     skip: int = 0,
@@ -112,26 +118,30 @@ async def get_papers(
     Returns:
         dict: Dictionary containing total count and list of papers.
     """
-    # Connect to index
-    index = await AsyncSearchIndex.from_existing(
-        name=config.DEFAULT_PROVIDER,
-        redis_url=config.REDIS_URL
-    )
-    # Build query
+    # Attach to index
+    index = AsyncSearchIndex(schema, client)
+
+    # Build queries
     filter_expression = build_filter_expression(
         years.split(","),
         categories.split(",")
     )
-    filter_query = FilterQuery(return_fields=[], filter_expression=filter_expression)
-    # Execute search
-    result_papers = await index.search(
-        # TODO - expose rvl pagination support to query API
-        filter_query.query.paging(skip, limit)
+    filter_query = FilterQuery(
+        return_fields=[],
+        filter_expression=filter_expression
     )
-    return prepare_response(result_papers.total, result_papers)
+    filter_query.set_paging(skip, limit)
+    count_query = CountQuery(filter_expression)
+    # Execute searches
+    total_count, result_papers = await asyncio.gather(
+        index.query(count_query),
+        index.query(filter_query)
+    )
+    result_papers = await index.query(filter_query)
+    return prepare_response(total_count, result_papers)
 
 
-@r.post("/vectorsearch/paper", response_model=Dict)
+@router.post("/vectorsearch/paper", response_model=Dict)
 async def find_papers_by_paper(similarity_request: PaperSimilarityRequest):
     """
     Find and return papers similar to a given paper based on vector
@@ -145,16 +155,13 @@ async def find_papers_by_paper(similarity_request: PaperSimilarityRequest):
     Returns:
         dict: Dictionary containing total count and list of similar papers.
     """
-    # Connect to index
-    index = await AsyncSearchIndex.from_existing(
-        name=similarity_request.provider,
-        redis_url=config.REDIS_URL
-    )
-    # Fetch paper key and the vector from the HASH, cast to numpy array
-    paper_key = index.key(similarity_request.paper_id)
+    # Attach to index
+    index = AsyncSearchIndex(schema, client)
+
+    # Fetch paper vector from the HASH, cast to numpy array
+    paper = await index.fetch(similarity_request.paper_id)
     paper_vector = np.frombuffer(
-        await index.client.hget(paper_key, paper_vector_field_name),
-        dtype=np.float32
+        paper[similarity_request.provider], dtype=np.float32
     )
     # Build filter expression
     filter_expression = build_filter_expression(
@@ -164,13 +171,13 @@ async def find_papers_by_paper(similarity_request: PaperSimilarityRequest):
     # Create queries
     paper_similarity_query = VectorQuery(
         vector=paper_vector,
-        vector_field_name=paper_vector_field_name,
+        vector_field_name=similarity_request.provider,
         num_results=similarity_request.number_of_results,
         return_fields=config.RETURN_FIELDS,
         filter_expression=filter_expression
     )
     count_query = CountQuery(filter_expression)
-    # Execute search
+    # Execute searches
     total_count, result_papers = await asyncio.gather(
         index.query(count_query),
         index.query(paper_similarity_query)
@@ -179,7 +186,7 @@ async def find_papers_by_paper(similarity_request: PaperSimilarityRequest):
     return prepare_response(total_count, result_papers)
 
 
-@r.post("/vectorsearch/text", response_model=Dict)
+@router.post("/vectorsearch/text", response_model=Dict)
 async def find_papers_by_text(similarity_request: UserTextSimilarityRequest):
     """
     Find and return papers similar to user-provided text based on
@@ -194,33 +201,30 @@ async def find_papers_by_text(similarity_request: UserTextSimilarityRequest):
         dict: Dictionary containing total count and list of similar papers.
     """
     # Attach to index
-    index = await AsyncSearchIndex.from_existing(
-        name=similarity_request.provider,
-        redis_url=config.REDIS_URL
-    )
+    index = AsyncSearchIndex(schema, client)
+
     # Build filter expression
     filter_expression = build_filter_expression(
         similarity_request.years,
         similarity_request.categories
     )
     # Check available paper count and create vector from user text
     count_query = CountQuery(filter_expression)
-    query_vector, total_count = await asyncio.gather(
-        embeddings.get(
-            provider=similarity_request.provider,
-            text=similarity_request.user_text
-        ),
-        index.query(count_query)
+    query_vector = await embeddings.get(
+        provider=similarity_request.provider,
+        text=similarity_request.user_text
     )
     # Assemble vector query
     paper_similarity_query = VectorQuery(
         vector=query_vector,
-        vector_field_name=paper_vector_field_name,
+        vector_field_name=similarity_request.provider,
         num_results=similarity_request.number_of_results,
         return_fields=config.RETURN_FIELDS,
         filter_expression=filter_expression
     )
-    # Perform Vector Search
-    result_papers = await index.query(paper_similarity_query)
-    # Get Paper records of those results
+    # Execute searches
+    total_count, result_papers = await asyncio.gather(
+        index.query(count_query),
+        index.query(paper_similarity_query)
+    )    # Get Paper records of those results
     return prepare_response(total_count, result_papers)
diff --git a/backend/arxivsearch/config.py b/backend/arxivsearch/config.py
@@ -7,11 +7,10 @@
 API_V1_STR = "/api/v1"
 
 # Configuration
+DEFAULT_DATASET = os.environ.get("DEFAULT_DATASET", "arxiv-papers-1000.json")
 DATA_LOCATION = os.environ.get("DATA_LOCATION", "../../data")
 DEPLOYMENT_ENV = os.environ.get("DEPLOYMENT", "dev")
-DISTANCE_METRIC = os.environ.get("DISTANCE_METRIC", "COSINE")
 WRITE_CONCURRENCY = os.environ.get("WRITE_CONCURRENCY", 150)
-INDEX_TYPE = os.environ.get("VECSIM_INDEX_TYPE", "HNSW")
 RETURN_FIELDS = [
     "paper_id",
     "authors",
@@ -33,7 +32,5 @@
 # Model Providers
 DEFAULT_PROVIDER = "huggingface"
 SENTENCE_TRANSFORMER_MODEL = os.environ.get("SENTENCE_TRANSFORMER_MODEL", "sentence-transformers/all-mpnet-base-v2")
-OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
 OPENAI_EMBEDDING_MODEL = os.environ.get("OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002")
-COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
-COHERE_EMBEDDING_MODEL = os.environ.get("COHERE_EMBEDDING_MODEL", "small")
+COHERE_EMBEDDING_MODEL = os.environ.get("COHERE_EMBEDDING_MODEL", "embed-multilingual-v3.0")
diff --git a/backend/arxivsearch/embeddings/__init__.py → backend/arxivsearch/embeddings.py b/backend/arxivsearch/embeddings/__init__.py → backend/arxivsearch/embeddings.py
@@ -1,10 +1,13 @@
 import re
 import string
 
-from redisvl.vectorize.text import OpenAITextVectorizer, HFTextVectorizer
+from redisvl.utils.vectorize import (
+    CohereTextVectorizer,
+    OpenAITextVectorizer,
+    HFTextVectorizer
+)
 
 from arxivsearch import config
-from arxivsearch.embeddings.providers import CohereProvider
 from arxivsearch.schema import Provider
 
 
@@ -38,15 +41,15 @@ def preprocess_text(text: str) -> str:
 class Embeddings:
 
     def __init__(self):
-        # Initialize embedding providers if relevant
         self.hf_vectorizer = HFTextVectorizer(
             model=config.SENTENCE_TRANSFORMER_MODEL
         )
         self.oai_vectorizer = OpenAITextVectorizer(
-            model=config.OPENAI_EMBEDDING_MODEL,
-            api_config={"api_key": config.OPENAI_API_KEY}
+            model=config.OPENAI_EMBEDDING_MODEL
+        )
+        self.co_vectorizer = CohereTextVectorizer(
+            model=config.COHERE_EMBEDDING_MODEL
         )
-        self.co_vectorizer = CohereProvider()
 
     async def get(self, provider: str, text: str):
         """
@@ -56,7 +59,6 @@ async def get(self, provider: str, text: str):
             provider (str): Specified provider to use
             text (str): Text to embed.
         """
-
         if provider == Provider.huggingface.value:
             # Use HuggingFace Sentence Transformer
             return self.hf_vectorizer.embed(
@@ -70,9 +72,8 @@ async def get(self, provider: str, text: str):
                 preprocess=preprocess_text
             )
         elif provider == Provider.cohere.value:
-            return await self.co_vectorizer.embed_query(
+            return self.co_vectorizer.embed(
                 text,
+                input_type="search_query",
                 preprocess=preprocess_text
             )
-
-
diff --git a/backend/arxivsearch/embeddings/providers/__init__.py b/backend/arxivsearch/embeddings/providers/__init__.py