From 8c0ba48000dda79be7d8c48da18ff997752902d1 Mon Sep 17 00:00:00 2001 From: tolgadevAI <164843802+tolgadevAI@users.noreply.github.com> Date: Mon, 19 Aug 2024 23:41:41 +0300 Subject: [PATCH 01/27] develop the process of storing the function_schemas field for the route_layer add function on Pinecone --- semantic_router/index/base.py | 1 + semantic_router/index/pinecone.py | 19 ++++++++++++++++--- semantic_router/layer.py | 7 ++++++- 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/semantic_router/index/base.py b/semantic_router/index/base.py index d0f12ac6..d25d41dc 100644 --- a/semantic_router/index/base.py +++ b/semantic_router/index/base.py @@ -26,6 +26,7 @@ def add( embeddings: List[List[float]], routes: List[str], utterances: List[Any], + function_schemas: List[Dict[str, Any]], ): """ Add embeddings to the index. diff --git a/semantic_router/index/pinecone.py b/semantic_router/index/pinecone.py index b4f6033f..0b84ad4c 100644 --- a/semantic_router/index/pinecone.py +++ b/semantic_router/index/pinecone.py @@ -22,6 +22,7 @@ class PineconeRecord(BaseModel): values: List[float] route: str utterance: str + function_schema: str def __init__(self, **data): super().__init__(**data) @@ -34,7 +35,11 @@ def to_dict(self): return { "id": self.id, "values": self.values, - "metadata": {"sr_route": self.route, "sr_utterance": self.utterance}, + "metadata": { + "sr_route": self.route, + "sr_utterance": self.utterance, + "sr_function_schemas": self.function_schema, + }, } @@ -305,6 +310,7 @@ def add( embeddings: List[List[float]], routes: List[str], utterances: List[str], + function_schemas: List[Dict[str, Any]] = "", batch_size: int = 100, ): """Add vectors to Pinecone in batches.""" @@ -313,8 +319,15 @@ def add( self.index = self._init_index(force_create=True) vectors_to_upsert = [ - PineconeRecord(values=vector, route=route, utterance=utterance).to_dict() - for vector, route, utterance in zip(embeddings, routes, utterances) + PineconeRecord( + values=vector, + route=route, + utterance=utterance, + function_schema=str(function_schema), + ).to_dict() + for vector, route, utterance, function_schema in zip( + embeddings, routes, utterances, function_schemas + ) ] for i in range(0, len(vectors_to_upsert), batch_size): diff --git a/semantic_router/layer.py b/semantic_router/layer.py index 6b548fc0..2285a7d9 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -430,12 +430,17 @@ def add(self, route: Route): if route.score_threshold is None: route.score_threshold = self.score_threshold - # add routes to the index self.index.add( embeddings=embeds, routes=[route.name] * len(route.utterances), utterances=route.utterances, + function_schemas=( + route.function_schemas * len(route.utterances) + if route.function_schemas + else [""] * len(route.utterances) + ), ) + self.routes.append(route) def list_route_names(self) -> List[str]: From 51d8b5ae503187fe70307dee44dafa42dc8743dd Mon Sep 17 00:00:00 2001 From: tolgadevAI <164843802+tolgadevAI@users.noreply.github.com> Date: Tue, 20 Aug 2024 09:09:42 +0300 Subject: [PATCH 02/27] update the indexes to add function_schemas --- semantic_router/index/base.py | 2 +- semantic_router/index/local.py | 2 ++ semantic_router/index/pinecone.py | 2 +- semantic_router/index/postgres.py | 6 +++++- semantic_router/index/qdrant.py | 1 + semantic_router/layer.py | 20 ++++++++++++++++---- 6 files changed, 26 insertions(+), 7 deletions(-) diff --git a/semantic_router/index/base.py b/semantic_router/index/base.py index d25d41dc..3d391083 100644 --- a/semantic_router/index/base.py +++ b/semantic_router/index/base.py @@ -26,7 +26,7 @@ def add( embeddings: List[List[float]], routes: List[str], utterances: List[Any], - function_schemas: List[Dict[str, Any]], + function_schemas: List[Dict[str, Any]] = None, # type: ignore ): """ Add embeddings to the index. diff --git a/semantic_router/index/local.py b/semantic_router/index/local.py index 7150b267..802455cb 100644 --- a/semantic_router/index/local.py +++ b/semantic_router/index/local.py @@ -5,6 +5,7 @@ from semantic_router.index.base import BaseIndex from semantic_router.linear import similarity_matrix, top_scores from semantic_router.utils.logger import logger +from typing import Any class LocalIndex(BaseIndex): @@ -26,6 +27,7 @@ def add( embeddings: List[List[float]], routes: List[str], utterances: List[str], + function_schemas: List[Dict[str, Any]] = None, # type: ignore ): embeds = np.array(embeddings) # type: ignore routes_arr = np.array(routes) diff --git a/semantic_router/index/pinecone.py b/semantic_router/index/pinecone.py index 0b84ad4c..da11ec6e 100644 --- a/semantic_router/index/pinecone.py +++ b/semantic_router/index/pinecone.py @@ -310,7 +310,7 @@ def add( embeddings: List[List[float]], routes: List[str], utterances: List[str], - function_schemas: List[Dict[str, Any]] = "", + function_schemas: List[Dict[str, Any]] = None, # type: ignore batch_size: int = 100, ): """Add vectors to Pinecone in batches.""" diff --git a/semantic_router/index/postgres.py b/semantic_router/index/postgres.py index 4c971d4d..9fbac62f 100644 --- a/semantic_router/index/postgres.py +++ b/semantic_router/index/postgres.py @@ -254,7 +254,11 @@ def _check_embeddings_dimensions(self) -> bool: raise ValueError("No comment found for the 'vector' column.") def add( - self, embeddings: List[List[float]], routes: List[str], utterances: List[Any] + self, + embeddings: List[List[float]], + routes: List[str], + utterances: List[Any], + function_schemas: List[Dict[str, Any]] = None, # type: ignore ) -> None: """ Adds vectors to the index. diff --git a/semantic_router/index/qdrant.py b/semantic_router/index/qdrant.py index c1a5e28b..0fc6aa52 100644 --- a/semantic_router/index/qdrant.py +++ b/semantic_router/index/qdrant.py @@ -175,6 +175,7 @@ def add( embeddings: List[List[float]], routes: List[str], utterances: List[str], + function_schemas: List[Dict[str, Any]] = None, # type: ignore batch_size: int = DEFAULT_UPLOAD_BATCH_SIZE, ): self.dimensions = self.dimensions or len(embeddings[0]) diff --git a/semantic_router/layer.py b/semantic_router/layer.py index 2285a7d9..bae67544 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -437,7 +437,7 @@ def add(self, route: Route): function_schemas=( route.function_schemas * len(route.utterances) if route.function_schemas - else [""] * len(route.utterances) + else [""] * len(route.utterances) # type: ignore ), ) @@ -482,7 +482,9 @@ def _refresh_routes(self): def _add_routes(self, routes: List[Route]): # create embeddings for all routes - route_names, all_utterances = self._extract_routes_details(routes) + route_names, all_utterances, function_schemas = self._extract_routes_details( + routes + ) embedded_utterances = self.encoder(all_utterances) # create route array # add everything to the index @@ -490,11 +492,14 @@ def _add_routes(self, routes: List[Route]): embeddings=embedded_utterances, routes=route_names, utterances=all_utterances, + function_schemas=function_schemas, ) def _add_and_sync_routes(self, routes: List[Route]): # create embeddings for all routes and sync at startup with remote ones based on sync setting - local_route_names, local_utterances = self._extract_routes_details(routes) + local_route_names, local_utterances, local_function_schemas = ( + self._extract_routes_details(routes) + ) routes_to_add, routes_to_delete, layer_routes_dict = self.index._sync_index( local_route_names=local_route_names, local_utterances=local_utterances, @@ -522,6 +527,7 @@ def _add_and_sync_routes(self, routes: List[Route]): embeddings=embedded_utterances_to_add, routes=route_names_to_add, utterances=all_utterances_to_add, + function_schemas=local_function_schemas, ) self._set_layer_routes(layer_routes) @@ -529,7 +535,13 @@ def _add_and_sync_routes(self, routes: List[Route]): def _extract_routes_details(self, routes: List[Route]) -> Tuple: route_names = [route.name for route in routes for _ in route.utterances] utterances = [utterance for route in routes for utterance in route.utterances] - return route_names, utterances + function_schemas = [ + function_schema if function_schema is not None else "" + for route in routes + if route.function_schemas is not None + for function_schema in route.function_schemas + ] + return route_names, utterances, function_schemas def _encode(self, text: str) -> Any: """Given some text, encode it.""" From 88d32d42426681c99859ce8625c492bdfe25f88d Mon Sep 17 00:00:00 2001 From: tolgadevAI <164843802+tolgadevAI@users.noreply.github.com> Date: Tue, 20 Aug 2024 10:30:22 +0300 Subject: [PATCH 03/27] develop the process to store function_schemas in the vectorDB during the initialization of route_layer --- semantic_router/index/base.py | 6 ++- semantic_router/index/local.py | 6 ++- semantic_router/index/pinecone.py | 6 ++- semantic_router/index/qdrant.py | 6 ++- semantic_router/layer.py | 63 ++++++++++++++++++++----------- 5 files changed, 60 insertions(+), 27 deletions(-) diff --git a/semantic_router/index/base.py b/semantic_router/index/base.py index 3d391083..d07045e7 100644 --- a/semantic_router/index/base.py +++ b/semantic_router/index/base.py @@ -99,7 +99,11 @@ def delete_index(self): raise NotImplementedError("This method should be implemented by subclasses.") def _sync_index( - self, local_route_names: List[str], local_utterances: List[str], dimensions: int + self, + local_route_names: List[str], + local_utterances: List[str], + dimensions: int, + local_function_schemas: List[str] = None, # type: ignore ): """ Synchronize the local index with the remote index based on the specified mode. diff --git a/semantic_router/index/local.py b/semantic_router/index/local.py index 802455cb..65446f46 100644 --- a/semantic_router/index/local.py +++ b/semantic_router/index/local.py @@ -49,7 +49,11 @@ def _remove_and_sync(self, routes_to_delete: dict): logger.warning("Sync remove is not implemented for LocalIndex.") def _sync_index( - self, local_route_names: List[str], local_utterances: List[str], dimensions: int + self, + local_route_names: List[str], + local_utterances: List[str], + dimensions: int, + local_function_schemas: List[str] = None, # type: ignore ): if self.sync is not None: logger.error("Sync remove is not implemented for LocalIndex.") diff --git a/semantic_router/index/pinecone.py b/semantic_router/index/pinecone.py index da11ec6e..7686993c 100644 --- a/semantic_router/index/pinecone.py +++ b/semantic_router/index/pinecone.py @@ -207,7 +207,11 @@ async def _init_async_index(self, force_create: bool = False): self.host = index_stats["host"] if index_stats else None def _sync_index( - self, local_route_names: List[str], local_utterances: List[str], dimensions: int + self, + local_route_names: List[str], + local_utterances: List[str], + dimensions: int, + local_function_schemas: List[str] = None, # type: ignore ): if self.index is None: self.dimensions = self.dimensions or dimensions diff --git a/semantic_router/index/qdrant.py b/semantic_router/index/qdrant.py index 0fc6aa52..5801c6de 100644 --- a/semantic_router/index/qdrant.py +++ b/semantic_router/index/qdrant.py @@ -165,7 +165,11 @@ def _remove_and_sync(self, routes_to_delete: dict): logger.error("Sync remove is not implemented for QdrantIndex.") def _sync_index( - self, local_route_names: List[str], local_utterances: List[str], dimensions: int + self, + local_route_names: List[str], + local_utterances: List[str], + dimensions: int, + local_function_schemas: List[str] = None, # type: ignore ): if self.sync is not None: logger.error("Sync remove is not implemented for QdrantIndex.") diff --git a/semantic_router/layer.py b/semantic_router/layer.py index bae67544..c16700bf 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -180,7 +180,7 @@ def __init__( self, encoder: Optional[BaseEncoder] = None, llm: Optional[BaseLLM] = None, - routes: Optional[List[Route]] = None, + routes: List[Route] = [], index: Optional[BaseIndex] = None, # type: ignore top_k: int = 5, aggregation: str = "sum", @@ -195,7 +195,7 @@ def __init__( else: self.encoder = encoder self.llm = llm - self.routes: List[Route] = routes if routes is not None else [] + self.routes = routes if self.encoder.score_threshold is None: raise ValueError( "No score threshold provided for encoder. Please set the score threshold " @@ -216,15 +216,13 @@ def __init__( for route in self.routes: if route.score_threshold is None: route.score_threshold = self.score_threshold + + if self.routes: + self._add_routes(routes=self.routes) + # if routes list has been passed, we initialize index now if self.index.sync: - # initialize index now - if len(self.routes) > 0: - self._add_and_sync_routes(routes=self.routes) - else: - self._add_and_sync_routes(routes=[]) - elif len(self.routes) > 0: - self._add_routes(routes=self.routes) + self._add_and_sync_routes(routes=self.routes) def check_for_matching_routes(self, top_class: str) -> Optional[Route]: matching_routes = [route for route in self.routes if route.name == top_class] @@ -482,32 +480,52 @@ def _refresh_routes(self): def _add_routes(self, routes: List[Route]): # create embeddings for all routes - route_names, all_utterances, function_schemas = self._extract_routes_details( - routes - ) - embedded_utterances = self.encoder(all_utterances) + # route_names, all_utterances, function_schemas = self._extract_routes_details( + # routes + # ) + # embedded_utterances = self.encoder(all_utterances) # create route array # add everything to the index - self.index.add( - embeddings=embedded_utterances, - routes=route_names, - utterances=all_utterances, - function_schemas=function_schemas, - ) + if routes: + for route in routes: + logger.info(f"Adding `{route.name}` route") + embeddings = self.encoder(route.utterances) + if route.score_threshold is None: + route.score_threshold = self.score_threshold + + try: + self.index.add( + embeddings=embeddings, + routes=[route.name] * len(route.utterances), + utterances=route.utterances, + function_schemas=( + route.function_schemas * len(route.utterances) + if route.function_schemas + else [""] * len(route.utterances) # type: ignore + ), + ) + except Exception as e: + logger.error(f"index error: {e}") + raise Exception(f"index error: {e}") from e def _add_and_sync_routes(self, routes: List[Route]): # create embeddings for all routes and sync at startup with remote ones based on sync setting local_route_names, local_utterances, local_function_schemas = ( self._extract_routes_details(routes) ) + routes_to_add, routes_to_delete, layer_routes_dict = self.index._sync_index( local_route_names=local_route_names, local_utterances=local_utterances, + local_function_schemas=local_function_schemas, dimensions=len(self.encoder(["dummy"])[0]), ) layer_routes = [ - Route(name=route, utterances=layer_routes_dict[route]) + Route( + name=route, + utterances=layer_routes_dict[route], + ) for route in layer_routes_dict.keys() ] @@ -536,10 +554,9 @@ def _extract_routes_details(self, routes: List[Route]) -> Tuple: route_names = [route.name for route in routes for _ in route.utterances] utterances = [utterance for route in routes for utterance in route.utterances] function_schemas = [ - function_schema if function_schema is not None else "" + route.function_schemas if route.function_schemas is not None else "" for route in routes - if route.function_schemas is not None - for function_schema in route.function_schemas + for _ in route.utterances ] return route_names, utterances, function_schemas From 30f9bf8c992bed3f7e6264bcd6737d2053971ea0 Mon Sep 17 00:00:00 2001 From: tolgadevAI <164843802+tolgadevAI@users.noreply.github.com> Date: Tue, 20 Aug 2024 10:47:03 +0300 Subject: [PATCH 04/27] update the exception --- semantic_router/layer.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/semantic_router/layer.py b/semantic_router/layer.py index c16700bf..6e34af8e 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -479,20 +479,12 @@ def _refresh_routes(self): self.routes.append(route) def _add_routes(self, routes: List[Route]): - # create embeddings for all routes - # route_names, all_utterances, function_schemas = self._extract_routes_details( - # routes - # ) - # embedded_utterances = self.encoder(all_utterances) - # create route array - # add everything to the index if routes: for route in routes: logger.info(f"Adding `{route.name}` route") embeddings = self.encoder(route.utterances) if route.score_threshold is None: route.score_threshold = self.score_threshold - try: self.index.add( embeddings=embeddings, @@ -505,8 +497,10 @@ def _add_routes(self, routes: List[Route]): ), ) except Exception as e: - logger.error(f"index error: {e}") - raise Exception(f"index error: {e}") from e + logger.error( + f"Failed to add route `{route.name}` to the index: {e}" + ) + raise Exception(f"Indexing error for route `{route.name}`") from e def _add_and_sync_routes(self, routes: List[Route]): # create embeddings for all routes and sync at startup with remote ones based on sync setting From 1fefc5a0f8e0d58ce794f9b1861cc37fca6f221d Mon Sep 17 00:00:00 2001 From: tolgadevAI <164843802+tolgadevAI@users.noreply.github.com> Date: Mon, 26 Aug 2024 14:36:23 +0300 Subject: [PATCH 05/27] update the object definitions --- semantic_router/index/base.py | 4 ++-- semantic_router/index/local.py | 4 ++-- semantic_router/index/pinecone.py | 10 ++++++---- semantic_router/index/postgres.py | 2 +- semantic_router/index/qdrant.py | 4 ++-- semantic_router/layer.py | 3 ++- 6 files changed, 15 insertions(+), 12 deletions(-) diff --git a/semantic_router/index/base.py b/semantic_router/index/base.py index 26335ce5..8ef48967 100644 --- a/semantic_router/index/base.py +++ b/semantic_router/index/base.py @@ -26,7 +26,7 @@ def add( embeddings: List[List[float]], routes: List[str], utterances: List[Any], - function_schemas: List[Dict[str, Any]] = None, # type: ignore + function_schemas: List[Dict[str, Any]] | None = None, ): """ Add embeddings to the index. @@ -114,7 +114,7 @@ def _sync_index( local_route_names: List[str], local_utterances: List[str], dimensions: int, - local_function_schemas: List[str] = None, # type: ignore + local_function_schemas: List[str] | None = None, ): """ Synchronize the local index with the remote index based on the specified mode. diff --git a/semantic_router/index/local.py b/semantic_router/index/local.py index d57f563a..7bc12bba 100644 --- a/semantic_router/index/local.py +++ b/semantic_router/index/local.py @@ -27,7 +27,7 @@ def add( embeddings: List[List[float]], routes: List[str], utterances: List[str], - function_schemas: List[Dict[str, Any]] = None, # type: ignore + function_schemas: List[Dict[str, Any]] | None = None, ): embeds = np.array(embeddings) # type: ignore routes_arr = np.array(routes) @@ -53,7 +53,7 @@ def _sync_index( local_route_names: List[str], local_utterances: List[str], dimensions: int, - local_function_schemas: List[str] = None, # type: ignore + local_function_schemas: List[str] | None = None, ): if self.sync is not None: logger.error("Sync remove is not implemented for LocalIndex.") diff --git a/semantic_router/index/pinecone.py b/semantic_router/index/pinecone.py index a1008833..5b88ba57 100644 --- a/semantic_router/index/pinecone.py +++ b/semantic_router/index/pinecone.py @@ -3,6 +3,8 @@ import hashlib import os import time +import json + from typing import Any, Dict, List, Optional, Tuple, Union import numpy as np @@ -211,7 +213,7 @@ def _sync_index( local_route_names: List[str], local_utterances: List[str], dimensions: int, - local_function_schemas: List[str] = None, # type: ignore + local_function_schemas: List[str] | None = None, ): if self.index is None: self.dimensions = self.dimensions or dimensions @@ -314,7 +316,7 @@ def add( embeddings: List[List[float]], routes: List[str], utterances: List[str], - function_schemas: List[Dict[str, Any]] = None, # type: ignore + function_schemas: List[Dict[str, Any]] | None = None, batch_size: int = 100, ): """Add vectors to Pinecone in batches.""" @@ -327,10 +329,10 @@ def add( values=vector, route=route, utterance=utterance, - function_schema=str(function_schema), + function_schema=json.dumps(function_schema), ).to_dict() for vector, route, utterance, function_schema in zip( - embeddings, routes, utterances, function_schemas + embeddings, routes, utterances, function_schemas # type: ignore ) ] diff --git a/semantic_router/index/postgres.py b/semantic_router/index/postgres.py index 963f7513..b4e133fe 100644 --- a/semantic_router/index/postgres.py +++ b/semantic_router/index/postgres.py @@ -259,7 +259,7 @@ def add( embeddings: List[List[float]], routes: List[str], utterances: List[Any], - function_schemas: List[Dict[str, Any]] = None, # type: ignore + function_schemas: List[Dict[str, Any]] | None = None, ) -> None: """ Adds vectors to the index. diff --git a/semantic_router/index/qdrant.py b/semantic_router/index/qdrant.py index 425518ff..0b414cff 100644 --- a/semantic_router/index/qdrant.py +++ b/semantic_router/index/qdrant.py @@ -169,7 +169,7 @@ def _sync_index( local_route_names: List[str], local_utterances: List[str], dimensions: int, - local_function_schemas: List[str] = None, # type: ignore + local_function_schemas: List[str] | None = None, ): if self.sync is not None: logger.error("Sync remove is not implemented for QdrantIndex.") @@ -179,7 +179,7 @@ def add( embeddings: List[List[float]], routes: List[str], utterances: List[str], - function_schemas: List[Dict[str, Any]] = None, # type: ignore + function_schemas: List[Dict[str, Any]] | None = None, batch_size: int = DEFAULT_UPLOAD_BATCH_SIZE, ): self.dimensions = self.dimensions or len(embeddings[0]) diff --git a/semantic_router/layer.py b/semantic_router/layer.py index 6e34af8e..a77c5844 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -428,6 +428,7 @@ def add(self, route: Route): if route.score_threshold is None: route.score_threshold = self.score_threshold + # add routes to the index self.index.add( embeddings=embeds, routes=[route.name] * len(route.utterances), @@ -435,7 +436,7 @@ def add(self, route: Route): function_schemas=( route.function_schemas * len(route.utterances) if route.function_schemas - else [""] * len(route.utterances) # type: ignore + else [{}] * len(route.utterances) ), ) From 77d323bad7ba520abe2b4030308f4dbf56ae3a8f Mon Sep 17 00:00:00 2001 From: tolgadevAI <164843802+tolgadevAI@users.noreply.github.com> Date: Mon, 26 Aug 2024 23:52:39 +0300 Subject: [PATCH 06/27] Develop the process to create and add the function_schema field for routes with sync="local" --- semantic_router/index/base.py | 2 +- semantic_router/index/pinecone.py | 131 ++++++++++++++++++++---------- semantic_router/layer.py | 31 ++++--- 3 files changed, 109 insertions(+), 55 deletions(-) diff --git a/semantic_router/index/base.py b/semantic_router/index/base.py index 8ef48967..73467887 100644 --- a/semantic_router/index/base.py +++ b/semantic_router/index/base.py @@ -114,7 +114,7 @@ def _sync_index( local_route_names: List[str], local_utterances: List[str], dimensions: int, - local_function_schemas: List[str] | None = None, + local_function_schemas: List[Dict[str, Any]], ): """ Synchronize the local index with the remote index based on the specified mode. diff --git a/semantic_router/index/pinecone.py b/semantic_router/index/pinecone.py index 5b88ba57..70833c4a 100644 --- a/semantic_router/index/pinecone.py +++ b/semantic_router/index/pinecone.py @@ -5,7 +5,7 @@ import time import json -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Union, Tuple import numpy as np import requests @@ -213,89 +213,127 @@ def _sync_index( local_route_names: List[str], local_utterances: List[str], dimensions: int, - local_function_schemas: List[str] | None = None, - ): + local_function_schemas: List[Dict[str, Any]], + ) -> Tuple: + if self.index is None: self.dimensions = self.dimensions or dimensions self.index = self._init_index(force_create=True) remote_routes = self.get_routes() - remote_dict: dict = {route: set() for route, _ in remote_routes} - for route, utterance in remote_routes: - remote_dict[route].add(utterance) + remote_dict = { + route: {"utterances": set(), "function_schemas": set()} + for route, _, _ in remote_routes + } + + for route, utterance, function_schema in remote_routes: + remote_dict[route]["utterances"].add(utterance) + remote_dict[route]["function_schemas"].add(function_schema) - local_dict: dict = {route: set() for route in local_route_names} - for route, utterance in zip(local_route_names, local_utterances): - local_dict[route].add(utterance) + local_dict = { + route: {"utterances": set(), "function_schemas": set()} + for route in local_route_names + } - all_routes = set(remote_dict.keys()).union(local_dict.keys()) + for route, utterance, function_schema in zip( + local_route_names, local_utterances, local_function_schemas + ): + local_dict[route]["utterances"].add(utterance) + local_dict[route]["function_schemas"].add(json.dumps(function_schema)) + all_routes = set(remote_dict.keys()).union(local_dict.keys()) routes_to_add = [] routes_to_delete = [] layer_routes = {} for route in all_routes: - local_utterances = local_dict.get(route, set()) - remote_utterances = remote_dict.get(route, set()) + local_utterances_set = local_dict.get(route, {"utterances": set()})[ + "utterances" + ] + remote_utterances_set = remote_dict.get(route, {"utterances": set()})[ + "utterances" + ] + local_function_schemas_set = local_dict.get( + route, {"function_schemas": set()} + )["function_schemas"] - if not local_utterances and not remote_utterances: + remote_function_schemas_set = remote_dict.get( + route, {"function_schemas": set()} + )["function_schemas"] + + if not local_utterances_set and not remote_utterances_set: continue + utterances_to_include: set = set() + if self.sync == "error": - if local_utterances != remote_utterances: + if local_utterances_set != remote_utterances_set: raise ValueError( f"Synchronization error: Differences found in route '{route}'" ) - utterances_to_include: set = set() - if local_utterances: - layer_routes[route] = list(local_utterances) + if local_utterances_set: + layer_routes[route] = {"utterances": list(local_utterances_set)} + elif self.sync == "remote": - utterances_to_include = set() - if remote_utterances: - layer_routes[route] = list(remote_utterances) + if remote_utterances_set: + layer_routes[route] = {"utterances": list(remote_utterances_set)} + elif self.sync == "local": - utterances_to_include = local_utterances - remote_utterances + utterances_to_include = local_utterances_set - remote_utterances_set routes_to_delete.extend( [ (route, utterance) - for utterance in remote_utterances - if utterance not in local_utterances + for utterance in remote_utterances_set + if utterance not in local_utterances_set ] ) - if local_utterances: - layer_routes[route] = list(local_utterances) + layer_routes[route] = {} + if local_utterances_set: + layer_routes[route]["utterances"] = list(local_utterances_set) + if local_function_schemas_set: + layer_routes[route]["function_schemas"] = list( + local_function_schemas_set + ) + elif self.sync == "merge-force-remote": if route in local_dict and route not in remote_dict: utterances_to_include = set(local_utterances) if local_utterances: - layer_routes[route] = list(local_utterances) + layer_routes[route] = {"utterances": list(local_utterances)} else: - utterances_to_include = set() - if remote_utterances: - layer_routes[route] = list(remote_utterances) + if remote_utterances_set: + layer_routes[route] = { + "utterances": list(remote_utterances_set) + } + elif self.sync == "merge-force-local": if route in local_dict: - utterances_to_include = local_utterances - remote_utterances + utterances_to_include = local_utterances_set - remote_utterances_set routes_to_delete.extend( [ (route, utterance) - for utterance in remote_utterances - if utterance not in local_utterances + for utterance in remote_utterances_set + if utterance not in local_utterances_set ] ) - if local_utterances: - layer_routes[route] = local_utterances + if local_utterances_set: + layer_routes[route] = {"utterances": list(local_utterances_set)} else: - utterances_to_include = set() - if remote_utterances: - layer_routes[route] = list(remote_utterances) + if remote_utterances_set: + layer_routes[route] = { + "utterances": list(remote_utterances_set) + } + elif self.sync == "merge": - utterances_to_include = local_utterances - remote_utterances - if local_utterances or remote_utterances: - layer_routes[route] = list( - remote_utterances.union(local_utterances) - ) + utterances_to_include = local_utterances_set - remote_utterances_set + if local_utterances_set or remote_utterances_set: + layer_routes[route] = { + "utterances": list( + remote_utterances_set.union(local_utterances_set) + ) + } + else: raise ValueError("Invalid sync mode specified") @@ -437,7 +475,14 @@ def get_routes(self) -> List[Tuple]: """ # Get all records _, metadata = self._get_all(include_metadata=True) - route_tuples = [(x["sr_route"], x["sr_utterance"]) for x in metadata] + route_tuples = [ + ( + route_objects["sr_route"], + route_objects["sr_utterance"], + route_objects["function_schemas"], + ) + for route_objects in metadata + ] return route_tuples def delete(self, route_name: str): diff --git a/semantic_router/layer.py b/semantic_router/layer.py index a77c5844..07a39ea8 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -217,13 +217,13 @@ def __init__( if route.score_threshold is None: route.score_threshold = self.score_threshold - if self.routes: - self._add_routes(routes=self.routes) - # if routes list has been passed, we initialize index now if self.index.sync: self._add_and_sync_routes(routes=self.routes) + if self.routes: + self._add_routes(routes=self.routes) + def check_for_matching_routes(self, top_class: str) -> Optional[Route]: matching_routes = [route for route in self.routes if route.name == top_class] if not matching_routes: @@ -516,13 +516,20 @@ def _add_and_sync_routes(self, routes: List[Route]): dimensions=len(self.encoder(["dummy"])[0]), ) - layer_routes = [ - Route( - name=route, - utterances=layer_routes_dict[route], + layer_routes = [] + for route in layer_routes_dict.keys(): + route_data = layer_routes_dict[route] + logger.info( + f"route_data[function_schemas][0]: {route_data["function_schemas"][0]}" ) - for route in layer_routes_dict.keys() - ] + if not route_data["function_schemas"][0]: + layer_routes.append( + Route( + name=route, + utterances=route_data["utterances"], + function_schemas=None, + ) + ) data_to_delete: dict = {} for route, utterance in routes_to_delete: @@ -545,11 +552,13 @@ def _add_and_sync_routes(self, routes: List[Route]): self._set_layer_routes(layer_routes) - def _extract_routes_details(self, routes: List[Route]) -> Tuple: + def _extract_routes_details( + self, routes: List[Route] + ) -> Tuple[list[str], list[str], List[Dict[str, Any]]]: route_names = [route.name for route in routes for _ in route.utterances] utterances = [utterance for route in routes for utterance in route.utterances] function_schemas = [ - route.function_schemas if route.function_schemas is not None else "" + route.function_schemas[0] if route.function_schemas is not None else [] for route in routes for _ in route.utterances ] From ab9dc774eceb50f2fd102ce7fe685427460abf39 Mon Sep 17 00:00:00 2001 From: tolgadevAI <164843802+tolgadevAI@users.noreply.github.com> Date: Tue, 27 Aug 2024 02:39:09 +0300 Subject: [PATCH 07/27] fix the local sync --- semantic_router/index/pinecone.py | 27 ++++++++++++++------------- semantic_router/layer.py | 29 +++++++++++++++++++++-------- 2 files changed, 35 insertions(+), 21 deletions(-) diff --git a/semantic_router/index/pinecone.py b/semantic_router/index/pinecone.py index 70833c4a..f4d003f6 100644 --- a/semantic_router/index/pinecone.py +++ b/semantic_router/index/pinecone.py @@ -223,24 +223,25 @@ def _sync_index( remote_routes = self.get_routes() remote_dict = { - route: {"utterances": set(), "function_schemas": set()} + route: {"utterances": set(), "function_schemas": {}} for route, _, _ in remote_routes } for route, utterance, function_schema in remote_routes: remote_dict[route]["utterances"].add(utterance) - remote_dict[route]["function_schemas"].add(function_schema) + remote_dict[route]["function_schemas"].update(function_schema) local_dict = { - route: {"utterances": set(), "function_schemas": set()} + route: {"utterances": set(), "function_schemas": {}} for route in local_route_names } for route, utterance, function_schema in zip( local_route_names, local_utterances, local_function_schemas ): + logger.info(f"function_schema: {function_schema}") local_dict[route]["utterances"].add(utterance) - local_dict[route]["function_schemas"].add(json.dumps(function_schema)) + local_dict[route]["function_schemas"].update(function_schema) all_routes = set(remote_dict.keys()).union(local_dict.keys()) routes_to_add = [] @@ -254,12 +255,12 @@ def _sync_index( remote_utterances_set = remote_dict.get(route, {"utterances": set()})[ "utterances" ] - local_function_schemas_set = local_dict.get( - route, {"function_schemas": set()} - )["function_schemas"] + local_function_schemas_dict = local_dict.get(route, {}).get( + "function_schemas", {} + ) - remote_function_schemas_set = remote_dict.get( - route, {"function_schemas": set()} + remote_function_schemas_dict = remote_dict.get( + route, {"function_schemas": {}} )["function_schemas"] if not local_utterances_set and not remote_utterances_set: @@ -291,10 +292,10 @@ def _sync_index( layer_routes[route] = {} if local_utterances_set: layer_routes[route]["utterances"] = list(local_utterances_set) - if local_function_schemas_set: - layer_routes[route]["function_schemas"] = list( - local_function_schemas_set - ) + if local_function_schemas_dict: + layer_routes[route][ + "function_schemas" + ] = local_function_schemas_dict elif self.sync == "merge-force-remote": if route in local_dict and route not in remote_dict: diff --git a/semantic_router/layer.py b/semantic_router/layer.py index 07a39ea8..a7f6c1c0 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -226,6 +226,8 @@ def __init__( def check_for_matching_routes(self, top_class: str) -> Optional[Route]: matching_routes = [route for route in self.routes if route.name == top_class] + logger.info(f"matching_routes: {matching_routes}") + logger.info(f"self.routes: {self.routes}") if not matching_routes: logger.error( f"No route found with name {top_class}. Check to see if any Routes " @@ -516,20 +518,29 @@ def _add_and_sync_routes(self, routes: List[Route]): dimensions=len(self.encoder(["dummy"])[0]), ) - layer_routes = [] + layer_routes: List[Route] = [] + logger.info(f"layer_routes_dict: {layer_routes_dict}") for route in layer_routes_dict.keys(): - route_data = layer_routes_dict[route] - logger.info( - f"route_data[function_schemas][0]: {route_data["function_schemas"][0]}" - ) - if not route_data["function_schemas"][0]: + logger.info(f"route name: {route}") + + route_ = layer_routes_dict[route] + function_schemas = route_.get("function_schemas", None) + if not function_schemas: layer_routes.append( Route( name=route, - utterances=route_data["utterances"], + utterances=route_["utterances"], function_schemas=None, ) ) + else: + layer_routes.append( + Route( + name=route, + utterances=route_["utterances"], + function_schemas=[function_schemas], + ) + ) data_to_delete: dict = {} for route, utterance in routes_to_delete: @@ -550,6 +561,8 @@ def _add_and_sync_routes(self, routes: List[Route]): function_schemas=local_function_schemas, ) + logger.info(f"layer_routes: {layer_routes}") + self._set_layer_routes(layer_routes) def _extract_routes_details( @@ -558,7 +571,7 @@ def _extract_routes_details( route_names = [route.name for route in routes for _ in route.utterances] utterances = [utterance for route in routes for utterance in route.utterances] function_schemas = [ - route.function_schemas[0] if route.function_schemas is not None else [] + route.function_schemas[0] if route.function_schemas is not None else {} for route in routes for _ in route.utterances ] From 199ee02f26ef3b9d1b54fa3c946af7b1476599e8 Mon Sep 17 00:00:00 2001 From: tolgadevAI <164843802+tolgadevAI@users.noreply.github.com> Date: Tue, 27 Aug 2024 09:47:40 +0300 Subject: [PATCH 08/27] develop the sync cases for "remote" and "merge-force-remote" --- semantic_router/index/pinecone.py | 28 ++++++++++++++++++++++++---- semantic_router/layer.py | 9 ++++----- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/semantic_router/index/pinecone.py b/semantic_router/index/pinecone.py index f4d003f6..e13d1131 100644 --- a/semantic_router/index/pinecone.py +++ b/semantic_router/index/pinecone.py @@ -227,9 +227,18 @@ def _sync_index( for route, _, _ in remote_routes } + logger.info(f"remote_routes: {remote_routes}") + for route, utterance, function_schema in remote_routes: + logger.info(f"function_schema remote: {function_schema}") remote_dict[route]["utterances"].add(utterance) - remote_dict[route]["function_schemas"].update(function_schema) + + if not function_schema: + logger.info(f"function_schema remote is empty for {route}") + remote_dict[route]["function_schemas"].update({}) + else: + logger.info(f"function_schema remote is not empty for {route}") + remote_dict[route]["function_schemas"].update(function_schema) local_dict = { route: {"utterances": set(), "function_schemas": {}} @@ -239,7 +248,7 @@ def _sync_index( for route, utterance, function_schema in zip( local_route_names, local_utterances, local_function_schemas ): - logger.info(f"function_schema: {function_schema}") + logger.info(f"function_schema local: {function_schema}") local_dict[route]["utterances"].add(utterance) local_dict[route]["function_schemas"].update(function_schema) @@ -279,6 +288,10 @@ def _sync_index( elif self.sync == "remote": if remote_utterances_set: layer_routes[route] = {"utterances": list(remote_utterances_set)} + if remote_function_schemas_dict: + layer_routes[route][ + "function_schemas" + ] = remote_function_schemas_dict elif self.sync == "local": utterances_to_include = local_utterances_set - remote_utterances_set @@ -296,17 +309,24 @@ def _sync_index( layer_routes[route][ "function_schemas" ] = local_function_schemas_dict - elif self.sync == "merge-force-remote": if route in local_dict and route not in remote_dict: utterances_to_include = set(local_utterances) if local_utterances: layer_routes[route] = {"utterances": list(local_utterances)} + if local_function_schemas_dict: + layer_routes[route][ + "function_schemas" + ] = local_function_schemas_dict else: if remote_utterances_set: layer_routes[route] = { "utterances": list(remote_utterances_set) } + if remote_function_schemas_dict: + layer_routes[route][ + "function_schemas" + ] = remote_function_schemas_dict elif self.sync == "merge-force-local": if route in local_dict: @@ -480,7 +500,7 @@ def get_routes(self) -> List[Tuple]: ( route_objects["sr_route"], route_objects["sr_utterance"], - route_objects["function_schemas"], + json.loads(route_objects["sr_function_schemas"]) if route_objects["sr_function_schemas"] else {}, ) for route_objects in metadata ] diff --git a/semantic_router/layer.py b/semantic_router/layer.py index a7f6c1c0..fee3b665 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -522,14 +522,13 @@ def _add_and_sync_routes(self, routes: List[Route]): logger.info(f"layer_routes_dict: {layer_routes_dict}") for route in layer_routes_dict.keys(): logger.info(f"route name: {route}") - - route_ = layer_routes_dict[route] - function_schemas = route_.get("function_schemas", None) + route_dict = layer_routes_dict[route] + function_schemas = route_dict.get("function_schemas", None) if not function_schemas: layer_routes.append( Route( name=route, - utterances=route_["utterances"], + utterances=route_dict["utterances"], function_schemas=None, ) ) @@ -537,7 +536,7 @@ def _add_and_sync_routes(self, routes: List[Route]): layer_routes.append( Route( name=route, - utterances=route_["utterances"], + utterances=route_dict["utterances"], function_schemas=[function_schemas], ) ) From cc94806d59a3111c9c5ce59bfb13778462f73e70 Mon Sep 17 00:00:00 2001 From: tolgadevAI <164843802+tolgadevAI@users.noreply.github.com> Date: Tue, 27 Aug 2024 09:48:43 +0300 Subject: [PATCH 09/27] lint --- semantic_router/index/pinecone.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/semantic_router/index/pinecone.py b/semantic_router/index/pinecone.py index e13d1131..1016a7b5 100644 --- a/semantic_router/index/pinecone.py +++ b/semantic_router/index/pinecone.py @@ -500,7 +500,11 @@ def get_routes(self) -> List[Tuple]: ( route_objects["sr_route"], route_objects["sr_utterance"], - json.loads(route_objects["sr_function_schemas"]) if route_objects["sr_function_schemas"] else {}, + ( + json.loads(route_objects["sr_function_schemas"]) + if route_objects["sr_function_schemas"] + else {} + ), ) for route_objects in metadata ] From 2b7854ad9352ec962484cf4deae994178adda56c Mon Sep 17 00:00:00 2001 From: tolgadevAI <164843802+tolgadevAI@users.noreply.github.com> Date: Tue, 27 Aug 2024 09:56:13 +0300 Subject: [PATCH 10/27] develop the case "merge-force-local" --- .gitignore | 1 + semantic_router/index/base.py | 6 +++--- semantic_router/index/pinecone.py | 12 +++++++++--- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index f571c929..094c61f4 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,7 @@ node_modules package-lock.json package.json test.ipynb +test_sync.ipynb ``` # docs diff --git a/semantic_router/index/base.py b/semantic_router/index/base.py index 73467887..9d824661 100644 --- a/semantic_router/index/base.py +++ b/semantic_router/index/base.py @@ -122,9 +122,9 @@ def _sync_index( - "error": Raise an error if local and remote are not synchronized. - "remote": Take remote as the source of truth and update local to align. - "local": Take local as the source of truth and update remote to align. - - "merge-force-remote": Merge both local and remote taking only remote routes utterances when a route with same route name is present both locally and remotely. - - "merge-force-local": Merge both local and remote taking only local routes utterances when a route with same route name is present both locally and remotely. - - "merge": Merge both local and remote, merging also local and remote utterances when a route with same route name is present both locally and remotely. + - "merge-force-remote": Merge both local and remote taking only remote routes features when a route with same route name is present both locally and remotely. + - "merge-force-local": Merge both local and remote taking only local routes features when a route with same route name is present both locally and remotely. + - "merge": Merge both local and remote, merging also local and remote features when a route with same route name is present both locally and remotely. This method should be implemented by subclasses. """ diff --git a/semantic_router/index/pinecone.py b/semantic_router/index/pinecone.py index 1016a7b5..cb2f58ca 100644 --- a/semantic_router/index/pinecone.py +++ b/semantic_router/index/pinecone.py @@ -292,7 +292,6 @@ def _sync_index( layer_routes[route][ "function_schemas" ] = remote_function_schemas_dict - elif self.sync == "local": utterances_to_include = local_utterances_set - remote_utterances_set routes_to_delete.extend( @@ -304,7 +303,7 @@ def _sync_index( ) layer_routes[route] = {} if local_utterances_set: - layer_routes[route]["utterances"] = list(local_utterances_set) + layer_routes[route] = {"utterances": list(local_utterances_set)} if local_function_schemas_dict: layer_routes[route][ "function_schemas" @@ -340,12 +339,19 @@ def _sync_index( ) if local_utterances_set: layer_routes[route] = {"utterances": list(local_utterances_set)} + if local_function_schemas_dict: + layer_routes[route][ + "function_schemas" + ] = local_function_schemas_dict else: if remote_utterances_set: layer_routes[route] = { "utterances": list(remote_utterances_set) } - + if remote_function_schemas_dict: + layer_routes[route][ + "function_schemas" + ] = remote_function_schemas_dict elif self.sync == "merge": utterances_to_include = local_utterances_set - remote_utterances_set if local_utterances_set or remote_utterances_set: From da93be388e07a67760ad07207d9097d5a996cded Mon Sep 17 00:00:00 2001 From: tolgadevAI <164843802+tolgadevAI@users.noreply.github.com> Date: Tue, 27 Aug 2024 11:01:24 +0300 Subject: [PATCH 11/27] update the "merge" case --- semantic_router/index/pinecone.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/semantic_router/index/pinecone.py b/semantic_router/index/pinecone.py index cb2f58ca..bcc75c62 100644 --- a/semantic_router/index/pinecone.py +++ b/semantic_router/index/pinecone.py @@ -361,6 +361,12 @@ def _sync_index( ) } + if local_function_schemas_dict or remote_function_schemas_dict: + layer_routes[route]["function_schemas"] = { + **remote_function_schemas_dict, + **local_function_schemas_dict, + } + else: raise ValueError("Invalid sync mode specified") From 775cb117857de582a7364837d5be40bf96d6b1ff Mon Sep 17 00:00:00 2001 From: tolgadevAI <164843802+tolgadevAI@users.noreply.github.com> Date: Tue, 27 Aug 2024 11:11:42 +0300 Subject: [PATCH 12/27] develop the sync case for "error" --- semantic_router/index/pinecone.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/semantic_router/index/pinecone.py b/semantic_router/index/pinecone.py index bcc75c62..31c1bf1b 100644 --- a/semantic_router/index/pinecone.py +++ b/semantic_router/index/pinecone.py @@ -227,8 +227,6 @@ def _sync_index( for route, _, _ in remote_routes } - logger.info(f"remote_routes: {remote_routes}") - for route, utterance, function_schema in remote_routes: logger.info(f"function_schema remote: {function_schema}") remote_dict[route]["utterances"].add(utterance) @@ -278,13 +276,16 @@ def _sync_index( utterances_to_include: set = set() if self.sync == "error": - if local_utterances_set != remote_utterances_set: + if (local_utterances_set != remote_utterances_set) or (local_function_schemas_dict != remote_function_schemas_dict): raise ValueError( f"Synchronization error: Differences found in route '{route}'" ) if local_utterances_set: layer_routes[route] = {"utterances": list(local_utterances_set)} - + if local_function_schemas_dict: + layer_routes[route][ + "function_schemas" + ] = local_function_schemas_dict elif self.sync == "remote": if remote_utterances_set: layer_routes[route] = {"utterances": list(remote_utterances_set)} From f230223021de7e94fd6002b1dfbdb4ae187136e7 Mon Sep 17 00:00:00 2001 From: tolgadevAI <164843802+tolgadevAI@users.noreply.github.com> Date: Tue, 27 Aug 2024 12:06:33 +0300 Subject: [PATCH 13/27] various optimizations for remote and local routes --- semantic_router/index/base.py | 2 +- semantic_router/index/local.py | 2 +- semantic_router/index/pinecone.py | 107 ++++++++++++++++-------------- semantic_router/index/qdrant.py | 2 +- semantic_router/layer.py | 38 ++++------- 5 files changed, 74 insertions(+), 77 deletions(-) diff --git a/semantic_router/index/base.py b/semantic_router/index/base.py index 9d824661..750e5d87 100644 --- a/semantic_router/index/base.py +++ b/semantic_router/index/base.py @@ -113,8 +113,8 @@ def _sync_index( self, local_route_names: List[str], local_utterances: List[str], - dimensions: int, local_function_schemas: List[Dict[str, Any]], + dimensions: int, ): """ Synchronize the local index with the remote index based on the specified mode. diff --git a/semantic_router/index/local.py b/semantic_router/index/local.py index 7bc12bba..be4b48dd 100644 --- a/semantic_router/index/local.py +++ b/semantic_router/index/local.py @@ -52,8 +52,8 @@ def _sync_index( self, local_route_names: List[str], local_utterances: List[str], + local_function_schemas: List[Dict[str, Any]], dimensions: int, - local_function_schemas: List[str] | None = None, ): if self.sync is not None: logger.error("Sync remove is not implemented for LocalIndex.") diff --git a/semantic_router/index/pinecone.py b/semantic_router/index/pinecone.py index 31c1bf1b..246f0851 100644 --- a/semantic_router/index/pinecone.py +++ b/semantic_router/index/pinecone.py @@ -212,8 +212,8 @@ def _sync_index( self, local_route_names: List[str], local_utterances: List[str], - dimensions: int, local_function_schemas: List[Dict[str, Any]], + dimensions: int, ) -> Tuple: if self.index is None: @@ -222,23 +222,20 @@ def _sync_index( remote_routes = self.get_routes() - remote_dict = { + remote_dict: Dict[str, Dict[str, Union[set, Dict]]] = { route: {"utterances": set(), "function_schemas": {}} for route, _, _ in remote_routes } for route, utterance, function_schema in remote_routes: - logger.info(f"function_schema remote: {function_schema}") - remote_dict[route]["utterances"].add(utterance) + remote_dict[route]["utterances"].add(utterance) # type: ignore - if not function_schema: - logger.info(f"function_schema remote is empty for {route}") - remote_dict[route]["function_schemas"].update({}) - else: - logger.info(f"function_schema remote is not empty for {route}") - remote_dict[route]["function_schemas"].update(function_schema) + logger.info( + f"function_schema remote is {'empty' if not function_schema else 'not empty'} for {route}" + ) + remote_dict[route]["function_schemas"].update(function_schema or {}) - local_dict = { + local_dict: Dict[str, Dict[str, Union[set, Dict]]] = { route: {"utterances": set(), "function_schemas": {}} for route in local_route_names } @@ -246,14 +243,13 @@ def _sync_index( for route, utterance, function_schema in zip( local_route_names, local_utterances, local_function_schemas ): - logger.info(f"function_schema local: {function_schema}") - local_dict[route]["utterances"].add(utterance) + local_dict[route]["utterances"].add(utterance) # type: ignore local_dict[route]["function_schemas"].update(function_schema) all_routes = set(remote_dict.keys()).union(local_dict.keys()) routes_to_add = [] routes_to_delete = [] - layer_routes = {} + layer_routes: Dict[str, Dict[str, Union[List[str], Dict]]] = {} for route in all_routes: local_utterances_set = local_dict.get(route, {"utterances": set()})[ @@ -276,25 +272,27 @@ def _sync_index( utterances_to_include: set = set() if self.sync == "error": - if (local_utterances_set != remote_utterances_set) or (local_function_schemas_dict != remote_function_schemas_dict): + if (local_utterances_set != remote_utterances_set) or ( + local_function_schemas_dict != remote_function_schemas_dict + ): raise ValueError( f"Synchronization error: Differences found in route '{route}'" ) if local_utterances_set: layer_routes[route] = {"utterances": list(local_utterances_set)} - if local_function_schemas_dict: - layer_routes[route][ - "function_schemas" - ] = local_function_schemas_dict + if isinstance(local_function_schemas_dict, dict): + layer_routes[route]["function_schemas"] = { + **local_function_schemas_dict + } elif self.sync == "remote": if remote_utterances_set: layer_routes[route] = {"utterances": list(remote_utterances_set)} - if remote_function_schemas_dict: - layer_routes[route][ - "function_schemas" - ] = remote_function_schemas_dict + if isinstance(remote_function_schemas_dict, dict): + layer_routes[route]["function_schemas"] = { + **remote_function_schemas_dict + } elif self.sync == "local": - utterances_to_include = local_utterances_set - remote_utterances_set + utterances_to_include = local_utterances_set - remote_utterances_set # type: ignore routes_to_delete.extend( [ (route, utterance) @@ -305,32 +303,32 @@ def _sync_index( layer_routes[route] = {} if local_utterances_set: layer_routes[route] = {"utterances": list(local_utterances_set)} - if local_function_schemas_dict: - layer_routes[route][ - "function_schemas" - ] = local_function_schemas_dict + if isinstance(local_function_schemas_dict, dict): + layer_routes[route]["function_schemas"] = { + **local_function_schemas_dict + } elif self.sync == "merge-force-remote": if route in local_dict and route not in remote_dict: utterances_to_include = set(local_utterances) if local_utterances: layer_routes[route] = {"utterances": list(local_utterances)} - if local_function_schemas_dict: - layer_routes[route][ - "function_schemas" - ] = local_function_schemas_dict + if isinstance(local_function_schemas_dict, dict): + layer_routes[route]["function_schemas"] = { + **local_function_schemas_dict + } else: if remote_utterances_set: layer_routes[route] = { "utterances": list(remote_utterances_set) } - if remote_function_schemas_dict: - layer_routes[route][ - "function_schemas" - ] = remote_function_schemas_dict + if isinstance(remote_function_schemas_dict, dict): + layer_routes[route]["function_schemas"] = { + **remote_function_schemas_dict + } elif self.sync == "merge-force-local": if route in local_dict: - utterances_to_include = local_utterances_set - remote_utterances_set + utterances_to_include = local_utterances_set - remote_utterances_set # type: ignore routes_to_delete.extend( [ (route, utterance) @@ -340,32 +338,41 @@ def _sync_index( ) if local_utterances_set: layer_routes[route] = {"utterances": list(local_utterances_set)} - if local_function_schemas_dict: - layer_routes[route][ - "function_schemas" - ] = local_function_schemas_dict + if isinstance(local_function_schemas_dict, dict): + layer_routes[route]["function_schemas"] = { + **local_function_schemas_dict + } else: if remote_utterances_set: layer_routes[route] = { "utterances": list(remote_utterances_set) } - if remote_function_schemas_dict: - layer_routes[route][ - "function_schemas" - ] = remote_function_schemas_dict + if isinstance(remote_function_schemas_dict, dict): + layer_routes[route]["function_schemas"] = { + **remote_function_schemas_dict + } elif self.sync == "merge": - utterances_to_include = local_utterances_set - remote_utterances_set + utterances_to_include = local_utterances_set - remote_utterances_set # type: ignore if local_utterances_set or remote_utterances_set: layer_routes[route] = { "utterances": list( - remote_utterances_set.union(local_utterances_set) + remote_utterances_set.union(local_utterances_set) # type: ignore ) } if local_function_schemas_dict or remote_function_schemas_dict: - layer_routes[route]["function_schemas"] = { - **remote_function_schemas_dict, - **local_function_schemas_dict, + # Ensure both are dictionaries before merging + layer_routes[route]["function_schemas"] = { # type: ignore + **( + remote_function_schemas_dict + if isinstance(remote_function_schemas_dict, dict) + else {} + ), + **( + local_function_schemas_dict + if isinstance(local_function_schemas_dict, dict) + else {} + ), } else: diff --git a/semantic_router/index/qdrant.py b/semantic_router/index/qdrant.py index 0b414cff..11a0a076 100644 --- a/semantic_router/index/qdrant.py +++ b/semantic_router/index/qdrant.py @@ -168,8 +168,8 @@ def _sync_index( self, local_route_names: List[str], local_utterances: List[str], + local_function_schemas: List[Dict[str, Any]], dimensions: int, - local_function_schemas: List[str] | None = None, ): if self.sync is not None: logger.error("Sync remove is not implemented for QdrantIndex.") diff --git a/semantic_router/layer.py b/semantic_router/layer.py index fee3b665..6a4c4cdf 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -225,16 +225,16 @@ def __init__( self._add_routes(routes=self.routes) def check_for_matching_routes(self, top_class: str) -> Optional[Route]: - matching_routes = [route for route in self.routes if route.name == top_class] - logger.info(f"matching_routes: {matching_routes}") - logger.info(f"self.routes: {self.routes}") - if not matching_routes: + # Use next with a generator expression for optimization + matching_route = next( + (route for route in self.routes if route.name == top_class), None + ) + if matching_route is None: logger.error( f"No route found with name {top_class}. Check to see if any Routes " "have been defined." ) - return None - return matching_routes[0] + return matching_route def __call__( self, @@ -496,7 +496,7 @@ def _add_routes(self, routes: List[Route]): function_schemas=( route.function_schemas * len(route.utterances) if route.function_schemas - else [""] * len(route.utterances) # type: ignore + else [{}] * len(route.utterances) ), ) except Exception as e: @@ -519,27 +519,17 @@ def _add_and_sync_routes(self, routes: List[Route]): ) layer_routes: List[Route] = [] - logger.info(f"layer_routes_dict: {layer_routes_dict}") + for route in layer_routes_dict.keys(): - logger.info(f"route name: {route}") route_dict = layer_routes_dict[route] function_schemas = route_dict.get("function_schemas", None) - if not function_schemas: - layer_routes.append( - Route( - name=route, - utterances=route_dict["utterances"], - function_schemas=None, - ) - ) - else: - layer_routes.append( - Route( - name=route, - utterances=route_dict["utterances"], - function_schemas=[function_schemas], - ) + layer_routes.append( + Route( + name=route, + utterances=route_dict["utterances"], + function_schemas=[function_schemas] if function_schemas else None, ) + ) data_to_delete: dict = {} for route, utterance in routes_to_delete: From 41d1c15b80f64175ef9f77869d3dcadf4dbc80b2 Mon Sep 17 00:00:00 2001 From: tolgadevAI <164843802+tolgadevAI@users.noreply.github.com> Date: Tue, 27 Aug 2024 15:26:11 +0300 Subject: [PATCH 14/27] optimize the '_add_routes' function --- semantic_router/layer.py | 60 +++++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 22 deletions(-) diff --git a/semantic_router/layer.py b/semantic_router/layer.py index 6a4c4cdf..aa4a8ef6 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -482,28 +482,44 @@ def _refresh_routes(self): self.routes.append(route) def _add_routes(self, routes: List[Route]): - if routes: - for route in routes: - logger.info(f"Adding `{route.name}` route") - embeddings = self.encoder(route.utterances) - if route.score_threshold is None: - route.score_threshold = self.score_threshold - try: - self.index.add( - embeddings=embeddings, - routes=[route.name] * len(route.utterances), - utterances=route.utterances, - function_schemas=( - route.function_schemas * len(route.utterances) - if route.function_schemas - else [{}] * len(route.utterances) - ), - ) - except Exception as e: - logger.error( - f"Failed to add route `{route.name}` to the index: {e}" - ) - raise Exception(f"Indexing error for route `{route.name}`") from e + if not routes: + logger.warning("No routes provided to add.") + return + + route_names = [] + all_embeddings = [] + all_utterances = [] + all_function_schemas = [] + + for route in routes: + logger.info(f"Adding `{route.name}` route") + route_embeddings = self.encoder(route.utterances) + + # Set score_threshold if not already set + route.score_threshold = route.score_threshold or self.score_threshold + + # Prepare data for batch insertion + route_names.extend([route.name] * len(route.utterances)) + all_embeddings.extend(route_embeddings) + all_utterances.extend(route.utterances) + all_function_schemas.extend( + route.function_schemas * len(route.utterances) + if route.function_schemas + else [{}] * len(route.utterances) + ) + + try: + # Batch insertion into the index + self.index.add( + embeddings=all_embeddings, + routes=route_names, + utterances=all_utterances, + function_schemas=all_function_schemas, + ) + except Exception as e: + logger.error(f"Failed to add routes to the index: {e}") + raise Exception("Indexing error occurred") from e + def _add_and_sync_routes(self, routes: List[Route]): # create embeddings for all routes and sync at startup with remote ones based on sync setting From 4286b5461ff9c9f2d1a2d271b5db932a816f3a82 Mon Sep 17 00:00:00 2001 From: tolgadevAI <164843802+tolgadevAI@users.noreply.github.com> Date: Tue, 27 Aug 2024 15:29:07 +0300 Subject: [PATCH 15/27] lint --- semantic_router/layer.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/semantic_router/layer.py b/semantic_router/layer.py index aa4a8ef6..4f478ea0 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -488,16 +488,16 @@ def _add_routes(self, routes: List[Route]): route_names = [] all_embeddings = [] - all_utterances = [] + all_utterances: List[str] = [] all_function_schemas = [] for route in routes: logger.info(f"Adding `{route.name}` route") route_embeddings = self.encoder(route.utterances) - + # Set score_threshold if not already set route.score_threshold = route.score_threshold or self.score_threshold - + # Prepare data for batch insertion route_names.extend([route.name] * len(route.utterances)) all_embeddings.extend(route_embeddings) @@ -520,7 +520,6 @@ def _add_routes(self, routes: List[Route]): logger.error(f"Failed to add routes to the index: {e}") raise Exception("Indexing error occurred") from e - def _add_and_sync_routes(self, routes: List[Route]): # create embeddings for all routes and sync at startup with remote ones based on sync setting local_route_names, local_utterances, local_function_schemas = ( From 3402afd35a9abd8348e1f2dac2837440602ed1ed Mon Sep 17 00:00:00 2001 From: tolgadevAI <164843802+tolgadevAI@users.noreply.github.com> Date: Wed, 28 Aug 2024 19:16:57 +0300 Subject: [PATCH 16/27] update the test layer --- tests/unit/test_layer.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/unit/test_layer.py b/tests/unit/test_layer.py index 05979cd2..4566401d 100644 --- a/tests/unit/test_layer.py +++ b/tests/unit/test_layer.py @@ -87,7 +87,7 @@ def cohere_encoder(mocker): @pytest.fixture def openai_encoder(mocker): mocker.patch.object(OpenAIEncoder, "__call__", side_effect=mock_encoder_call) - return OpenAIEncoder(name="text-embedding-3-small", openai_api_key="test_api_key") + return OpenAIEncoder(name="text-embedding-ada-002", openai_api_key="test_api_key") @pytest.fixture @@ -155,8 +155,8 @@ def test_initialization(self, openai_encoder, routes, index_cls): route_layer = RouteLayer( encoder=openai_encoder, routes=routes, top_k=10, index=index_cls() ) - assert openai_encoder.score_threshold == 0.3 - assert route_layer.score_threshold == 0.3 + assert openai_encoder.score_threshold == 0.82 + assert route_layer.score_threshold == 0.82 assert route_layer.top_k == 10 assert len(route_layer.index) if route_layer.index is not None else 0 == 5 assert ( @@ -172,7 +172,7 @@ def test_initialization_different_encoders( assert cohere_encoder.score_threshold == 0.3 assert route_layer_cohere.score_threshold == 0.3 route_layer_openai = RouteLayer(encoder=openai_encoder, index=index_cls()) - assert route_layer_openai.score_threshold == 0.3 + assert route_layer_openai.score_threshold == 0.82 def test_initialization_no_encoder(self, openai_encoder, index_cls): os.environ["OPENAI_API_KEY"] = "test_api_key" @@ -189,8 +189,8 @@ def test_initialization_dynamic_route( route_layer_openai = RouteLayer( encoder=openai_encoder, routes=dynamic_routes, index=index_cls() ) - assert openai_encoder.score_threshold == 0.3 - assert route_layer_openai.score_threshold == 0.3 + assert openai_encoder.score_threshold == 0.82 + assert route_layer_openai.score_threshold == 0.82 def test_add_route(self, openai_encoder, index_cls): route_layer = RouteLayer(encoder=openai_encoder, index=index_cls()) @@ -542,7 +542,7 @@ def test_get_thresholds(self, openai_encoder, routes, index_cls): route_layer = RouteLayer( encoder=openai_encoder, routes=routes, index=index_cls() ) - assert route_layer.get_thresholds() == {"Route 1": 0.3, "Route 2": 0.3} + assert route_layer.get_thresholds() == {"Route 1": 0.82, "Route 2": 0.82} def test_with_multiple_routes_passing_threshold( self, openai_encoder, routes, index_cls From cdbd95e843044ed7547ebbc3c5541eed7ded9461 Mon Sep 17 00:00:00 2001 From: tolgadevAI <164843802+tolgadevAI@users.noreply.github.com> Date: Wed, 28 Aug 2024 19:41:53 +0300 Subject: [PATCH 17/27] fix the test_layer --- tests/unit/test_layer.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/unit/test_layer.py b/tests/unit/test_layer.py index 4566401d..05979cd2 100644 --- a/tests/unit/test_layer.py +++ b/tests/unit/test_layer.py @@ -87,7 +87,7 @@ def cohere_encoder(mocker): @pytest.fixture def openai_encoder(mocker): mocker.patch.object(OpenAIEncoder, "__call__", side_effect=mock_encoder_call) - return OpenAIEncoder(name="text-embedding-ada-002", openai_api_key="test_api_key") + return OpenAIEncoder(name="text-embedding-3-small", openai_api_key="test_api_key") @pytest.fixture @@ -155,8 +155,8 @@ def test_initialization(self, openai_encoder, routes, index_cls): route_layer = RouteLayer( encoder=openai_encoder, routes=routes, top_k=10, index=index_cls() ) - assert openai_encoder.score_threshold == 0.82 - assert route_layer.score_threshold == 0.82 + assert openai_encoder.score_threshold == 0.3 + assert route_layer.score_threshold == 0.3 assert route_layer.top_k == 10 assert len(route_layer.index) if route_layer.index is not None else 0 == 5 assert ( @@ -172,7 +172,7 @@ def test_initialization_different_encoders( assert cohere_encoder.score_threshold == 0.3 assert route_layer_cohere.score_threshold == 0.3 route_layer_openai = RouteLayer(encoder=openai_encoder, index=index_cls()) - assert route_layer_openai.score_threshold == 0.82 + assert route_layer_openai.score_threshold == 0.3 def test_initialization_no_encoder(self, openai_encoder, index_cls): os.environ["OPENAI_API_KEY"] = "test_api_key" @@ -189,8 +189,8 @@ def test_initialization_dynamic_route( route_layer_openai = RouteLayer( encoder=openai_encoder, routes=dynamic_routes, index=index_cls() ) - assert openai_encoder.score_threshold == 0.82 - assert route_layer_openai.score_threshold == 0.82 + assert openai_encoder.score_threshold == 0.3 + assert route_layer_openai.score_threshold == 0.3 def test_add_route(self, openai_encoder, index_cls): route_layer = RouteLayer(encoder=openai_encoder, index=index_cls()) @@ -542,7 +542,7 @@ def test_get_thresholds(self, openai_encoder, routes, index_cls): route_layer = RouteLayer( encoder=openai_encoder, routes=routes, index=index_cls() ) - assert route_layer.get_thresholds() == {"Route 1": 0.82, "Route 2": 0.82} + assert route_layer.get_thresholds() == {"Route 1": 0.3, "Route 2": 0.3} def test_with_multiple_routes_passing_threshold( self, openai_encoder, routes, index_cls From 0729eb9b3bfdceff0e1d87844451733be2bc5506 Mon Sep 17 00:00:00 2001 From: James Briggs Date: Wed, 28 Aug 2024 22:17:02 +0200 Subject: [PATCH 18/27] fix: python 3.9 backwards compatibility --- semantic_router/index/base.py | 2 +- semantic_router/index/local.py | 2 +- semantic_router/index/pinecone.py | 2 +- semantic_router/index/postgres.py | 2 +- semantic_router/index/qdrant.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/semantic_router/index/base.py b/semantic_router/index/base.py index 750e5d87..44bbecf9 100644 --- a/semantic_router/index/base.py +++ b/semantic_router/index/base.py @@ -26,7 +26,7 @@ def add( embeddings: List[List[float]], routes: List[str], utterances: List[Any], - function_schemas: List[Dict[str, Any]] | None = None, + function_schemas: Optional[List[Dict[str, Any]]] = None, ): """ Add embeddings to the index. diff --git a/semantic_router/index/local.py b/semantic_router/index/local.py index be4b48dd..e979c40b 100644 --- a/semantic_router/index/local.py +++ b/semantic_router/index/local.py @@ -27,7 +27,7 @@ def add( embeddings: List[List[float]], routes: List[str], utterances: List[str], - function_schemas: List[Dict[str, Any]] | None = None, + function_schemas: Optional[List[Dict[str, Any]]] = None, ): embeds = np.array(embeddings) # type: ignore routes_arr = np.array(routes) diff --git a/semantic_router/index/pinecone.py b/semantic_router/index/pinecone.py index 26e0b40e..875a488c 100644 --- a/semantic_router/index/pinecone.py +++ b/semantic_router/index/pinecone.py @@ -399,7 +399,7 @@ def add( embeddings: List[List[float]], routes: List[str], utterances: List[str], - function_schemas: List[Dict[str, Any]] | None = None, + function_schemas: Optional[List[Dict[str, Any]]] = None, batch_size: int = 100, ): """Add vectors to Pinecone in batches.""" diff --git a/semantic_router/index/postgres.py b/semantic_router/index/postgres.py index b4e133fe..0d18381f 100644 --- a/semantic_router/index/postgres.py +++ b/semantic_router/index/postgres.py @@ -259,7 +259,7 @@ def add( embeddings: List[List[float]], routes: List[str], utterances: List[Any], - function_schemas: List[Dict[str, Any]] | None = None, + function_schemas: Optional[List[Dict[str, Any]]] = None, ) -> None: """ Adds vectors to the index. diff --git a/semantic_router/index/qdrant.py b/semantic_router/index/qdrant.py index 11a0a076..ec809577 100644 --- a/semantic_router/index/qdrant.py +++ b/semantic_router/index/qdrant.py @@ -179,7 +179,7 @@ def add( embeddings: List[List[float]], routes: List[str], utterances: List[str], - function_schemas: List[Dict[str, Any]] | None = None, + function_schemas: Optional[List[Dict[str, Any]]] = None, batch_size: int = DEFAULT_UPLOAD_BATCH_SIZE, ): self.dimensions = self.dimensions or len(embeddings[0]) From cdf43b705c671d18e1ce4ad1ddfbb902dd89d737 Mon Sep 17 00:00:00 2001 From: tolgadevAI <164843802+tolgadevAI@users.noreply.github.com> Date: Thu, 29 Aug 2024 15:19:11 +0300 Subject: [PATCH 19/27] fix the "merge-force-remote" issue --- semantic_router/index/base.py | 2 +- semantic_router/index/local.py | 2 +- semantic_router/index/pinecone.py | 8 ++++---- semantic_router/index/qdrant.py | 2 +- semantic_router/layer.py | 10 ++++------ 5 files changed, 11 insertions(+), 13 deletions(-) diff --git a/semantic_router/index/base.py b/semantic_router/index/base.py index 44bbecf9..08c6ad6c 100644 --- a/semantic_router/index/base.py +++ b/semantic_router/index/base.py @@ -113,8 +113,8 @@ def _sync_index( self, local_route_names: List[str], local_utterances: List[str], - local_function_schemas: List[Dict[str, Any]], dimensions: int, + local_function_schemas: List[Dict[str, Any]], ): """ Synchronize the local index with the remote index based on the specified mode. diff --git a/semantic_router/index/local.py b/semantic_router/index/local.py index e979c40b..68a859b5 100644 --- a/semantic_router/index/local.py +++ b/semantic_router/index/local.py @@ -52,8 +52,8 @@ def _sync_index( self, local_route_names: List[str], local_utterances: List[str], - local_function_schemas: List[Dict[str, Any]], dimensions: int, + local_function_schemas: List[Dict[str, Any]], ): if self.sync is not None: logger.error("Sync remove is not implemented for LocalIndex.") diff --git a/semantic_router/index/pinecone.py b/semantic_router/index/pinecone.py index 875a488c..3893acff 100644 --- a/semantic_router/index/pinecone.py +++ b/semantic_router/index/pinecone.py @@ -216,8 +216,8 @@ def _sync_index( self, local_route_names: List[str], local_utterances: List[str], - local_function_schemas: List[Dict[str, Any]], dimensions: int, + local_function_schemas: List[Dict[str, Any]], ) -> Tuple: if self.index is None: @@ -313,9 +313,9 @@ def _sync_index( } elif self.sync == "merge-force-remote": if route in local_dict and route not in remote_dict: - utterances_to_include = set(local_utterances) - if local_utterances: - layer_routes[route] = {"utterances": list(local_utterances)} + utterances_to_include = local_utterances_set + if local_utterances_set: + layer_routes[route] = {"utterances": list(local_utterances_set)} if isinstance(local_function_schemas_dict, dict): layer_routes[route]["function_schemas"] = { **local_function_schemas_dict diff --git a/semantic_router/index/qdrant.py b/semantic_router/index/qdrant.py index ec809577..180bb3b3 100644 --- a/semantic_router/index/qdrant.py +++ b/semantic_router/index/qdrant.py @@ -168,8 +168,8 @@ def _sync_index( self, local_route_names: List[str], local_utterances: List[str], - local_function_schemas: List[Dict[str, Any]], dimensions: int, + local_function_schemas: List[Dict[str, Any]], ): if self.sync is not None: logger.error("Sync remove is not implemented for QdrantIndex.") diff --git a/semantic_router/layer.py b/semantic_router/layer.py index 4f478ea0..f5c270ca 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -180,7 +180,7 @@ def __init__( self, encoder: Optional[BaseEncoder] = None, llm: Optional[BaseLLM] = None, - routes: List[Route] = [], + routes: Optional[List[Route]] = None, index: Optional[BaseIndex] = None, # type: ignore top_k: int = 5, aggregation: str = "sum", @@ -195,7 +195,7 @@ def __init__( else: self.encoder = encoder self.llm = llm - self.routes = routes + self.routes = routes if routes else [] if self.encoder.score_threshold is None: raise ValueError( "No score threshold provided for encoder. Please set the score threshold " @@ -216,12 +216,10 @@ def __init__( for route in self.routes: if route.score_threshold is None: route.score_threshold = self.score_threshold - # if routes list has been passed, we initialize index now if self.index.sync: self._add_and_sync_routes(routes=self.routes) - - if self.routes: + elif self.routes: self._add_routes(routes=self.routes) def check_for_matching_routes(self, top_class: str) -> Optional[Route]: @@ -529,8 +527,8 @@ def _add_and_sync_routes(self, routes: List[Route]): routes_to_add, routes_to_delete, layer_routes_dict = self.index._sync_index( local_route_names=local_route_names, local_utterances=local_utterances, - local_function_schemas=local_function_schemas, dimensions=len(self.encoder(["dummy"])[0]), + local_function_schemas=local_function_schemas, ) layer_routes: List[Route] = [] From 2796e679566b4ddfa7dde669e59b4ffa2b9caa0f Mon Sep 17 00:00:00 2001 From: tolgadevAI <164843802+tolgadevAI@users.noreply.github.com> Date: Thu, 29 Aug 2024 15:26:10 +0300 Subject: [PATCH 20/27] fix the local_utterances_set --- semantic_router/index/pinecone.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/semantic_router/index/pinecone.py b/semantic_router/index/pinecone.py index 3893acff..d387e8ab 100644 --- a/semantic_router/index/pinecone.py +++ b/semantic_router/index/pinecone.py @@ -313,7 +313,11 @@ def _sync_index( } elif self.sync == "merge-force-remote": if route in local_dict and route not in remote_dict: - utterances_to_include = local_utterances_set + utterances_to_include = ( + local_utterances_set + if isinstance(local_utterances_set, set) + else set() + ) if local_utterances_set: layer_routes[route] = {"utterances": list(local_utterances_set)} if isinstance(local_function_schemas_dict, dict): From cf684085a8ed8cede387145e3e1cb137e39ff823 Mon Sep 17 00:00:00 2001 From: Vits Date: Fri, 30 Aug 2024 18:06:46 +0200 Subject: [PATCH 21/27] Implemented custom metadata. Still to solve linting issues --- semantic_router/index/base.py | 4 +- semantic_router/index/local.py | 4 +- semantic_router/index/pinecone.py | 279 +++++++++++++++--------------- semantic_router/index/postgres.py | 3 +- semantic_router/index/qdrant.py | 6 +- semantic_router/layer.py | 136 +++++++-------- semantic_router/route.py | 1 + 7 files changed, 219 insertions(+), 214 deletions(-) diff --git a/semantic_router/index/base.py b/semantic_router/index/base.py index 08c6ad6c..5ddb586e 100644 --- a/semantic_router/index/base.py +++ b/semantic_router/index/base.py @@ -27,6 +27,7 @@ def add( routes: List[str], utterances: List[Any], function_schemas: Optional[List[Dict[str, Any]]] = None, + metadata_list: List[Dict[str, Any]] = [], ): """ Add embeddings to the index. @@ -113,8 +114,9 @@ def _sync_index( self, local_route_names: List[str], local_utterances: List[str], - dimensions: int, local_function_schemas: List[Dict[str, Any]], + local_metadata: List[Dict[str, Any]], + dimensions: int, ): """ Synchronize the local index with the remote index based on the specified mode. diff --git a/semantic_router/index/local.py b/semantic_router/index/local.py index 68a859b5..09e23ffc 100644 --- a/semantic_router/index/local.py +++ b/semantic_router/index/local.py @@ -28,6 +28,7 @@ def add( routes: List[str], utterances: List[str], function_schemas: Optional[List[Dict[str, Any]]] = None, + metadata_list: List[Dict[str, Any]] = [], ): embeds = np.array(embeddings) # type: ignore routes_arr = np.array(routes) @@ -52,8 +53,9 @@ def _sync_index( self, local_route_names: List[str], local_utterances: List[str], - dimensions: int, local_function_schemas: List[Dict[str, Any]], + local_metadata: List[Dict[str, Any]], + dimensions: int, ): if self.sync is not None: logger.error("Sync remove is not implemented for LocalIndex.") diff --git a/semantic_router/index/pinecone.py b/semantic_router/index/pinecone.py index 6f405841..0231f53f 100644 --- a/semantic_router/index/pinecone.py +++ b/semantic_router/index/pinecone.py @@ -24,6 +24,7 @@ class PineconeRecord(BaseModel): route: str utterance: str function_schema: str + metadata: Dict[str, Any] = {} # Additional metadata dictionary def __init__(self, **data): super().__init__(**data) @@ -31,16 +32,19 @@ def __init__(self, **data): # Use SHA-256 for a more secure hash utterance_id = hashlib.sha256(self.utterance.encode()).hexdigest() self.id = f"{clean_route}#{utterance_id}" + self.metadata.update( + { + "sr_route": self.route, + "sr_utterance": self.utterance, + "sr_function_schema": self.function_schema, + } + ) def to_dict(self): return { "id": self.id, "values": self.values, - "metadata": { - "sr_route": self.route, - "sr_utterance": self.utterance, - "sr_function_schemas": self.function_schema, - }, + "metadata": self.metadata, } @@ -214,179 +218,162 @@ async def _init_async_index(self, force_create: bool = False): def _sync_index( self, local_route_names: List[str], - local_utterances: List[str], + local_utterances_list: List[str], + local_function_schemas_list: List[Dict[str, Any]], + local_metadata_list: List[Dict[str, Any]], dimensions: int, - local_function_schemas: List[Dict[str, Any]], - ) -> Tuple: - + ) -> Tuple[List,List,Dict]: if self.index is None: self.dimensions = self.dimensions or dimensions self.index = self._init_index(force_create=True) remote_routes = self.get_routes() - remote_dict: Dict[str, Dict[str, Union[set, Dict]]] = { - route: {"utterances": set(), "function_schemas": {}} - for route, _, _ in remote_routes - } - - for route, utterance, function_schema in remote_routes: - remote_dict[route]["utterances"].add(utterance) # type: ignore - - logger.info( - f"function_schema remote is {'empty' if not function_schema else 'not empty'} for {route}" - ) - remote_dict[route]["function_schemas"].update(function_schema or {}) - - local_dict: Dict[str, Dict[str, Union[set, Dict]]] = { - route: {"utterances": set(), "function_schemas": {}} - for route in local_route_names + # Create remote dictionary for storing utterances and metadata + remote_dict: Dict[str, Dict[str, Any]] = { + route: {"utterances": set(), "function_schemas": function_schemas, "metadata": metadata} + for route, utterance, function_schemas, metadata in remote_routes } + for route, utterance, function_schemas, metadata in remote_routes: + remote_dict[route]["utterances"].add(utterance) - for route, utterance, function_schema in zip( - local_route_names, local_utterances, local_function_schemas + # Create local dictionary for storing utterances and metadata + local_dict: Dict[str, Dict[str, Any]] = {} + for route, utterance, function_schemas, metadata in zip( + local_route_names, local_utterances_list, local_function_schemas_list, local_metadata_list ): - local_dict[route]["utterances"].add(utterance) # type: ignore - local_dict[route]["function_schemas"].update(function_schema) + if route not in local_dict: + local_dict[route] = {"utterances": set(), "function_schemas": function_schemas, "metadata": metadata} + local_dict[route]["utterances"].add(utterance) + local_dict[route]["function_schemas"] = function_schemas + local_dict[route]["metadata"] = metadata all_routes = set(remote_dict.keys()).union(local_dict.keys()) + routes_to_add = [] routes_to_delete = [] - layer_routes: Dict[str, Dict[str, Union[List[str], Dict]]] = {} + layer_routes = {} for route in all_routes: - local_utterances_set = local_dict.get(route, {"utterances": set()})[ - "utterances" - ] - remote_utterances_set = remote_dict.get(route, {"utterances": set()})[ - "utterances" - ] - local_function_schemas_dict = local_dict.get(route, {}).get( - "function_schemas", {} - ) - - remote_function_schemas_dict = remote_dict.get( - route, {"function_schemas": {}} - )["function_schemas"] + local_utterances = local_dict.get(route, {}).get("utterances", set()) + remote_utterances = remote_dict.get(route, {}).get("utterances", set()) + local_function_schemas = local_dict.get(route, {}).get("function_schemas", {}) + remote_function_schemas = remote_dict.get(route, {}).get("function_schemas", {}) + local_metadata = local_dict.get(route, {}).get("metadata", {}) + remote_metadata = remote_dict.get(route, {}).get("metadata", {}) - if not local_utterances_set and not remote_utterances_set: - continue + utterances_to_include = set() - utterances_to_include: set = set() + metadata_changed = local_metadata != remote_metadata + function_schema_changed = local_function_schemas != remote_function_schemas if self.sync == "error": - if (local_utterances_set != remote_utterances_set) or ( - local_function_schemas_dict != remote_function_schemas_dict + if ( + local_utterances != remote_utterances + or local_function_schemas != remote_function_schemas + or local_metadata != remote_metadata ): raise ValueError( f"Synchronization error: Differences found in route '{route}'" ) - if local_utterances_set: - layer_routes[route] = {"utterances": list(local_utterances_set)} - if isinstance(local_function_schemas_dict, dict): - layer_routes[route]["function_schemas"] = { - **local_function_schemas_dict + + if local_utterances: + layer_routes[route] = { + "utterances": list(local_utterances), + "function_schemas": local_function_schemas, + "metadata": local_metadata, } + elif self.sync == "remote": - if remote_utterances_set: - layer_routes[route] = {"utterances": list(remote_utterances_set)} - if isinstance(remote_function_schemas_dict, dict): - layer_routes[route]["function_schemas"] = { - **remote_function_schemas_dict + if remote_utterances: + layer_routes[route] = { + "utterances": list(remote_utterances), + "function_schemas": remote_function_schemas, + "metadata": remote_metadata, } + elif self.sync == "local": - utterances_to_include = local_utterances_set - remote_utterances_set # type: ignore + utterances_to_include = local_utterances - remote_utterances routes_to_delete.extend( [ (route, utterance) - for utterance in remote_utterances_set - if utterance not in local_utterances_set + for utterance in remote_utterances + if utterance not in local_utterances ] ) - layer_routes[route] = {} - if local_utterances_set: - layer_routes[route] = {"utterances": list(local_utterances_set)} - if isinstance(local_function_schemas_dict, dict): - layer_routes[route]["function_schemas"] = { - **local_function_schemas_dict + if local_utterances: + layer_routes[route] = { + "utterances": list(local_utterances), + "function_schemas": local_function_schemas, + "metadata": local_metadata, } + elif self.sync == "merge-force-remote": if route in local_dict and route not in remote_dict: - utterances_to_include = ( - local_utterances_set - if isinstance(local_utterances_set, set) - else set() - ) - if local_utterances_set: - layer_routes[route] = {"utterances": list(local_utterances_set)} - if isinstance(local_function_schemas_dict, dict): - layer_routes[route]["function_schemas"] = { - **local_function_schemas_dict + utterances_to_include = local_utterances + if local_utterances: + layer_routes[route] = { + "utterances": list(local_utterances), + "function_schemas": local_function_schemas, + "metadata": local_metadata, } else: - if remote_utterances_set: + if remote_utterances: layer_routes[route] = { - "utterances": list(remote_utterances_set) - } - if isinstance(remote_function_schemas_dict, dict): - layer_routes[route]["function_schemas"] = { - **remote_function_schemas_dict + "utterances": list(remote_utterances), + "function_schemas": remote_function_schemas, + "metadata": remote_metadata, } elif self.sync == "merge-force-local": if route in local_dict: - utterances_to_include = local_utterances_set - remote_utterances_set # type: ignore + utterances_to_include = local_utterances - remote_utterances routes_to_delete.extend( [ (route, utterance) - for utterance in remote_utterances_set - if utterance not in local_utterances_set + for utterance in remote_utterances + if utterance not in local_utterances ] ) - if local_utterances_set: - layer_routes[route] = {"utterances": list(local_utterances_set)} - if isinstance(local_function_schemas_dict, dict): - layer_routes[route]["function_schemas"] = { - **local_function_schemas_dict + if local_utterances: + layer_routes[route] = { + "utterances": list(local_utterances), + "function_schemas": local_function_schemas, + "metadata": local_metadata, } else: - if remote_utterances_set: + if remote_utterances: layer_routes[route] = { - "utterances": list(remote_utterances_set) - } - if isinstance(remote_function_schemas_dict, dict): - layer_routes[route]["function_schemas"] = { - **remote_function_schemas_dict + "utterances": list(remote_utterances), + "function_schemas": remote_function_schemas, + "metadata": remote_metadata, } + elif self.sync == "merge": - utterances_to_include = local_utterances_set - remote_utterances_set # type: ignore - if local_utterances_set or remote_utterances_set: + utterances_to_include = local_utterances - remote_utterances + if local_utterances or remote_utterances: + # Here metadata are merged, with local metadata taking precedence for same keys + merged_metadata = {**remote_metadata, **local_metadata} + merged_function_schemas = {**remote_function_schemas, **local_function_schemas} layer_routes[route] = { - "utterances": list( - remote_utterances_set.union(local_utterances_set) # type: ignore - ) - } - - if local_function_schemas_dict or remote_function_schemas_dict: - # Ensure both are dictionaries before merging - layer_routes[route]["function_schemas"] = { # type: ignore - **( - remote_function_schemas_dict - if isinstance(remote_function_schemas_dict, dict) - else {} - ), - **( - local_function_schemas_dict - if isinstance(local_function_schemas_dict, dict) - else {} - ), + "utterances": list(remote_utterances.union(local_utterances)), + "function_schemas": merged_function_schemas, + "metadata": merged_metadata, } else: raise ValueError("Invalid sync mode specified") - for utterance in utterances_to_include: - routes_to_add.append((route, utterance)) + # Add utterances if metadata has changed or if there are new utterances + if (metadata_changed or function_schema_changed) and self.sync in ["local", "merge-force-local"]: + for utterance in local_utterances: + routes_to_add.append((route, utterance, local_function_schemas, local_metadata)) + if (metadata_changed or function_schema_changed) and self.sync == "merge": + for utterance in local_utterances: + routes_to_add.append((route, utterance, merged_function_schemas, merged_metadata)) + elif utterances_to_include: + for utterance in utterances_to_include: + routes_to_add.append((route, utterance, local_function_schemas, local_metadata)) return routes_to_add, routes_to_delete, layer_routes @@ -403,6 +390,7 @@ def add( routes: List[str], utterances: List[str], function_schemas: Optional[List[Dict[str, Any]]] = None, + metadata_list: List[Dict[str, Any]] = [], batch_size: int = 100, ): """Add vectors to Pinecone in batches.""" @@ -416,9 +404,10 @@ def add( route=route, utterance=utterance, function_schema=json.dumps(function_schema), + metadata=metadata, ).to_dict() - for vector, route, utterance, function_schema in zip( - embeddings, routes, utterances, function_schemas # type: ignore + for vector, route, utterance, function_schema, metadata in zip( + embeddings, routes, utterances, function_schemas, metadata_list # type: ignore ) ] @@ -484,26 +473,30 @@ def _get_all(self, prefix: Optional[str] = None, include_metadata: bool = False) def get_routes(self) -> List[Tuple]: """ - Gets a list of route and utterance objects currently stored in the index. + Gets a list of route and utterance objects currently stored in the index, including additional metadata. Returns: - List[Tuple]: A list of (route_name, utterance) objects. + List[Tuple]: A list of tuples, each containing route, utterance, function schema and additional metadata. """ - # Get all records _, metadata = self._get_all(include_metadata=True) route_tuples = [ ( - route_objects["sr_route"], - route_objects["sr_utterance"], + data.get("sr_route", ""), + data.get("sr_utterance", ""), ( - json.loads(route_objects["sr_function_schemas"]) - if route_objects["sr_function_schemas"] + json.loads(data["sr_function_schema"]) + if data.get("sr_function_schema", "") else {} ), + { + key: value + for key, value in data.items() + if key not in ["sr_route", "sr_utterance", "sr_function_schema"] + }, ) - for route_objects in metadata + for data in metadata ] - return route_tuples + return route_tuples # type: ignore def delete(self, route_name: str): route_vec_ids = self._get_route_ids(route_name=route_name) @@ -765,16 +758,32 @@ async def _async_fetch_metadata(self, vector_id: str) -> dict: response_data.get("vectors", {}).get(vector_id, {}).get("metadata", {}) ) - async def _async_get_routes(self) -> list[tuple]: + async def _async_get_routes(self) -> List[Tuple]: """ - Gets a list of route and utterance objects currently stored in the index. + Asynchronously gets a list of route and utterance objects currently stored in the index, including additional metadata. Returns: - List[Tuple]: A list of (route_name, utterance) objects. + List[Tuple]: A list of tuples, each containing route, utterance, function schema and additional metadata. """ _, metadata = await self._async_get_all(include_metadata=True) - route_tuples = [(x["sr_route"], x["sr_utterance"]) for x in metadata] - return route_tuples + route_info = [ + ( + data.get("sr_route", ""), + data.get("sr_utterance", ""), + ( + json.loads(data["sr_function_schema"]) + if data["sr_function_schema"] + else {} + ), + { + key: value + for key, value in data.items() + if key not in ["sr_route", "sr_utterance", "sr_function_schema"] + }, + ) + for data in metadata + ] + return route_info # type: ignore def __len__(self): return self.index.describe_index_stats()["total_vector_count"] diff --git a/semantic_router/index/postgres.py b/semantic_router/index/postgres.py index 0d18381f..ff63ec09 100644 --- a/semantic_router/index/postgres.py +++ b/semantic_router/index/postgres.py @@ -258,8 +258,9 @@ def add( self, embeddings: List[List[float]], routes: List[str], - utterances: List[Any], + utterances: List[str], function_schemas: Optional[List[Dict[str, Any]]] = None, + metadata_list: List[Dict[str, Any]] = [], ) -> None: """ Adds vectors to the index. diff --git a/semantic_router/index/qdrant.py b/semantic_router/index/qdrant.py index 180bb3b3..b372c49c 100644 --- a/semantic_router/index/qdrant.py +++ b/semantic_router/index/qdrant.py @@ -167,9 +167,10 @@ def _remove_and_sync(self, routes_to_delete: dict): def _sync_index( self, local_route_names: List[str], - local_utterances: List[str], - dimensions: int, + local_utterances_list: List[str], local_function_schemas: List[Dict[str, Any]], + local_metadata_list: List[Dict[str, Any]], + dimensions: int, ): if self.sync is not None: logger.error("Sync remove is not implemented for QdrantIndex.") @@ -180,6 +181,7 @@ def add( routes: List[str], utterances: List[str], function_schemas: Optional[List[Dict[str, Any]]] = None, + metadata_list: List[Dict[str, Any]] = [], batch_size: int = DEFAULT_UPLOAD_BATCH_SIZE, ): self.dimensions = self.dimensions or len(embeddings[0]) diff --git a/semantic_router/layer.py b/semantic_router/layer.py index f5c270ca..7baa27c8 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -218,12 +218,15 @@ def __init__( route.score_threshold = self.score_threshold # if routes list has been passed, we initialize index now if self.index.sync: - self._add_and_sync_routes(routes=self.routes) - elif self.routes: + # initialize index now + if len(self.routes) > 0: + self._add_and_sync_routes(routes=self.routes) + else: + self._add_and_sync_routes(routes=[]) + elif len(self.routes) > 0: self._add_routes(routes=self.routes) def check_for_matching_routes(self, top_class: str) -> Optional[Route]: - # Use next with a generator expression for optimization matching_route = next( (route for route in self.routes if route.name == top_class), None ) @@ -232,6 +235,7 @@ def check_for_matching_routes(self, top_class: str) -> Optional[Route]: f"No route found with name {top_class}. Check to see if any Routes " "have been defined." ) + return None return matching_route def __call__( @@ -388,14 +392,6 @@ def _check_threshold(self, scores: List[float], route: Optional[Route]) -> bool: ) return self._pass_threshold(scores, threshold) - def _set_layer_routes(self, new_routes: List[Route]): - """ - Set and override the current routes with a new list of routes. - - :param new_routes: List of Route objects to set as the current routes. - """ - self.routes = new_routes - def __str__(self): return ( f"RouteLayer(encoder={self.encoder}, " @@ -421,16 +417,9 @@ def from_config(cls, config: LayerConfig, index: Optional[BaseIndex] = None): return cls(encoder=encoder, routes=config.routes, index=index) def add(self, route: Route): - logger.info(f"Adding `{route.name}` route") - # create embeddings - embeds = self.encoder(route.utterances) - # if route has no score_threshold, use default - if route.score_threshold is None: - route.score_threshold = self.score_threshold - - # add routes to the index + embedded_utterances = self.encoder(route.utterances) self.index.add( - embeddings=embeds, + embeddings=embedded_utterances, routes=[route.name] * len(route.utterances), utterances=route.utterances, function_schemas=( @@ -438,6 +427,7 @@ def add(self, route: Route): if route.function_schemas else [{}] * len(route.utterances) ), + metadata_list=[route.metadata] * len(route.utterances), ) self.routes.append(route) @@ -483,36 +473,18 @@ def _add_routes(self, routes: List[Route]): if not routes: logger.warning("No routes provided to add.") return - - route_names = [] - all_embeddings = [] - all_utterances: List[str] = [] - all_function_schemas = [] - - for route in routes: - logger.info(f"Adding `{route.name}` route") - route_embeddings = self.encoder(route.utterances) - - # Set score_threshold if not already set - route.score_threshold = route.score_threshold or self.score_threshold - - # Prepare data for batch insertion - route_names.extend([route.name] * len(route.utterances)) - all_embeddings.extend(route_embeddings) - all_utterances.extend(route.utterances) - all_function_schemas.extend( - route.function_schemas * len(route.utterances) - if route.function_schemas - else [{}] * len(route.utterances) - ) - + # create embeddings for all routes + route_names, all_utterances, all_metadata = self._extract_routes_details( + routes, include_metadata=True + ) + embedded_utterances = self.encoder(all_utterances) try: # Batch insertion into the index self.index.add( - embeddings=all_embeddings, + embeddings=embedded_utterances, routes=route_names, utterances=all_utterances, - function_schemas=all_function_schemas, + metadata_list=all_metadata, ) except Exception as e: logger.error(f"Failed to add routes to the index: {e}") @@ -520,55 +492,63 @@ def _add_routes(self, routes: List[Route]): def _add_and_sync_routes(self, routes: List[Route]): # create embeddings for all routes and sync at startup with remote ones based on sync setting - local_route_names, local_utterances, local_function_schemas = ( - self._extract_routes_details(routes) + local_route_names, local_utterances, local_function_schemas, local_metadata = ( + self._extract_routes_details(routes, include_metadata=True) ) routes_to_add, routes_to_delete, layer_routes_dict = self.index._sync_index( - local_route_names=local_route_names, - local_utterances=local_utterances, + local_route_names, + local_utterances, + local_function_schemas, + local_metadata, dimensions=len(self.encoder(["dummy"])[0]), - local_function_schemas=local_function_schemas, ) - layer_routes: List[Route] = [] + logger.info(f"Routes to add: {routes_to_add}") + logger.info(f"Routes to delete: {routes_to_delete}") + logger.info(f"Layer routes: {layer_routes_dict}") - for route in layer_routes_dict.keys(): - route_dict = layer_routes_dict[route] - function_schemas = route_dict.get("function_schemas", None) - layer_routes.append( - Route( - name=route, - utterances=route_dict["utterances"], - function_schemas=[function_schemas] if function_schemas else None, - ) - ) - - data_to_delete: dict = {} + data_to_delete = {} # type: ignore for route, utterance in routes_to_delete: data_to_delete.setdefault(route, []).append(utterance) self.index._remove_and_sync(data_to_delete) - all_utterances_to_add = [utt for _, utt in routes_to_add] + # Prepare data for addition + if routes_to_add: + ( + route_names_to_add, + all_utterances_to_add, + function_schemas_to_add, + metadata_to_add, + ) = map(list, zip(*routes_to_add)) + else: + ( + route_names_to_add, + all_utterances_to_add, + function_schemas_to_add, + metadata_to_add, + ) = ([], [], [], []) + embedded_utterances_to_add = ( self.encoder(all_utterances_to_add) if all_utterances_to_add else [] ) - route_names_to_add = [route for route, _, in routes_to_add] - self.index.add( embeddings=embedded_utterances_to_add, routes=route_names_to_add, utterances=all_utterances_to_add, - function_schemas=local_function_schemas, + function_schemas=function_schemas_to_add, + metadata_list=metadata_to_add, ) - logger.info(f"layer_routes: {layer_routes}") - - self._set_layer_routes(layer_routes) + # Update local route layer state + self.routes = [ + Route(name=route, utterances=data.get("utterances", []), function_schemas=[data.get("function_schemas", None)], metadata=data.get("metadata", {})) + for route, data in layer_routes_dict.items() + ] def _extract_routes_details( - self, routes: List[Route] + self, routes: List[Route], include_metadata: bool = False ) -> Tuple[list[str], list[str], List[Dict[str, Any]]]: route_names = [route.name for route in routes for _ in route.utterances] utterances = [utterance for route in routes for utterance in route.utterances] @@ -577,6 +557,10 @@ def _extract_routes_details( for route in routes for _ in route.utterances ] + + if include_metadata: + metadata = [route.metadata for route in routes for _ in route.utterances] + return route_names, utterances, function_schemas, metadata return route_names, utterances, function_schemas def _encode(self, text: str) -> Any: @@ -771,11 +755,15 @@ def fit( remote_routes = self.index.get_routes() # TODO Enhance by retrieving directly the vectors instead of embedding all utterances again - routes = [route_tuple[0] for route_tuple in remote_routes] - utterances = [route_tuple[1] for route_tuple in remote_routes] + routes, utterances, metadata = map(list, zip(*remote_routes)) embeddings = self.encoder(utterances) self.index = LocalIndex() - self.index.add(embeddings=embeddings, routes=routes, utterances=utterances) + self.index.add( + embeddings=embeddings, + routes=routes, + utterances=utterances, + metadata_list=metadata, + ) # convert inputs into array Xq: List[List[float]] = [] diff --git a/semantic_router/route.py b/semantic_router/route.py index 3fc3f040..41fd0bf2 100644 --- a/semantic_router/route.py +++ b/semantic_router/route.py @@ -50,6 +50,7 @@ class Route(BaseModel): function_schemas: Optional[List[Dict[str, Any]]] = None llm: Optional[BaseLLM] = None score_threshold: Optional[float] = None + metadata: Optional[Dict[str, Any]] = {} class Config: arbitrary_types_allowed = True From 092c620c635cdb25e2bba3ee2094abcdd58f8414 Mon Sep 17 00:00:00 2001 From: Vits Date: Fri, 30 Aug 2024 18:08:31 +0200 Subject: [PATCH 22/27] Formatting --- semantic_router/index/pinecone.py | 55 +++++++++++++++++++++++-------- semantic_router/layer.py | 7 +++- 2 files changed, 47 insertions(+), 15 deletions(-) diff --git a/semantic_router/index/pinecone.py b/semantic_router/index/pinecone.py index 0231f53f..174fb49a 100644 --- a/semantic_router/index/pinecone.py +++ b/semantic_router/index/pinecone.py @@ -222,7 +222,7 @@ def _sync_index( local_function_schemas_list: List[Dict[str, Any]], local_metadata_list: List[Dict[str, Any]], dimensions: int, - ) -> Tuple[List,List,Dict]: + ) -> Tuple[List, List, Dict]: if self.index is None: self.dimensions = self.dimensions or dimensions self.index = self._init_index(force_create=True) @@ -231,7 +231,11 @@ def _sync_index( # Create remote dictionary for storing utterances and metadata remote_dict: Dict[str, Dict[str, Any]] = { - route: {"utterances": set(), "function_schemas": function_schemas, "metadata": metadata} + route: { + "utterances": set(), + "function_schemas": function_schemas, + "metadata": metadata, + } for route, utterance, function_schemas, metadata in remote_routes } for route, utterance, function_schemas, metadata in remote_routes: @@ -240,10 +244,17 @@ def _sync_index( # Create local dictionary for storing utterances and metadata local_dict: Dict[str, Dict[str, Any]] = {} for route, utterance, function_schemas, metadata in zip( - local_route_names, local_utterances_list, local_function_schemas_list, local_metadata_list + local_route_names, + local_utterances_list, + local_function_schemas_list, + local_metadata_list, ): if route not in local_dict: - local_dict[route] = {"utterances": set(), "function_schemas": function_schemas, "metadata": metadata} + local_dict[route] = { + "utterances": set(), + "function_schemas": function_schemas, + "metadata": metadata, + } local_dict[route]["utterances"].add(utterance) local_dict[route]["function_schemas"] = function_schemas local_dict[route]["metadata"] = metadata @@ -257,8 +268,12 @@ def _sync_index( for route in all_routes: local_utterances = local_dict.get(route, {}).get("utterances", set()) remote_utterances = remote_dict.get(route, {}).get("utterances", set()) - local_function_schemas = local_dict.get(route, {}).get("function_schemas", {}) - remote_function_schemas = remote_dict.get(route, {}).get("function_schemas", {}) + local_function_schemas = local_dict.get(route, {}).get( + "function_schemas", {} + ) + remote_function_schemas = remote_dict.get(route, {}).get( + "function_schemas", {} + ) local_metadata = local_dict.get(route, {}).get("metadata", {}) remote_metadata = remote_dict.get(route, {}).get("metadata", {}) @@ -354,7 +369,10 @@ def _sync_index( if local_utterances or remote_utterances: # Here metadata are merged, with local metadata taking precedence for same keys merged_metadata = {**remote_metadata, **local_metadata} - merged_function_schemas = {**remote_function_schemas, **local_function_schemas} + merged_function_schemas = { + **remote_function_schemas, + **local_function_schemas, + } layer_routes[route] = { "utterances": list(remote_utterances.union(local_utterances)), "function_schemas": merged_function_schemas, @@ -365,15 +383,24 @@ def _sync_index( raise ValueError("Invalid sync mode specified") # Add utterances if metadata has changed or if there are new utterances - if (metadata_changed or function_schema_changed) and self.sync in ["local", "merge-force-local"]: + if (metadata_changed or function_schema_changed) and self.sync in [ + "local", + "merge-force-local", + ]: for utterance in local_utterances: - routes_to_add.append((route, utterance, local_function_schemas, local_metadata)) + routes_to_add.append( + (route, utterance, local_function_schemas, local_metadata) + ) if (metadata_changed or function_schema_changed) and self.sync == "merge": for utterance in local_utterances: - routes_to_add.append((route, utterance, merged_function_schemas, merged_metadata)) + routes_to_add.append( + (route, utterance, merged_function_schemas, merged_metadata) + ) elif utterances_to_include: for utterance in utterances_to_include: - routes_to_add.append((route, utterance, local_function_schemas, local_metadata)) + routes_to_add.append( + (route, utterance, local_function_schemas, local_metadata) + ) return routes_to_add, routes_to_delete, layer_routes @@ -407,7 +434,7 @@ def add( metadata=metadata, ).to_dict() for vector, route, utterance, function_schema, metadata in zip( - embeddings, routes, utterances, function_schemas, metadata_list # type: ignore + embeddings, routes, utterances, function_schemas, metadata_list # type: ignore ) ] @@ -496,7 +523,7 @@ def get_routes(self) -> List[Tuple]: ) for data in metadata ] - return route_tuples # type: ignore + return route_tuples # type: ignore def delete(self, route_name: str): route_vec_ids = self._get_route_ids(route_name=route_name) @@ -783,7 +810,7 @@ async def _async_get_routes(self) -> List[Tuple]: ) for data in metadata ] - return route_info # type: ignore + return route_info # type: ignore def __len__(self): return self.index.describe_index_stats()["total_vector_count"] diff --git a/semantic_router/layer.py b/semantic_router/layer.py index 7baa27c8..7e385dc4 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -543,7 +543,12 @@ def _add_and_sync_routes(self, routes: List[Route]): # Update local route layer state self.routes = [ - Route(name=route, utterances=data.get("utterances", []), function_schemas=[data.get("function_schemas", None)], metadata=data.get("metadata", {})) + Route( + name=route, + utterances=data.get("utterances", []), + function_schemas=[data.get("function_schemas", None)], + metadata=data.get("metadata", {}), + ) for route, data in layer_routes_dict.items() ] From 6bd33a8ee9cd4e56af074b8d10f7aa11328dad1f Mon Sep 17 00:00:00 2001 From: Vits Date: Fri, 30 Aug 2024 18:12:44 +0200 Subject: [PATCH 23/27] Linting --- semantic_router/layer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/semantic_router/layer.py b/semantic_router/layer.py index 7e385dc4..c868d29c 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -427,7 +427,7 @@ def add(self, route: Route): if route.function_schemas else [{}] * len(route.utterances) ), - metadata_list=[route.metadata] * len(route.utterances), + metadata_list=[route.metadata if route.metadata else {}] * len(route.utterances), ) self.routes.append(route) @@ -554,7 +554,7 @@ def _add_and_sync_routes(self, routes: List[Route]): def _extract_routes_details( self, routes: List[Route], include_metadata: bool = False - ) -> Tuple[list[str], list[str], List[Dict[str, Any]]]: + ) -> Tuple: route_names = [route.name for route in routes for _ in route.utterances] utterances = [utterance for route in routes for utterance in route.utterances] function_schemas = [ From 16891cb017b67cb19cc85f015d0f8388566037b9 Mon Sep 17 00:00:00 2001 From: Vits Date: Fri, 30 Aug 2024 18:13:47 +0200 Subject: [PATCH 24/27] Formatting --- semantic_router/layer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/semantic_router/layer.py b/semantic_router/layer.py index c868d29c..1a0dbf18 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -427,7 +427,8 @@ def add(self, route: Route): if route.function_schemas else [{}] * len(route.utterances) ), - metadata_list=[route.metadata if route.metadata else {}] * len(route.utterances), + metadata_list=[route.metadata if route.metadata else {}] + * len(route.utterances), ) self.routes.append(route) From 53e6e3190f9538562e2ebcdeee8e37d759804106 Mon Sep 17 00:00:00 2001 From: Vits Date: Fri, 30 Aug 2024 18:24:27 +0200 Subject: [PATCH 25/27] Minor bugfix --- semantic_router/layer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/semantic_router/layer.py b/semantic_router/layer.py index 1a0dbf18..32c57a9f 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -475,7 +475,7 @@ def _add_routes(self, routes: List[Route]): logger.warning("No routes provided to add.") return # create embeddings for all routes - route_names, all_utterances, all_metadata = self._extract_routes_details( + route_names, all_utterances, all_function_schemas, all_metadata = self._extract_routes_details( routes, include_metadata=True ) embedded_utterances = self.encoder(all_utterances) @@ -485,6 +485,7 @@ def _add_routes(self, routes: List[Route]): embeddings=embedded_utterances, routes=route_names, utterances=all_utterances, + function_schemas=all_function_schemas, metadata_list=all_metadata, ) except Exception as e: From 1b8c6966b127ac6805083fe36f7462b06a5836ac Mon Sep 17 00:00:00 2001 From: Vits Date: Fri, 30 Aug 2024 18:26:25 +0200 Subject: [PATCH 26/27] Formatting --- semantic_router/layer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/semantic_router/layer.py b/semantic_router/layer.py index 32c57a9f..0bf7d99f 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -475,8 +475,8 @@ def _add_routes(self, routes: List[Route]): logger.warning("No routes provided to add.") return # create embeddings for all routes - route_names, all_utterances, all_function_schemas, all_metadata = self._extract_routes_details( - routes, include_metadata=True + route_names, all_utterances, all_function_schemas, all_metadata = ( + self._extract_routes_details(routes, include_metadata=True) ) embedded_utterances = self.encoder(all_utterances) try: From 5a64c953f1c2e96d1301b5a377fb2794cd525c3f Mon Sep 17 00:00:00 2001 From: Vits Date: Mon, 2 Sep 2024 10:31:15 +0200 Subject: [PATCH 27/27] Fixed failing test "test_to_dict" --- tests/unit/test_route.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/test_route.py b/tests/unit/test_route.py index b21ffd87..fe202181 100644 --- a/tests/unit/test_route.py +++ b/tests/unit/test_route.py @@ -127,6 +127,7 @@ def test_to_dict(self): "function_schemas": None, "llm": None, "score_threshold": None, + "metadata": {}, } assert route.to_dict() == expected_dict