From f7159066f523323f0b6837de1ff5fc494d2d506a Mon Sep 17 00:00:00 2001 From: star-nox Date: Wed, 13 Mar 2024 15:29:15 -0500 Subject: [PATCH 01/12] initial commit for parent-doc --- ai_ta_backend/service/retrieval_service.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ai_ta_backend/service/retrieval_service.py b/ai_ta_backend/service/retrieval_service.py index af425218..266f4a01 100644 --- a/ai_ta_backend/service/retrieval_service.py +++ b/ai_ta_backend/service/retrieval_service.py @@ -17,6 +17,7 @@ from ai_ta_backend.service.posthog_service import PosthogService from ai_ta_backend.service.sentry_service import SentryService from ai_ta_backend.utils.utils_tokenization import count_tokens_and_cost +from ai_ta_backend.utils.context_parent_doc_padding import context_parent_doc_padding class RetrievalService: @@ -69,6 +70,10 @@ def getTopContexts(self, search_query: str, course_name: str, token_limit: int = found_docs: list[Document] = self.vector_search(search_query=search_query, course_name=course_name) + # add parent doc retrieval here + parent_docs = context_parent_doc_padding(found_docs, search_query, course_name) + print(f"Number of final docs after context padding: {len(parent_docs)}") + pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" # count tokens at start and end, then also count each context. token_counter, _ = count_tokens_and_cost(pre_prompt + "\n\nNow please respond to my query: " + # type: ignore @@ -76,7 +81,7 @@ def getTopContexts(self, search_query: str, course_name: str, token_limit: int = valid_docs = [] num_tokens = 0 - for doc in found_docs: + for doc in parent_docs: doc_string = f"Document: {doc.metadata['readable_filename']}{', page: ' + str(doc.metadata['pagenumber']) if doc.metadata['pagenumber'] else ''}\n{str(doc.page_content)}\n" num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore From 46d5a5e61c0082be904c1adf89063bdc963e7457 Mon Sep 17 00:00:00 2001 From: star-nox Date: Wed, 13 Mar 2024 16:28:08 -0500 Subject: [PATCH 02/12] minor changes --- ai_ta_backend/service/export_service.py | 3 +++ ai_ta_backend/utils/context_parent_doc_padding.py | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ai_ta_backend/service/export_service.py b/ai_ta_backend/service/export_service.py index 6eb889c2..9d12c8c4 100644 --- a/ai_ta_backend/service/export_service.py +++ b/ai_ta_backend/service/export_service.py @@ -34,6 +34,7 @@ def export_documents_json(self, course_name: str, from_date='', to_date=''): """ response = self.sql.getDocumentsBetweenDates(course_name, from_date, to_date, 'documents') + print("response count: ", response.count) # add a condition to route to direct download or s3 download if response.count > 1000: # call background task to upload to s3 @@ -108,6 +109,8 @@ def export_data_in_bg(self, response, download_type, course_name, s3_path): course_name (str): The name of the course. s3_path (str): The S3 path where the file will be uploaded. """ + print("Exporting data in background...") + print(response) total_doc_count = response.count first_id = response.data[0]['id'] print("total_doc_count: ", total_doc_count) diff --git a/ai_ta_backend/utils/context_parent_doc_padding.py b/ai_ta_backend/utils/context_parent_doc_padding.py index fc0ba19c..1ce559d5 100644 --- a/ai_ta_backend/utils/context_parent_doc_padding.py +++ b/ai_ta_backend/utils/context_parent_doc_padding.py @@ -3,11 +3,14 @@ from concurrent.futures import ProcessPoolExecutor from functools import partial from multiprocessing import Manager +from ai_ta_backend.database.sql import SQLDatabase -DOCUMENTS_TABLE = os.environ['SUPABASE_DOCUMENTS_TABLE'] + +# DOCUMENTS_TABLE = os.environ['SUPABASE_DOCUMENTS_TABLE'] # SUPABASE_CLIENT = supabase.create_client(supabase_url=os.environ['SUPABASE_URL'], # supabase_key=os.environ['SUPABASE_API_KEY']) # type: ignore +SQL_DB = SQLDatabase def context_parent_doc_padding(found_docs, search_query, course_name): """ From 8dba0b1a3d6eec5b2f1453f8cc0510586f4be961 Mon Sep 17 00:00:00 2001 From: star-nox Date: Wed, 13 Mar 2024 17:43:47 -0500 Subject: [PATCH 03/12] added context padding in getTopContexts() --- ai_ta_backend/database/sql.py | 6 +-- ai_ta_backend/service/retrieval_service.py | 7 ++-- .../utils/context_parent_doc_padding.py | 38 +++++++++++-------- 3 files changed, 29 insertions(+), 22 deletions(-) diff --git a/ai_ta_backend/database/sql.py b/ai_ta_backend/database/sql.py index a9819657..b0ffb455 100644 --- a/ai_ta_backend/database/sql.py +++ b/ai_ta_backend/database/sql.py @@ -7,7 +7,7 @@ class SQLDatabase: @inject - def __init__(self, db_url: str): + def __init__(self): # Create a Supabase client self.supabase_client = supabase.create_client( # type: ignore supabase_url=os.environ['SUPABASE_URL'], supabase_key=os.environ['SUPABASE_API_KEY']) @@ -18,11 +18,11 @@ def getAllMaterialsForCourse(self, course_name: str): 'course_name', course_name).execute() def getMaterialsForCourseAndS3Path(self, course_name: str, s3_path: str): - return self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).select("id, s3_path, contexts").eq( + return self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).select("id, s3_path, readable_filename, base_url, url, contexts").eq( 's3_path', s3_path).eq('course_name', course_name).execute() def getMaterialsForCourseAndKeyAndValue(self, course_name: str, key: str, value: str): - return self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).select("id, s3_path, contexts").eq( + return self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).select("id, s3_path, readable_filename, base_url, url, contexts").eq( key, value).eq('course_name', course_name).execute() def deleteMaterialsForCourseAndKeyAndValue(self, course_name: str, key: str, value: str): diff --git a/ai_ta_backend/service/retrieval_service.py b/ai_ta_backend/service/retrieval_service.py index 266f4a01..95d8d6aa 100644 --- a/ai_ta_backend/service/retrieval_service.py +++ b/ai_ta_backend/service/retrieval_service.py @@ -71,6 +71,7 @@ def getTopContexts(self, search_query: str, course_name: str, token_limit: int = found_docs: list[Document] = self.vector_search(search_query=search_query, course_name=course_name) # add parent doc retrieval here + print(f"Number of docs retrieved: {len(found_docs)}") parent_docs = context_parent_doc_padding(found_docs, search_query, course_name) print(f"Number of final docs after context padding: {len(parent_docs)}") @@ -82,11 +83,11 @@ def getTopContexts(self, search_query: str, course_name: str, token_limit: int = valid_docs = [] num_tokens = 0 for doc in parent_docs: - doc_string = f"Document: {doc.metadata['readable_filename']}{', page: ' + str(doc.metadata['pagenumber']) if doc.metadata['pagenumber'] else ''}\n{str(doc.page_content)}\n" + doc_string = f"Document: {doc['readable_filename']}{', page: ' + str(doc['pagenumber']) if doc['pagenumber'] else ''}\n{str(doc['text'])}\n" num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore print( - f"tokens used/limit: {token_counter}/{token_limit}, tokens in chunk: {num_tokens}, total prompt cost (of these contexts): {prompt_cost}. 📄 File: {doc.metadata['readable_filename']}" + f"tokens used/limit: {token_counter}/{token_limit}, tokens in chunk: {num_tokens}, total prompt cost (of these contexts): {prompt_cost}. 📄 File: {doc['readable_filename']}" ) if token_counter + num_tokens <= token_limit: token_counter += num_tokens @@ -114,7 +115,7 @@ def getTopContexts(self, search_query: str, course_name: str, token_limit: int = }, ) - return self.format_for_json(valid_docs) + return self.format_for_json_mqr(valid_docs) except Exception as e: # return full traceback to front end # err: str = f"ERROR: In /getTopContexts. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}" # type: ignore diff --git a/ai_ta_backend/utils/context_parent_doc_padding.py b/ai_ta_backend/utils/context_parent_doc_padding.py index 1ce559d5..892cddc9 100644 --- a/ai_ta_backend/utils/context_parent_doc_padding.py +++ b/ai_ta_backend/utils/context_parent_doc_padding.py @@ -10,7 +10,7 @@ # SUPABASE_CLIENT = supabase.create_client(supabase_url=os.environ['SUPABASE_URL'], # supabase_key=os.environ['SUPABASE_API_KEY']) # type: ignore -SQL_DB = SQLDatabase +SQL_DB = SQLDatabase() def context_parent_doc_padding(found_docs, search_query, course_name): """ @@ -18,7 +18,7 @@ def context_parent_doc_padding(found_docs, search_query, course_name): """ print("inside main context padding") start_time = time.monotonic() - + with Manager() as manager: qdrant_contexts = manager.list() supabase_contexts = manager.list() @@ -44,50 +44,54 @@ def context_parent_doc_padding(found_docs, search_query, course_name): def qdrant_context_processing(doc, course_name, result_contexts): """ Re-factor QDRANT objects into Supabase objects and append to result_docs - """ + """ context_dict = { 'text': doc.page_content, 'embedding': '', 'pagenumber': doc.metadata['pagenumber'], 'readable_filename': doc.metadata['readable_filename'], 'course_name': course_name, - 's3_path': doc.metadata['s3_path'], - 'base_url': doc.metadata['base_url'] + 's3_path': doc.metadata['s3_path'] } if 'url' in doc.metadata.keys(): context_dict['url'] = doc.metadata['url'] else: context_dict['url'] = '' + if 'base_url' in doc.metadata.keys(): + context_dict['base_url'] = doc.metadata['url'] + else: + context_dict['base_url'] = '' + result_contexts.append(context_dict) - return result_contexts + #return result_contexts def supabase_context_padding(doc, course_name, result_docs): """ Does context padding for given doc. """ - + # query by url or s3_path if 'url' in doc.metadata.keys() and doc.metadata['url']: parent_doc_id = doc.metadata['url'] - response = SUPABASE_CLIENT.table(DOCUMENTS_TABLE).select('*').eq('course_name', - course_name).eq('url', parent_doc_id).execute() - + # response = SUPABASE_CLIENT.table(DOCUMENTS_TABLE).select('*').eq('course_name', + # course_name).eq('url', parent_doc_id).execute() + response = SQL_DB.getMaterialsForCourseAndKeyAndValue(course_name=course_name, key='url', value=parent_doc_id) else: parent_doc_id = doc.metadata['s3_path'] - response = SUPABASE_CLIENT.table(DOCUMENTS_TABLE).select('*').eq('course_name', - course_name).eq('s3_path', - parent_doc_id).execute() - + # response = SUPABASE_CLIENT.table(DOCUMENTS_TABLE).select('*').eq('course_name', + # course_name).eq('s3_path', + # parent_doc_id).execute() + response = SQL_DB.getMaterialsForCourseAndS3Path(course_name=course_name, s3_path=parent_doc_id) data = response.data - + if len(data) > 0: # do the padding filename = data[0]['readable_filename'] contexts = data[0]['contexts'] #print("no of contexts within the og doc: ", len(contexts)) - + if 'chunk_index' in doc.metadata and 'chunk_index' in contexts[0].keys(): #print("inside chunk index") # pad contexts by chunk index + 3 and - 3 @@ -135,3 +139,5 @@ def supabase_context_padding(doc, course_name, result_docs): context_dict['url'] = '' result_docs.append(context_dict) + + From af30a812b1bbed8561402aa4dff7f5607fadd142 Mon Sep 17 00:00:00 2001 From: star-nox Date: Mon, 18 Mar 2024 17:22:25 -0500 Subject: [PATCH 04/12] updated and switched parent doc to custom process pool --- ai_ta_backend/executors/process_pool_executor.py | 7 +++++++ ai_ta_backend/utils/context_parent_doc_padding.py | 9 ++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/ai_ta_backend/executors/process_pool_executor.py b/ai_ta_backend/executors/process_pool_executor.py index 81b4860c..b981d613 100644 --- a/ai_ta_backend/executors/process_pool_executor.py +++ b/ai_ta_backend/executors/process_pool_executor.py @@ -29,3 +29,10 @@ def submit(self, fn, *args, **kwargs): def map(self, fn, *iterables, timeout=None, chunksize=1): return self.executor.map(fn, *iterables, timeout=timeout, chunksize=chunksize) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.executor.shutdown(wait=True) + diff --git a/ai_ta_backend/utils/context_parent_doc_padding.py b/ai_ta_backend/utils/context_parent_doc_padding.py index 892cddc9..ed1d4988 100644 --- a/ai_ta_backend/utils/context_parent_doc_padding.py +++ b/ai_ta_backend/utils/context_parent_doc_padding.py @@ -4,6 +4,7 @@ from functools import partial from multiprocessing import Manager from ai_ta_backend.database.sql import SQLDatabase +from ai_ta_backend.executors.process_pool_executor import ProcessPoolExecutorAdapter # DOCUMENTS_TABLE = os.environ['SUPABASE_DOCUMENTS_TABLE'] @@ -12,12 +13,14 @@ SQL_DB = SQLDatabase() + def context_parent_doc_padding(found_docs, search_query, course_name): """ Takes top N contexts acquired from QRANT similarity search and pads them """ print("inside main context padding") start_time = time.monotonic() + #executor = ProcessPoolExecutorAdapter(max_workers=10) with Manager() as manager: qdrant_contexts = manager.list() @@ -25,7 +28,11 @@ def context_parent_doc_padding(found_docs, search_query, course_name): partial_func1 = partial(qdrant_context_processing, course_name=course_name, result_contexts=qdrant_contexts) partial_func2 = partial(supabase_context_padding, course_name=course_name, result_docs=supabase_contexts) - with ProcessPoolExecutor() as executor: + # with ProcessPoolExecutor() as executor: + # executor.map(partial_func1, found_docs[5:]) + # executor.map(partial_func2, found_docs[:5]) + + with ProcessPoolExecutorAdapter() as executor: executor.map(partial_func1, found_docs[5:]) executor.map(partial_func2, found_docs[:5]) From 9b9594170056685d2a05eed3d73009dcf3cb4c3f Mon Sep 17 00:00:00 2001 From: star-nox Date: Mon, 18 Mar 2024 17:24:33 -0500 Subject: [PATCH 05/12] moved sql db object inside supabase padding fn --- ai_ta_backend/utils/context_parent_doc_padding.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/ai_ta_backend/utils/context_parent_doc_padding.py b/ai_ta_backend/utils/context_parent_doc_padding.py index ed1d4988..163dab51 100644 --- a/ai_ta_backend/utils/context_parent_doc_padding.py +++ b/ai_ta_backend/utils/context_parent_doc_padding.py @@ -1,6 +1,6 @@ -import os +#import os import time -from concurrent.futures import ProcessPoolExecutor +#from concurrent.futures import ProcessPoolExecutor from functools import partial from multiprocessing import Manager from ai_ta_backend.database.sql import SQLDatabase @@ -11,8 +11,6 @@ # SUPABASE_CLIENT = supabase.create_client(supabase_url=os.environ['SUPABASE_URL'], # supabase_key=os.environ['SUPABASE_API_KEY']) # type: ignore -SQL_DB = SQLDatabase() - def context_parent_doc_padding(found_docs, search_query, course_name): """ @@ -77,8 +75,9 @@ def qdrant_context_processing(doc, course_name, result_contexts): def supabase_context_padding(doc, course_name, result_docs): """ Does context padding for given doc. - """ - + """ + SQL_DB = SQLDatabase() + # query by url or s3_path if 'url' in doc.metadata.keys() and doc.metadata['url']: parent_doc_id = doc.metadata['url'] From e36910e9971406c1612f3b31ae59be46efa01cc5 Mon Sep 17 00:00:00 2001 From: star-nox Date: Tue, 19 Mar 2024 15:47:14 -0500 Subject: [PATCH 06/12] testing changes --- ai_ta_backend/service/retrieval_service.py | 140 +++++++++++++++++- .../utils/context_parent_doc_padding.py | 1 - 2 files changed, 137 insertions(+), 4 deletions(-) diff --git a/ai_ta_backend/service/retrieval_service.py b/ai_ta_backend/service/retrieval_service.py index 95d8d6aa..9e8daf7a 100644 --- a/ai_ta_backend/service/retrieval_service.py +++ b/ai_ta_backend/service/retrieval_service.py @@ -3,6 +3,8 @@ import time import traceback from typing import Dict, List, Union +from functools import partial +from multiprocessing import Manager import openai from injector import inject @@ -17,7 +19,9 @@ from ai_ta_backend.service.posthog_service import PosthogService from ai_ta_backend.service.sentry_service import SentryService from ai_ta_backend.utils.utils_tokenization import count_tokens_and_cost -from ai_ta_backend.utils.context_parent_doc_padding import context_parent_doc_padding +#from ai_ta_backend.utils.context_parent_doc_padding import context_parent_doc_padding +from ai_ta_backend.executors.process_pool_executor import ProcessPoolExecutorAdapter + class RetrievalService: @@ -27,13 +31,14 @@ class RetrievalService: @inject def __init__(self, vdb: VectorDatabase, sqlDb: SQLDatabase, aws: AWSStorage, posthog: PosthogService, - sentry: SentryService, nomicService: NomicService): + sentry: SentryService, nomicService: NomicService, process_executor: ProcessPoolExecutorAdapter): self.vdb = vdb self.sqlDb = sqlDb self.aws = aws self.sentry = sentry self.posthog = posthog self.nomicService = nomicService + self.process_executor = process_executor openai.api_key = os.environ["OPENAI_API_KEY"] @@ -72,7 +77,7 @@ def getTopContexts(self, search_query: str, course_name: str, token_limit: int = # add parent doc retrieval here print(f"Number of docs retrieved: {len(found_docs)}") - parent_docs = context_parent_doc_padding(found_docs, search_query, course_name) + parent_docs = self.context_parent_doc_padding(found_docs, course_name) print(f"Number of final docs after context padding: {len(parent_docs)}") pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" @@ -422,3 +427,132 @@ def format_for_json(self, found_docs: List[Document]) -> List[Dict]: ] return contexts + + def context_parent_doc_padding(self, found_docs, course_name): + """ + Takes top N contexts acquired from QRANT similarity search and pads them + """ + print("inside main context padding") + start_time = time.monotonic() + + + with Manager() as manager: + qdrant_contexts = manager.list() + supabase_contexts = manager.list() + partial_func1 = partial(self.qdrant_context_processing, course_name=course_name, result_contexts=qdrant_contexts) + partial_func2 = partial(self.supabase_context_padding, course_name=course_name, result_docs=supabase_contexts) + + # with ProcessPoolExecutor() as executor: + # executor.map(partial_func1, found_docs[5:]) + # executor.map(partial_func2, found_docs[:5]) + try: + with self.process_executor as executor: + futures1 = executor.map(partial_func1, found_docs[5:]) + futures2 = executor.map(partial_func2, found_docs[:5]) + for future in futures2: + try: + print(f"future1: {future.result()}") + except Exception as e: + print(f"Error in future1: {e}") + except Exception as e: + print("Error in process_pool_executor", e) + + supabase_contexts_no_duplicates = [] + for context in supabase_contexts: + if context not in supabase_contexts_no_duplicates: + supabase_contexts_no_duplicates.append(context) + + result_contexts = supabase_contexts_no_duplicates + list(qdrant_contexts) + + print(f"⏰ Context padding runtime: {(time.monotonic() - start_time):.2f} seconds") + + return result_contexts + + def qdrant_context_processing(self, doc, course_name, result_contexts): + """ + Re-factor QDRANT objects into Supabase objects and append to result_docs + """ + print("inside qdrant_context_processing") + context_dict = { + 'text': doc.page_content, + 'embedding': '', + 'pagenumber': doc.metadata['pagenumber'], + 'readable_filename': doc.metadata['readable_filename'], + 'course_name': course_name, + 's3_path': doc.metadata['s3_path'] + } + if 'url' in doc.metadata.keys(): + context_dict['url'] = doc.metadata['url'] + else: + context_dict['url'] = '' + + if 'base_url' in doc.metadata.keys(): + context_dict['base_url'] = doc.metadata['url'] + else: + context_dict['base_url'] = '' + + result_contexts.append(context_dict) + + def supabase_context_padding(self, doc, course_name, result_docs): + """ + Fetches surrounding contexts from supabase and appends them to retrieved contexts. + """ + print("inside supabase_context_padding") + # query by url or s3_path + if 'url' in doc.metadata.keys() and doc.metadata['url']: + parent_doc_id = doc.metadata['url'] + response = self.sqlDb.getMaterialsForCourseAndKeyAndValue(course_name=course_name, key='url', value=parent_doc_id) + else: + parent_doc_id = doc.metadata['s3_path'] + response = self.sqlDb.getMaterialsForCourseAndS3Path(course_name=course_name, s3_path=parent_doc_id) + data = response.data + + if len(data) > 0: + # do the padding + filename = data[0]['readable_filename'] + contexts = data[0]['contexts'] + + if 'chunk_index' in doc.metadata and 'chunk_index' in contexts[0].keys(): + # pad contexts by chunk index + 3 and - 3 + target_chunk_index = doc.metadata['chunk_index'] + for context in contexts: + curr_chunk_index = context['chunk_index'] + if (target_chunk_index - 3 <= curr_chunk_index <= target_chunk_index + 3): + context['readable_filename'] = filename + context['course_name'] = course_name + context['s3_path'] = data[0]['s3_path'] + context['url'] = data[0]['url'] + context['base_url'] = data[0]['base_url'] + result_docs.append(context) + + elif doc.metadata['pagenumber'] != '': + # pad contexts belonging to same page number + pagenumber = doc.metadata['pagenumber'] + + for context in contexts: + if int(context['pagenumber']) == pagenumber: + context['readable_filename'] = filename + context['course_name'] = course_name + context['s3_path'] = data[0]['s3_path'] + context['url'] = data[0]['url'] + context['base_url'] = data[0]['base_url'] + result_docs.append(context) + + else: + # refactor as a Supabase object and append + context_dict = { + 'text': doc.page_content, + 'embedding': '', + 'pagenumber': doc.metadata['pagenumber'], + 'readable_filename': doc.metadata['readable_filename'], + 'course_name': course_name, + 's3_path': doc.metadata['s3_path'], + 'base_url': doc.metadata['base_url'] + } + if 'url' in doc.metadata.keys(): + context_dict['url'] = doc.metadata['url'] + else: + context_dict['url'] = '' + + result_docs.append(context_dict) + diff --git a/ai_ta_backend/utils/context_parent_doc_padding.py b/ai_ta_backend/utils/context_parent_doc_padding.py index 163dab51..a9fefd16 100644 --- a/ai_ta_backend/utils/context_parent_doc_padding.py +++ b/ai_ta_backend/utils/context_parent_doc_padding.py @@ -69,7 +69,6 @@ def qdrant_context_processing(doc, course_name, result_contexts): context_dict['base_url'] = '' result_contexts.append(context_dict) - #return result_contexts def supabase_context_padding(doc, course_name, result_docs): From 9efa91ecf8760b6bad1ee47940cf55eb151eb9e6 Mon Sep 17 00:00:00 2001 From: star-nox Date: Wed, 20 Mar 2024 15:40:18 -0500 Subject: [PATCH 07/12] added lock --- ai_ta_backend/service/retrieval_service.py | 167 +++++++++++---------- 1 file changed, 86 insertions(+), 81 deletions(-) diff --git a/ai_ta_backend/service/retrieval_service.py b/ai_ta_backend/service/retrieval_service.py index 9e8daf7a..296ed908 100644 --- a/ai_ta_backend/service/retrieval_service.py +++ b/ai_ta_backend/service/retrieval_service.py @@ -5,6 +5,8 @@ from typing import Dict, List, Union from functools import partial from multiprocessing import Manager +from multiprocessing import Lock + import openai from injector import inject @@ -431,7 +433,7 @@ def format_for_json(self, found_docs: List[Document]) -> List[Dict]: def context_parent_doc_padding(self, found_docs, course_name): """ Takes top N contexts acquired from QRANT similarity search and pads them - """ + """ print("inside main context padding") start_time = time.monotonic() @@ -447,8 +449,9 @@ def context_parent_doc_padding(self, found_docs, course_name): # executor.map(partial_func2, found_docs[:5]) try: with self.process_executor as executor: - futures1 = executor.map(partial_func1, found_docs[5:]) - futures2 = executor.map(partial_func2, found_docs[:5]) + lock = Lock() + futures1 = executor.map(partial_func1, found_docs[5:], [lock] * len(found_docs[5:])) + futures2 = executor.map(partial_func2, found_docs[:5], [lock] * len(found_docs[:5])) for future in futures2: try: print(f"future1: {future.result()}") @@ -456,7 +459,7 @@ def context_parent_doc_padding(self, found_docs, course_name): print(f"Error in future1: {e}") except Exception as e: print("Error in process_pool_executor", e) - + supabase_contexts_no_duplicates = [] for context in supabase_contexts: if context not in supabase_contexts_no_duplicates: @@ -468,91 +471,93 @@ def context_parent_doc_padding(self, found_docs, course_name): return result_contexts - def qdrant_context_processing(self, doc, course_name, result_contexts): + def qdrant_context_processing(self, doc, course_name, result_contexts, lock): """ Re-factor QDRANT objects into Supabase objects and append to result_docs """ - print("inside qdrant_context_processing") - context_dict = { - 'text': doc.page_content, - 'embedding': '', - 'pagenumber': doc.metadata['pagenumber'], - 'readable_filename': doc.metadata['readable_filename'], - 'course_name': course_name, - 's3_path': doc.metadata['s3_path'] - } - if 'url' in doc.metadata.keys(): - context_dict['url'] = doc.metadata['url'] - else: - context_dict['url'] = '' - - if 'base_url' in doc.metadata.keys(): - context_dict['base_url'] = doc.metadata['url'] - else: - context_dict['base_url'] = '' - - result_contexts.append(context_dict) + with lock: + print("inside qdrant_context_processing") + context_dict = { + 'text': doc.page_content, + 'embedding': '', + 'pagenumber': doc.metadata['pagenumber'], + 'readable_filename': doc.metadata['readable_filename'], + 'course_name': course_name, + 's3_path': doc.metadata['s3_path'] + } + if 'url' in doc.metadata.keys(): + context_dict['url'] = doc.metadata['url'] + else: + context_dict['url'] = '' + + if 'base_url' in doc.metadata.keys(): + context_dict['base_url'] = doc.metadata['url'] + else: + context_dict['base_url'] = '' + + result_contexts.append(context_dict) - def supabase_context_padding(self, doc, course_name, result_docs): + def supabase_context_padding(self, doc, course_name, result_docs, lock): """ Fetches surrounding contexts from supabase and appends them to retrieved contexts. """ - print("inside supabase_context_padding") - # query by url or s3_path - if 'url' in doc.metadata.keys() and doc.metadata['url']: - parent_doc_id = doc.metadata['url'] - response = self.sqlDb.getMaterialsForCourseAndKeyAndValue(course_name=course_name, key='url', value=parent_doc_id) - else: - parent_doc_id = doc.metadata['s3_path'] - response = self.sqlDb.getMaterialsForCourseAndS3Path(course_name=course_name, s3_path=parent_doc_id) - data = response.data - - if len(data) > 0: - # do the padding - filename = data[0]['readable_filename'] - contexts = data[0]['contexts'] + with lock: + print("inside supabase_context_padding") + # query by url or s3_path + if 'url' in doc.metadata.keys() and doc.metadata['url']: + parent_doc_id = doc.metadata['url'] + response = self.sqlDb.getMaterialsForCourseAndKeyAndValue(course_name=course_name, key='url', value=parent_doc_id) + else: + parent_doc_id = doc.metadata['s3_path'] + response = self.sqlDb.getMaterialsForCourseAndS3Path(course_name=course_name, s3_path=parent_doc_id) + data = response.data - if 'chunk_index' in doc.metadata and 'chunk_index' in contexts[0].keys(): - # pad contexts by chunk index + 3 and - 3 - target_chunk_index = doc.metadata['chunk_index'] - for context in contexts: - curr_chunk_index = context['chunk_index'] - if (target_chunk_index - 3 <= curr_chunk_index <= target_chunk_index + 3): - context['readable_filename'] = filename - context['course_name'] = course_name - context['s3_path'] = data[0]['s3_path'] - context['url'] = data[0]['url'] - context['base_url'] = data[0]['base_url'] - result_docs.append(context) - - elif doc.metadata['pagenumber'] != '': - # pad contexts belonging to same page number - pagenumber = doc.metadata['pagenumber'] - - for context in contexts: - if int(context['pagenumber']) == pagenumber: - context['readable_filename'] = filename - context['course_name'] = course_name - context['s3_path'] = data[0]['s3_path'] - context['url'] = data[0]['url'] - context['base_url'] = data[0]['base_url'] - result_docs.append(context) + if len(data) > 0: + # do the padding + filename = data[0]['readable_filename'] + contexts = data[0]['contexts'] + + if 'chunk_index' in doc.metadata and 'chunk_index' in contexts[0].keys(): + # pad contexts by chunk index + 3 and - 3 + target_chunk_index = doc.metadata['chunk_index'] + for context in contexts: + curr_chunk_index = context['chunk_index'] + if (target_chunk_index - 3 <= curr_chunk_index <= target_chunk_index + 3): + context['readable_filename'] = filename + context['course_name'] = course_name + context['s3_path'] = data[0]['s3_path'] + context['url'] = data[0]['url'] + context['base_url'] = data[0]['base_url'] + result_docs.append(context) + + elif doc.metadata['pagenumber'] != '': + # pad contexts belonging to same page number + pagenumber = doc.metadata['pagenumber'] + + for context in contexts: + if int(context['pagenumber']) == pagenumber: + context['readable_filename'] = filename + context['course_name'] = course_name + context['s3_path'] = data[0]['s3_path'] + context['url'] = data[0]['url'] + context['base_url'] = data[0]['base_url'] + result_docs.append(context) - else: - # refactor as a Supabase object and append - context_dict = { - 'text': doc.page_content, - 'embedding': '', - 'pagenumber': doc.metadata['pagenumber'], - 'readable_filename': doc.metadata['readable_filename'], - 'course_name': course_name, - 's3_path': doc.metadata['s3_path'], - 'base_url': doc.metadata['base_url'] - } - if 'url' in doc.metadata.keys(): - context_dict['url'] = doc.metadata['url'] else: - context_dict['url'] = '' - - result_docs.append(context_dict) + # refactor as a Supabase object and append + context_dict = { + 'text': doc.page_content, + 'embedding': '', + 'pagenumber': doc.metadata['pagenumber'], + 'readable_filename': doc.metadata['readable_filename'], + 'course_name': course_name, + 's3_path': doc.metadata['s3_path'], + 'base_url': doc.metadata['base_url'] + } + if 'url' in doc.metadata.keys(): + context_dict['url'] = doc.metadata['url'] + else: + context_dict['url'] = '' + + result_docs.append(context_dict) From b766deebc193f2ff6c165b596873c018a89e157d Mon Sep 17 00:00:00 2001 From: star-nox Date: Fri, 19 Apr 2024 12:10:10 -0500 Subject: [PATCH 08/12] moved context padding to retreival_service.py --- ai_ta_backend/service/retrieval_service.py | 241 ++++++++++++++++++++- 1 file changed, 235 insertions(+), 6 deletions(-) diff --git a/ai_ta_backend/service/retrieval_service.py b/ai_ta_backend/service/retrieval_service.py index d6cde2eb..30059c0b 100644 --- a/ai_ta_backend/service/retrieval_service.py +++ b/ai_ta_backend/service/retrieval_service.py @@ -21,10 +21,10 @@ from ai_ta_backend.service.posthog_service import PosthogService from ai_ta_backend.service.sentry_service import SentryService from ai_ta_backend.utils.utils_tokenization import count_tokens_and_cost -from ai_ta_backend.utils.context_parent_doc_padding import context_parent_doc_padding -#from ai_ta_backend.executors.process_pool_executor import ProcessPoolExecutorAdapter - - +#from ai_ta_backend.utils.context_parent_doc_padding import context_parent_doc_padding +from ai_ta_backend.executors.process_pool_executor import ProcessPoolExecutorAdapter +from functools import partial +from multiprocessing import Manager class RetrievalService: """ @@ -33,13 +33,14 @@ class RetrievalService: @inject def __init__(self, vdb: VectorDatabase, sqlDb: SQLDatabase, aws: AWSStorage, posthog: PosthogService, - sentry: SentryService, nomicService: NomicService): + sentry: SentryService, nomicService: NomicService, executor: ProcessPoolExecutorAdapter): self.vdb = vdb self.sqlDb = sqlDb self.aws = aws self.sentry = sentry self.posthog = posthog self.nomicService = nomicService + self.executor = executor openai.api_key = os.environ["OPENAI_API_KEY"] @@ -86,7 +87,7 @@ def getTopContexts(self, # add parent doc retrieval here print(f"Number of docs retrieved: {len(found_docs)}") - parent_docs = context_parent_doc_padding(found_docs, course_name) + parent_docs = self.context_parent_doc_padding(found_docs, course_name) print(f"Number of final docs after context padding: {len(parent_docs)}") pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" @@ -439,5 +440,233 @@ def format_for_json(self, found_docs: List[Document]) -> List[Dict]: ] return contexts + + def context_parent_doc_padding(self, found_docs, course_name): + """ + Takes top N contexts acquired from QRANT similarity search and pads them + """ + print("inside main context padding") + start_time = time.monotonic() + + with Manager() as manager: + qdrant_contexts = manager.list() + supabase_contexts = manager.list() + partial_func1 = partial(qdrant_context_processing, course_name=course_name, result_contexts=qdrant_contexts) + partial_func2 = partial(supabase_context_padding, course_name=course_name, result_docs=supabase_contexts) + + with self.executor as executor: + executor.map(partial_func1, found_docs[5:]) + executor.map(partial_func2, found_docs[:5]) + + # with self.executor as executor: + # executor.map(lambda doc: self.qdrant_context_processing(doc, course_name=course_name, result_contexts=[]), found_docs[5:]) + # executor.map(lambda doc: self.supabase_context_padding(doc, course_name=course_name, result_docs=[]), found_docs[:5]) + + + supabase_contexts_no_duplicates = [] + for context in supabase_contexts: + if context not in supabase_contexts_no_duplicates: + supabase_contexts_no_duplicates.append(context) + + result_contexts = supabase_contexts_no_duplicates + list(qdrant_contexts) + #print("len of supabase contexts: ", len(supabase_contexts_no_duplicates)) + + print(f"⏰ Context padding runtime: {(time.monotonic() - start_time):.2f} seconds") + + return result_contexts + + # def qdrant_context_processing(self, doc, course_name, result_contexts): + # """ + # Re-factor QDRANT objects into Supabase objects and append to result_docs + # """ + # print("inside qdrant context processing") + # context_dict = { + # 'text': doc.page_content, + # 'embedding': '', + # 'pagenumber': doc.metadata['pagenumber'], + # 'readable_filename': doc.metadata['readable_filename'], + # 'course_name': course_name, + # 's3_path': doc.metadata['s3_path'] + # } + + # if 'url' in doc.metadata.keys(): + # context_dict['url'] = doc.metadata['url'] + # else: + # context_dict['url'] = '' + + # if 'base_url' in doc.metadata.keys(): + # context_dict['base_url'] = doc.metadata['url'] + # else: + # context_dict['base_url'] = '' + + # result_contexts.append(context_dict) + + # def supabase_context_padding(self, doc, course_name, result_docs): + # """ + # Does context padding for given doc. + # """ + # print("inside supabase context padding") + # SQL_DB = SQLDatabase() + + # # query by url or s3_path + # if 'url' in doc.metadata.keys() and doc.metadata['url']: + # parent_doc_id = doc.metadata['url'] + # response = SQL_DB.getMaterialsForCourseAndKeyAndValue(course_name=course_name, key='url', value=parent_doc_id) + # else: + # parent_doc_id = doc.metadata['s3_path'] + # response = SQL_DB.getMaterialsForCourseAndS3Path(course_name=course_name, s3_path=parent_doc_id) + + # data = response.data + + # if len(data) > 0: + # # do the padding + # filename = data[0]['readable_filename'] + # contexts = data[0]['contexts'] + # #print("no of contexts within the og doc: ", len(contexts)) + + # if 'chunk_index' in doc.metadata and 'chunk_index' in contexts[0].keys(): + # #print("inside chunk index") + # # pad contexts by chunk index + 3 and - 3 + # target_chunk_index = doc.metadata['chunk_index'] + # for context in contexts: + # curr_chunk_index = context['chunk_index'] + # if (target_chunk_index - 3 <= curr_chunk_index <= target_chunk_index + 3): + # context['readable_filename'] = filename + # context['course_name'] = course_name + # context['s3_path'] = data[0]['s3_path'] + # context['url'] = data[0]['url'] + # context['base_url'] = data[0]['base_url'] + # result_docs.append(context) + + # elif doc.metadata['pagenumber'] != '': + # #print("inside page number") + # # pad contexts belonging to same page number + # pagenumber = doc.metadata['pagenumber'] + + # for context in contexts: + # # pad contexts belonging to same page number + # if int(context['pagenumber']) == pagenumber: + # context['readable_filename'] = filename + # context['course_name'] = course_name + # context['s3_path'] = data[0]['s3_path'] + # context['url'] = data[0]['url'] + # context['base_url'] = data[0]['base_url'] + # result_docs.append(context) + + # else: + # #print("inside else") + # # refactor as a Supabase object and append + # context_dict = { + # 'text': doc.page_content, + # 'embedding': '', + # 'pagenumber': doc.metadata['pagenumber'], + # 'readable_filename': doc.metadata['readable_filename'], + # 'course_name': course_name, + # 's3_path': doc.metadata['s3_path'], + # 'base_url': doc.metadata['base_url'] + # } + # if 'url' in doc.metadata.keys(): + # context_dict['url'] = doc.metadata['url'] + # else: + # context_dict['url'] = '' + + # result_docs.append(context_dict) + +def qdrant_context_processing(doc, course_name, result_contexts): + """ + Re-factor QDRANT objects into Supabase objects and append to result_docs + """ + #print("inside qdrant context processing") + context_dict = { + 'text': doc.page_content, + 'embedding': '', + 'pagenumber': doc.metadata['pagenumber'], + 'readable_filename': doc.metadata['readable_filename'], + 'course_name': course_name, + 's3_path': doc.metadata['s3_path'] + } + + if 'url' in doc.metadata.keys(): + context_dict['url'] = doc.metadata['url'] + else: + context_dict['url'] = '' + + if 'base_url' in doc.metadata.keys(): + context_dict['base_url'] = doc.metadata['url'] + else: + context_dict['base_url'] = '' + + result_contexts.append(context_dict) + +def supabase_context_padding(doc, course_name, result_docs): + """ + Does context padding for given doc. + """ + #print("inside supabase context padding") + SQL_DB = SQLDatabase() + + # query by url or s3_path + if 'url' in doc.metadata.keys() and doc.metadata['url']: + parent_doc_id = doc.metadata['url'] + response = SQL_DB.getMaterialsForCourseAndKeyAndValue(course_name=course_name, key='url', value=parent_doc_id) + else: + parent_doc_id = doc.metadata['s3_path'] + response = SQL_DB.getMaterialsForCourseAndS3Path(course_name=course_name, s3_path=parent_doc_id) + + data = response.data + + if len(data) > 0: + # do the padding + filename = data[0]['readable_filename'] + contexts = data[0]['contexts'] + #print("no of contexts within the og doc: ", len(contexts)) + + if 'chunk_index' in doc.metadata and 'chunk_index' in contexts[0].keys(): + #print("inside chunk index") + # pad contexts by chunk index + 3 and - 3 + target_chunk_index = doc.metadata['chunk_index'] + for context in contexts: + curr_chunk_index = context['chunk_index'] + if (target_chunk_index - 3 <= curr_chunk_index <= target_chunk_index + 3): + context['readable_filename'] = filename + context['course_name'] = course_name + context['s3_path'] = data[0]['s3_path'] + context['url'] = data[0]['url'] + context['base_url'] = data[0]['base_url'] + result_docs.append(context) + + elif doc.metadata['pagenumber'] != '': + #print("inside page number") + # pad contexts belonging to same page number + pagenumber = doc.metadata['pagenumber'] + + for context in contexts: + # pad contexts belonging to same page number + if int(context['pagenumber']) == pagenumber: + context['readable_filename'] = filename + context['course_name'] = course_name + context['s3_path'] = data[0]['s3_path'] + context['url'] = data[0]['url'] + context['base_url'] = data[0]['base_url'] + result_docs.append(context) + + else: + #print("inside else") + # refactor as a Supabase object and append + context_dict = { + 'text': doc.page_content, + 'embedding': '', + 'pagenumber': doc.metadata['pagenumber'], + 'readable_filename': doc.metadata['readable_filename'], + 'course_name': course_name, + 's3_path': doc.metadata['s3_path'], + 'base_url': doc.metadata['base_url'] + } + if 'url' in doc.metadata.keys(): + context_dict['url'] = doc.metadata['url'] + else: + context_dict['url'] = '' + + result_docs.append(context_dict) \ No newline at end of file From 92ad1e282868d459fbf2d3e355f2e32f9df6ae8d Mon Sep 17 00:00:00 2001 From: star-nox Date: Tue, 30 Apr 2024 15:30:25 -0500 Subject: [PATCH 09/12] modified context padding logic --- ai_ta_backend/database/sql.py | 5 + ai_ta_backend/service/retrieval_service.py | 191 +++++---------------- 2 files changed, 51 insertions(+), 145 deletions(-) diff --git a/ai_ta_backend/database/sql.py b/ai_ta_backend/database/sql.py index 2f8c6a38..f5739fef 100644 --- a/ai_ta_backend/database/sql.py +++ b/ai_ta_backend/database/sql.py @@ -110,3 +110,8 @@ def updateProjects(self, course_name: str, data: dict): def getConversation(self, course_name: str, key: str, value: str): return self.supabase_client.table("llm-convo-monitor").select("*").eq(key, value).eq("course_name", course_name).execute() + def getDocsByURLs(self, course_name: str, urls: list): + return self.supabase_client.table("documents").select("*").eq("course_name", course_name).in_("url", urls).execute() + + def getDocsByS3Paths(self, course_name: str, s3_paths: list): + return self.supabase_client.table("documents").select("*").eq("course_name", course_name).in_("s3_path", s3_paths).execute() \ No newline at end of file diff --git a/ai_ta_backend/service/retrieval_service.py b/ai_ta_backend/service/retrieval_service.py index e33b18f7..95bbd810 100644 --- a/ai_ta_backend/service/retrieval_service.py +++ b/ai_ta_backend/service/retrieval_service.py @@ -21,7 +21,6 @@ from ai_ta_backend.service.posthog_service import PosthogService from ai_ta_backend.service.sentry_service import SentryService from ai_ta_backend.utils.utils_tokenization import count_tokens_and_cost -#from ai_ta_backend.utils.context_parent_doc_padding import context_parent_doc_padding from ai_ta_backend.executors.process_pool_executor import ProcessPoolExecutorAdapter from functools import partial from multiprocessing import Manager @@ -59,7 +58,7 @@ def __init__(self, vdb: VectorDatabase, sqlDb: SQLDatabase, aws: AWSStorage, pos openai_api_key=os.environ["AZURE_OPENAI_KEY"], openai_api_version=os.environ["OPENAI_API_VERSION"], openai_api_type=os.environ['OPENAI_API_TYPE'], - ) + ) # type: ignore def getTopContexts(self, search_query: str, @@ -130,7 +129,7 @@ def getTopContexts(self, }, ) - return self.format_for_json_mqr(valid_docs) + return valid_docs except Exception as e: # return full traceback to front end # err: str = f"ERROR: In /getTopContexts. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}" # type: ignore @@ -486,130 +485,41 @@ def context_parent_doc_padding(self, found_docs, course_name): """ print("inside main context padding") start_time = time.monotonic() - + + # form a list of urls and s3_paths + urls = [] + s3_paths = [] + for doc in found_docs[:5]: + if 'url' in doc.metadata.keys(): + urls.append(doc.metadata['url']) + elif 's3_path' in doc.metadata.keys(): + s3_paths.append(doc.metadata['s3_path']) + + # query Supabase + supabase_url_content = self.sqlDb.getDocsByURLs(course_name, urls) + supabase_s3_content = self.sqlDb.getDocsByS3Paths(course_name, s3_paths) + supabase_data = supabase_url_content.data + supabase_s3_content.data + with Manager() as manager: qdrant_contexts = manager.list() supabase_contexts = manager.list() partial_func1 = partial(qdrant_context_processing, course_name=course_name, result_contexts=qdrant_contexts) - partial_func2 = partial(supabase_context_padding, course_name=course_name, result_docs=supabase_contexts) + partial_func2 = partial(supabase_context_padding, course_name=course_name, sql_data=supabase_data, result_docs=supabase_contexts) with self.executor as executor: executor.map(partial_func1, found_docs[5:]) executor.map(partial_func2, found_docs[:5]) - - # with self.executor as executor: - # executor.map(lambda doc: self.qdrant_context_processing(doc, course_name=course_name, result_contexts=[]), found_docs[5:]) - # executor.map(lambda doc: self.supabase_context_padding(doc, course_name=course_name, result_docs=[]), found_docs[:5]) - - + supabase_contexts_no_duplicates = [] for context in supabase_contexts: if context not in supabase_contexts_no_duplicates: supabase_contexts_no_duplicates.append(context) result_contexts = supabase_contexts_no_duplicates + list(qdrant_contexts) - #print("len of supabase contexts: ", len(supabase_contexts_no_duplicates)) - + print("len of supabase contexts: ", len(supabase_contexts_no_duplicates)) print(f"⏰ Context padding runtime: {(time.monotonic() - start_time):.2f} seconds") - return result_contexts - - # def qdrant_context_processing(self, doc, course_name, result_contexts): - # """ - # Re-factor QDRANT objects into Supabase objects and append to result_docs - # """ - # print("inside qdrant context processing") - # context_dict = { - # 'text': doc.page_content, - # 'embedding': '', - # 'pagenumber': doc.metadata['pagenumber'], - # 'readable_filename': doc.metadata['readable_filename'], - # 'course_name': course_name, - # 's3_path': doc.metadata['s3_path'] - # } - - # if 'url' in doc.metadata.keys(): - # context_dict['url'] = doc.metadata['url'] - # else: - # context_dict['url'] = '' - - # if 'base_url' in doc.metadata.keys(): - # context_dict['base_url'] = doc.metadata['url'] - # else: - # context_dict['base_url'] = '' - - # result_contexts.append(context_dict) - - # def supabase_context_padding(self, doc, course_name, result_docs): - # """ - # Does context padding for given doc. - # """ - # print("inside supabase context padding") - # SQL_DB = SQLDatabase() - - # # query by url or s3_path - # if 'url' in doc.metadata.keys() and doc.metadata['url']: - # parent_doc_id = doc.metadata['url'] - # response = SQL_DB.getMaterialsForCourseAndKeyAndValue(course_name=course_name, key='url', value=parent_doc_id) - # else: - # parent_doc_id = doc.metadata['s3_path'] - # response = SQL_DB.getMaterialsForCourseAndS3Path(course_name=course_name, s3_path=parent_doc_id) - - # data = response.data - - # if len(data) > 0: - # # do the padding - # filename = data[0]['readable_filename'] - # contexts = data[0]['contexts'] - # #print("no of contexts within the og doc: ", len(contexts)) - - # if 'chunk_index' in doc.metadata and 'chunk_index' in contexts[0].keys(): - # #print("inside chunk index") - # # pad contexts by chunk index + 3 and - 3 - # target_chunk_index = doc.metadata['chunk_index'] - # for context in contexts: - # curr_chunk_index = context['chunk_index'] - # if (target_chunk_index - 3 <= curr_chunk_index <= target_chunk_index + 3): - # context['readable_filename'] = filename - # context['course_name'] = course_name - # context['s3_path'] = data[0]['s3_path'] - # context['url'] = data[0]['url'] - # context['base_url'] = data[0]['base_url'] - # result_docs.append(context) - - # elif doc.metadata['pagenumber'] != '': - # #print("inside page number") - # # pad contexts belonging to same page number - # pagenumber = doc.metadata['pagenumber'] - - # for context in contexts: - # # pad contexts belonging to same page number - # if int(context['pagenumber']) == pagenumber: - # context['readable_filename'] = filename - # context['course_name'] = course_name - # context['s3_path'] = data[0]['s3_path'] - # context['url'] = data[0]['url'] - # context['base_url'] = data[0]['base_url'] - # result_docs.append(context) - - # else: - # #print("inside else") - # # refactor as a Supabase object and append - # context_dict = { - # 'text': doc.page_content, - # 'embedding': '', - # 'pagenumber': doc.metadata['pagenumber'], - # 'readable_filename': doc.metadata['readable_filename'], - # 'course_name': course_name, - # 's3_path': doc.metadata['s3_path'], - # 'base_url': doc.metadata['base_url'] - # } - # if 'url' in doc.metadata.keys(): - # context_dict['url'] = doc.metadata['url'] - # else: - # context_dict['url'] = '' - - # result_docs.append(context_dict) + def qdrant_context_processing(doc, course_name, result_contexts): """ @@ -637,31 +547,24 @@ def qdrant_context_processing(doc, course_name, result_contexts): result_contexts.append(context_dict) -def supabase_context_padding(doc, course_name, result_docs): +def supabase_context_padding(doc, course_name, sql_data, result_docs): """ Does context padding for given doc. - """ - #print("inside supabase context padding") - SQL_DB = SQLDatabase() - - # query by url or s3_path - if 'url' in doc.metadata.keys() and doc.metadata['url']: - parent_doc_id = doc.metadata['url'] - response = SQL_DB.getMaterialsForCourseAndKeyAndValue(course_name=course_name, key='url', value=parent_doc_id) + """ + # search the document in sql_data + if 'url' in doc.metadata.keys(): + supabase_doc = next((item for item in sql_data if item['url'] == doc.metadata['url']), None) + elif 's3_path' in doc.metadata.keys(): + supabase_doc = next((item for item in sql_data if item['s3_path'] == doc.metadata['s3_path']), None) else: - parent_doc_id = doc.metadata['s3_path'] - response = SQL_DB.getMaterialsForCourseAndS3Path(course_name=course_name, s3_path=parent_doc_id) - - data = response.data - - if len(data) > 0: - # do the padding - filename = data[0]['readable_filename'] - contexts = data[0]['contexts'] - #print("no of contexts within the og doc: ", len(contexts)) - + supabase_doc = None + + # create a dictionary + if supabase_doc: + contexts = supabase_doc['contexts'] + filename = supabase_doc['readable_filename'] + if 'chunk_index' in doc.metadata and 'chunk_index' in contexts[0].keys(): - #print("inside chunk index") # pad contexts by chunk index + 3 and - 3 target_chunk_index = doc.metadata['chunk_index'] for context in contexts: @@ -669,13 +572,13 @@ def supabase_context_padding(doc, course_name, result_docs): if (target_chunk_index - 3 <= curr_chunk_index <= target_chunk_index + 3): context['readable_filename'] = filename context['course_name'] = course_name - context['s3_path'] = data[0]['s3_path'] - context['url'] = data[0]['url'] - context['base_url'] = data[0]['base_url'] + context['s3_path'] = supabase_doc['s3_path'] + context['url'] = supabase_doc['url'] + context['base_url'] = supabase_doc['base_url'] + context.pop('embedding', None) result_docs.append(context) - + elif doc.metadata['pagenumber'] != '': - #print("inside page number") # pad contexts belonging to same page number pagenumber = doc.metadata['pagenumber'] @@ -684,17 +587,16 @@ def supabase_context_padding(doc, course_name, result_docs): if int(context['pagenumber']) == pagenumber: context['readable_filename'] = filename context['course_name'] = course_name - context['s3_path'] = data[0]['s3_path'] - context['url'] = data[0]['url'] - context['base_url'] = data[0]['base_url'] + context['s3_path'] = supabase_doc['s3_path'] + context['url'] = supabase_doc['url'] + context['base_url'] = supabase_doc['base_url'] + context.pop('embedding', None) result_docs.append(context) - + else: - #print("inside else") # refactor as a Supabase object and append context_dict = { 'text': doc.page_content, - 'embedding': '', 'pagenumber': doc.metadata['pagenumber'], 'readable_filename': doc.metadata['readable_filename'], 'course_name': course_name, @@ -707,5 +609,4 @@ def supabase_context_padding(doc, course_name, result_docs): context_dict['url'] = '' result_docs.append(context_dict) - - \ No newline at end of file + From 56a6e2af25c61b4406e94ce083af5284eb4165b2 Mon Sep 17 00:00:00 2001 From: star-nox Date: Tue, 30 Apr 2024 17:43:57 -0500 Subject: [PATCH 10/12] updated deduplication logic in context padding --- ai_ta_backend/service/retrieval_service.py | 64 +++++++--------------- ai_ta_backend/service/sentry_service.py | 2 + 2 files changed, 23 insertions(+), 43 deletions(-) diff --git a/ai_ta_backend/service/retrieval_service.py b/ai_ta_backend/service/retrieval_service.py index 95bbd810..03b17114 100644 --- a/ai_ta_backend/service/retrieval_service.py +++ b/ai_ta_backend/service/retrieval_service.py @@ -491,14 +491,14 @@ def context_parent_doc_padding(self, found_docs, course_name): s3_paths = [] for doc in found_docs[:5]: if 'url' in doc.metadata.keys(): - urls.append(doc.metadata['url']) + urls.append(doc.metadata.get('url')) elif 's3_path' in doc.metadata.keys(): - s3_paths.append(doc.metadata['s3_path']) + s3_paths.append(doc.metadata.get('s3_path')) # query Supabase - supabase_url_content = self.sqlDb.getDocsByURLs(course_name, urls) - supabase_s3_content = self.sqlDb.getDocsByS3Paths(course_name, s3_paths) - supabase_data = supabase_url_content.data + supabase_s3_content.data + supabase_url_content = self.sqlDb.getDocsByURLs(course_name, urls).data if urls else [] + supabase_s3_content = self.sqlDb.getDocsByS3Paths(course_name, s3_paths).data if s3_paths else [] + supabase_data = supabase_url_content + supabase_s3_content with Manager() as manager: qdrant_contexts = manager.list() @@ -510,15 +510,10 @@ def context_parent_doc_padding(self, found_docs, course_name): executor.map(partial_func1, found_docs[5:]) executor.map(partial_func2, found_docs[:5]) - supabase_contexts_no_duplicates = [] - for context in supabase_contexts: - if context not in supabase_contexts_no_duplicates: - supabase_contexts_no_duplicates.append(context) - - result_contexts = supabase_contexts_no_duplicates + list(qdrant_contexts) - print("len of supabase contexts: ", len(supabase_contexts_no_duplicates)) + unique_contexts = list(set(tuple(item.items()) for item in list(supabase_contexts) + list(qdrant_contexts))) + unique_contexts = [dict(item) for item in unique_contexts] print(f"⏰ Context padding runtime: {(time.monotonic() - start_time):.2f} seconds") - return result_contexts + return unique_contexts def qdrant_context_processing(doc, course_name, result_contexts): @@ -528,23 +523,13 @@ def qdrant_context_processing(doc, course_name, result_contexts): #print("inside qdrant context processing") context_dict = { 'text': doc.page_content, - 'embedding': '', - 'pagenumber': doc.metadata['pagenumber'], - 'readable_filename': doc.metadata['readable_filename'], + 'pagenumber': doc.metadata.get('pagenumber', ''), + 'readable_filename': doc.metadata.get('readable_filename', ''), 'course_name': course_name, - 's3_path': doc.metadata['s3_path'] + 's3_path': doc.metadata.get('s3_path', ''), + 'url': doc.metadata.get('url', ''), + 'base_url': doc.metadata.get('base_url', '') } - - if 'url' in doc.metadata.keys(): - context_dict['url'] = doc.metadata['url'] - else: - context_dict['url'] = '' - - if 'base_url' in doc.metadata.keys(): - context_dict['base_url'] = doc.metadata['url'] - else: - context_dict['base_url'] = '' - result_contexts.append(context_dict) def supabase_context_padding(doc, course_name, sql_data, result_docs): @@ -552,12 +537,8 @@ def supabase_context_padding(doc, course_name, sql_data, result_docs): Does context padding for given doc. """ # search the document in sql_data - if 'url' in doc.metadata.keys(): - supabase_doc = next((item for item in sql_data if item['url'] == doc.metadata['url']), None) - elif 's3_path' in doc.metadata.keys(): - supabase_doc = next((item for item in sql_data if item['s3_path'] == doc.metadata['s3_path']), None) - else: - supabase_doc = None + url_match = next((item for item in sql_data if item.get('url') == doc.metadata.get('url')), None) + supabase_doc = url_match or next((item for item in sql_data if item.get('s3_path') == doc.metadata.get('s3_path')), None) # create a dictionary if supabase_doc: @@ -576,6 +557,7 @@ def supabase_context_padding(doc, course_name, sql_data, result_docs): context['url'] = supabase_doc['url'] context['base_url'] = supabase_doc['base_url'] context.pop('embedding', None) + result_docs.append(context) elif doc.metadata['pagenumber'] != '': @@ -597,16 +579,12 @@ def supabase_context_padding(doc, course_name, sql_data, result_docs): # refactor as a Supabase object and append context_dict = { 'text': doc.page_content, - 'pagenumber': doc.metadata['pagenumber'], - 'readable_filename': doc.metadata['readable_filename'], + 'pagenumber': doc.metadata.get('pagenumber', ''), + 'readable_filename': doc.metadata.get('readable_filename', ''), 'course_name': course_name, - 's3_path': doc.metadata['s3_path'], - 'base_url': doc.metadata['base_url'] + 's3_path': doc.metadata.get('s3_path', ''), + 'base_url': doc.metadata.get('base_url', ''), + 'url': doc.metadata.get('url', '') } - if 'url' in doc.metadata.keys(): - context_dict['url'] = doc.metadata['url'] - else: - context_dict['url'] = '' - result_docs.append(context_dict) diff --git a/ai_ta_backend/service/sentry_service.py b/ai_ta_backend/service/sentry_service.py index 53b780b0..03c25a4c 100644 --- a/ai_ta_backend/service/sentry_service.py +++ b/ai_ta_backend/service/sentry_service.py @@ -16,7 +16,9 @@ def __init__(self, dsn: str): # Set profiles_sample_rate to 1.0 to profile 100% of sampled transactions. # We recommend adjusting this value in production. profiles_sample_rate=1.0, + environment="development", # 'production', 'staging', 'development', 'testing enable_tracing=True) def capture_exception(self, exception: Exception): sentry_sdk.capture_exception(exception) + From 962bf2c96e6f79daf7c1a31e35451220284149cc Mon Sep 17 00:00:00 2001 From: star-nox Date: Wed, 1 May 2024 11:38:58 -0500 Subject: [PATCH 11/12] removed elif in url and s3_path list creation --- ai_ta_backend/service/retrieval_service.py | 52 +++++++++++----------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/ai_ta_backend/service/retrieval_service.py b/ai_ta_backend/service/retrieval_service.py index 03b17114..15e58139 100644 --- a/ai_ta_backend/service/retrieval_service.py +++ b/ai_ta_backend/service/retrieval_service.py @@ -310,28 +310,28 @@ def getTopContextsWithMQR(self, # sentry_sdk.capture_exception(e) # return err - def format_for_json_mqr(self, found_docs) -> List[Dict]: - """ - Same as format_for_json, but for the new MQR pipeline. - """ - for found_doc in found_docs: - if "pagenumber" not in found_doc.keys(): - print("found no pagenumber") - found_doc['pagenumber'] = found_doc['pagenumber_or_timestamp'] - - contexts = [ - { - 'text': doc['text'], - 'readable_filename': doc['readable_filename'], - 'course_name ': doc['course_name'], - 's3_path': doc['s3_path'], - 'pagenumber': doc['pagenumber'], - 'url': doc['url'], # wouldn't this error out? - 'base_url': doc['base_url'], - } for doc in found_docs - ] - - return contexts + # def format_for_json_mqr(self, found_docs) -> List[Dict]: + # """ + # Same as format_for_json, but for the new MQR pipeline. + # """ + # for found_doc in found_docs: + # if "pagenumber" not in found_doc.keys(): + # print("found no pagenumber") + # found_doc['pagenumber'] = found_doc['pagenumber_or_timestamp'] + + # contexts = [ + # { + # 'text': doc['text'], + # 'readable_filename': doc['readable_filename'], + # 'course_name ': doc['course_name'], + # 's3_path': doc['s3_path'], + # 'pagenumber': doc['pagenumber'], + # 'url': doc['url'], # wouldn't this error out? + # 'base_url': doc['base_url'], + # } for doc in found_docs + # ] + + # return contexts def delete_from_nomic_and_supabase(self, course_name: str, identifier_key: str, identifier_value: str): try: @@ -483,7 +483,7 @@ def context_parent_doc_padding(self, found_docs, course_name): """ Takes top N contexts acquired from QRANT similarity search and pads them """ - print("inside main context padding") + print("inside context_parent_doc_padding()") start_time = time.monotonic() # form a list of urls and s3_paths @@ -491,9 +491,9 @@ def context_parent_doc_padding(self, found_docs, course_name): s3_paths = [] for doc in found_docs[:5]: if 'url' in doc.metadata.keys(): - urls.append(doc.metadata.get('url')) - elif 's3_path' in doc.metadata.keys(): - s3_paths.append(doc.metadata.get('s3_path')) + urls.append(doc.metadata['url']) + else: + s3_paths.append(doc.metadata['s3_path']) # query Supabase supabase_url_content = self.sqlDb.getDocsByURLs(course_name, urls).data if urls else [] From a2b70280d2658a88f1a977008729880c57667031 Mon Sep 17 00:00:00 2001 From: star-nox Date: Mon, 6 May 2024 16:40:59 -0500 Subject: [PATCH 12/12] added old getTopContexts back --- ai_ta_backend/main.py | 58 ++++++++++++++++ ai_ta_backend/service/retrieval_service.py | 77 +++++++++++++++++++++- 2 files changed, 134 insertions(+), 1 deletion(-) diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py index 5b52e7d3..612d4d85 100644 --- a/ai_ta_backend/main.py +++ b/ai_ta_backend/main.py @@ -125,6 +125,64 @@ def getTopContexts(service: RetrievalService) -> Response: return response +@app.route('/getTopContextsv2', methods=['POST']) +def getTopContextsv2(service: RetrievalService) -> Response: + """Get most relevant contexts for a given search query. + + Return value + + ## POST body + course name (optional) str + A json response with TBD fields. + search_query + token_limit + doc_groups + + Returns + ------- + JSON + A json response with TBD fields. + Metadata fields + * pagenumber_or_timestamp + * readable_filename + * s3_pdf_path + + Example: + [ + { + 'readable_filename': 'Lumetta_notes', + 'pagenumber_or_timestamp': 'pg. 19', + 's3_pdf_path': '/courses//Lumetta_notes.pdf', + 'text': 'In FSM, we do this...' + }, + ] + + Raises + ------ + Exception + Testing how exceptions are handled. + """ + data = request.get_json() + search_query: str = data.get('search_query', '') + course_name: str = data.get('course_name', '') + token_limit: int = data.get('token_limit', 3000) + doc_groups: List[str] = data.get('doc_groups', []) + + if search_query == '' or course_name == '': + # proper web error "400 Bad request" + abort( + 400, + description= + f"Missing one or more required parameters: 'search_query' and 'course_name' must be provided. Search query: `{search_query}`, Course name: `{course_name}`" + ) + + found_documents = service.getTopContextsv2(search_query, course_name, token_limit, doc_groups) + + response = jsonify(found_documents) + response.headers.add('Access-Control-Allow-Origin', '*') + return response + + @app.route('/getAll', methods=['GET']) def getAll(service: RetrievalService) -> Response: """Get all course materials based on the course_name diff --git a/ai_ta_backend/service/retrieval_service.py b/ai_ta_backend/service/retrieval_service.py index 15e58139..4c3d1549 100644 --- a/ai_ta_backend/service/retrieval_service.py +++ b/ai_ta_backend/service/retrieval_service.py @@ -59,7 +59,7 @@ def __init__(self, vdb: VectorDatabase, sqlDb: SQLDatabase, aws: AWSStorage, pos openai_api_version=os.environ["OPENAI_API_VERSION"], openai_api_type=os.environ['OPENAI_API_TYPE'], ) # type: ignore - + def getTopContexts(self, search_query: str, course_name: str, @@ -80,6 +80,81 @@ def getTopContexts(self, try: start_time_overall = time.monotonic() + found_docs: list[Document] = self.vector_search(search_query=search_query, + course_name=course_name, + doc_groups=doc_groups) + + pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" + # count tokens at start and end, then also count each context. + token_counter, _ = count_tokens_and_cost(pre_prompt + "\n\nNow please respond to my query: " + # type: ignore + search_query) + + valid_docs = [] + num_tokens = 0 + for doc in found_docs: + doc_string = f"Document: {doc.metadata['readable_filename']}{', page: ' + str(doc.metadata['pagenumber']) if doc.metadata['pagenumber'] else ''}\n{str(doc.page_content)}\n" + num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore + + print( + f"tokens used/limit: {token_counter}/{token_limit}, tokens in chunk: {num_tokens}, total prompt cost (of these contexts): {prompt_cost}. 📄 File: {doc.metadata['readable_filename']}" + ) + if token_counter + num_tokens <= token_limit: + token_counter += num_tokens + valid_docs.append(doc) + else: + # filled our token size, time to return + break + + print(f"Total tokens used: {token_counter}. Docs used: {len(valid_docs)} of {len(found_docs)} docs retrieved") + print(f"Course: {course_name} ||| search_query: {search_query}") + print(f"⏰ ^^ Runtime of getTopContexts: {(time.monotonic() - start_time_overall):.2f} seconds") + if len(valid_docs) == 0: + return [] + + self.posthog.capture( + event_name="getTopContexts_success_DI", + properties={ + "user_query": search_query, + "course_name": course_name, + "token_limit": token_limit, + "total_tokens_used": token_counter, + "total_contexts_used": len(valid_docs), + "total_unique_docs_retrieved": len(found_docs), + "getTopContext_total_latency_sec": time.monotonic() - start_time_overall, + }, + ) + + return self.format_for_json(valid_docs) + except Exception as e: + # return full traceback to front end + # err: str = f"ERROR: In /getTopContexts. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}" # type: ignore + err: str = f"ERROR: In /getTopContexts. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.print_exc} \n{e}" # type: ignore + traceback.print_exc() + print(err) + self.sentry.capture_exception(e) + return err + + + def getTopContextsv2(self, + search_query: str, + course_name: str, + token_limit: int = 4_000, + doc_groups: List[str] | None = None) -> Union[List[Dict], str]: + """Here's a summary of the work. + + /GET arguments + course name (optional) str: A json response with TBD fields. + + Returns + JSON: A json response with TBD fields. See main.py:getTopContexts docs. + or + String: An error message with traceback. + """ + if doc_groups is None: + doc_groups = [] + try: + start_time_overall = time.monotonic() + found_docs: list[Document] = self.vector_search(search_query=search_query, course_name=course_name, doc_groups=doc_groups)