From fdf2910f10f571660c23076c07b9e402c6f55ae7 Mon Sep 17 00:00:00 2001 From: star-nox Date: Thu, 2 Nov 2023 18:40:59 -0500 Subject: [PATCH 1/3] metadata refactor WIP --- ai_ta_backend/vector_database.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/ai_ta_backend/vector_database.py b/ai_ta_backend/vector_database.py index 35bee539..2b86c738 100644 --- a/ai_ta_backend/vector_database.py +++ b/ai_ta_backend/vector_database.py @@ -933,8 +933,7 @@ def getAll( return distinct_dicts def vector_search(self, search_query, course_name): - #top_n = 80 - top_n = 5 + top_n = 80 o = OpenAIEmbeddings() # type: ignore user_query_embedding = o.embed_query(search_query) myfilter = models.Filter( @@ -1058,7 +1057,7 @@ def context_padding(self, found_docs, search_query, course_name): 'url': doc.metadata['url'], 'base_url':doc.metadata['base_url'] } - + print("context_dict: ", context_dict) result_contexts.append(context_dict) print("length of final contexts: ", len(result_contexts)) @@ -1242,9 +1241,9 @@ def get_stuffed_prompt(self, search_query: str, course_name: str, token_limit: i token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' + search_query) # type: ignore valid_docs = [] for d in found_docs: - if "pagenumber" not in d.payload["metadata"].keys(): # type: ignore - d.payload["metadata"]["pagenumber"] = d.payload["metadata"]["pagenumber_or_timestamp"] # type: ignore - doc_string = f"---\nDocument: {d.payload['metadata']['readable_filename']}{', page: ' + str(d.payload['metadata']['pagenumber']) if d.payload['metadata']['pagenumber'] else ''}\n{d.payload.get('page_content')}\n" # type: ignore + if "pagenumber" not in d.payload.keys(): # type: ignore + d.payload["pagenumber"] = d.payload["pagenumber_or_timestamp"] # type: ignore + doc_string = f"---\nDocument: {d.payload['readable_filename']}{', page: ' + str(d.payload['pagenumber']) if d.payload['pagenumber'] else ''}\n{d.payload.get('page_content')}\n" # type: ignore num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore print(f"Page: {d.payload.get('page_content')[:100]}...") # type: ignore From 6f1128af07708a92595d3ff3d4977c7e4979894e Mon Sep 17 00:00:00 2001 From: star-nox Date: Mon, 6 Nov 2023 12:23:02 -0600 Subject: [PATCH 2/3] testing context retrieval flow --- ai_ta_backend/vector_database.py | 116 ++++++++++++++++--------------- 1 file changed, 61 insertions(+), 55 deletions(-) diff --git a/ai_ta_backend/vector_database.py b/ai_ta_backend/vector_database.py index 2b86c738..3dba9f69 100644 --- a/ai_ta_backend/vector_database.py +++ b/ai_ta_backend/vector_database.py @@ -990,7 +990,7 @@ def context_padding(self, found_docs, search_query, course_name): retrieved_contexts_identifiers = {} result_contexts = [] for doc in found_docs: # top N from QDRANT - + # if url present, query through that if doc.metadata['url']: parent_doc_id = doc.metadata['url'] @@ -1005,61 +1005,67 @@ def context_padding(self, found_docs, search_query, course_name): retrieved_contexts_identifiers[parent_doc_id] = [] data = response.data # at this point, we have the origin parent document from Supabase - filename = data[0]['readable_filename'] - contexts = data[0]['contexts'] - print("no of contexts within the og doc: ", len(contexts)) - - if 'chunk_index' in doc.metadata: - # retrieve by chunk index --> pad contexts - target_chunk_index = doc.metadata['chunk_index'] - print("target chunk_index: ", target_chunk_index) - - for context in contexts: - curr_chunk_index = context['chunk_index'] - # collect between range of target index - 3 and target index + 3 - if (target_chunk_index - 3 <= curr_chunk_index <= target_chunk_index + 3) and curr_chunk_index not in retrieved_contexts_identifiers[parent_doc_id]: - context['readable_filename'] = filename - context['course_name'] = course_name - context['s3_path'] = data[0]['s3_path'] - context['url'] = data[0]['url'] - context['base_url'] = data[0]['base_url'] - - result_contexts.append(context) - # add current index to retrieved_contexts_identifiers after each context is retrieved to avoid duplicates - retrieved_contexts_identifiers[parent_doc_id].append(curr_chunk_index) - - elif doc.metadata['pagenumber'] != '': - # retrieve by page number --> retrieve the single whole page? - pagenumber = doc.metadata['pagenumber'] - print("target pagenumber: ", pagenumber) - - for context in contexts: - if context['pagenumber'] == pagenumber: - context['readable_filename'] = filename - context['course_name'] = course_name - context['s3_path'] = data[0]['s3_path'] - context['url'] = data[0]['url'] - context['base_url'] = data[0]['base_url'] - result_contexts.append(context) + if len(data) > 0: + print("-------------------") + print("len of data: ", len(data)) + if len(data) == 0: + print("data: ", data) + print("-------------------") + filename = data[0]['readable_filename'] + contexts = data[0]['contexts'] + print("no of contexts within the og doc: ", len(contexts)) + + if 'chunk_index' in doc.metadata: + # retrieve by chunk index --> pad contexts + target_chunk_index = doc.metadata['chunk_index'] + print("target chunk_index: ", target_chunk_index) + + for context in contexts: + curr_chunk_index = context['chunk_index'] + # collect between range of target index - 3 and target index + 3 + if (target_chunk_index - 3 <= curr_chunk_index <= target_chunk_index + 3) and curr_chunk_index not in retrieved_contexts_identifiers[parent_doc_id]: + context['readable_filename'] = filename + context['course_name'] = course_name + context['s3_path'] = data[0]['s3_path'] + context['url'] = data[0]['url'] + context['base_url'] = data[0]['base_url'] + + result_contexts.append(context) + # add current index to retrieved_contexts_identifiers after each context is retrieved to avoid duplicates + retrieved_contexts_identifiers[parent_doc_id].append(curr_chunk_index) + + elif doc.metadata['pagenumber'] != '': + # retrieve by page number --> retrieve the single whole page? + pagenumber = doc.metadata['pagenumber'] + print("target pagenumber: ", pagenumber) + + for context in contexts: + if context['pagenumber'] == pagenumber: + context['readable_filename'] = filename + context['course_name'] = course_name + context['s3_path'] = data[0]['s3_path'] + context['url'] = data[0]['url'] + context['base_url'] = data[0]['base_url'] + result_contexts.append(context) + + # add page number to retrieved_contexts_identifiers after all contexts belonging to that page number have been retrieved + retrieved_contexts_identifiers[parent_doc_id].append(pagenumber) + else: + # dont pad, re-factor it to be like Supabase object + print("no chunk index or page number, just appending the QDRANT context") + context_dict = {'text': doc.page_content, + 'embedding': '', + 'timestamp': doc.metadata['timestamp'], + 'pagenumber': doc.metadata['pagenumber'], + 'readable_filename': doc.metadata['readable_filename'], + 'course_name': course_name, + 's3_path': doc.metadata['s3_path'], + 'url': doc.metadata['url'], + 'base_url':doc.metadata['base_url'] + } + print("context_dict: ", context_dict) + result_contexts.append(context_dict) - # add page number to retrieved_contexts_identifiers after all contexts belonging to that page number have been retrieved - retrieved_contexts_identifiers[parent_doc_id].append(pagenumber) - else: - # dont pad, re-factor it to be like Supabase object - print("no chunk index or page number, just appending the QDRANT context") - context_dict = {'text': doc.page_content, - 'embedding': '', - 'timestamp': doc.metadata['timestamp'], - 'pagenumber': doc.metadata['pagenumber'], - 'readable_filename': doc.metadata['readable_filename'], - 'course_name': course_name, - 's3_path': doc.metadata['s3_path'], - 'url': doc.metadata['url'], - 'base_url':doc.metadata['base_url'] - } - print("context_dict: ", context_dict) - result_contexts.append(context_dict) - print("length of final contexts: ", len(result_contexts)) return result_contexts From 678197756d7b5043cbfc975a29a5119dcc5e7688 Mon Sep 17 00:00:00 2001 From: star-nox Date: Mon, 6 Nov 2023 12:24:02 -0600 Subject: [PATCH 3/3] removed print statements --- ai_ta_backend/vector_database.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/ai_ta_backend/vector_database.py b/ai_ta_backend/vector_database.py index 3dba9f69..6b581b13 100644 --- a/ai_ta_backend/vector_database.py +++ b/ai_ta_backend/vector_database.py @@ -1006,11 +1006,6 @@ def context_padding(self, found_docs, search_query, course_name): data = response.data # at this point, we have the origin parent document from Supabase if len(data) > 0: - print("-------------------") - print("len of data: ", len(data)) - if len(data) == 0: - print("data: ", data) - print("-------------------") filename = data[0]['readable_filename'] contexts = data[0]['contexts'] print("no of contexts within the og doc: ", len(contexts))