From 00e2f9d6c35fd655fbaca1480f3350de6e474666 Mon Sep 17 00:00:00 2001 From: star-nox Date: Wed, 29 Nov 2023 12:48:14 -0600 Subject: [PATCH] added similar format_for_json for MQR --- ai_ta_backend/parallel_context_processing.py | 8 +++- ai_ta_backend/vector_database.py | 44 +++++++++++++++----- 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/ai_ta_backend/parallel_context_processing.py b/ai_ta_backend/parallel_context_processing.py index d0297933..61793eb7 100644 --- a/ai_ta_backend/parallel_context_processing.py +++ b/ai_ta_backend/parallel_context_processing.py @@ -54,13 +54,15 @@ def qdrant_context_processing(doc, course_name, result_contexts): } if 'url' in doc.metadata.keys(): context_dict['url'] = doc.metadata['url'] + else: + context_dict['url'] = '' result_contexts.append(context_dict) return result_contexts def supabase_context_padding(doc, course_name, result_docs): """ - Does context padding for given doc. Used with context_padding() + Does context padding for given doc. """ # query by url or s3_path @@ -98,6 +100,7 @@ def supabase_context_padding(doc, course_name, result_docs): pagenumber = doc.metadata['pagenumber'] for context in contexts: + # pad contexts belonging to same page number if int(context['pagenumber']) == pagenumber: context['readable_filename'] = filename context['course_name'] = course_name @@ -119,6 +122,9 @@ def supabase_context_padding(doc, course_name, result_docs): } if 'url' in doc.metadata.keys(): context_dict['url'] = doc.metadata['url'] + else: + context_dict['url'] = '' + result_docs.append(context_dict) diff --git a/ai_ta_backend/vector_database.py b/ai_ta_backend/vector_database.py index 282b0979..78eee5f9 100644 --- a/ai_ta_backend/vector_database.py +++ b/ai_ta_backend/vector_database.py @@ -1409,7 +1409,7 @@ def getTopContextsWithMQR(self, search_query: str, course_name: str, token_limit print(f"⏰ ^^ Runtime of getTopContextsWithMQR: {(time.monotonic() - start_time_overall):.2f} seconds") if len(valid_docs) == 0: return [] - return self.format_for_json(valid_docs) + return self.format_for_json_mqr(valid_docs) except Exception as e: # return full traceback to front end err: str = f"ERROR: In /getTopContextsWithMQR. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.format_exc()}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}" # type: ignore @@ -1577,6 +1577,29 @@ def get_stuffed_prompt(self, search_query: str, course_name: str, token_limit: i print(err) return err + + def format_for_json_mqr(self, found_docs) -> List[Dict]: + """ + Same as format_for_json, but for the new MQR pipeline. + """ + for found_doc in found_docs: + if "pagenumber" not in found_doc.keys(): + print("found no pagenumber") + found_doc['pagenumber'] = found_doc['pagenumber_or_timestamp'] + + contexts = [{ + 'text': doc['text'], + 'readable_filename': doc['readable_filename'], + 'course_name ': doc['course_name'], + 's3_path': doc['s3_path'], + 'pagenumber': doc['pagenumber'], + 'url': doc['url'], # wouldn't this error out? + 'base_url': doc['base_url'], + } for doc in found_docs] + + return contexts + + def format_for_json(self, found_docs: List[Document]) -> List[Dict]: """Formatting only. {'course_name': course_name, 'contexts': [{'source_name': 'Lumetta_notes', 'source_location': 'pg. 19', 'text': 'In FSM, we do this...'}, {'source_name': 'Lumetta_notes', 'source_location': 'pg. 20', 'text': 'In Assembly language, the code does that...'},]} @@ -1590,21 +1613,20 @@ def format_for_json(self, found_docs: List[Document]) -> List[Dict]: Returns: List[Dict]: _description_ """ - for found_doc in found_docs: - if "pagenumber" not in found_doc.keys(): + if "pagenumber" not in found_doc.metadata.keys(): print("found no pagenumber") - found_doc['pagenumber'] = found_doc['pagenumber_or_timestamp'] + found_doc.metadata['pagenumber'] = found_doc.metadata['pagenumber_or_timestamp'] contexts = [{ - 'text': doc['text'], - 'readable_filename': doc['readable_filename'], - 'course_name ': doc['course_name'], - 's3_path': doc['s3_path'], - 'pagenumber': doc['pagenumber'], # this because vector db schema is older... + 'text': doc.page_content, + 'readable_filename': doc.metadata['readable_filename'], + 'course_name ': doc.metadata['course_name'], + 's3_path': doc.metadata['s3_path'], + 'pagenumber': doc.metadata['pagenumber'], # this because vector db schema is older... # OPTIONAL PARAMS... - 'url': doc['url'], # wouldn't this error out? - 'base_url': doc['base_url'], + 'url': doc.metadata.get('url'), # wouldn't this error out? + 'base_url': doc.metadata.get('base_url'), } for doc in found_docs] return contexts