Skip to content

Commit

Permalink
added similar format_for_json for MQR
Browse files Browse the repository at this point in the history
  • Loading branch information
star-nox committed Nov 29, 2023
1 parent 9aa030c commit 00e2f9d
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 12 deletions.
8 changes: 7 additions & 1 deletion ai_ta_backend/parallel_context_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,15 @@ def qdrant_context_processing(doc, course_name, result_contexts):
}
if 'url' in doc.metadata.keys():
context_dict['url'] = doc.metadata['url']
else:
context_dict['url'] = ''

result_contexts.append(context_dict)
return result_contexts

def supabase_context_padding(doc, course_name, result_docs):
"""
Does context padding for given doc. Used with context_padding()
Does context padding for given doc.
"""

# query by url or s3_path
Expand Down Expand Up @@ -98,6 +100,7 @@ def supabase_context_padding(doc, course_name, result_docs):
pagenumber = doc.metadata['pagenumber']

for context in contexts:
# pad contexts belonging to same page number
if int(context['pagenumber']) == pagenumber:
context['readable_filename'] = filename
context['course_name'] = course_name
Expand All @@ -119,6 +122,9 @@ def supabase_context_padding(doc, course_name, result_docs):
}
if 'url' in doc.metadata.keys():
context_dict['url'] = doc.metadata['url']
else:
context_dict['url'] = ''

result_docs.append(context_dict)


Expand Down
44 changes: 33 additions & 11 deletions ai_ta_backend/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -1409,7 +1409,7 @@ def getTopContextsWithMQR(self, search_query: str, course_name: str, token_limit
print(f"⏰ ^^ Runtime of getTopContextsWithMQR: {(time.monotonic() - start_time_overall):.2f} seconds")
if len(valid_docs) == 0:
return []
return self.format_for_json(valid_docs)
return self.format_for_json_mqr(valid_docs)
except Exception as e:
# return full traceback to front end
err: str = f"ERROR: In /getTopContextsWithMQR. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.format_exc()}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}" # type: ignore
Expand Down Expand Up @@ -1577,6 +1577,29 @@ def get_stuffed_prompt(self, search_query: str, course_name: str, token_limit: i
print(err)
return err


def format_for_json_mqr(self, found_docs) -> List[Dict]:
"""
Same as format_for_json, but for the new MQR pipeline.
"""
for found_doc in found_docs:
if "pagenumber" not in found_doc.keys():
print("found no pagenumber")
found_doc['pagenumber'] = found_doc['pagenumber_or_timestamp']

contexts = [{
'text': doc['text'],
'readable_filename': doc['readable_filename'],
'course_name ': doc['course_name'],
's3_path': doc['s3_path'],
'pagenumber': doc['pagenumber'],
'url': doc['url'], # wouldn't this error out?
'base_url': doc['base_url'],
} for doc in found_docs]

return contexts


def format_for_json(self, found_docs: List[Document]) -> List[Dict]:
"""Formatting only.
{'course_name': course_name, 'contexts': [{'source_name': 'Lumetta_notes', 'source_location': 'pg. 19', 'text': 'In FSM, we do this...'}, {'source_name': 'Lumetta_notes', 'source_location': 'pg. 20', 'text': 'In Assembly language, the code does that...'},]}
Expand All @@ -1590,21 +1613,20 @@ def format_for_json(self, found_docs: List[Document]) -> List[Dict]:
Returns:
List[Dict]: _description_
"""

for found_doc in found_docs:
if "pagenumber" not in found_doc.keys():
if "pagenumber" not in found_doc.metadata.keys():
print("found no pagenumber")
found_doc['pagenumber'] = found_doc['pagenumber_or_timestamp']
found_doc.metadata['pagenumber'] = found_doc.metadata['pagenumber_or_timestamp']

contexts = [{
'text': doc['text'],
'readable_filename': doc['readable_filename'],
'course_name ': doc['course_name'],
's3_path': doc['s3_path'],
'pagenumber': doc['pagenumber'], # this because vector db schema is older...
'text': doc.page_content,
'readable_filename': doc.metadata['readable_filename'],
'course_name ': doc.metadata['course_name'],
's3_path': doc.metadata['s3_path'],
'pagenumber': doc.metadata['pagenumber'], # this because vector db schema is older...
# OPTIONAL PARAMS...
'url': doc['url'], # wouldn't this error out?
'base_url': doc['base_url'],
'url': doc.metadata.get('url'), # wouldn't this error out?
'base_url': doc.metadata.get('base_url'),
} for doc in found_docs]

return contexts
Expand Down

0 comments on commit 00e2f9d

Please sign in to comment.