Skip to content

Commit

Permalink
modified function for removing duplicates in padded contexts
Browse files Browse the repository at this point in the history
  • Loading branch information
star-nox committed Nov 9, 2023
1 parent 73823e7 commit 647964f
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 12 deletions.
10 changes: 6 additions & 4 deletions ai_ta_backend/parallel_context_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,13 @@ def context_processing(found_docs, search_query, course_name):
executor.map(partial_func1, found_docs[5:])
executor.map(partial_func2, found_docs[:5])

print("qdrant contexts: ", len(qdrant_contexts))
print("supabase contexts: ", len(supabase_contexts))

result_contexts = list(supabase_contexts) + list(qdrant_contexts)
supabase_contexts_no_duplicates = []
for context in supabase_contexts:
if context not in supabase_contexts_no_duplicates:
supabase_contexts_no_duplicates.append(context)

result_contexts = supabase_contexts_no_duplicates + list(qdrant_contexts)

print(f"⏰ Context processing runtime: {(time.monotonic() - start_time):.2f} seconds")

return result_contexts
Expand Down
18 changes: 10 additions & 8 deletions ai_ta_backend/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -1301,7 +1301,7 @@ def getTopContexts(self, search_query: str, course_name: str, token_limit: int =

# Vector search with ONLY original query
# found_docs: list[Document] = self.vector_search(search_query=search_query, course_name=course_name)

mq_start_time = time.monotonic()
# Multi query retriever
generate_queries = (
MULTI_QUERY_PROMPT
Expand All @@ -1322,6 +1322,8 @@ def getTopContexts(self, search_query: str, course_name: str, token_limit: int =
if len(found_docs) == 0:
return []

print(f"⏰ Multi-query processing runtime: {(time.monotonic() - mq_start_time):.2f} seconds")

# 'context padding' // 'parent document retriever'
# TODO maybe only do context padding for top 5 docs? Otherwise it's wasteful imo.
final_docs = context_processing(found_docs, search_query, course_name)
Expand All @@ -1347,13 +1349,13 @@ def getTopContexts(self, search_query: str, course_name: str, token_limit: int =
# filled our token size, time to return
break

for v in valid_docs:
print("FINAL VALID DOCS:")
#print("valid doc text: ", v['text'])
print("s3_path: ", v['s3_path'])
print("url: ", v['url'])
print("readable_filename: ", v['readable_filename'])
print("\n")
# for v in valid_docs:
# print("FINAL VALID DOCS:")
# #print("valid doc text: ", v['text'])
# print("s3_path: ", v['s3_path'])
# print("url: ", v['url'])
# print("readable_filename: ", v['readable_filename'])
# print("\n")

print(f"Total tokens used: {token_counter} total docs: {len(found_docs)} num docs used: {len(valid_docs)}")
print(f"Course: {course_name} ||| search_query: {search_query}")
Expand Down

0 comments on commit 647964f

Please sign in to comment.