Skip to content

Commit

Permalink
modified context padding for only first 5 docs
Browse files Browse the repository at this point in the history
  • Loading branch information
star-nox committed Nov 9, 2023
1 parent 0a17cd1 commit 73823e7
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 14 deletions.
104 changes: 92 additions & 12 deletions ai_ta_backend/parallel_context_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,37 @@
from functools import partial
from multiprocessing import Manager

DOCUMENTS_TABLE = os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']
SUPABASE_CLIENT = supabase.create_client(supabase_url=os.environ['SUPABASE_URL'], supabase_key=os.environ['SUPABASE_API_KEY'])


def context_processing(found_docs, search_query, course_name):
"""
Takes top N contexts acquired from QRANT similarity search and pads them
"""
print("inside main context padding")
start_time = time.monotonic()

with Manager() as manager:
qdrant_contexts = manager.list()
supabase_contexts = manager.list()
partial_func1 = partial(qdrant_context_processing, course_name=course_name, result_contexts=qdrant_contexts)
partial_func2 = partial(supabase_context_padding, course_name=course_name, result_docs=supabase_contexts)

with ProcessPoolExecutor() as executor:
executor.map(partial_func1, found_docs[5:])
executor.map(partial_func2, found_docs[:5])

print("qdrant contexts: ", len(qdrant_contexts))
print("supabase contexts: ", len(supabase_contexts))

result_contexts = list(supabase_contexts) + list(qdrant_contexts)

print(f"⏰ Context processing runtime: {(time.monotonic() - start_time):.2f} seconds")

return result_contexts


def qdrant_context_processing(doc, course_name, result_contexts):
"""
Re-factor QDRANT objects into Supabase objects and append to result_docs
Expand All @@ -25,21 +56,70 @@ def qdrant_context_processing(doc, course_name, result_contexts):
result_contexts.append(context_dict)
return result_contexts


def context_padding(found_docs, search_query, course_name):
def supabase_context_padding(doc, course_name, result_docs):
"""
Takes top N contexts acquired from QRANT similarity search and pads them
Does context padding for given doc. Used with context_padding()
"""
print("inside main context padding")
start_time = time.monotonic()
print("in supabase context processing")

# query by url or s3_path
if 'url' in doc.metadata.keys() and doc.metadata['url']:
parent_doc_id = doc.metadata['url']
response = SUPABASE_CLIENT.table(DOCUMENTS_TABLE).select('*').eq('course_name', course_name).eq('url', parent_doc_id).execute()

else:
parent_doc_id = doc.metadata['s3_path']
response = SUPABASE_CLIENT.table(DOCUMENTS_TABLE).select('*').eq('course_name', course_name).eq('s3_path', parent_doc_id).execute()

with Manager() as manager:
result_contexts = manager.list()
partial_func = partial(qdrant_context_processing, course_name=course_name, result_contexts=result_contexts)
data = response.data

with ProcessPoolExecutor() as executor:
executor.map(partial_func, found_docs[5:])
if len(data) > 0:
# do the padding
filename = data[0]['readable_filename']
contexts = data[0]['contexts']
print("no of contexts within the og doc: ", len(contexts))

if 'chunk_index' in doc.metadata and 'chunk_index' in contexts[0].keys():
# pad contexts by chunk index + 3 and - 3
target_chunk_index = doc.metadata['chunk_index']
for context in contexts:
curr_chunk_index = context['chunk_index']
if (target_chunk_index - 3 <= curr_chunk_index <= target_chunk_index + 3):
context['readable_filename'] = filename
context['course_name'] = course_name
context['s3_path'] = data[0]['s3_path']
context['url'] = data[0]['url']
context['base_url'] = data[0]['base_url']
result_docs.append(context)

elif doc.metadata['pagenumber'] != '':
# pad contexts belonging to same page number
pagenumber = doc.metadata['pagenumber']

for context in contexts:
if int(context['pagenumber']) == pagenumber:
context['readable_filename'] = filename
context['course_name'] = course_name
context['s3_path'] = data[0]['s3_path']
context['url'] = data[0]['url']
context['base_url'] = data[0]['base_url']
result_docs.append(context)

else:
# refactor as a Supabase object and append
context_dict = {
'text': doc.page_content,
'embedding': '',
'pagenumber': doc.metadata['pagenumber'],
'readable_filename': doc.metadata['readable_filename'],
'course_name': course_name,
's3_path': doc.metadata['s3_path'],
'base_url':doc.metadata['base_url']
}
if 'url' in doc.metadata.keys():
context_dict['url'] = doc.metadata['url']
result_docs.append(context_dict)



print(f"⏰ QDRANT processing runtime: {(time.monotonic() - start_time):.2f} seconds")
return list(result_contexts)

4 changes: 2 additions & 2 deletions ai_ta_backend/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
from ai_ta_backend.aws import upload_data_files_to_s3
from ai_ta_backend.extreme_context_stuffing import OpenAIAPIProcessor
from ai_ta_backend.utils_tokenization import count_tokens_and_cost
from ai_ta_backend.parallel_context_processing import context_padding
from ai_ta_backend.parallel_context_processing import context_processing


MULTI_QUERY_PROMPT = hub.pull("langchain-ai/rag-fusion-query-generation")
Expand Down Expand Up @@ -1324,7 +1324,7 @@ def getTopContexts(self, search_query: str, course_name: str, token_limit: int =

# 'context padding' // 'parent document retriever'
# TODO maybe only do context padding for top 5 docs? Otherwise it's wasteful imo.
final_docs = context_padding(found_docs, search_query, course_name)
final_docs = context_processing(found_docs, search_query, course_name)
print(f"Number of final docs after context padding: {len(final_docs)}")

pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n"
Expand Down

0 comments on commit 73823e7

Please sign in to comment.