Skip to content

Commit

Permalink
Clean up requirements.txt, removed all related to ingest
Browse files Browse the repository at this point in the history
  • Loading branch information
KastanDay committed Mar 5, 2024
1 parent bb6c41a commit 80a1119
Show file tree
Hide file tree
Showing 2 changed files with 128 additions and 117 deletions.
161 changes: 82 additions & 79 deletions ai_ta_backend/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@

from ai_ta_backend.context_parent_doc_padding import context_parent_doc_padding
from ai_ta_backend.extreme_context_stuffing import OpenAIAPIProcessor
from ai_ta_backend.filtering_contexts import filter_top_contexts

# from ai_ta_backend.filtering_contexts import filter_top_contexts
from ai_ta_backend.nomic_logging import delete_from_document_map
from ai_ta_backend.utils_tokenization import count_tokens_and_cost

Expand Down Expand Up @@ -496,85 +497,87 @@ def getTopContextsWithMQR(self,
4. [CANCELED BEC POINTLESS] Rank the docs based on the relevance score.
5. Parent-doc-retrieval: Pad just the top 5 docs with expanded context from the original document.
"""
try:
top_n_per_query = 40 # HARD CODE TO ENSURE WE HIT THE MAX TOKENS
start_time_overall = time.monotonic()
mq_start_time = time.monotonic()

# 1. GENERATE MULTIPLE QUERIES
generate_queries = (
MULTI_QUERY_PROMPT | self.llm | StrOutputParser() | (lambda x: x.split("\n")) |
(lambda x: list(filter(None, x))) # filter out non-empty strings
)

generated_queries = generate_queries.invoke({"original_query": search_query})
print("generated_queries", generated_queries)

# 2. VECTOR SEARCH FOR EACH QUERY
batch_found_docs_nested: list[list[Document]] = self.batch_vector_search(search_queries=generated_queries,
course_name=course_name,
top_n=top_n_per_query)

# 3. RANK REMAINING DOCUMENTS -- good for parent doc padding of top 5 at the end.
found_docs = self.reciprocal_rank_fusion(batch_found_docs_nested)
found_docs = [doc for doc, score in found_docs]
print(f"Num docs after re-ranking: {len(found_docs)}")
if len(found_docs) == 0:
return []
print(f"⏰ Total multi-query processing runtime: {(time.monotonic() - mq_start_time):.2f} seconds")

# 4. FILTER DOCS
filtered_docs = filter_top_contexts(contexts=found_docs, user_query=search_query, timeout=30, max_concurrency=180)
if len(filtered_docs) == 0:
return []

# 5. TOP DOC CONTEXT PADDING // parent document retriever
final_docs = context_parent_doc_padding(filtered_docs, search_query, course_name)
print(f"Number of final docs after context padding: {len(final_docs)}")

pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n"
token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' +
search_query) # type: ignore

valid_docs = []
num_tokens = 0
for doc in final_docs:
doc_string = f"Document: {doc['readable_filename']}{', page: ' + str(doc['pagenumber']) if doc['pagenumber'] else ''}\n{str(doc['text'])}\n"
num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore

print(f"token_counter: {token_counter}, num_tokens: {num_tokens}, max_tokens: {token_limit}")
if token_counter + num_tokens <= token_limit:
token_counter += num_tokens
valid_docs.append(doc)
else:
# filled our token size, time to return
break
return 'fail'

print(f"Total tokens used: {token_counter} Used {len(valid_docs)} of total unique docs {len(found_docs)}.")
print(f"Course: {course_name} ||| search_query: {search_query}")
print(f"⏰ ^^ Runtime of getTopContextsWithMQR: {(time.monotonic() - start_time_overall):.2f} seconds")

if len(valid_docs) == 0:
return []

self.posthog.capture('distinct_id_of_the_user',
event='filter_top_contexts_succeeded',
properties={
'user_query': search_query,
'course_name': course_name,
'token_limit': token_limit,
'total_tokens_used': token_counter,
'total_contexts_used': len(valid_docs),
'total_unique_docs_retrieved': len(found_docs),
})

return self.format_for_json_mqr(valid_docs)
except Exception as e:
# return full traceback to front end
err: str = f"ERROR: In /getTopContextsWithMQR. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.format_exc()}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}" # type: ignore
print(err)
sentry_sdk.capture_exception(e)
return err
# try:
# top_n_per_query = 40 # HARD CODE TO ENSURE WE HIT THE MAX TOKENS
# start_time_overall = time.monotonic()
# mq_start_time = time.monotonic()

# # 1. GENERATE MULTIPLE QUERIES
# generate_queries = (
# MULTI_QUERY_PROMPT | self.llm | StrOutputParser() | (lambda x: x.split("\n")) |
# (lambda x: list(filter(None, x))) # filter out non-empty strings
# )

# generated_queries = generate_queries.invoke({"original_query": search_query})
# print("generated_queries", generated_queries)

# # 2. VECTOR SEARCH FOR EACH QUERY
# batch_found_docs_nested: list[list[Document]] = self.batch_vector_search(search_queries=generated_queries,
# course_name=course_name,
# top_n=top_n_per_query)

# # 3. RANK REMAINING DOCUMENTS -- good for parent doc padding of top 5 at the end.
# found_docs = self.reciprocal_rank_fusion(batch_found_docs_nested)
# found_docs = [doc for doc, score in found_docs]
# print(f"Num docs after re-ranking: {len(found_docs)}")
# if len(found_docs) == 0:
# return []
# print(f"⏰ Total multi-query processing runtime: {(time.monotonic() - mq_start_time):.2f} seconds")

# # 4. FILTER DOCS
# filtered_docs = filter_top_contexts(contexts=found_docs, user_query=search_query, timeout=30, max_concurrency=180)
# if len(filtered_docs) == 0:
# return []

# # 5. TOP DOC CONTEXT PADDING // parent document retriever
# final_docs = context_parent_doc_padding(filtered_docs, search_query, course_name)
# print(f"Number of final docs after context padding: {len(final_docs)}")

# pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n"
# token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' +
# search_query) # type: ignore

# valid_docs = []
# num_tokens = 0
# for doc in final_docs:
# doc_string = f"Document: {doc['readable_filename']}{', page: ' + str(doc['pagenumber']) if doc['pagenumber'] else ''}\n{str(doc['text'])}\n"
# num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore

# print(f"token_counter: {token_counter}, num_tokens: {num_tokens}, max_tokens: {token_limit}")
# if token_counter + num_tokens <= token_limit:
# token_counter += num_tokens
# valid_docs.append(doc)
# else:
# # filled our token size, time to return
# break

# print(f"Total tokens used: {token_counter} Used {len(valid_docs)} of total unique docs {len(found_docs)}.")
# print(f"Course: {course_name} ||| search_query: {search_query}")
# print(f"⏰ ^^ Runtime of getTopContextsWithMQR: {(time.monotonic() - start_time_overall):.2f} seconds")

# if len(valid_docs) == 0:
# return []

# self.posthog.capture('distinct_id_of_the_user',
# event='filter_top_contexts_succeeded',
# properties={
# 'user_query': search_query,
# 'course_name': course_name,
# 'token_limit': token_limit,
# 'total_tokens_used': token_counter,
# 'total_contexts_used': len(valid_docs),
# 'total_unique_docs_retrieved': len(found_docs),
# })

# return self.format_for_json_mqr(valid_docs)
# except Exception as e:
# # return full traceback to front end
# err: str = f"ERROR: In /getTopContextsWithMQR. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.format_exc()}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}" # type: ignore
# print(err)
# sentry_sdk.capture_exception(e)
# return err

def format_for_json_mqr(self, found_docs) -> List[Dict]:
"""
Expand Down
84 changes: 46 additions & 38 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,59 +1,67 @@
# On Apple Silicon: pip uninstall grpcio -y; conda install grpcio -y
nomic==2.0.14
Flask==3.0.0
flask-cors==4.0.0
gunicorn==21.2.0
protobuf==4.25.0
langchain==0.0.331
langchainhub==0.1.14
click==8.1.7
aiohttp==3.8.6
wheel==0.41.3
click==8.1.7
MarkupSafe==2.1.3
Werkzeug==3.0.1
mkdocstrings[python]==0.23.0
mkdocs-material==9.4.7
itsdangerous==2.1.2
wheel==0.41.3
Flask==3.0.0
gunicorn==21.2.0
tiktoken==0.5.1
Jinja2==3.1.2
python-dotenv==1.0.0
flask-cors==4.0.0
qdrant-client==1.7.3
mkdocs==1.5.3
openai==0.28.1
supabase==2.0.2
SQLAlchemy==2.0.22
boto3==1.28.79
PyMuPDF==1.23.6
tabulate==0.9.0
typing-inspect==0.9.0
typing_extensions==4.8.0
pysrt==1.1.2
docx2txt==0.8
pydub==0.25.1
ffmpeg-python==0.2.0
ffprobe==0.5
ffmpeg==1.4
beautifulsoup4==4.12.2
canvasapi==3.2.0
GitPython==3.1.40

# Utils
tiktoken==0.5.1
python-dotenv==1.0.0
pydantic==1.10.13 # pydantic v1 works better for ray
flask-executor==1.0.0

# AI & core services
nomic==2.0.14
openai==0.28.1
langchain==0.0.331
langchainhub==0.1.14

# Data
boto3==1.28.79
qdrant-client==1.7.3
supabase==2.0.2

# Logging
posthog==3.1.0
sentry-sdk==1.39.1

# Not currently supporting coursera ingest
# cs-dlp @ git+https://github.com/raffaem/[email protected] # previously called coursera-dl

# removed due to /ingest in Beam
# canvasapi==3.2.0
# GitPython==3.1.40
# pysrt==1.1.2
# docx2txt==0.8
# pydub==0.25.1
# ffmpeg-python==0.2.0
# ffprobe==0.5
# ffmpeg==1.4
# beautifulsoup4==4.12.2
# PyMuPDF==1.23.6
# pytesseract==0.3.10 # image OCR
# openpyxl==3.1.2 # excel
# networkx==3.2.1 # unused part of excel partitioning :(
# python-pptx==0.6.23
# unstructured==0.10.29 # causes huge ~5.3 GB of installs. Probbably from onnx: https://github.com/Unstructured-IO/unstructured/blob/ad14321016533dc03c1782f6ebea00bc9c804846/requirements/extra-pdf-image.in#L4

# pdf packages for unstructured
# pdf2image==1.16.3
# pdfminer.six==20221105
# opencv-python-headless==4.8.1.78
# unstructured.pytesseract==0.3.12
# unstructured-inference==0.7.11 # this is the real large one :(
pytesseract==0.3.10 # image OCR
openpyxl==3.1.2 # excel
networkx==3.2.1 # unused part of excel partitioning :(
python-pptx==0.6.23
unstructured==0.10.29 # causes huge ~5.3 GB of installs. Probbably from onnx: https://github.com/Unstructured-IO/unstructured/blob/ad14321016533dc03c1782f6ebea00bc9c804846/requirements/extra-pdf-image.in#L4
# unstructured[xlsx,image,pptx]==0.10.29 # causes huge ~5.3 GB of installs. Probbably from onnx: https://github.com/Unstructured-IO/unstructured/blob/ad14321016533dc03c1782f6ebea00bc9c804846/requirements/extra-pdf-image.in#L4

# Not currently supporting coursera ingest
# cs-dlp @ git+https://github.com/raffaem/[email protected] # previously called coursera-dl
pydantic==1.10.13 # pydantic v1 works better for ray
posthog==3.1.0
sentry-sdk==1.39.1
# ray==2.8.1
# newrelic==9.3.0

0 comments on commit 80a1119

Please sign in to comment.