Clean up requirements.txt, removed all related to ingest

UIUC-Chatbot · Mar 5, 2024 · 80a1119 · 80a1119
1 parent bb6c41a
commit 80a1119
Show file tree

Hide file tree

Showing 2 changed files with 128 additions and 117 deletions.
diff --git a/ai_ta_backend/vector_database.py b/ai_ta_backend/vector_database.py
@@ -21,7 +21,8 @@
 
 from ai_ta_backend.context_parent_doc_padding import context_parent_doc_padding
 from ai_ta_backend.extreme_context_stuffing import OpenAIAPIProcessor
-from ai_ta_backend.filtering_contexts import filter_top_contexts
+
+# from ai_ta_backend.filtering_contexts import filter_top_contexts
 from ai_ta_backend.nomic_logging import delete_from_document_map
 from ai_ta_backend.utils_tokenization import count_tokens_and_cost
 
@@ -496,85 +497,87 @@ def getTopContextsWithMQR(self,
     4. [CANCELED BEC POINTLESS] Rank the docs based on the relevance score.
     5. Parent-doc-retrieval: Pad just the top 5 docs with expanded context from the original document.
     """
-    try:
-      top_n_per_query = 40  # HARD CODE TO ENSURE WE HIT THE MAX TOKENS
-      start_time_overall = time.monotonic()
-      mq_start_time = time.monotonic()
-
-      # 1. GENERATE MULTIPLE QUERIES
-      generate_queries = (
-          MULTI_QUERY_PROMPT | self.llm | StrOutputParser() | (lambda x: x.split("\n")) |
-          (lambda x: list(filter(None, x)))  # filter out non-empty strings
-      )
-
-      generated_queries = generate_queries.invoke({"original_query": search_query})
-      print("generated_queries", generated_queries)
-
-      # 2. VECTOR SEARCH FOR EACH QUERY
-      batch_found_docs_nested: list[list[Document]] = self.batch_vector_search(search_queries=generated_queries,
-                                                                               course_name=course_name,
-                                                                               top_n=top_n_per_query)
-
-      # 3. RANK REMAINING DOCUMENTS -- good for parent doc padding of top 5 at the end.
-      found_docs = self.reciprocal_rank_fusion(batch_found_docs_nested)
-      found_docs = [doc for doc, score in found_docs]
-      print(f"Num docs after re-ranking: {len(found_docs)}")
-      if len(found_docs) == 0:
-        return []
-      print(f"⏰ Total multi-query processing runtime: {(time.monotonic() - mq_start_time):.2f} seconds")
-
-      # 4. FILTER DOCS
-      filtered_docs = filter_top_contexts(contexts=found_docs, user_query=search_query, timeout=30, max_concurrency=180)
-      if len(filtered_docs) == 0:
-        return []
-
-      # 5. TOP DOC CONTEXT PADDING // parent document retriever
-      final_docs = context_parent_doc_padding(filtered_docs, search_query, course_name)
-      print(f"Number of final docs after context padding: {len(final_docs)}")
-
-      pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n"
-      token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' +
-                                               search_query)  # type: ignore
-
-      valid_docs = []
-      num_tokens = 0
-      for doc in final_docs:
-        doc_string = f"Document: {doc['readable_filename']}{', page: ' + str(doc['pagenumber']) if doc['pagenumber'] else ''}\n{str(doc['text'])}\n"
-        num_tokens, prompt_cost = count_tokens_and_cost(doc_string)  # type: ignore
-
-        print(f"token_counter: {token_counter}, num_tokens: {num_tokens}, max_tokens: {token_limit}")
-        if token_counter + num_tokens <= token_limit:
-          token_counter += num_tokens
-          valid_docs.append(doc)
-        else:
-          # filled our token size, time to return
-          break
+    return 'fail'
 
-      print(f"Total tokens used: {token_counter} Used {len(valid_docs)} of total unique docs {len(found_docs)}.")
-      print(f"Course: {course_name} ||| search_query: {search_query}")
-      print(f"⏰ ^^ Runtime of getTopContextsWithMQR: {(time.monotonic() - start_time_overall):.2f} seconds")
-
-      if len(valid_docs) == 0:
-        return []
-
-      self.posthog.capture('distinct_id_of_the_user',
-                           event='filter_top_contexts_succeeded',
-                           properties={
-                               'user_query': search_query,
-                               'course_name': course_name,
-                               'token_limit': token_limit,
-                               'total_tokens_used': token_counter,
-                               'total_contexts_used': len(valid_docs),
-                               'total_unique_docs_retrieved': len(found_docs),
-                           })
-
-      return self.format_for_json_mqr(valid_docs)
-    except Exception as e:
-      # return full traceback to front end
-      err: str = f"ERROR: In /getTopContextsWithMQR. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.format_exc()}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}"  # type: ignore
-      print(err)
-      sentry_sdk.capture_exception(e)
-      return err
+    # try:
+    #   top_n_per_query = 40  # HARD CODE TO ENSURE WE HIT THE MAX TOKENS
+    #   start_time_overall = time.monotonic()
+    #   mq_start_time = time.monotonic()
+
+    #   # 1. GENERATE MULTIPLE QUERIES
+    #   generate_queries = (
+    #       MULTI_QUERY_PROMPT | self.llm | StrOutputParser() | (lambda x: x.split("\n")) |
+    #       (lambda x: list(filter(None, x)))  # filter out non-empty strings
+    #   )
+
+    #   generated_queries = generate_queries.invoke({"original_query": search_query})
+    #   print("generated_queries", generated_queries)
+
+    #   # 2. VECTOR SEARCH FOR EACH QUERY
+    #   batch_found_docs_nested: list[list[Document]] = self.batch_vector_search(search_queries=generated_queries,
+    #                                                                            course_name=course_name,
+    #                                                                            top_n=top_n_per_query)
+
+    #   # 3. RANK REMAINING DOCUMENTS -- good for parent doc padding of top 5 at the end.
+    #   found_docs = self.reciprocal_rank_fusion(batch_found_docs_nested)
+    #   found_docs = [doc for doc, score in found_docs]
+    #   print(f"Num docs after re-ranking: {len(found_docs)}")
+    #   if len(found_docs) == 0:
+    #     return []
+    #   print(f"⏰ Total multi-query processing runtime: {(time.monotonic() - mq_start_time):.2f} seconds")
+
+    #   # 4. FILTER DOCS
+    #   filtered_docs = filter_top_contexts(contexts=found_docs, user_query=search_query, timeout=30, max_concurrency=180)
+    #   if len(filtered_docs) == 0:
+    #     return []
+
+    #   # 5. TOP DOC CONTEXT PADDING // parent document retriever
+    #   final_docs = context_parent_doc_padding(filtered_docs, search_query, course_name)
+    #   print(f"Number of final docs after context padding: {len(final_docs)}")
+
+    #   pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n"
+    #   token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' +
+    #                                            search_query)  # type: ignore
+
+    #   valid_docs = []
+    #   num_tokens = 0
+    #   for doc in final_docs:
+    #     doc_string = f"Document: {doc['readable_filename']}{', page: ' + str(doc['pagenumber']) if doc['pagenumber'] else ''}\n{str(doc['text'])}\n"
+    #     num_tokens, prompt_cost = count_tokens_and_cost(doc_string)  # type: ignore
+
+    #     print(f"token_counter: {token_counter}, num_tokens: {num_tokens}, max_tokens: {token_limit}")
+    #     if token_counter + num_tokens <= token_limit:
+    #       token_counter += num_tokens
+    #       valid_docs.append(doc)
+    #     else:
+    #       # filled our token size, time to return
+    #       break
+
+    #   print(f"Total tokens used: {token_counter} Used {len(valid_docs)} of total unique docs {len(found_docs)}.")
+    #   print(f"Course: {course_name} ||| search_query: {search_query}")
+    #   print(f"⏰ ^^ Runtime of getTopContextsWithMQR: {(time.monotonic() - start_time_overall):.2f} seconds")
+
+    #   if len(valid_docs) == 0:
+    #     return []
+
+    #   self.posthog.capture('distinct_id_of_the_user',
+    #                        event='filter_top_contexts_succeeded',
+    #                        properties={
+    #                            'user_query': search_query,
+    #                            'course_name': course_name,
+    #                            'token_limit': token_limit,
+    #                            'total_tokens_used': token_counter,
+    #                            'total_contexts_used': len(valid_docs),
+    #                            'total_unique_docs_retrieved': len(found_docs),
+    #                        })
+
+    #   return self.format_for_json_mqr(valid_docs)
+    # except Exception as e:
+    #   # return full traceback to front end
+    #   err: str = f"ERROR: In /getTopContextsWithMQR. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.format_exc()}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}"  # type: ignore
+    #   print(err)
+    #   sentry_sdk.capture_exception(e)
+    #   return err
 
   def format_for_json_mqr(self, found_docs) -> List[Dict]:
     """

diff --git a/requirements.txt b/requirements.txt
@@ -1,59 +1,67 @@
-# On Apple Silicon: pip uninstall grpcio -y; conda install grpcio -y
-nomic==2.0.14
+Flask==3.0.0
+flask-cors==4.0.0
+gunicorn==21.2.0
 protobuf==4.25.0
-langchain==0.0.331
-langchainhub==0.1.14
-click==8.1.7
 aiohttp==3.8.6
+wheel==0.41.3
+click==8.1.7
 MarkupSafe==2.1.3
 Werkzeug==3.0.1
 mkdocstrings[python]==0.23.0
 mkdocs-material==9.4.7
 itsdangerous==2.1.2
-wheel==0.41.3
-Flask==3.0.0
-gunicorn==21.2.0
-tiktoken==0.5.1
 Jinja2==3.1.2
-python-dotenv==1.0.0
-flask-cors==4.0.0
-qdrant-client==1.7.3
 mkdocs==1.5.3
-openai==0.28.1
-supabase==2.0.2
 SQLAlchemy==2.0.22
-boto3==1.28.79
-PyMuPDF==1.23.6
 tabulate==0.9.0
 typing-inspect==0.9.0
 typing_extensions==4.8.0
-pysrt==1.1.2
-docx2txt==0.8
-pydub==0.25.1
-ffmpeg-python==0.2.0
-ffprobe==0.5
-ffmpeg==1.4
-beautifulsoup4==4.12.2
-canvasapi==3.2.0
-GitPython==3.1.40
+
+# Utils
+tiktoken==0.5.1
+python-dotenv==1.0.0
+pydantic==1.10.13 # pydantic v1 works better for ray
 flask-executor==1.0.0
+
+# AI & core services
+nomic==2.0.14
+openai==0.28.1
+langchain==0.0.331
+langchainhub==0.1.14
+
+# Data
+boto3==1.28.79
+qdrant-client==1.7.3
+supabase==2.0.2
+
+# Logging 
+posthog==3.1.0
+sentry-sdk==1.39.1
+
+# Not currently supporting coursera ingest
+# cs-dlp @ git+https://github.com/raffaem/[email protected] # previously called coursera-dl
+
+# removed due to /ingest in Beam
+# canvasapi==3.2.0
+# GitPython==3.1.40
+# pysrt==1.1.2
+# docx2txt==0.8
+# pydub==0.25.1
+# ffmpeg-python==0.2.0
+# ffprobe==0.5
+# ffmpeg==1.4
+# beautifulsoup4==4.12.2
+# PyMuPDF==1.23.6
+# pytesseract==0.3.10 # image OCR
+# openpyxl==3.1.2 # excel
+# networkx==3.2.1 # unused part of excel partitioning :(
+# python-pptx==0.6.23
+# unstructured==0.10.29 # causes huge ~5.3 GB of installs. Probbably from onnx: https://github.com/Unstructured-IO/unstructured/blob/ad14321016533dc03c1782f6ebea00bc9c804846/requirements/extra-pdf-image.in#L4
+
 # pdf packages for unstructured
 # pdf2image==1.16.3
 # pdfminer.six==20221105
 # opencv-python-headless==4.8.1.78
 # unstructured.pytesseract==0.3.12
 # unstructured-inference==0.7.11 # this is the real large one :(
-pytesseract==0.3.10 # image OCR
-openpyxl==3.1.2 # excel
-networkx==3.2.1 # unused part of excel partitioning :(
-python-pptx==0.6.23
-unstructured==0.10.29 # causes huge ~5.3 GB of installs. Probbably from onnx: https://github.com/Unstructured-IO/unstructured/blob/ad14321016533dc03c1782f6ebea00bc9c804846/requirements/extra-pdf-image.in#L4
 # unstructured[xlsx,image,pptx]==0.10.29 # causes huge ~5.3 GB of installs. Probbably from onnx: https://github.com/Unstructured-IO/unstructured/blob/ad14321016533dc03c1782f6ebea00bc9c804846/requirements/extra-pdf-image.in#L4
-
-# Not currently supporting coursera ingest
-# cs-dlp @ git+https://github.com/raffaem/[email protected] # previously called coursera-dl
-pydantic==1.10.13 # pydantic v1 works better for ray
-posthog==3.1.0
-sentry-sdk==1.39.1
-# ray==2.8.1
-# newrelic==9.3.0