diff --git a/ai_ta_backend/filtering_contexts.py b/ai_ta_backend/filtering_contexts.py
index 8d9d7131..476df3d0 100644
--- a/ai_ta_backend/filtering_contexts.py
+++ b/ai_ta_backend/filtering_contexts.py
@@ -1,213 +1,196 @@
-import json
-import os
-import threading
-import time
-from typing import Optional
-
-import openai
-import ray
-import requests
-# from langchain import hub
-# import replicate
-from posthog import Posthog
-import sentry_sdk
-
-# from dotenv import load_dotenv
-# load_dotenv(override=True)
-# from transformers import AutoTokenizer
-# tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
-
-filter_unrelated_contexts_zephyr = """<|system|>
-You are an expert at determining if a passage is relevant and helpful for answering a question.
-To be valuable, a passage must have at least some amount of useful and meaningful information with more than a passing mention of the topic.
-As part of your thinking process, you first write a few sentences evaluating the utility of the passage, given the question we're trying to answer. Limit yourself to writing only a sentence or two, no more.
-Finally, you must submit your final answer by adding two newline characters then "Yes." or "No." or "I don't know.". Provide a single answer only. Providing multiple final results will disqualify you.
-Here's a template code snippet of how it should work (with placeholder variables):
-```
-Passage: <The full text of the passage>
-Question: <The question we're using to determine relevancy of the passage>
-Your evaluation of the utility of the passage: <A few sentences exploring how useful the passage is for this particular question>
-
-
-Final answer: <Enter one of "Yes." or "No." or "I don't know."
-```
-Here's a complete example. Follow this formatting exactly.
-```
-Passage: Figure 4.6: Overview of the CUDA device memory model 
- 
-In order to fully appreciate the difference between registers, shared memory and global memory, we need to go into a little more details of how these different memory types are realized and used in modern processors. Virtually all modern processors find their root in the model proposed by John von Neumann in 1945, which is shown in Figure 4.7. The CUDA devices are no exception. The Global Memory in a CUDA device maps to the Memory box in Figure 4.7. The processor box corresponds to the processor chip boundary that we typically see today. The Global Memory is off the processor chip and is implemented with DRAM technology, which implies long access latencies and relatively low access bandwidth.
-
-Question: Explain how tiling helps with global memory bandwidth.
-Your evaluation of the utility of the passage: The passage briefly mentions the use of shared memory as a means to reduce global memory bandwidth, but it doesn't provide a detailed explanation or analysis of how tiling helps with global memory bandwidth. Therefore, the passage is not helpful when answering the question.
-
-
-Final answer: No.
-```</s>
-<|user|>
-Passage: {context}
-Question: {user_query}
-Your evaluation of the utility of the passage: </s>
-<|assistant|>"""
-
-
-@ray.remote
-class AsyncActor:
-
-  def filter_context(self, context, user_query, langsmith_prompt_obj):
-    final_prompt = str(langsmith_prompt_obj.format(context=context, user_query=user_query))
-    # print(f"-------\nfinal_prompt:\n{final_prompt}\n^^^^^^^^^^^^^")
-    try:
-      # completion = run_caii_hosted_llm(final_prompt)
-      # completion = run_replicate(final_prompt)
-      completion = run_anyscale(final_prompt)
-      return {"completion": completion, "context": context}
-    except Exception as e:
-      sentry_sdk.capture_exception(e)
-      print(f"Error: {e}")
-
-
-def run_caii_hosted_llm(prompt, max_tokens=300, temp=0.3, **kwargs):
-  """
-  Local LLMs  USAGE DOCS: https://kastanday.notion.site/LLM-Serving-on-prem-OpenAI-Clone-bb06028266d842b0872465f552684177 ## 
-  """
-
-  url = "http://api.kastan.ai/v1/completions?model=HuggingFaceH4/zephyr-7b-alpha"
-  headers = {'Content-Type': 'application/json'}
-  data = {"prompt": prompt, "max_tokens": max_tokens, "temperature": temp, **kwargs}
-
-  response = None
-  try:
-    response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180)
-    return response.json()['choices'][0]['text']
-  except Exception as e:
-    sentry_sdk.capture_exception(e)
-    # Probably cuda OOM error.
-    response_content = response.json() if response else "No response"
-    raise ValueError(
-        f"🚫🚫🚫 Failed inference attempt. Response: {response_content}\nError: {e}\nPromt that caused error: {prompt}"
-    ) from e
-
-
-def run_replicate(prompt):
-  output = None
-  # output = replicate.run("tomasmcm/zephyr-7b-beta:961cd6665b811d0c43c0b9488b6dfa85ff5c7bfb875e93b4533e4c7f96c7c526",
-  #                        input={
-  #                            "top_k": 50,
-  #                            "top_p": 0.95,
-  #                            "prompt": prompt,
-  #                            "temperature": 0.3,
-  #                            "max_new_tokens": 250,
-  #                            "presence_penalty": 1
-  #                        })
-  print(output)
-  return output
-
-
-def run_anyscale(prompt, model_name="HuggingFaceH4/zephyr-7b-beta"):
-  start_time = time.monotonic()
-  ret = openai.ChatCompletion.create(
-      api_base="https://api.endpoints.anyscale.com/v1",
-      api_key=os.environ["ANYSCALE_ENDPOINT_TOKEN"],
-      api_type="openai",
-      # model="mistralai/Mistral-7B-Instruct-v0.1",
-      model="HuggingFaceH4/zephyr-7b-beta",
-      messages=[{
-          "role": "system",
-          "content": "You are a helpful assistant."
-      }, {
-          "role": "user",
-          "content": prompt
-      }],
-      temperature=0.3,
-      max_tokens=250,
-  )
-
-  output = ret["choices"][0]["message"]["content"]  # type: ignore
-  print("Response from Anyscale:", output[:150])
-
-  # input_length = len(tokenizer.encode(prompt))
-  # output_length = len(tokenizer.encode(output))
-  # Input tokens {input_length}, output tokens: {output_length}"
-  print(f"^^^^ one anyscale call Runtime: {(time.monotonic() - start_time):.2f} seconds.")
-  return output
-
-
-def parse_result(result: str):
-  lines = result.split('\n')
-  for line in lines:
-    if 'Final answer' in line:
-      return 'yes' in line.lower()
-  return False
-
-
-def filter_top_contexts(contexts,
-                        user_query: str,
-                        timeout: Optional[float] = None,
-                        max_concurrency: Optional[int] = 180):
-
-  print("⏰⏰⏰ Starting filter_top_contexts() ⏰⏰⏰")
-
-  timeout = timeout or float(os.environ["FILTER_TOP_CONTEXTS_TIMEOUT_SECONDS"])
-  # langsmith_prompt_obj = hub.pull("kastanday/filter-unrelated-contexts-zephyr") # TOO UNSTABLE, service offline
-  langsmith_prompt_obj = filter_unrelated_contexts_zephyr
-  posthog = Posthog(sync_mode=True, project_api_key=os.environ['POSTHOG_API_KEY'], host='https://app.posthog.com')
-
-  print("NUM ACTIVE THREADS (top of filtering_contexts):", threading.active_count())
-
-  max_concurrency = min(100, len(contexts))
-  print("max_concurrency is max of 100, or len(contexts), whichever is less ---- Max concurrency:", max_concurrency)
-  print("Num contexts to filter:", len(contexts))
-
-  # START TASKS
-  actor = AsyncActor.options(max_concurrency=max_concurrency, num_cpus=0.001).remote()  # type: ignore
-  result_futures = [actor.filter_context.remote(c, user_query, langsmith_prompt_obj) for c in contexts]
-
-  start_time = time.monotonic()
-  done_tasks, in_progress = ray.wait(result_futures,
-                                     num_returns=len(result_futures),
-                                     timeout=timeout,
-                                     fetch_local=False)
-
-  print("NUM ACTIVE THREADS (before cleanup filtering_contexts):", threading.active_count())
-  # Cleanup
-  for task in in_progress:
-    ray.cancel(task)
-  results = ray.get(done_tasks)
-  print("NUM ACTIVE THREADS (before kill filtering_contexts):", threading.active_count())
-  ray.kill(actor)
-  print("NUM ACTIVE THREADS (after kill filtering_contexts):", threading.active_count())
-
-  best_contexts_to_keep = [
-      r['context'] for r in results if r and 'context' in r and 'completion' in r and parse_result(r['completion'])
-  ]
-
-  print("🧠🧠 TOTAL DOCS PROCESSED BY ANYSCALE FILTERING:", len(results))
-  print("🧠🧠 TOTAL DOCS KEPT, AFTER FILTERING:", len(best_contexts_to_keep))
-  mqr_runtime = round(time.monotonic() - start_time, 2)
-  print(f"⏰ Total elapsed time: {mqr_runtime} seconds")
-
-  posthog.capture('distinct_id_of_the_user',
-                  event='filter_top_contexts',
-                  properties={
-                      'user_query': user_query,
-                      'course_name': contexts[0].metadata.get('course_name', None),
-                      'percent_kept': len(best_contexts_to_keep) / max(1, len(results)),
-                      'total_docs_processed': len(results),
-                      'total_docs_kept': len(best_contexts_to_keep),
-                      'MQR_total_runtime_sec': mqr_runtime,
-                  })
-  posthog.shutdown()
-  return best_contexts_to_keep
-
-
-def run_main():
-  start_time = time.monotonic()
-  # final_passage_list = filter_top_contexts(contexts=CONTEXTS * 2, user_query=USER_QUERY)
-  # print("✅✅✅ TOTAL included in results: ", len(final_passage_list))
-  print(f"⏰⏰⏰ Runtime: {(time.monotonic() - start_time):.2f} seconds")
-  # print("Total contexts:", len(CONTEXTS) * 2)
-
-
-# ! CONDA ENV: llm-serving
-if __name__ == "__main__":
-  run_main()
+# import json
+# import os
+# import threading
+# import time
+# from typing import Optional
+
+# import openai
+# import ray
+# import requests
+# from posthog import Posthog
+# import sentry_sdk
+
+# filter_unrelated_contexts_zephyr = """<|system|>
+# You are an expert at determining if a passage is relevant and helpful for answering a question.
+# To be valuable, a passage must have at least some amount of useful and meaningful information with more than a passing mention of the topic.
+# As part of your thinking process, you first write a few sentences evaluating the utility of the passage, given the question we're trying to answer. Limit yourself to writing only a sentence or two, no more.
+# Finally, you must submit your final answer by adding two newline characters then "Yes." or "No." or "I don't know.". Provide a single answer only. Providing multiple final results will disqualify you.
+# Here's a template code snippet of how it should work (with placeholder variables):
+# ```
+# Passage: <The full text of the passage>
+# Question: <The question we're using to determine relevancy of the passage>
+# Your evaluation of the utility of the passage: <A few sentences exploring how useful the passage is for this particular question>
+
+# Final answer: <Enter one of "Yes." or "No." or "I don't know."
+# ```
+# Here's a complete example. Follow this formatting exactly.
+# ```
+# Passage: Figure 4.6: Overview of the CUDA device memory model
+
+# In order to fully appreciate the difference between registers, shared memory and global memory, we need to go into a little more details of how these different memory types are realized and used in modern processors. Virtually all modern processors find their root in the model proposed by John von Neumann in 1945, which is shown in Figure 4.7. The CUDA devices are no exception. The Global Memory in a CUDA device maps to the Memory box in Figure 4.7. The processor box corresponds to the processor chip boundary that we typically see today. The Global Memory is off the processor chip and is implemented with DRAM technology, which implies long access latencies and relatively low access bandwidth.
+
+# Question: Explain how tiling helps with global memory bandwidth.
+# Your evaluation of the utility of the passage: The passage briefly mentions the use of shared memory as a means to reduce global memory bandwidth, but it doesn't provide a detailed explanation or analysis of how tiling helps with global memory bandwidth. Therefore, the passage is not helpful when answering the question.
+
+# Final answer: No.
+# ```</s>
+# <|user|>
+# Passage: {context}
+# Question: {user_query}
+# Your evaluation of the utility of the passage: </s>
+# <|assistant|>"""
+
+# @ray.remote
+# class AsyncActor:
+
+#   def filter_context(self, context, user_query, langsmith_prompt_obj):
+#     final_prompt = str(langsmith_prompt_obj.format(context=context, user_query=user_query))
+#     # print(f"-------\nfinal_prompt:\n{final_prompt}\n^^^^^^^^^^^^^")
+#     try:
+#       # completion = run_caii_hosted_llm(final_prompt)
+#       # completion = run_replicate(final_prompt)
+#       completion = run_anyscale(final_prompt)
+#       return {"completion": completion, "context": context}
+#     except Exception as e:
+#       sentry_sdk.capture_exception(e)
+#       print(f"Error: {e}")
+
+# def run_caii_hosted_llm(prompt, max_tokens=300, temp=0.3, **kwargs):
+#   """
+#   Local LLMs  USAGE DOCS: https://kastanday.notion.site/LLM-Serving-on-prem-OpenAI-Clone-bb06028266d842b0872465f552684177 ##
+#   """
+
+#   url = "http://api.kastan.ai/v1/completions?model=HuggingFaceH4/zephyr-7b-alpha"
+#   headers = {'Content-Type': 'application/json'}
+#   data = {"prompt": prompt, "max_tokens": max_tokens, "temperature": temp, **kwargs}
+
+#   response = None
+#   try:
+#     response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180)
+#     return response.json()['choices'][0]['text']
+#   except Exception as e:
+#     sentry_sdk.capture_exception(e)
+#     # Probably cuda OOM error.
+#     response_content = response.json() if response else "No response"
+#     raise ValueError(
+#         f"🚫🚫🚫 Failed inference attempt. Response: {response_content}\nError: {e}\nPromt that caused error: {prompt}"
+#     ) from e
+
+# def run_replicate(prompt):
+#   output = None
+#   # output = replicate.run("tomasmcm/zephyr-7b-beta:961cd6665b811d0c43c0b9488b6dfa85ff5c7bfb875e93b4533e4c7f96c7c526",
+#   #                        input={
+#   #                            "top_k": 50,
+#   #                            "top_p": 0.95,
+#   #                            "prompt": prompt,
+#   #                            "temperature": 0.3,
+#   #                            "max_new_tokens": 250,
+#   #                            "presence_penalty": 1
+#   #                        })
+#   print(output)
+#   return output
+
+# def run_anyscale(prompt, model_name="HuggingFaceH4/zephyr-7b-beta"):
+#   start_time = time.monotonic()
+#   ret = openai.ChatCompletion.create(
+#       api_base="https://api.endpoints.anyscale.com/v1",
+#       api_key=os.environ["ANYSCALE_ENDPOINT_TOKEN"],
+#       api_type="openai",
+#       # model="mistralai/Mistral-7B-Instruct-v0.1",
+#       model="HuggingFaceH4/zephyr-7b-beta",
+#       messages=[{
+#           "role": "system",
+#           "content": "You are a helpful assistant."
+#       }, {
+#           "role": "user",
+#           "content": prompt
+#       }],
+#       temperature=0.3,
+#       max_tokens=250,
+#   )
+
+#   output = ret["choices"][0]["message"]["content"]  # type: ignore
+#   print("Response from Anyscale:", output[:150])
+
+#   # input_length = len(tokenizer.encode(prompt))
+#   # output_length = len(tokenizer.encode(output))
+#   # Input tokens {input_length}, output tokens: {output_length}"
+#   print(f"^^^^ one anyscale call Runtime: {(time.monotonic() - start_time):.2f} seconds.")
+#   return output
+
+# def parse_result(result: str):
+#   lines = result.split('\n')
+#   for line in lines:
+#     if 'Final answer' in line:
+#       return 'yes' in line.lower()
+#   return False
+
+# def filter_top_contexts(contexts,
+#                         user_query: str,
+#                         timeout: Optional[float] = None,
+#                         max_concurrency: Optional[int] = 180):
+
+#   print("⏰⏰⏰ Starting filter_top_contexts() ⏰⏰⏰")
+
+#   timeout = timeout or float(os.environ["FILTER_TOP_CONTEXTS_TIMEOUT_SECONDS"])
+#   # langsmith_prompt_obj = hub.pull("kastanday/filter-unrelated-contexts-zephyr") # TOO UNSTABLE, service offline
+#   langsmith_prompt_obj = filter_unrelated_contexts_zephyr
+#   posthog = Posthog(sync_mode=True, project_api_key=os.environ['POSTHOG_API_KEY'], host='https://app.posthog.com')
+
+#   print("NUM ACTIVE THREADS (top of filtering_contexts):", threading.active_count())
+
+#   max_concurrency = min(100, len(contexts))
+#   print("max_concurrency is max of 100, or len(contexts), whichever is less ---- Max concurrency:", max_concurrency)
+#   print("Num contexts to filter:", len(contexts))
+
+#   # START TASKS
+#   actor = AsyncActor.options(max_concurrency=max_concurrency, num_cpus=0.001).remote()  # type: ignore
+#   result_futures = [actor.filter_context.remote(c, user_query, langsmith_prompt_obj) for c in contexts]
+
+#   start_time = time.monotonic()
+#   done_tasks, in_progress = ray.wait(result_futures,
+#                                      num_returns=len(result_futures),
+#                                      timeout=timeout,
+#                                      fetch_local=False)
+
+#   print("NUM ACTIVE THREADS (before cleanup filtering_contexts):", threading.active_count())
+#   # Cleanup
+#   for task in in_progress:
+#     ray.cancel(task)
+#   results = ray.get(done_tasks)
+#   print("NUM ACTIVE THREADS (before kill filtering_contexts):", threading.active_count())
+#   ray.kill(actor)
+#   print("NUM ACTIVE THREADS (after kill filtering_contexts):", threading.active_count())
+
+#   best_contexts_to_keep = [
+#       r['context'] for r in results if r and 'context' in r and 'completion' in r and parse_result(r['completion'])
+#   ]
+
+#   print("🧠🧠 TOTAL DOCS PROCESSED BY ANYSCALE FILTERING:", len(results))
+#   print("🧠🧠 TOTAL DOCS KEPT, AFTER FILTERING:", len(best_contexts_to_keep))
+#   mqr_runtime = round(time.monotonic() - start_time, 2)
+#   print(f"⏰ Total elapsed time: {mqr_runtime} seconds")
+
+#   posthog.capture('distinct_id_of_the_user',
+#                   event='filter_top_contexts',
+#                   properties={
+#                       'user_query': user_query,
+#                       'course_name': contexts[0].metadata.get('course_name', None),
+#                       'percent_kept': len(best_contexts_to_keep) / max(1, len(results)),
+#                       'total_docs_processed': len(results),
+#                       'total_docs_kept': len(best_contexts_to_keep),
+#                       'MQR_total_runtime_sec': mqr_runtime,
+#                   })
+#   posthog.shutdown()
+#   return best_contexts_to_keep
+
+# def run_main():
+#   start_time = time.monotonic()
+#   # final_passage_list = filter_top_contexts(contexts=CONTEXTS * 2, user_query=USER_QUERY)
+#   # print("✅✅✅ TOTAL included in results: ", len(final_passage_list))
+#   print(f"⏰⏰⏰ Runtime: {(time.monotonic() - start_time):.2f} seconds")
+#   # print("Total contexts:", len(CONTEXTS) * 2)
+
+# # ! CONDA ENV: llm-serving
+# if __name__ == "__main__":
+#   run_main()
diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py
index dae7ef0e..3c87f04b 100644
--- a/ai_ta_backend/main.py
+++ b/ai_ta_backend/main.py
@@ -6,7 +6,6 @@
 import requests
 from threading import Thread
 
-
 from dotenv import load_dotenv
 from flask import (
     Flask,
@@ -21,7 +20,7 @@
 from flask_cors import CORS
 from flask_executor import Executor
 from posthog import Posthog
-import ray
+# import ray
 import sentry_sdk
 
 from ai_ta_backend.canvas import CanvasAPI
@@ -50,7 +49,7 @@
 # load API keys from globally-availabe .env file
 load_dotenv()
 
-ray.init()
+# ray.init()
 
 print("NUM ACTIVE THREADS (top of main):", threading.active_count())
 
@@ -561,6 +560,7 @@ def nomic_map():
   response.headers.add('Access-Control-Allow-Origin', '*')
   return response
 
+
 @app.route('/createDocumentMap', methods=['GET'])
 def createDocumentMap():
   course_name: str = request.args.get('course_name', default='', type=str)
@@ -620,13 +620,15 @@ def export_convo_history():
     response.headers.add('Access-Control-Allow-Origin', '*')
 
   else:
-    response = make_response(send_from_directory(export_status['response'][2], export_status['response'][1], as_attachment=True))
+    response = make_response(
+        send_from_directory(export_status['response'][2], export_status['response'][1], as_attachment=True))
     response.headers.add('Access-Control-Allow-Origin', '*')
     response.headers["Content-Disposition"] = f"attachment; filename={export_status['response'][1]}"
     os.remove(export_status['response'][0])
-    
+
   return response
 
+
 @app.route('/exportDocuments', methods=['GET'])
 def exportDocuments():
   course_name: str = request.args.get('course_name', default='', type=str)
@@ -649,11 +651,12 @@ def exportDocuments():
     response.headers.add('Access-Control-Allow-Origin', '*')
 
   else:
-    response = make_response(send_from_directory(export_status['response'][2], export_status['response'][1], as_attachment=True))
+    response = make_response(
+        send_from_directory(export_status['response'][2], export_status['response'][1], as_attachment=True))
     response.headers.add('Access-Control-Allow-Origin', '*')
     response.headers["Content-Disposition"] = f"attachment; filename={export_status['response'][1]}"
     os.remove(export_status['response'][0])
-    
+
   return response
 
 
diff --git a/ai_ta_backend/vector_database.py b/ai_ta_backend/vector_database.py
index daab6c6e..739d5b16 100644
--- a/ai_ta_backend/vector_database.py
+++ b/ai_ta_backend/vector_database.py
@@ -42,13 +42,12 @@
 from pydub import AudioSegment
 from qdrant_client import QdrantClient, models
 from qdrant_client.models import PointStruct
-from langchain.schema.output_parser import StrOutputParser
 
 from ai_ta_backend.aws import upload_data_files_to_s3
 from ai_ta_backend.extreme_context_stuffing import OpenAIAPIProcessor
 from ai_ta_backend.utils_tokenization import count_tokens_and_cost
-from ai_ta_backend.context_parent_doc_padding import context_parent_doc_padding
-from ai_ta_backend.filtering_contexts import filter_top_contexts
+# from ai_ta_backend.context_parent_doc_padding import context_parent_doc_padding
+# from ai_ta_backend.filtering_contexts import filter_top_contexts
 from ai_ta_backend.nomic_logging import log_to_document_map, delete_from_document_map
 
 MULTI_QUERY_PROMPT = hub.pull("langchain-ai/rag-fusion-query-generation")
@@ -1356,85 +1355,87 @@ def getTopContextsWithMQR(self,
     4. [CANCELED BEC POINTLESS] Rank the docs based on the relevance score.
     5. Parent-doc-retrieval: Pad just the top 5 docs with expanded context from the original document.
     """
-    try:
-      top_n_per_query = 40  # HARD CODE TO ENSURE WE HIT THE MAX TOKENS
-      start_time_overall = time.monotonic()
-      mq_start_time = time.monotonic()
-
-      # 1. GENERATE MULTIPLE QUERIES
-      generate_queries = (
-          MULTI_QUERY_PROMPT | self.llm | StrOutputParser() | (lambda x: x.split("\n")) |
-          (lambda x: list(filter(None, x)))  # filter out non-empty strings
-      )
-
-      generated_queries = generate_queries.invoke({"original_query": search_query})
-      print("generated_queries", generated_queries)
-
-      # 2. VECTOR SEARCH FOR EACH QUERY
-      batch_found_docs_nested: list[list[Document]] = self.batch_vector_search(search_queries=generated_queries,
-                                                                               course_name=course_name,
-                                                                               top_n=top_n_per_query)
-
-      # 3. RANK REMAINING DOCUMENTS -- good for parent doc padding of top 5 at the end.
-      found_docs = self.reciprocal_rank_fusion(batch_found_docs_nested)
-      found_docs = [doc for doc, score in found_docs]
-      print(f"Num docs after re-ranking: {len(found_docs)}")
-      if len(found_docs) == 0:
-        return []
-      print(f"⏰ Total multi-query processing runtime: {(time.monotonic() - mq_start_time):.2f} seconds")
-
-      # 4. FILTER DOCS
-      filtered_docs = filter_top_contexts(contexts=found_docs, user_query=search_query, timeout=30, max_concurrency=180)
-      if len(filtered_docs) == 0:
-        return []
-
-      # 5. TOP DOC CONTEXT PADDING // parent document retriever
-      final_docs = context_parent_doc_padding(filtered_docs, search_query, course_name)
-      print(f"Number of final docs after context padding: {len(final_docs)}")
-
-      pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n"
-      token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' +
-                                               search_query)  # type: ignore
-
-      valid_docs = []
-      num_tokens = 0
-      for doc in final_docs:
-        doc_string = f"Document: {doc['readable_filename']}{', page: ' + str(doc['pagenumber']) if doc['pagenumber'] else ''}\n{str(doc['text'])}\n"
-        num_tokens, prompt_cost = count_tokens_and_cost(doc_string)  # type: ignore
-
-        print(f"token_counter: {token_counter}, num_tokens: {num_tokens}, max_tokens: {token_limit}")
-        if token_counter + num_tokens <= token_limit:
-          token_counter += num_tokens
-          valid_docs.append(doc)
-        else:
-          # filled our token size, time to return
-          break
-
-      print(f"Total tokens used: {token_counter} Used {len(valid_docs)} of total unique docs {len(found_docs)}.")
-      print(f"Course: {course_name} ||| search_query: {search_query}")
-      print(f"⏰ ^^ Runtime of getTopContextsWithMQR: {(time.monotonic() - start_time_overall):.2f} seconds")
-
-      if len(valid_docs) == 0:
-        return []
-
-      self.posthog.capture('distinct_id_of_the_user',
-                           event='filter_top_contexts_succeeded',
-                           properties={
-                               'user_query': search_query,
-                               'course_name': course_name,
-                               'token_limit': token_limit,
-                               'total_tokens_used': token_counter,
-                               'total_contexts_used': len(valid_docs),
-                               'total_unique_docs_retrieved': len(found_docs),
-                           })
-
-      return self.format_for_json_mqr(valid_docs)
-    except Exception as e:
-      # return full traceback to front end
-      err: str = f"ERROR: In /getTopContextsWithMQR. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.format_exc()}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}"  # type: ignore
-      print(err)
-      sentry_sdk.capture_exception(e)
-      return err
+    return 'fail'
+
+  #   try:
+  #     top_n_per_query = 40  # HARD CODE TO ENSURE WE HIT THE MAX TOKENS
+  #     start_time_overall = time.monotonic()
+  #     mq_start_time = time.monotonic()
+
+  #     # 1. GENERATE MULTIPLE QUERIES
+  #     generate_queries = (
+  #         MULTI_QUERY_PROMPT | self.llm | StrOutputParser() | (lambda x: x.split("\n")) |
+  #         (lambda x: list(filter(None, x)))  # filter out non-empty strings
+  #     )
+
+  #     generated_queries = generate_queries.invoke({"original_query": search_query})
+  #     print("generated_queries", generated_queries)
+
+  #     # 2. VECTOR SEARCH FOR EACH QUERY
+  #     batch_found_docs_nested: list[list[Document]] = self.batch_vector_search(search_queries=generated_queries,
+  #                                                                              course_name=course_name,
+  #                                                                              top_n=top_n_per_query)
+
+  #     # 3. RANK REMAINING DOCUMENTS -- good for parent doc padding of top 5 at the end.
+  #     found_docs = self.reciprocal_rank_fusion(batch_found_docs_nested)
+  #     found_docs = [doc for doc, score in found_docs]
+  #     print(f"Num docs after re-ranking: {len(found_docs)}")
+  #     if len(found_docs) == 0:
+  #       return []
+  #     print(f"⏰ Total multi-query processing runtime: {(time.monotonic() - mq_start_time):.2f} seconds")
+
+  #     # 4. FILTER DOCS
+  #     filtered_docs = filter_top_contexts(contexts=found_docs, user_query=search_query, timeout=30, max_concurrency=180)
+  #     if len(filtered_docs) == 0:
+  #       return []
+
+  #     # 5. TOP DOC CONTEXT PADDING // parent document retriever
+  #     final_docs = context_parent_doc_padding(filtered_docs, search_query, course_name)
+  #     print(f"Number of final docs after context padding: {len(final_docs)}")
+
+  #     pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n"
+  #     token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' +
+  #                                              search_query)  # type: ignore
+
+  #     valid_docs = []
+  #     num_tokens = 0
+  #     for doc in final_docs:
+  #       doc_string = f"Document: {doc['readable_filename']}{', page: ' + str(doc['pagenumber']) if doc['pagenumber'] else ''}\n{str(doc['text'])}\n"
+  #       num_tokens, prompt_cost = count_tokens_and_cost(doc_string)  # type: ignore
+
+  #       print(f"token_counter: {token_counter}, num_tokens: {num_tokens}, max_tokens: {token_limit}")
+  #       if token_counter + num_tokens <= token_limit:
+  #         token_counter += num_tokens
+  #         valid_docs.append(doc)
+  #       else:
+  #         # filled our token size, time to return
+  #         break
+
+  #     print(f"Total tokens used: {token_counter} Used {len(valid_docs)} of total unique docs {len(found_docs)}.")
+  #     print(f"Course: {course_name} ||| search_query: {search_query}")
+  #     print(f"⏰ ^^ Runtime of getTopContextsWithMQR: {(time.monotonic() - start_time_overall):.2f} seconds")
+
+  #     if len(valid_docs) == 0:
+  #       return []
+
+  #     self.posthog.capture('distinct_id_of_the_user',
+  #                          event='filter_top_contexts_succeeded',
+  #                          properties={
+  #                              'user_query': search_query,
+  #                              'course_name': course_name,
+  #                              'token_limit': token_limit,
+  #                              'total_tokens_used': token_counter,
+  #                              'total_contexts_used': len(valid_docs),
+  #                              'total_unique_docs_retrieved': len(found_docs),
+  #                          })
+
+  #     return self.format_for_json_mqr(valid_docs)
+  #   except Exception as e:
+  #     # return full traceback to front end
+  #     err: str = f"ERROR: In /getTopContextsWithMQR. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.format_exc()}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}"  # type: ignore
+  #     print(err)
+  #     sentry_sdk.capture_exception(e)
+  #     return err
 
   def format_for_json_mqr(self, found_docs) -> List[Dict]:
     """
diff --git a/requirements.txt b/requirements.txt
index acc2eed2..f4503824 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -53,7 +53,7 @@ unstructured==0.10.29 # causes huge ~5.3 GB of installs. Probbably from onnx: ht
 # Not currently supporting coursera ingest
 # cs-dlp @ git+https://github.com/raffaem/cs-dlp.git@0.12.0b0 # previously called coursera-dl
 pydantic==1.10.13 # pydantic v1 works better for ray
-ray==2.8.1
 posthog==3.1.0
 sentry-sdk==1.39.1
+# ray==2.8.1
 # newrelic==9.3.0
\ No newline at end of file
diff --git a/run.sh b/run.sh
index 20a23b02..0d77691a 100755
--- a/run.sh
+++ b/run.sh
@@ -3,6 +3,6 @@
 # Docs https://docs.gunicorn.org/en/stable/settings.html#workers
 
 # 200 MB object store memory.. necessary to statically allocate or will crash in Railway env restrictions.
-ray start --head --num-cpus 6 --object-store-memory 300000000
+# ray start --head --num-cpus 6 --object-store-memory 300000000
 export PYTHONPATH=${PYTHONPATH}:$(pwd)/ai_ta_backend
 exec gunicorn --workers=6 --threads=20000 --worker-class=gthread ai_ta_backend.main:app --timeout 1800