From 00834df39a2a3a5ba5038213ae9ff8e216124401 Mon Sep 17 00:00:00 2001 From: Kastan Day Date: Thu, 14 Dec 2023 14:51:33 -0800 Subject: [PATCH] Working implementation. Using ray, tested end to end locally --- ...ssing.py => context_parent_doc_padding.py} | 8 +- ai_ta_backend/filtering_contexts.py | 347 ++++++------------ ai_ta_backend/vector_database.py | 81 ++-- run.sh | 2 + 4 files changed, 148 insertions(+), 290 deletions(-) rename ai_ta_backend/{parallel_context_processing.py => context_parent_doc_padding.py} (97%) diff --git a/ai_ta_backend/parallel_context_processing.py b/ai_ta_backend/context_parent_doc_padding.py similarity index 97% rename from ai_ta_backend/parallel_context_processing.py rename to ai_ta_backend/context_parent_doc_padding.py index e1367da1..e4047a8e 100644 --- a/ai_ta_backend/parallel_context_processing.py +++ b/ai_ta_backend/context_parent_doc_padding.py @@ -1,16 +1,16 @@ import os -import supabase -import pandas as pd import time from concurrent.futures import ProcessPoolExecutor from functools import partial from multiprocessing import Manager +import supabase + DOCUMENTS_TABLE = os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE'] -SUPABASE_CLIENT = supabase.create_client(supabase_url=os.environ['SUPABASE_URL'], supabase_key=os.environ['SUPABASE_API_KEY']) +SUPABASE_CLIENT = supabase.create_client(supabase_url=os.environ['SUPABASE_URL'], supabase_key=os.environ['SUPABASE_API_KEY']) # type: ignore -def context_processing(found_docs, search_query, course_name): +def context_parent_doc_padding(found_docs, search_query, course_name): """ Takes top N contexts acquired from QRANT similarity search and pads them """ diff --git a/ai_ta_backend/filtering_contexts.py b/ai_ta_backend/filtering_contexts.py index 9d53e3db..9566fb60 100644 --- a/ai_ta_backend/filtering_contexts.py +++ b/ai_ta_backend/filtering_contexts.py @@ -1,275 +1,144 @@ -# Env for kastan: - -import inspect import json import os import time -import traceback +# from dotenv import load_dotenv import openai import ray import replicate import requests -from dotenv import load_dotenv from langchain import hub -from langchain.prompts import PromptTemplate -#from openai import OpenAI - -from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor -from functools import partial -from multiprocessing import Pool, Manager +from transformers import AutoTokenizer -from ai_ta_backend.utils_tokenization import count_tokens_and_cost +# load_dotenv(override=True) +# tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") -load_dotenv(override=True) -LANGSMITH_PROMPT_OBJ = hub.pull("kastanday/filter-unrelated-contexts-zephyr") +@ray.remote +class AsyncActor: -## Local LLMs USAGE DOCS: https://kastanday.notion.site/LLM-Serving-on-prem-OpenAI-Clone-bb06028266d842b0872465f552684177 ## + def filter_context(self, context, user_query, langsmith_prompt_obj): + final_prompt = str(langsmith_prompt_obj.format(context=context, user_query=user_query)) + # print(f"-------\nfinal_prompt:\n{final_prompt}\n^^^^^^^^^^^^^") + try: + # completion = run_caii_hosted_llm(final_prompt) + # completion = run_replicate(final_prompt) + completion = run_anyscale(final_prompt) + return {"completion": completion, "context": context} + except Exception as e: + print(f"Error: {e}") -def list_context_filtering(contexts, user_query, max_time_before_return=45, max_concurrency=100): - """ - Main function for filtering contexts. Use this when dealing with a List[Dicts]. To be called after context_padding - in getTopContextsWithMQR(). It is also used with batch_context_filtering. - This function multi-processes a list of contexts. - Args: contexts (list of dicts), user_query (str), max_time_before_return (int), max_concurrency (int) - Returns: filtered_contexts (list of dicts) +def run_caii_hosted_llm(prompt, max_tokens=300, temp=0.3, **kwargs): """ - - start_time = time.monotonic() - - # call filter contexts function - with Manager() as manager: - filtered_contexts = manager.list() - partial_func1 = partial(anyscale_completion, user_query=user_query, langsmith_prompt_obj=LANGSMITH_PROMPT_OBJ) - partial_func2 = partial(select_context, result=filtered_contexts) - - with ProcessPoolExecutor(max_workers=30) as executor: - anyscale_responses = list(executor.map(partial_func1, contexts)) - if len(anyscale_responses) > 0: - executor.map(partial_func2, anyscale_responses) - else: - print("LLM responses are empty.") - executor.shutdown() - - filtered_contexts = list(filtered_contexts) - print(f"ā° Context filtering runtime: {(time.monotonic() - start_time):.2f} seconds") - - print("len of filtered contexts: ", len(filtered_contexts)) - return filtered_contexts - -def batch_context_filtering(batch_docs, user_query, max_time_before_return=45, max_concurrency=100): + Local LLMs USAGE DOCS: https://kastanday.notion.site/LLM-Serving-on-prem-OpenAI-Clone-bb06028266d842b0872465f552684177 ## """ - Main function for filtering contexts. Use this when dealing with List[List[Docs]]. To be called between - batch_vector_search() and reciprocal_ranking(). - This function multi-processes a list of list of contexts. - Args: batch_docs (list of list of docs), user_query (str), max_time_before_return (int), max_concurrency (int) - Returns: filtered_contexts (list of list of docs) - """ - + url = "http://api.kastan.ai/v1/completions?model=HuggingFaceH4/zephyr-7b-alpha" + headers = {'Content-Type': 'application/json'} + data = {"prompt": prompt, "max_tokens": max_tokens, "temperature": temp, **kwargs} + + try: + response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180) + return response.json()['choices'][0]['text'] + except Exception as e: + # Probably cuda OOM error. + raise ValueError( + f"šŸš«šŸš«šŸš« Failed inference attempt. Response: {response.json()}\nError: {e}\nPromt that caused error: {prompt}") + + +def run_replicate(prompt): + output = replicate.run("tomasmcm/zephyr-7b-beta:961cd6665b811d0c43c0b9488b6dfa85ff5c7bfb875e93b4533e4c7f96c7c526", + input={ + "top_k": 50, + "top_p": 0.95, + "prompt": prompt, + "temperature": 0.3, + "max_new_tokens": 250, + "presence_penalty": 1 + }) + print(output) + return output + + +def run_anyscale(prompt, model_name="HuggingFaceH4/zephyr-7b-beta"): start_time = time.monotonic() - - partial_func = partial(list_context_filtering, user_query=user_query, max_time_before_return=max_time_before_return, max_concurrency=max_concurrency) - with ProcessPoolExecutor(max_workers=5) as executor: - processed_docs = list(executor.map(partial_func, batch_docs)) - - processed_docs = list(processed_docs) - print(f"ā° Batch context filtering runtime: {(time.monotonic() - start_time):.2f} seconds") - - return processed_docs - - -def anyscale_completion(context, user_query, langsmith_prompt_obj): - """ - Runs the Anyscale completion API call. - Args: context (dict), user_query (str), langsmith_prompt_obj (PromptTemplate) - Returns: completion_object (dict) - """ - api_start_time = time.monotonic() - # use first final_prompt when using batch_context_filtering as start point and second when using list_context_filtering as start point - final_prompt = str(langsmith_prompt_obj.format(context=context.page_content, user_query=user_query)) - #final_prompt = str(langsmith_prompt_obj.format(context=context['text'], user_query=user_query)) - try: - ret = openai.ChatCompletion.create( - api_base = "https://api.endpoints.anyscale.com/v1", - api_key=os.environ["ANYSCALE_ENDPOINT_TOKEN"], - model = "HuggingFaceH4/zephyr-7b-beta", - messages=[{"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": final_prompt}], - temperature=0.3, - max_tokens=250, - ) - completion = ret["choices"][0]["message"]["content"] - - print(f"ā° Anyscale runtime: {(time.monotonic() - api_start_time):.2f} seconds") - return {"completion": completion, "context": context} - except Exception as e: - print(f"Error: {e}") - -def select_context(completion_object, result): - """ - Uses parse_result() to determine if the context should be passed to the frontend. - Args: completion_object (dict), result (list of dicts) - Returns: None - """ - if parse_result(completion_object['completion']): - result.append(completion_object['context']) - -def parse_result(result): - """ - Parses the result of the LLM completion API call. - Args: result (str) -- the completion part of Anyscale response - """ + ret = openai.ChatCompletion.create( + api_base="https://api.endpoints.anyscale.com/v1", + api_key=os.environ["ANYSCALE_ENDPOINT_TOKEN"], + api_type="openai", + # model="mistralai/Mistral-7B-Instruct-v0.1", + model="HuggingFaceH4/zephyr-7b-beta", + messages=[{ + "role": "system", + "content": "You are a helpful assistant." + }, { + "role": "user", + "content": prompt + }], + temperature=0.3, + max_tokens=250, + ) + + output = ret["choices"][0]["message"]["content"] + print("Response from Anyscale:", output[:150]) + + # input_length = len(tokenizer.encode(prompt)) + # output_length = len(tokenizer.encode(output)) + # Input tokens {input_length}, output tokens: {output_length}" + print(f"^^^^ one anyscale call Runtime: {(time.monotonic() - start_time):.2f} seconds.") + return output + + +def parse_result(result: str): lines = result.split('\n') for line in lines: if 'Final answer' in line: return 'yes' in line.lower() - return False - - -#----------------------- RAY CODE BELOW ----------------------------------------------------------------------------# + return False -# @ray.remote -# class AsyncActor: -# def __init__(self): -# pass -# def filter_context(self, context, user_query, langsmith_prompt_obj): -# final_prompt = str(langsmith_prompt_obj.format(context=context['text'], user_query=user_query)) -# #print(f"-------\nfinal_prompt:\n{final_prompt}\n^^^^^^^^^^^^^") -# try: -# # completion = run_model(final_prompt) -# #completion = run_replicate(final_prompt) -# completion = run_anyscale(final_prompt) - -# return {"completion": completion, "context": context} -# except Exception as e: -# print(f"Error: {e}") +def filter_top_contexts(contexts, user_query: str, timeout: float = None, max_concurrency: int = 180): -# def run_model(prompt, max_tokens=300, temp=0.3, **kwargs): -# ''' -# Local LLMs USAGE DOCS: https://kastanday.notion.site/LLM-Serving-on-prem-OpenAI-Clone-bb06028266d842b0872465f552684177 ## -# ''' + print("ā°ā°ā° Starting filter_top_contexts() ā°ā°ā°") + # print(len(contexts)) + # print(contexts) + # raise ValueError("STOPPING HERE") -# url = "http://api.kastan.ai/v1/completions?model=HuggingFaceH4/zephyr-7b-alpha" -# headers = { -# 'Content-Type': 'application/json' -# } -# data = { -# "prompt": prompt, -# "max_tokens": max_tokens, -# "temperature": temp, -# **kwargs -# } + timeout = timeout or float(os.environ["FILTER_TOP_CONTEXTS_TIMEOUT_SECONDS"]) + langsmith_prompt_obj = hub.pull("kastanday/filter-unrelated-contexts-zephyr") -# try: -# response = requests.post(url, headers=headers, data=json.dumps(data)) -# return response.json()['choices'][0]['text'] -# except Exception as e: -# # Probably cuda OOM error. -# raise ValueError(f"šŸš«šŸš«šŸš« Failed inference attempt. Response: {response.json()}\nError: {e}\nPromt that caused error: {prompt}") + print("Max concurrency:", max_concurrency) + print("Num contexts to filter:", len(contexts)) -# def run_replicate(prompt): -# output = replicate.run( -# "tomasmcm/zephyr-7b-beta:961cd6665b811d0c43c0b9488b6dfa85ff5c7bfb875e93b4533e4c7f96c7c526", -# input={ -# "top_k": 50, -# "top_p": 0.95, -# "prompt": prompt, -# "temperature": 0.3, -# "max_new_tokens": 250, -# "presence_penalty": 1 -# } -# ) -# print(output) -# return output + # START TASKS + actor = AsyncActor.options(max_concurrency=max_concurrency, num_cpus=0.001).remote() + result_futures = [actor.filter_context.remote(c, user_query, langsmith_prompt_obj) for c in contexts] -# def run_anyscale(prompt): -# api_start_time = time.monotonic() -# ret = openai.ChatCompletion.create( -# api_base = "https://api.endpoints.anyscale.com/v1", -# api_key=os.environ["ANYSCALE_ENDPOINT_TOKEN"], -# # model="meta-llama/Llama-2-70b-chat-hf", -# #model="mistralai/Mistral-7B-Instruct-v0.1", -# model = "HuggingFaceH4/zephyr-7b-beta", -# messages=[{"role": "system", "content": "You are a helpful assistant."}, -# {"role": "user", "content": prompt}], -# temperature=0.3, -# max_tokens=250, -# ) -# print(f"ā° Anyscale runtime: {(time.monotonic() - api_start_time):.2f} seconds") -# return ret["choices"][0]["message"]["content"] + start_time = time.time() + done_tasks, in_progress = ray.wait(result_futures, + num_returns=len(result_futures), + timeout=timeout, + fetch_local=False) + for task in in_progress: + ray.cancel(task) + results = ray.get(done_tasks) + print("šŸ§ šŸ§  TOTAL RETURNS FROM ANYSCALE:", len(results)) + print(f"ā° Total elapsed time: {(time.time() - start_time):.2f} seconds") -# def parse_result(result): -# lines = result.split('\n') -# for line in lines: -# if 'Final answer' in line: -# return 'yes' in line.lower() -# return False + best_contexts_to_keep = [r['context'] for r in results if parse_result(r['completion'])] + return best_contexts_to_keep -# def ray_context_filtering(contexts, user_query, max_tokens_to_return=3000, max_time_before_return=None, max_concurrency=100): -# -# # Main function for filtering contexts using RAY. Use this when dealing with a list of contexts. -# -# -# langsmith_prompt_obj = hub.pull("kastanday/filter-unrelated-contexts-zephyr") - -# print("Num jobs to run:", len(contexts)) -# actor = AsyncActor.options(max_concurrency=max_concurrency).remote() -# result_futures = [actor.filter_context.remote(c, user_query, langsmith_prompt_obj) for c in contexts] -# print("Num futures:", len(result_futures)) -# #print("Result futures:", result_futures) - -# start_time = time.time() -# for i in range(0, len(result_futures)): -# try: -# ready, not_ready = ray.wait(result_futures) -# result = ray.get(ready[0]) - -# if result is None: -# print("RESULT WAS NONE, llm inference probably failed") -# continue - -# if parse_result(result['completion']): -# yield result['context'] - -# elapsed_time = (time.time() - start_time) -# avg_task_time = elapsed_time / (i+1) -# estimated_total_runtime = avg_task_time * len(contexts) - -# print(f"šŸ“Œ Completed {i+1} of {len(contexts)}") -# print(f"ā° Running total of elapsed time: {elapsed_time:.2f} seconds\nšŸ”® Estimated total runtime: {estimated_total_runtime:.2f} seconds.\n") -# print(f"ā°šŸ‘» avg_task_time (s): {avg_task_time:.2f}") -# # print(f"šŸ“œ Passage: {result['context']['text']}") -# # print(f"āœ… Result: {result['completion']}") - -# if max_time_before_return is not None and elapsed_time >= max_time_before_return: -# break - -# except Exception as e: -# print("-----------āŒāŒāŒāŒ------------START OF ERROR-----------āŒāŒāŒāŒ------------") -# print(f"Error in {inspect.currentframe().f_code.co_name}: {e}") # print function name in error. -# print(f"Traceback:") -# print(traceback.print_exc()) -# finally: -# result_futures = not_ready -# if not result_futures: -# break - - - +def run_main(): + start_time = time.monotonic() + final_passage_list = filter_top_contexts(contexts=CONTEXTS * 2, user_query=USER_QUERY) -# # ! CONDA ENV: llm-serving -# if __name__ == "__main__": -# #ray.init() -# start_time = time.monotonic() -# # print(len(CONTEXTS)) + print("āœ…āœ…āœ… TOTAL included in results: ", len(final_passage_list)) + print(f"ā°ā°ā° Runtime: {(time.monotonic() - start_time):.2f} seconds") + # print("Total contexts:", len(CONTEXTS) * 2) -# final_passage_list = list(ray_context_filtering(contexts=CONTEXTS*2, user_query=USER_QUERY, max_time_before_return=45, max_concurrency=20)) -# print("āœ…āœ…āœ… FINAL RESULTS: \n" + '\n'.join(json.dumps(r, indent=2) for r in final_passage_list)) -# print("āœ…āœ…āœ… TOTAL RETURNED: ", len(final_passage_list)) -# print(f"ā°ā°ā° Runtime: {(time.monotonic() - start_time):.2f} seconds") \ No newline at end of file +# ! CONDA ENV: llm-serving +if __name__ == "__main__": + run_main() diff --git a/ai_ta_backend/vector_database.py b/ai_ta_backend/vector_database.py index a26fad8e..354ff602 100644 --- a/ai_ta_backend/vector_database.py +++ b/ai_ta_backend/vector_database.py @@ -45,10 +45,10 @@ from ai_ta_backend.aws import upload_data_files_to_s3 from ai_ta_backend.extreme_context_stuffing import OpenAIAPIProcessor from ai_ta_backend.utils_tokenization import count_tokens_and_cost -from ai_ta_backend.parallel_context_processing import context_processing +from ai_ta_backend.context_parent_doc_padding import context_parent_doc_padding #from ai_ta_backend.filtering_contexts import ray_context_filtering #from ai_ta_backend.filtering_contexts import run_context_filtering -from ai_ta_backend.filtering_contexts import batch_context_filtering +from ai_ta_backend.filtering_contexts import filter_top_contexts MULTI_QUERY_PROMPT = hub.pull("langchain-ai/rag-fusion-query-generation") OPENAI_API_TYPE = "azure" # "openai" or "azure" @@ -1005,6 +1005,8 @@ def batch_vector_search(self, search_queries: List[str], course_name: str, top_n """ Perform a similarity search for all the generated queries at once. """ + start_time = time.monotonic() + from qdrant_client.http import models as rest o = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE) # Prepare the filter for the course name @@ -1021,13 +1023,17 @@ def batch_vector_search(self, search_queries: List[str], course_name: str, top_n for query in search_queries: user_query_embedding = o.embed_query(query) search_requests.append( - rest.SearchRequest(vector=user_query_embedding, filter=myfilter, limit=top_n, with_payload=True) + rest.SearchRequest(vector=user_query_embedding, filter=myfilter, limit=top_n, with_payload=True, params=models.SearchParams( + quantization=models.QuantizationSearchParams( + rescore=False + ) + )) ) # Perform the batch search search_results = self.qdrant_client.search_batch( collection_name=os.environ['QDRANT_COLLECTION_NAME'], - requests=search_requests + requests=search_requests, ) # process search results found_docs: list[list[Document]] = [] @@ -1046,6 +1052,8 @@ def batch_vector_search(self, search_queries: List[str], course_name: str, top_n except Exception as e: print(traceback.print_exc()) found_docs.append(docs) + + print(f"ā° Qdrant Batch Search runtime: {(time.monotonic() - start_time):.2f} seconds") return found_docs @@ -1152,17 +1160,15 @@ def getTopContextsWithMQR(self, search_query: str, course_name: str, token_limit 1. Generate multiple queries based on the input search query. 2. Retrieve relevant docs for each query. 3. Filter the relevant docs based on the user query and pass them to the rank fusion step. - 4. Rank the docs based on the relevance score. - 5. Pad the top 5 docs with context from the original document. + 4. [CANCELED BEC POINTLESS] Rank the docs based on the relevance score. + 5. Parent-doc-retrieval: Pad just the top 5 docs with expanded context from the original document. """ try: - top_n = 80 # HARD CODE TO ENSURE WE HIT THE MAX TOKENS + top_n_per_query = 40 # HARD CODE TO ENSURE WE HIT THE MAX TOKENS start_time_overall = time.monotonic() - - # Vector search with ONLY original query - # found_docs: list[Document] = self.vector_search(search_query=search_query, course_name=course_name) mq_start_time = time.monotonic() - # Multi query retriever + + # 1. GENERATE MULTIPLE QUERIES generate_queries = ( MULTI_QUERY_PROMPT | self.llm @@ -1174,53 +1180,36 @@ def getTopContextsWithMQR(self, search_query: str, course_name: str, token_limit generated_queries = generate_queries.invoke({"original_query": search_query}) print("generated_queries", generated_queries) - batch_found_docs: list[list[Document]] = self.batch_vector_search(search_queries=generated_queries, course_name=course_name) + # 2. VECTOR SEARCH FOR EACH QUERY + batch_found_docs_nested: list[list[Document]] = self.batch_vector_search(search_queries=generated_queries, course_name=course_name, top_n=top_n_per_query) + # batch_found_docs: list[Document] = [doc for sublist in batch_found_docs_nested for doc in sublist] + total_docs_retrieved = sum([len(docs) for docs in batch_found_docs_nested]) - # use the below filtering code for batch context filtering - List[List[Document]] (only use between batch search and rank fusion) - filtered_docs = batch_context_filtering(batch_docs=batch_found_docs, user_query=search_query, max_time_before_return=45, max_concurrency=100) - - filtered_count = 0 - for docs in filtered_docs: - filtered_count += len(docs) - print(f"Number of individual docs after context filtering: {filtered_count}") - # if filtered docs are between 0 to 5 (very less), use the pre-filter batch_found_docs - if 0 <= filtered_count <= 5: - found_docs = self.reciprocal_rank_fusion(batch_found_docs) - else: - found_docs = self.reciprocal_rank_fusion(filtered_docs) - + # 3. RANK REMAINING DOCUMENTS -- good for parent doc padding of top 5 at the end. + found_docs = self.reciprocal_rank_fusion(batch_found_docs_nested) found_docs = [doc for doc, score in found_docs] - print(f"Number of docs found with MQR after rank fusion: {len(found_docs)}") + print(f"Num docs after re-ranking: {len(found_docs)}") if len(found_docs) == 0: - return [] - + return [] print(f"ā° Total multi-query processing runtime: {(time.monotonic() - mq_start_time):.2f} seconds") - # 'context padding' // 'parent document retriever' - final_docs = context_processing(found_docs, search_query, course_name) + # 4. FILTER DOCS + filtered_docs = filter_top_contexts(contexts=found_docs, user_query=search_query, timeout=30, max_concurrency=180) + print("Num docs after filtering: ", len(filtered_docs)) + if len(filtered_docs) == 0: + return [] + + # 5. TOP DOC CONTEXT PADDING // parent document retriever + final_docs = context_parent_doc_padding(filtered_docs, search_query, course_name) print(f"Number of final docs after context padding: {len(final_docs)}") pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" - # count tokens at start and end, then also count each context. token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' + search_query) # type: ignore - # use the below commented code for ray-based context filtering or List[dict] filtering - - #filtered_docs = list_context_filtering(contexts=final_docs, user_query=search_query, max_time_before_return=45, max_concurrency=100) - #filtered_docs = list(run(contexts=final_docs, user_query=search_query, max_time_before_return=45, max_concurrency=100)) - # print(f"Number of docs after context filtering: {len(filtered_docs)}") - # if 0 <= len(filtered_docs) <= 5: - # final_docs_used = final_docs - # print("No docs passed context filtering, using all docs retrieved.") - # else: - # final_docs_used = filtered_docs - valid_docs = [] num_tokens = 0 - for doc in final_docs: - doc_string = f"Document: {doc['readable_filename']}{', page: ' + str(doc['pagenumber']) if doc['pagenumber'] else ''}\n{str(doc['text'])}\n" num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore @@ -1232,9 +1221,7 @@ def getTopContextsWithMQR(self, search_query: str, course_name: str, token_limit # filled our token size, time to return break - print("Length of valid docs: ", len(valid_docs)) - - print(f"Total tokens used: {token_counter} total docs: {len(found_docs)} num docs used: {len(valid_docs)}") + print(f"Total tokens used: {token_counter} Used {len(valid_docs)} of total docs {total_docs_retrieved}.") print(f"Course: {course_name} ||| search_query: {search_query}") print(f"ā° ^^ Runtime of getTopContextsWithMQR: {(time.monotonic() - start_time_overall):.2f} seconds") diff --git a/run.sh b/run.sh index b925376d..df0148ec 100755 --- a/run.sh +++ b/run.sh @@ -2,5 +2,7 @@ # Docs https://docs.gunicorn.org/en/stable/settings.html#workers +# 200 MB object store memory.. necessary to statically allocate or will crash in Railway env restrictions. +ray start --head --num_cpus 8 --object-store-memory 200000000 export PYTHONPATH=$PYTHONPATH:$(pwd)/ai_ta_backend exec gunicorn --workers=3 --threads=16 --worker-class=gthread ai_ta_backend.main:app --timeout 1800 \ No newline at end of file