diff --git a/ai_ta_backend/filtering_contexts.py b/ai_ta_backend/filtering_contexts.py index 8d9d7131..476df3d0 100644 --- a/ai_ta_backend/filtering_contexts.py +++ b/ai_ta_backend/filtering_contexts.py @@ -1,213 +1,196 @@ -import json -import os -import threading -import time -from typing import Optional - -import openai -import ray -import requests -# from langchain import hub -# import replicate -from posthog import Posthog -import sentry_sdk - -# from dotenv import load_dotenv -# load_dotenv(override=True) -# from transformers import AutoTokenizer -# tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") - -filter_unrelated_contexts_zephyr = """<|system|> -You are an expert at determining if a passage is relevant and helpful for answering a question. -To be valuable, a passage must have at least some amount of useful and meaningful information with more than a passing mention of the topic. -As part of your thinking process, you first write a few sentences evaluating the utility of the passage, given the question we're trying to answer. Limit yourself to writing only a sentence or two, no more. -Finally, you must submit your final answer by adding two newline characters then "Yes." or "No." or "I don't know.". Provide a single answer only. Providing multiple final results will disqualify you. -Here's a template code snippet of how it should work (with placeholder variables): -``` -Passage: -Question: -Your evaluation of the utility of the passage: - - -Final answer: -<|user|> -Passage: {context} -Question: {user_query} -Your evaluation of the utility of the passage: -<|assistant|>""" - - -@ray.remote -class AsyncActor: - - def filter_context(self, context, user_query, langsmith_prompt_obj): - final_prompt = str(langsmith_prompt_obj.format(context=context, user_query=user_query)) - # print(f"-------\nfinal_prompt:\n{final_prompt}\n^^^^^^^^^^^^^") - try: - # completion = run_caii_hosted_llm(final_prompt) - # completion = run_replicate(final_prompt) - completion = run_anyscale(final_prompt) - return {"completion": completion, "context": context} - except Exception as e: - sentry_sdk.capture_exception(e) - print(f"Error: {e}") - - -def run_caii_hosted_llm(prompt, max_tokens=300, temp=0.3, **kwargs): - """ - Local LLMs USAGE DOCS: https://kastanday.notion.site/LLM-Serving-on-prem-OpenAI-Clone-bb06028266d842b0872465f552684177 ## - """ - - url = "http://api.kastan.ai/v1/completions?model=HuggingFaceH4/zephyr-7b-alpha" - headers = {'Content-Type': 'application/json'} - data = {"prompt": prompt, "max_tokens": max_tokens, "temperature": temp, **kwargs} - - response = None - try: - response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180) - return response.json()['choices'][0]['text'] - except Exception as e: - sentry_sdk.capture_exception(e) - # Probably cuda OOM error. - response_content = response.json() if response else "No response" - raise ValueError( - f"🚫🚫🚫 Failed inference attempt. Response: {response_content}\nError: {e}\nPromt that caused error: {prompt}" - ) from e - - -def run_replicate(prompt): - output = None - # output = replicate.run("tomasmcm/zephyr-7b-beta:961cd6665b811d0c43c0b9488b6dfa85ff5c7bfb875e93b4533e4c7f96c7c526", - # input={ - # "top_k": 50, - # "top_p": 0.95, - # "prompt": prompt, - # "temperature": 0.3, - # "max_new_tokens": 250, - # "presence_penalty": 1 - # }) - print(output) - return output - - -def run_anyscale(prompt, model_name="HuggingFaceH4/zephyr-7b-beta"): - start_time = time.monotonic() - ret = openai.ChatCompletion.create( - api_base="https://api.endpoints.anyscale.com/v1", - api_key=os.environ["ANYSCALE_ENDPOINT_TOKEN"], - api_type="openai", - # model="mistralai/Mistral-7B-Instruct-v0.1", - model="HuggingFaceH4/zephyr-7b-beta", - messages=[{ - "role": "system", - "content": "You are a helpful assistant." - }, { - "role": "user", - "content": prompt - }], - temperature=0.3, - max_tokens=250, - ) - - output = ret["choices"][0]["message"]["content"] # type: ignore - print("Response from Anyscale:", output[:150]) - - # input_length = len(tokenizer.encode(prompt)) - # output_length = len(tokenizer.encode(output)) - # Input tokens {input_length}, output tokens: {output_length}" - print(f"^^^^ one anyscale call Runtime: {(time.monotonic() - start_time):.2f} seconds.") - return output - - -def parse_result(result: str): - lines = result.split('\n') - for line in lines: - if 'Final answer' in line: - return 'yes' in line.lower() - return False - - -def filter_top_contexts(contexts, - user_query: str, - timeout: Optional[float] = None, - max_concurrency: Optional[int] = 180): - - print("⏰⏰⏰ Starting filter_top_contexts() ⏰⏰⏰") - - timeout = timeout or float(os.environ["FILTER_TOP_CONTEXTS_TIMEOUT_SECONDS"]) - # langsmith_prompt_obj = hub.pull("kastanday/filter-unrelated-contexts-zephyr") # TOO UNSTABLE, service offline - langsmith_prompt_obj = filter_unrelated_contexts_zephyr - posthog = Posthog(sync_mode=True, project_api_key=os.environ['POSTHOG_API_KEY'], host='https://app.posthog.com') - - print("NUM ACTIVE THREADS (top of filtering_contexts):", threading.active_count()) - - max_concurrency = min(100, len(contexts)) - print("max_concurrency is max of 100, or len(contexts), whichever is less ---- Max concurrency:", max_concurrency) - print("Num contexts to filter:", len(contexts)) - - # START TASKS - actor = AsyncActor.options(max_concurrency=max_concurrency, num_cpus=0.001).remote() # type: ignore - result_futures = [actor.filter_context.remote(c, user_query, langsmith_prompt_obj) for c in contexts] - - start_time = time.monotonic() - done_tasks, in_progress = ray.wait(result_futures, - num_returns=len(result_futures), - timeout=timeout, - fetch_local=False) - - print("NUM ACTIVE THREADS (before cleanup filtering_contexts):", threading.active_count()) - # Cleanup - for task in in_progress: - ray.cancel(task) - results = ray.get(done_tasks) - print("NUM ACTIVE THREADS (before kill filtering_contexts):", threading.active_count()) - ray.kill(actor) - print("NUM ACTIVE THREADS (after kill filtering_contexts):", threading.active_count()) - - best_contexts_to_keep = [ - r['context'] for r in results if r and 'context' in r and 'completion' in r and parse_result(r['completion']) - ] - - print("🧠🧠 TOTAL DOCS PROCESSED BY ANYSCALE FILTERING:", len(results)) - print("🧠🧠 TOTAL DOCS KEPT, AFTER FILTERING:", len(best_contexts_to_keep)) - mqr_runtime = round(time.monotonic() - start_time, 2) - print(f"⏰ Total elapsed time: {mqr_runtime} seconds") - - posthog.capture('distinct_id_of_the_user', - event='filter_top_contexts', - properties={ - 'user_query': user_query, - 'course_name': contexts[0].metadata.get('course_name', None), - 'percent_kept': len(best_contexts_to_keep) / max(1, len(results)), - 'total_docs_processed': len(results), - 'total_docs_kept': len(best_contexts_to_keep), - 'MQR_total_runtime_sec': mqr_runtime, - }) - posthog.shutdown() - return best_contexts_to_keep - - -def run_main(): - start_time = time.monotonic() - # final_passage_list = filter_top_contexts(contexts=CONTEXTS * 2, user_query=USER_QUERY) - # print("✅✅✅ TOTAL included in results: ", len(final_passage_list)) - print(f"⏰⏰⏰ Runtime: {(time.monotonic() - start_time):.2f} seconds") - # print("Total contexts:", len(CONTEXTS) * 2) - - -# ! CONDA ENV: llm-serving -if __name__ == "__main__": - run_main() +# import json +# import os +# import threading +# import time +# from typing import Optional + +# import openai +# import ray +# import requests +# from posthog import Posthog +# import sentry_sdk + +# filter_unrelated_contexts_zephyr = """<|system|> +# You are an expert at determining if a passage is relevant and helpful for answering a question. +# To be valuable, a passage must have at least some amount of useful and meaningful information with more than a passing mention of the topic. +# As part of your thinking process, you first write a few sentences evaluating the utility of the passage, given the question we're trying to answer. Limit yourself to writing only a sentence or two, no more. +# Finally, you must submit your final answer by adding two newline characters then "Yes." or "No." or "I don't know.". Provide a single answer only. Providing multiple final results will disqualify you. +# Here's a template code snippet of how it should work (with placeholder variables): +# ``` +# Passage: +# Question: +# Your evaluation of the utility of the passage: + +# Final answer: +# <|user|> +# Passage: {context} +# Question: {user_query} +# Your evaluation of the utility of the passage: +# <|assistant|>""" + +# @ray.remote +# class AsyncActor: + +# def filter_context(self, context, user_query, langsmith_prompt_obj): +# final_prompt = str(langsmith_prompt_obj.format(context=context, user_query=user_query)) +# # print(f"-------\nfinal_prompt:\n{final_prompt}\n^^^^^^^^^^^^^") +# try: +# # completion = run_caii_hosted_llm(final_prompt) +# # completion = run_replicate(final_prompt) +# completion = run_anyscale(final_prompt) +# return {"completion": completion, "context": context} +# except Exception as e: +# sentry_sdk.capture_exception(e) +# print(f"Error: {e}") + +# def run_caii_hosted_llm(prompt, max_tokens=300, temp=0.3, **kwargs): +# """ +# Local LLMs USAGE DOCS: https://kastanday.notion.site/LLM-Serving-on-prem-OpenAI-Clone-bb06028266d842b0872465f552684177 ## +# """ + +# url = "http://api.kastan.ai/v1/completions?model=HuggingFaceH4/zephyr-7b-alpha" +# headers = {'Content-Type': 'application/json'} +# data = {"prompt": prompt, "max_tokens": max_tokens, "temperature": temp, **kwargs} + +# response = None +# try: +# response = requests.post(url, headers=headers, data=json.dumps(data), timeout=180) +# return response.json()['choices'][0]['text'] +# except Exception as e: +# sentry_sdk.capture_exception(e) +# # Probably cuda OOM error. +# response_content = response.json() if response else "No response" +# raise ValueError( +# f"🚫🚫🚫 Failed inference attempt. Response: {response_content}\nError: {e}\nPromt that caused error: {prompt}" +# ) from e + +# def run_replicate(prompt): +# output = None +# # output = replicate.run("tomasmcm/zephyr-7b-beta:961cd6665b811d0c43c0b9488b6dfa85ff5c7bfb875e93b4533e4c7f96c7c526", +# # input={ +# # "top_k": 50, +# # "top_p": 0.95, +# # "prompt": prompt, +# # "temperature": 0.3, +# # "max_new_tokens": 250, +# # "presence_penalty": 1 +# # }) +# print(output) +# return output + +# def run_anyscale(prompt, model_name="HuggingFaceH4/zephyr-7b-beta"): +# start_time = time.monotonic() +# ret = openai.ChatCompletion.create( +# api_base="https://api.endpoints.anyscale.com/v1", +# api_key=os.environ["ANYSCALE_ENDPOINT_TOKEN"], +# api_type="openai", +# # model="mistralai/Mistral-7B-Instruct-v0.1", +# model="HuggingFaceH4/zephyr-7b-beta", +# messages=[{ +# "role": "system", +# "content": "You are a helpful assistant." +# }, { +# "role": "user", +# "content": prompt +# }], +# temperature=0.3, +# max_tokens=250, +# ) + +# output = ret["choices"][0]["message"]["content"] # type: ignore +# print("Response from Anyscale:", output[:150]) + +# # input_length = len(tokenizer.encode(prompt)) +# # output_length = len(tokenizer.encode(output)) +# # Input tokens {input_length}, output tokens: {output_length}" +# print(f"^^^^ one anyscale call Runtime: {(time.monotonic() - start_time):.2f} seconds.") +# return output + +# def parse_result(result: str): +# lines = result.split('\n') +# for line in lines: +# if 'Final answer' in line: +# return 'yes' in line.lower() +# return False + +# def filter_top_contexts(contexts, +# user_query: str, +# timeout: Optional[float] = None, +# max_concurrency: Optional[int] = 180): + +# print("⏰⏰⏰ Starting filter_top_contexts() ⏰⏰⏰") + +# timeout = timeout or float(os.environ["FILTER_TOP_CONTEXTS_TIMEOUT_SECONDS"]) +# # langsmith_prompt_obj = hub.pull("kastanday/filter-unrelated-contexts-zephyr") # TOO UNSTABLE, service offline +# langsmith_prompt_obj = filter_unrelated_contexts_zephyr +# posthog = Posthog(sync_mode=True, project_api_key=os.environ['POSTHOG_API_KEY'], host='https://app.posthog.com') + +# print("NUM ACTIVE THREADS (top of filtering_contexts):", threading.active_count()) + +# max_concurrency = min(100, len(contexts)) +# print("max_concurrency is max of 100, or len(contexts), whichever is less ---- Max concurrency:", max_concurrency) +# print("Num contexts to filter:", len(contexts)) + +# # START TASKS +# actor = AsyncActor.options(max_concurrency=max_concurrency, num_cpus=0.001).remote() # type: ignore +# result_futures = [actor.filter_context.remote(c, user_query, langsmith_prompt_obj) for c in contexts] + +# start_time = time.monotonic() +# done_tasks, in_progress = ray.wait(result_futures, +# num_returns=len(result_futures), +# timeout=timeout, +# fetch_local=False) + +# print("NUM ACTIVE THREADS (before cleanup filtering_contexts):", threading.active_count()) +# # Cleanup +# for task in in_progress: +# ray.cancel(task) +# results = ray.get(done_tasks) +# print("NUM ACTIVE THREADS (before kill filtering_contexts):", threading.active_count()) +# ray.kill(actor) +# print("NUM ACTIVE THREADS (after kill filtering_contexts):", threading.active_count()) + +# best_contexts_to_keep = [ +# r['context'] for r in results if r and 'context' in r and 'completion' in r and parse_result(r['completion']) +# ] + +# print("🧠🧠 TOTAL DOCS PROCESSED BY ANYSCALE FILTERING:", len(results)) +# print("🧠🧠 TOTAL DOCS KEPT, AFTER FILTERING:", len(best_contexts_to_keep)) +# mqr_runtime = round(time.monotonic() - start_time, 2) +# print(f"⏰ Total elapsed time: {mqr_runtime} seconds") + +# posthog.capture('distinct_id_of_the_user', +# event='filter_top_contexts', +# properties={ +# 'user_query': user_query, +# 'course_name': contexts[0].metadata.get('course_name', None), +# 'percent_kept': len(best_contexts_to_keep) / max(1, len(results)), +# 'total_docs_processed': len(results), +# 'total_docs_kept': len(best_contexts_to_keep), +# 'MQR_total_runtime_sec': mqr_runtime, +# }) +# posthog.shutdown() +# return best_contexts_to_keep + +# def run_main(): +# start_time = time.monotonic() +# # final_passage_list = filter_top_contexts(contexts=CONTEXTS * 2, user_query=USER_QUERY) +# # print("✅✅✅ TOTAL included in results: ", len(final_passage_list)) +# print(f"⏰⏰⏰ Runtime: {(time.monotonic() - start_time):.2f} seconds") +# # print("Total contexts:", len(CONTEXTS) * 2) + +# # ! CONDA ENV: llm-serving +# if __name__ == "__main__": +# run_main() diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py index dae7ef0e..3c87f04b 100644 --- a/ai_ta_backend/main.py +++ b/ai_ta_backend/main.py @@ -6,7 +6,6 @@ import requests from threading import Thread - from dotenv import load_dotenv from flask import ( Flask, @@ -21,7 +20,7 @@ from flask_cors import CORS from flask_executor import Executor from posthog import Posthog -import ray +# import ray import sentry_sdk from ai_ta_backend.canvas import CanvasAPI @@ -50,7 +49,7 @@ # load API keys from globally-availabe .env file load_dotenv() -ray.init() +# ray.init() print("NUM ACTIVE THREADS (top of main):", threading.active_count()) @@ -561,6 +560,7 @@ def nomic_map(): response.headers.add('Access-Control-Allow-Origin', '*') return response + @app.route('/createDocumentMap', methods=['GET']) def createDocumentMap(): course_name: str = request.args.get('course_name', default='', type=str) @@ -620,13 +620,15 @@ def export_convo_history(): response.headers.add('Access-Control-Allow-Origin', '*') else: - response = make_response(send_from_directory(export_status['response'][2], export_status['response'][1], as_attachment=True)) + response = make_response( + send_from_directory(export_status['response'][2], export_status['response'][1], as_attachment=True)) response.headers.add('Access-Control-Allow-Origin', '*') response.headers["Content-Disposition"] = f"attachment; filename={export_status['response'][1]}" os.remove(export_status['response'][0]) - + return response + @app.route('/exportDocuments', methods=['GET']) def exportDocuments(): course_name: str = request.args.get('course_name', default='', type=str) @@ -649,11 +651,12 @@ def exportDocuments(): response.headers.add('Access-Control-Allow-Origin', '*') else: - response = make_response(send_from_directory(export_status['response'][2], export_status['response'][1], as_attachment=True)) + response = make_response( + send_from_directory(export_status['response'][2], export_status['response'][1], as_attachment=True)) response.headers.add('Access-Control-Allow-Origin', '*') response.headers["Content-Disposition"] = f"attachment; filename={export_status['response'][1]}" os.remove(export_status['response'][0]) - + return response diff --git a/ai_ta_backend/vector_database.py b/ai_ta_backend/vector_database.py index daab6c6e..739d5b16 100644 --- a/ai_ta_backend/vector_database.py +++ b/ai_ta_backend/vector_database.py @@ -42,13 +42,12 @@ from pydub import AudioSegment from qdrant_client import QdrantClient, models from qdrant_client.models import PointStruct -from langchain.schema.output_parser import StrOutputParser from ai_ta_backend.aws import upload_data_files_to_s3 from ai_ta_backend.extreme_context_stuffing import OpenAIAPIProcessor from ai_ta_backend.utils_tokenization import count_tokens_and_cost -from ai_ta_backend.context_parent_doc_padding import context_parent_doc_padding -from ai_ta_backend.filtering_contexts import filter_top_contexts +# from ai_ta_backend.context_parent_doc_padding import context_parent_doc_padding +# from ai_ta_backend.filtering_contexts import filter_top_contexts from ai_ta_backend.nomic_logging import log_to_document_map, delete_from_document_map MULTI_QUERY_PROMPT = hub.pull("langchain-ai/rag-fusion-query-generation") @@ -1356,85 +1355,87 @@ def getTopContextsWithMQR(self, 4. [CANCELED BEC POINTLESS] Rank the docs based on the relevance score. 5. Parent-doc-retrieval: Pad just the top 5 docs with expanded context from the original document. """ - try: - top_n_per_query = 40 # HARD CODE TO ENSURE WE HIT THE MAX TOKENS - start_time_overall = time.monotonic() - mq_start_time = time.monotonic() - - # 1. GENERATE MULTIPLE QUERIES - generate_queries = ( - MULTI_QUERY_PROMPT | self.llm | StrOutputParser() | (lambda x: x.split("\n")) | - (lambda x: list(filter(None, x))) # filter out non-empty strings - ) - - generated_queries = generate_queries.invoke({"original_query": search_query}) - print("generated_queries", generated_queries) - - # 2. VECTOR SEARCH FOR EACH QUERY - batch_found_docs_nested: list[list[Document]] = self.batch_vector_search(search_queries=generated_queries, - course_name=course_name, - top_n=top_n_per_query) - - # 3. RANK REMAINING DOCUMENTS -- good for parent doc padding of top 5 at the end. - found_docs = self.reciprocal_rank_fusion(batch_found_docs_nested) - found_docs = [doc for doc, score in found_docs] - print(f"Num docs after re-ranking: {len(found_docs)}") - if len(found_docs) == 0: - return [] - print(f"⏰ Total multi-query processing runtime: {(time.monotonic() - mq_start_time):.2f} seconds") - - # 4. FILTER DOCS - filtered_docs = filter_top_contexts(contexts=found_docs, user_query=search_query, timeout=30, max_concurrency=180) - if len(filtered_docs) == 0: - return [] - - # 5. TOP DOC CONTEXT PADDING // parent document retriever - final_docs = context_parent_doc_padding(filtered_docs, search_query, course_name) - print(f"Number of final docs after context padding: {len(final_docs)}") - - pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" - token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' + - search_query) # type: ignore - - valid_docs = [] - num_tokens = 0 - for doc in final_docs: - doc_string = f"Document: {doc['readable_filename']}{', page: ' + str(doc['pagenumber']) if doc['pagenumber'] else ''}\n{str(doc['text'])}\n" - num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore - - print(f"token_counter: {token_counter}, num_tokens: {num_tokens}, max_tokens: {token_limit}") - if token_counter + num_tokens <= token_limit: - token_counter += num_tokens - valid_docs.append(doc) - else: - # filled our token size, time to return - break - - print(f"Total tokens used: {token_counter} Used {len(valid_docs)} of total unique docs {len(found_docs)}.") - print(f"Course: {course_name} ||| search_query: {search_query}") - print(f"⏰ ^^ Runtime of getTopContextsWithMQR: {(time.monotonic() - start_time_overall):.2f} seconds") - - if len(valid_docs) == 0: - return [] - - self.posthog.capture('distinct_id_of_the_user', - event='filter_top_contexts_succeeded', - properties={ - 'user_query': search_query, - 'course_name': course_name, - 'token_limit': token_limit, - 'total_tokens_used': token_counter, - 'total_contexts_used': len(valid_docs), - 'total_unique_docs_retrieved': len(found_docs), - }) - - return self.format_for_json_mqr(valid_docs) - except Exception as e: - # return full traceback to front end - err: str = f"ERROR: In /getTopContextsWithMQR. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.format_exc()}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}" # type: ignore - print(err) - sentry_sdk.capture_exception(e) - return err + return 'fail' + + # try: + # top_n_per_query = 40 # HARD CODE TO ENSURE WE HIT THE MAX TOKENS + # start_time_overall = time.monotonic() + # mq_start_time = time.monotonic() + + # # 1. GENERATE MULTIPLE QUERIES + # generate_queries = ( + # MULTI_QUERY_PROMPT | self.llm | StrOutputParser() | (lambda x: x.split("\n")) | + # (lambda x: list(filter(None, x))) # filter out non-empty strings + # ) + + # generated_queries = generate_queries.invoke({"original_query": search_query}) + # print("generated_queries", generated_queries) + + # # 2. VECTOR SEARCH FOR EACH QUERY + # batch_found_docs_nested: list[list[Document]] = self.batch_vector_search(search_queries=generated_queries, + # course_name=course_name, + # top_n=top_n_per_query) + + # # 3. RANK REMAINING DOCUMENTS -- good for parent doc padding of top 5 at the end. + # found_docs = self.reciprocal_rank_fusion(batch_found_docs_nested) + # found_docs = [doc for doc, score in found_docs] + # print(f"Num docs after re-ranking: {len(found_docs)}") + # if len(found_docs) == 0: + # return [] + # print(f"⏰ Total multi-query processing runtime: {(time.monotonic() - mq_start_time):.2f} seconds") + + # # 4. FILTER DOCS + # filtered_docs = filter_top_contexts(contexts=found_docs, user_query=search_query, timeout=30, max_concurrency=180) + # if len(filtered_docs) == 0: + # return [] + + # # 5. TOP DOC CONTEXT PADDING // parent document retriever + # final_docs = context_parent_doc_padding(filtered_docs, search_query, course_name) + # print(f"Number of final docs after context padding: {len(final_docs)}") + + # pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" + # token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' + + # search_query) # type: ignore + + # valid_docs = [] + # num_tokens = 0 + # for doc in final_docs: + # doc_string = f"Document: {doc['readable_filename']}{', page: ' + str(doc['pagenumber']) if doc['pagenumber'] else ''}\n{str(doc['text'])}\n" + # num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore + + # print(f"token_counter: {token_counter}, num_tokens: {num_tokens}, max_tokens: {token_limit}") + # if token_counter + num_tokens <= token_limit: + # token_counter += num_tokens + # valid_docs.append(doc) + # else: + # # filled our token size, time to return + # break + + # print(f"Total tokens used: {token_counter} Used {len(valid_docs)} of total unique docs {len(found_docs)}.") + # print(f"Course: {course_name} ||| search_query: {search_query}") + # print(f"⏰ ^^ Runtime of getTopContextsWithMQR: {(time.monotonic() - start_time_overall):.2f} seconds") + + # if len(valid_docs) == 0: + # return [] + + # self.posthog.capture('distinct_id_of_the_user', + # event='filter_top_contexts_succeeded', + # properties={ + # 'user_query': search_query, + # 'course_name': course_name, + # 'token_limit': token_limit, + # 'total_tokens_used': token_counter, + # 'total_contexts_used': len(valid_docs), + # 'total_unique_docs_retrieved': len(found_docs), + # }) + + # return self.format_for_json_mqr(valid_docs) + # except Exception as e: + # # return full traceback to front end + # err: str = f"ERROR: In /getTopContextsWithMQR. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.format_exc()}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}" # type: ignore + # print(err) + # sentry_sdk.capture_exception(e) + # return err def format_for_json_mqr(self, found_docs) -> List[Dict]: """ diff --git a/requirements.txt b/requirements.txt index acc2eed2..f4503824 100644 --- a/requirements.txt +++ b/requirements.txt @@ -53,7 +53,7 @@ unstructured==0.10.29 # causes huge ~5.3 GB of installs. Probbably from onnx: ht # Not currently supporting coursera ingest # cs-dlp @ git+https://github.com/raffaem/cs-dlp.git@0.12.0b0 # previously called coursera-dl pydantic==1.10.13 # pydantic v1 works better for ray -ray==2.8.1 posthog==3.1.0 sentry-sdk==1.39.1 +# ray==2.8.1 # newrelic==9.3.0 \ No newline at end of file diff --git a/run.sh b/run.sh index 20a23b02..0d77691a 100755 --- a/run.sh +++ b/run.sh @@ -3,6 +3,6 @@ # Docs https://docs.gunicorn.org/en/stable/settings.html#workers # 200 MB object store memory.. necessary to statically allocate or will crash in Railway env restrictions. -ray start --head --num-cpus 6 --object-store-memory 300000000 +# ray start --head --num-cpus 6 --object-store-memory 300000000 export PYTHONPATH=${PYTHONPATH}:$(pwd)/ai_ta_backend exec gunicorn --workers=6 --threads=20000 --worker-class=gthread ai_ta_backend.main:app --timeout 1800