From ea9c432db5a29f9a9f7575132f5cf3c668975c0f Mon Sep 17 00:00:00 2001 From: rohanmarwaha Date: Thu, 7 Mar 2024 12:32:22 -0600 Subject: [PATCH 01/15] Major Refactor introducing Dependency Injection --- ai_ta_backend/aws.py | 64 -- ai_ta_backend/canvas.py | 264 ------ ai_ta_backend/database/__init__.py | 0 ai_ta_backend/database/aws.py | 34 + ai_ta_backend/database/sql.py | 82 ++ ai_ta_backend/database/vector.py | 61 ++ ai_ta_backend/emails.py | 38 - ai_ta_backend/export_data.py | 350 -------- ai_ta_backend/extreme_context_stuffing.py | 550 ------------ ai_ta_backend/main.py | 548 +----------- ai_ta_backend/nomic_logging.py | 738 ---------------- ai_ta_backend/service/__init__.py | 0 ai_ta_backend/service/export_service.py | 263 ++++++ ai_ta_backend/service/nomic_service.py | 727 ++++++++++++++++ ai_ta_backend/service/posthog_service.py | 18 + ai_ta_backend/service/retrieval_service.py | 414 +++++++++ ai_ta_backend/service/sentry_service.py | 22 + ai_ta_backend/utils/__init__.py | 0 .../{ => utils}/context_parent_doc_padding.py | 4 +- ai_ta_backend/utils/emails.py | 38 + .../{ => utils}/filtering_contexts.py | 0 ai_ta_backend/vector_database.py | 803 ------------------ ai_ta_backend/web_scrape.py | 664 --------------- 23 files changed, 1707 insertions(+), 3975 deletions(-) delete mode 100644 ai_ta_backend/aws.py delete mode 100644 ai_ta_backend/canvas.py create mode 100644 ai_ta_backend/database/__init__.py create mode 100644 ai_ta_backend/database/aws.py create mode 100644 ai_ta_backend/database/sql.py create mode 100644 ai_ta_backend/database/vector.py delete mode 100644 ai_ta_backend/emails.py delete mode 100644 ai_ta_backend/export_data.py delete mode 100644 ai_ta_backend/extreme_context_stuffing.py delete mode 100644 ai_ta_backend/nomic_logging.py create mode 100644 ai_ta_backend/service/__init__.py create mode 100644 ai_ta_backend/service/export_service.py create mode 100644 ai_ta_backend/service/nomic_service.py create mode 100644 ai_ta_backend/service/posthog_service.py create mode 100644 ai_ta_backend/service/retrieval_service.py create mode 100644 ai_ta_backend/service/sentry_service.py create mode 100644 ai_ta_backend/utils/__init__.py rename ai_ta_backend/{ => utils}/context_parent_doc_padding.py (96%) create mode 100644 ai_ta_backend/utils/emails.py rename ai_ta_backend/{ => utils}/filtering_contexts.py (100%) delete mode 100644 ai_ta_backend/vector_database.py delete mode 100644 ai_ta_backend/web_scrape.py diff --git a/ai_ta_backend/aws.py b/ai_ta_backend/aws.py deleted file mode 100644 index 0c0cfa33..00000000 --- a/ai_ta_backend/aws.py +++ /dev/null @@ -1,64 +0,0 @@ -import os -import uuid -from multiprocessing import Lock, cpu_count -from multiprocessing.pool import ThreadPool -from typing import List, Optional - -import boto3 - - -def upload_data_files_to_s3(course_name: str, localdir: str) -> Optional[List[str]]: - """Uploads all files in localdir to S3 bucket. - - Args: - course_name (str): Official course name on our website. - localdir (str): Local directory to upload from, coursera-dl downloads to this directory. - - Returns: - Optional[List[str]]: A list of S3 paths, the final resting place of uploads, or None if no files were uploaded. - """ - s3 = boto3.client( - 's3', - aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), - aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), - ) - - filenames = [] - for root, _subdirs, files in os.walk(localdir): - for filename in files: - filenames.append(os.path.join(root, filename)) - - if not filenames: - print(f"No files to upload. Not found in: {localdir}") - return None - - print(f"Files to upload: {filenames}") - print("About to upload...") - - s3_paths = [] - s3_paths_lock = Lock() - - def upload(myfile): - # get the last part of the path and append unique ID before it - directory, old_filename = os.path.split(myfile) - new_filename = str(uuid.uuid4()) + '-' + old_filename - new_filepath = os.path.join(directory, new_filename) - - s3_file = f"courses/{course_name}/{os.path.basename(new_filepath)}" - s3.upload_file(myfile, os.getenv('S3_BUCKET_NAME'), s3_file) - with s3_paths_lock: - s3_paths.append(s3_file) - - # only 2 parallel uploads because we're getting rate limited with min_p=6... 503 errors. - min_p = 2 - max_p = cpu_count() - num_procs = max(min(len(filenames), max_p), min_p) - pool = ThreadPool(processes=num_procs) - pool.map(upload, filenames) - - print("All data files uploaded to S3 successfully.") - return s3_paths - - -if __name__ == '__main__': - pass \ No newline at end of file diff --git a/ai_ta_backend/canvas.py b/ai_ta_backend/canvas.py deleted file mode 100644 index 324621b4..00000000 --- a/ai_ta_backend/canvas.py +++ /dev/null @@ -1,264 +0,0 @@ -import os -import shutil - -import requests -from canvasapi import Canvas -import sentry_sdk - -from ai_ta_backend.aws import upload_data_files_to_s3 -from ai_ta_backend.vector_database import Ingest - - -class CanvasAPI(): - - def __init__(self): - self.canvas_client = Canvas("https://canvas.illinois.edu", os.getenv('CANVAS_ACCESS_TOKEN')) - self.headers = {"Authorization": "Bearer " + os.getenv('CANVAS_ACCESS_TOKEN')} - - def add_users(self, canvas_course_id: str, course_name: str): - """ - Get all users in a course by course ID and add them to uiuc.chat course - - Student profile does not have access to emails. - - Currently collecting all names in a list. - """ - course = self.canvas_client.get_course(canvas_course_id) - users = course.get_users() - - user_names = [] - for user in users: - user_names.append(user.name) - - print("Collected names: ", user_names) - - if len(user_names) > 0: - return "Success" - else: - return "Failed" - - def download_course_content(self, canvas_course_id: int, dest_folder: str, content_ingest_dict: dict) -> str: - """ - Downloads all Canvas course materials through the course ID and stores in local directory. - 1. Iterate through content_ingest_dict and download all. - 2. Maintain a list of URLs and convert HTML strings to proper format. - """ - print("In download_course_content") - - try: - api_path = "https://canvas.illinois.edu/api/v1/courses/" + str(canvas_course_id) - - # Iterate over the content_ingest_dict - for key, value in content_ingest_dict.items(): - if value is True: - if key == 'files': - self.download_files(dest_folder, api_path) - elif key == 'pages': - self.download_pages(dest_folder, api_path) - elif key == 'modules': - self.download_modules(dest_folder, api_path) - elif key == 'syllabus': - self.download_syllabus(dest_folder, api_path) - elif key == 'assignments': - self.download_assignments(dest_folder, api_path) - elif key == 'discussions': - self.download_discussions(dest_folder, api_path) - - # at this point, we have all extracted files in the dest_folder. - - return "Success" - except Exception as e: - sentry_sdk.capture_exception(e) - return "Failed! Error: " + str(e) - - def ingest_course_content(self, canvas_course_id: int, course_name: str, content_ingest_dict: dict = None) -> str: - """ - Ingests all Canvas course materials through the course ID. - 1. Download zip file from Canvas and store in local directory - 2. Upload all files to S3 - 3. Call bulk_ingest() to ingest all files into QDRANT - 4. Delete extracted files from local directory - """ - - print("In ingest_course_content") - try: - # a dictionary of all contents we want to ingest - files, pages, modules, syllabus, assignments, discussions. - if content_ingest_dict is None: - content_ingest_dict = { - 'files': True, - 'pages': True, - 'modules': True, - 'syllabus': True, - 'assignments': True, - 'discussions': True - } - - # Create a canvas directory with a course folder inside it. - canvas_dir = "canvas_materials" - folder_name = "canvas_course_" + str(canvas_course_id) + "_ingest" - folder_path = canvas_dir + "/" + folder_name - - if os.path.exists(canvas_dir): - print("Canvas directory already exists") - else: - os.mkdir(canvas_dir) - print("Canvas directory created") - - if os.path.exists(canvas_dir + "/" + folder_name): - print("Course folder already exists") - else: - os.mkdir(canvas_dir + "/" + folder_name) - print("Course folder created") - - # Download course content - self.download_course_content(canvas_course_id, folder_path, content_ingest_dict) - - # Upload files to S3 - s3_paths = upload_data_files_to_s3(course_name, folder_path) - - # Delete files from local directory - shutil.rmtree(folder_path) - - # Ingest files into QDRANT - ingest = Ingest() - canvas_ingest = ingest.bulk_ingest(s3_paths, course_name=course_name) - return canvas_ingest - - except Exception as e: - print(e) - sentry_sdk.capture_exception(e) - return "Failed" - - def download_files(self, dest_folder: str, api_path: str) -> str: - """ - Downloads all files in a Canvas course into given folder. - """ - try: - # files_request = requests.get(api_path + "/files", headers=self.headers) - # files = files_request.json() - - course = self.canvas_client.get_course(api_path.split('/')[-1]) - files = course.get_files() - - for file in files: - # file_name = file['filename'] - file_name = file.filename - print("Downloading file: ", file_name) - - # file_download = requests.get(file['url'], headers=self.headers) - file_download = requests.get(file.url, headers=self.headers) - with open(os.path.join(dest_folder, file_name), 'wb') as f: - f.write(file_download.content) - - return "Success" - except Exception as e: - sentry_sdk.capture_exception(e) - return "Failed! Error: " + str(e) - - def download_pages(self, dest_folder: str, api_path: str) -> str: - """ - Downloads all pages as HTML and stores them in given folder. - """ - print("In download_pages") - try: - pages_request = requests.get(api_path + "/pages", headers=self.headers) - pages = pages_request.json() - - for page in pages: - if page['html_url'] != '': - page_name = page['url'] + ".html" - page_content_request = requests.get(api_path + "/pages/" + str(page['page_id']), headers=self.headers) - page_body = page_content_request.json()['body'] - - with open(dest_folder + "/" + page_name, 'w') as html_file: - html_file.write(page_body) - - return "Success" - except Exception as e: - sentry_sdk.capture_exception(e) - return "Failed! Error: " + str(e) - - def download_syllabus(self, dest_folder: str, api_path: str) -> str: - """ - Downloads syllabus as HTML and stores in given folder. - """ - print("In download_syllabus") - try: - course_settings_request = requests.get(api_path + "?include=syllabus_body", headers=self.headers) - syllabus_body = course_settings_request.json()['syllabus_body'] - syllabus_name = "syllabus.html" - - with open(dest_folder + "/" + syllabus_name, 'w') as html_file: - html_file.write(syllabus_body) - return "Success" - except Exception as e: - sentry_sdk.capture_exception(e) - return "Failed! Error: " + str(e) - - def download_modules(self, dest_folder: str, api_path: str) -> str: - """ - Downloads all content uploaded in modules. - Modules may contain: assignments, quizzes, files, pages, discussions, external tools and external urls. - Rest of the things are covered in other functions. - """ - print("In download_modules") - try: - module_request = requests.get(api_path + "/modules?include=items", headers=self.headers) - modules = module_request.json() - - for module in modules: - module_items = module['items'] - for item in module_items: - if item['type'] == 'ExternalUrl': - external_url = item['external_url'] - url_title = item['title'] - - # Download external url as HTML - response = requests.get(external_url) - if response.status_code == 200: - html_file_name = url_title + ".html" - with open(dest_folder + "/" + html_file_name, 'w') as html_file: - html_file.write(response.text) - return "Success" - except Exception as e: - sentry_sdk.capture_exception(e) - return "Failed! Error: " + str(e) - - def download_assignments(self, dest_folder: str, api_path: str) -> str: - """ - The description attribute has the assignment content in HTML format. Access that and store it as an HTML file. - """ - print("In download_assignments") - try: - assignment_request = requests.get(api_path + "/assignments", headers=self.headers) - assignments = assignment_request.json() - - for assignment in assignments: - if assignment['description'] is not None and assignment['description'] != "": - assignment_name = "assignment_" + str(assignment['id']) + ".html" - assignment_description = assignment['description'] - - with open(dest_folder + "/" + assignment_name, 'w') as html_file: - html_file.write(assignment_description) - return "Success" - except Exception as e: - sentry_sdk.capture_exception(e) - return "Failed! Error: " + str(e) - - def download_discussions(self, dest_folder: str, api_path: str) -> str: - """ - Download course discussions as HTML and store in given folder. - """ - print("In download_discussions") - try: - discussion_request = requests.get(api_path + "/discussion_topics", headers=self.headers) - discussions = discussion_request.json() - - for discussion in discussions: - discussion_content = discussion['message'] - discussion_name = discussion['title'] + ".html" - - with open(dest_folder + "/" + discussion_name, 'w') as html_file: - html_file.write(discussion_content) - return "Success" - except Exception as e: - sentry_sdk.capture_exception(e) - return "Failed! Error: " + str(e) diff --git a/ai_ta_backend/database/__init__.py b/ai_ta_backend/database/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ai_ta_backend/database/aws.py b/ai_ta_backend/database/aws.py new file mode 100644 index 00000000..1b2f63dc --- /dev/null +++ b/ai_ta_backend/database/aws.py @@ -0,0 +1,34 @@ +import os + +import boto3 +from injector import inject + + +class AWSStorage: + + @inject + def __init__(self): + # S3 + self.s3_client = boto3.client( + 's3', + aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), + aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), + ) + + def upload_file(self, file_path: str, bucket_name: str, object_name: str): + self.s3_client.upload_file(file_path, bucket_name, object_name) + + def download_file(self, object_name: str, bucket_name: str, file_path: str): + self.s3_client.download_file(bucket_name, object_name, file_path) + + def delete_file(self, bucket_name: str, s3_path: str): + self.s3_client.delete_object(Bucket=bucket_name, Key=s3_path) + + def generatePresignedUrl(self, object: str, bucket_name: str, s3_path: str, expiration: int = 3600): + # generate presigned URL + return self.s3_client.generate_presigned_url('get_object', + Params={ + 'Bucket': bucket_name, + 'Key': s3_path + }, + ExpiresIn=expiration) diff --git a/ai_ta_backend/database/sql.py b/ai_ta_backend/database/sql.py new file mode 100644 index 00000000..fbd2035a --- /dev/null +++ b/ai_ta_backend/database/sql.py @@ -0,0 +1,82 @@ +import os + +from injector import inject + + +class SQLDatabase: + + @inject + def __init__(self, db_url: str): + # Create a Supabase client + self.supabase_client = supabase.create_client( # type: ignore + supabase_url=os.environ['SUPABASE_URL'], supabase_key=os.environ['SUPABASE_API_KEY']) + + def getAllMaterialsForCourse(self, course_name: str): + return self.supabase_client.table(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select( + 'course_name, s3_path, readable_filename, url, base_url').eq('course_name', course_name).execute() + + def getMaterialsForCourseAndS3Path(self, course_name: str, s3_path: str): + return self.supabase_client.from_( + os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select("id, s3_path, contexts").eq( + 's3_path', s3_path).eq('course_name', course_name).execute() + + def getMaterialsForCourseAndKeyAndValue(self, course_name: str, key: str, value: str): + return self.supabase_client.from_( + os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select("id, s3_path, contexts").eq(key, value).eq( + 'course_name', course_name).execute() + + def deleteMaterialsForCourseAndKeyAndValue(self, course_name: str, key: str, value: str): + return self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq(key, value).eq( + 'course_name', course_name).execute() + + def deleteMaterialsForCourseAndS3Path(self, course_name: str, s3_path: str): + return self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq( + 's3_path', s3_path).eq('course_name', course_name).execute() + + def getProjectsMapForCourse(self, course_name: str): + return self.supabase_client.table("projects").select("doc_map_id").eq("course_name", course_name).execute() + + def getDocumentsBetweenDates(self, course_name: str, from_date: str, to_date: str, table_name: str): + if from_date != '' and to_date != '': + # query between the dates + print("from_date and to_date") + + response = self.supabase_client.table(table_name).select("id", count='exact').eq("course_name", course_name).gte( + 'created_at', from_date).lte('created_at', to_date).order('id', desc=False).execute() + + elif from_date != '' and to_date == '': + # query from from_date to now + print("only from_date") + response = self.supabase_client.table(table_name).select("id", count='exact').eq("course_name", course_name).gte( + 'created_at', from_date).order('id', desc=False).execute() + + elif from_date == '' and to_date != '': + # query from beginning to to_date + print("only to_date") + response = self.supabase_client.table(table_name).select("id", count='exact').eq("course_name", course_name).lte( + 'created_at', to_date).order('id', desc=False).execute() + + else: + # query all data + print("No dates") + response = self.supabase_client.table(table_name).select("id", count='exact').eq( + "course_name", course_name).order('id', desc=False).execute() + return response + + def getAllFromTableForDownloadType(self, course_name: str, download_type: str, first_id: int): + if download_type == 'documents': + response = self.supabase_client.table("documents").select("*").eq("course_name", course_name).gte( + 'id', first_id).order('id', desc=False).limit(100).execute() + else: + response = self.supabase_client.table("llm-convo-monitor").select("*").eq("course_name", course_name).gte( + 'id', first_id).order('id', desc=False).limit(100).execute() + + return response + + def getAllConversationsBetweenIds(self, course_name: str, first_id: int, last_id: int): + return self.supabase_client.table("llm-convo-monitor").select("*").eq("course_name", course_name).gte( + 'id', first_id).lte('id', last_id).order('id', desc=False).limit(25).execute() + + def getDocsForIdsGte(self, course_name: str, first_id: int): + return self.supabase_client.table("documents").select("*").eq("course_name", course_name).gte('id', first_id).order( + 'id', desc=False).limit(100).execute() diff --git a/ai_ta_backend/database/vector.py b/ai_ta_backend/database/vector.py new file mode 100644 index 00000000..7ac67eda --- /dev/null +++ b/ai_ta_backend/database/vector.py @@ -0,0 +1,61 @@ +import os + +from injector import inject +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.vectorstores import Qdrant +from qdrant_client import QdrantClient, models + +OPENAI_API_TYPE = "azure" # "openai" or "azure" + + +class VectorDatabase(): + """ + Contains all methods for building and using vector databases. + """ + + @inject + def __init__(self): + """ + Initialize AWS S3, Qdrant, and Supabase. + """ + # vector DB + self.qdrant_client = QdrantClient( + url=os.getenv('QDRANT_URL'), + api_key=os.getenv('QDRANT_API_KEY'), + ) + + self.vectorstore = Qdrant(client=self.qdrant_client, + collection_name=os.environ['QDRANT_COLLECTION_NAME'], + embeddings=OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE)) + + def vector_search(self, search_query, course_name, user_query_embedding, top_n): + """ + Search the vector database for a given query. + """ + myfilter = models.Filter(must=[ + models.FieldCondition(key='course_name', match=models.MatchValue(value=course_name)), + ]) + search_results = self.qdrant_client.search( + collection_name=os.environ['QDRANT_COLLECTION_NAME'], + query_filter=myfilter, + with_vectors=False, + query_vector=user_query_embedding, + limit=top_n, # Return n closest points + + # In a system with high disk latency, the re-scoring step may become a bottleneck: https://qdrant.tech/documentation/guides/quantization/ + search_params=models.SearchParams(quantization=models.QuantizationSearchParams(rescore=False))) + return search_results + + def delete_data(self, collection_name: str, key: str, value: str): + """ + Delete data from the vector database. + """ + self.qdrant_client.delete( + collection_name=collection_name, + points_selector=models.Filter(must=[ + models.FieldCondition( + key=key, + match=models.MatchValue(value=value), + ), + ]), + ) diff --git a/ai_ta_backend/emails.py b/ai_ta_backend/emails.py deleted file mode 100644 index 2f17dce0..00000000 --- a/ai_ta_backend/emails.py +++ /dev/null @@ -1,38 +0,0 @@ -import os -import smtplib -from email.mime.text import MIMEText -from email.mime.multipart import MIMEMultipart - - -def send_email(subject: str, body_text: str, sender: str, receipients: list, bcc_receipients: list): - """ - Send an email using the AWS SES service - :param subject: The subject of the email - :param body_text: The body of the email - :param sender: The email address of the sender - :param receipients: A list of email addresses to send the email to - :param bcc_receipients: A list of email addresses to send the email to as BCC - :return: A string indicating the result of the email send operation - - """ - # Create message content - message = MIMEMultipart("alternative") - message["Subject"] = subject - message["From"] = sender - message["To"] = ", ".join(receipients) - - if len(bcc_receipients) > 0: - message["Bcc"] = ", ".join(bcc_receipients) - - # Add plain text part - part1 = MIMEText(body_text, "plain") - message.attach(part1) - - # Add additional parts for HTML, attachments, etc. (optional) - - # Connect to SMTP server - with smtplib.SMTP_SSL(os.getenv('SES_HOST'), os.getenv('SES_PORT')) as server: # type: ignore - server.login(os.getenv('USERNAME_SMTP'), os.getenv('PASSWORD_SMTP')) # type: ignore - server.sendmail(sender, receipients + bcc_receipients, message.as_string()) - - return "Email sent successfully!" \ No newline at end of file diff --git a/ai_ta_backend/export_data.py b/ai_ta_backend/export_data.py deleted file mode 100644 index 299b3435..00000000 --- a/ai_ta_backend/export_data.py +++ /dev/null @@ -1,350 +0,0 @@ -import os -import uuid -import zipfile -import io -import pandas as pd -import supabase -import sentry_sdk -import boto3 -import botocore -from concurrent.futures import ProcessPoolExecutor -import requests -import json -from ai_ta_backend.emails import send_email - -# Initialize Supabase client -SUPABASE_CLIENT = supabase.create_client(supabase_url=os.getenv('SUPABASE_URL'), # type: ignore - supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore - - -def export_documents_json(course_name: str, from_date='', to_date=''): - """ - This function exports the documents to a json file. - 1. If the number of documents is greater than 1000, it calls a background task to upload the documents to S3. - 2. If the number of documents is less than 1000, it fetches the documents and zips them. - Args: - course_name (str): The name of the course. - from_date (str, optional): The start date for the data export. Defaults to ''. - to_date (str, optional): The end date for the data export. Defaults to ''. - """ - - if from_date != '' and to_date != '': - # query between the dates - print("from_date and to_date") - response = SUPABASE_CLIENT.table("documents").select("id", count='exact').eq("course_name", course_name).gte( - 'created_at', from_date).lte('created_at', to_date).order('id', desc=False).execute() - - elif from_date != '' and to_date == '': - # query from from_date to now - print("only from_date") - response = SUPABASE_CLIENT.table("documents").select("id", count='exact').eq("course_name", course_name).gte( - 'created_at', from_date).order('id', desc=False).execute() - - elif from_date == '' and to_date != '': - # query from beginning to to_date - print("only to_date") - response = SUPABASE_CLIENT.table("documents").select("id", count='exact').eq("course_name", course_name).lte( - 'created_at', to_date).order('id', desc=False).execute() - - else: - # query all data - print("No dates") - response = SUPABASE_CLIENT.table("documents").select("id", - count='exact').eq("course_name", - course_name).order('id', - desc=False).execute() - - # add a condition to route to direct download or s3 download - if response.count > 1000: - # call background task to upload to s3 - - filename = course_name + '_' + str(uuid.uuid4()) + '_documents.zip' - s3_filepath = s3_file = f"courses/{course_name}/{filename}" - # background task of downloading data - map it with above ID - executor = ProcessPoolExecutor() - executor.submit(export_data_in_bg, response, "documents", course_name, s3_filepath) - return {"response": 'Download from S3', "s3_path": s3_filepath} - - else: - # Fetch data - if response.count > 0: - # batch download - total_doc_count = response.count - first_id = response.data[0]['id'] - last_id = response.data[-1]['id'] - - print("total_doc_count: ", total_doc_count) - print("first_id: ", first_id) - print("last_id: ", last_id) - - curr_doc_count = 0 - filename = course_name + '_' + str(uuid.uuid4()) + '_documents.json' - file_path = os.path.join(os.getcwd(), filename) - - while curr_doc_count < total_doc_count: - print("Fetching data from id: ", first_id) - response = SUPABASE_CLIENT.table("documents").select("*").eq("course_name", course_name).gte('id', first_id).order('id', desc=False).limit(100).execute() - df = pd.DataFrame(response.data) - curr_doc_count += len(response.data) - - # writing to file - if not os.path.isfile(file_path): - df.to_json(file_path, orient='records') - else: - df.to_json(file_path, orient='records', lines=True, mode='a') - - if len(response.data) > 0: - first_id = response.data[-1]['id'] + 1 - - # Download file - try: - # zip file - zip_filename = filename.split('.')[0] + '.zip' - zip_file_path = os.path.join(os.getcwd(), zip_filename) - - with zipfile.ZipFile(zip_file_path, 'w', compression=zipfile.ZIP_DEFLATED) as zipf: - zipf.write(file_path, filename) - - os.remove(file_path) - return {"response": (zip_file_path, zip_filename, os.getcwd())} - except Exception as e: - print(e) - sentry_sdk.capture_exception(e) - return {"response": "Error downloading file."} - else: - return {"response": "No data found between the given dates."} - - -def export_data_in_bg(response, download_type, course_name, s3_path): - """ - This function is called in export_documents_csv() to upload the documents to S3. - 1. download the documents in batches of 100 and upload them to S3. - 2. generate a pre-signed URL for the S3 file. - 3. send an email to the course admins with the pre-signed URL. - - Args: - response (dict): The response from the Supabase query. - download_type (str): The type of download - 'documents' or 'conversations'. - course_name (str): The name of the course. - s3_path (str): The S3 path where the file will be uploaded. - """ - total_doc_count = response.count - first_id = response.data[0]['id'] - print("total_doc_count: ", total_doc_count) - print("pre-defined s3_path: ", s3_path) - - curr_doc_count = 0 - filename = s3_path.split('/')[-1].split('.')[0] + '.json' - file_path = os.path.join(os.getcwd(), filename) - - # download data in batches of 100 - while curr_doc_count < total_doc_count: - print("Fetching data from id: ", first_id) - if download_type == 'documents': - response = SUPABASE_CLIENT.table("documents").select("*").eq("course_name", course_name).gte('id', first_id).order('id', desc=False).limit(100).execute() - else: - response = SUPABASE_CLIENT.table("llm-convo-monitor").select("*").eq("course_name", course_name).gte('id', first_id).order('id', desc=False).limit(100).execute() - df = pd.DataFrame(response.data) - curr_doc_count += len(response.data) - - # writing to file - if not os.path.isfile(file_path): - df.to_json(file_path, orient='records') - else: - df.to_json(file_path, orient='records', lines=True, mode='a') - - if len(response.data) > 0: - first_id = response.data[-1]['id'] + 1 - - # zip file - zip_filename = filename.split('.')[0] + '.zip' - zip_file_path = os.path.join(os.getcwd(), zip_filename) - - with zipfile.ZipFile(zip_file_path, 'w', compression=zipfile.ZIP_DEFLATED) as zipf: - zipf.write(file_path, filename) - - print("zip file created: ", zip_file_path) - - try: - # upload to S3 - s3 = boto3.client( - 's3', - aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), - aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), - ) - - #s3_file = f"courses/{course_name}/exports/{os.path.basename(zip_file_path)}" - s3_file = f"courses/{course_name}/{os.path.basename(zip_file_path)}" - s3.upload_file(zip_file_path, os.getenv('S3_BUCKET_NAME'), s3_file) - - # remove local files - os.remove(file_path) - os.remove(zip_file_path) - - print("file uploaded to s3: ", s3_file) - - # pre-signed URL - s3_object = s3.head_object(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path) - - # generate presigned URL - s3_url = s3.generate_presigned_url('get_object', Params={'Bucket': os.getenv('S3_BUCKET_NAME'), 'Key': s3_path}, ExpiresIn=3600) - - # get admin email IDs - headers = { - "Authorization": f"Bearer {os.getenv('VERCEL_READ_ONLY_API_KEY')}", - "Content-Type": "application/json" - } - - hget_url = str(os.getenv('VERCEL_BASE_URL')) + "course_metadatas/" + course_name - response = requests.get(hget_url, headers=headers) - course_metadata = response.json() - course_metadata = json.loads(course_metadata['result']) - admin_emails = course_metadata['course_admins'] - bcc_emails = [] - - # check for Kastan's email and move to bcc - if 'kvday2@illinois.edu' in admin_emails: - admin_emails.remove('kvday2@illinois.edu') - bcc_emails.append('kvday2@illinois.edu') - - # add course owner email to admin_emails - admin_emails.append(course_metadata['course_owner']) - admin_emails = list(set(admin_emails)) - print("admin_emails: ", admin_emails) - print("bcc_emails: ", bcc_emails) - - # add a check for emails, don't send email if no admin emails - if len(admin_emails) == 0: - return "No admin emails found. Email not sent." - - # send email to admins - subject = "UIUC.chat Data Export Complete for " + course_name - body_text = "The data export for " + course_name + " is complete.\n\nYou can download the file from the following link: \n\n" + s3_url + "\n\nThis link will expire in 48 hours." - email_status = send_email(subject, body_text, os.getenv('EMAIL_SENDER'), admin_emails, bcc_emails) - print("email_status: ", email_status) - - return "File uploaded to S3. Email sent to admins." - - except Exception as e: - print(e) - return "Error: " + str(e) - -def check_s3_path_and_download(s3_path): - """ - This function checks if the file exists in S3 and downloads it. - Args: - s3_path (str): The S3 path of the file. - """ - s3 = boto3.client( - 's3', - aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), - aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), - ) - - try: - print("Checking if file exists in S3...", s3_path) - s3_object = s3.head_object(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path) - - # generate presigned URL - s3_url = s3.generate_presigned_url('get_object', Params={'Bucket': os.getenv('S3_BUCKET_NAME'), 'Key': s3_path}, ExpiresIn=172800) - print("Presigned URL: ", s3_url) - return {"response": s3_url} - - except botocore.exceptions.ClientError as e: - if e.response['Error']['Code'] == "404": - # The object does not exist. - return {"response": "Export is not complete yet. Please try again later."} - else: - # Something else has gone wrong. - sentry_sdk.capture_exception(e) - return {"response": "Error downloading file."} - - -def export_convo_history_json(course_name: str, from_date='', to_date=''): - """ - This function exports the conversation history to a csv file. - Args: - course_name (str): The name of the course. - from_date (str, optional): The start date for the data export. Defaults to ''. - to_date (str, optional): The end date for the data export. Defaults to ''. - """ - print("Exporting conversation history to csv file...") - - if from_date == '' and to_date == '': - # Get all data - print("No dates") - response = SUPABASE_CLIENT.table("llm-convo-monitor").select("id", count='exact').eq( - "course_name", course_name).order('id', desc=False).execute() - elif from_date != '' and to_date == '': - print("only from_date") - # Get data from from_date to now - response = SUPABASE_CLIENT.table("llm-convo-monitor").select("id", count='exact').eq( - "course_name", course_name).gte('created_at', from_date).order('id', desc=False).execute() - elif from_date == '' and to_date != '': - print("only to_date") - # Get data from beginning to to_date - response = SUPABASE_CLIENT.table("llm-convo-monitor").select("id", count='exact').eq( - "course_name", course_name).lte('created_at', to_date).order('id', desc=False).execute() - else: - print("both from_date and to_date") - # Get data from from_date to to_date - response = SUPABASE_CLIENT.table("llm-convo-monitor").select("id", count='exact').eq( - "course_name", course_name).gte('created_at', from_date).lte('created_at', to_date).order('id', - desc=False).execute() - - if response.count > 1000: - # call background task to upload to s3 - filename = course_name + '_' + str(uuid.uuid4()) + '_convo_history.zip' - s3_filepath = s3_file = f"courses/{course_name}/{filename}" - # background task of downloading data - map it with above ID - executor = ProcessPoolExecutor() - executor.submit(export_data_in_bg, response, "conversations", course_name, s3_filepath) - return {"response": 'Download from S3', "s3_path": s3_filepath} - - # Fetch data - if response.count > 0: - print("id count greater than zero") - first_id = response.data[0]['id'] - last_id = response.data[-1]['id'] - total_count = response.count - - filename = course_name + '_' + str(uuid.uuid4()) + '_convo_history.csv' - file_path = os.path.join(os.getcwd(), filename) - curr_count = 0 - # Fetch data in batches of 25 from first_id to last_id - while curr_count < total_count: - print("Fetching data from id: ", first_id) - response = SUPABASE_CLIENT.table("llm-convo-monitor").select("*").eq("course_name", course_name).gte( - 'id', first_id).lte('id', last_id).order('id', desc=False).limit(25).execute() - # Convert to pandas dataframe - df = pd.DataFrame(response.data) - curr_count += len(response.data) - - # Append to csv file - if not os.path.isfile(file_path): - df.to_json(file_path, orient='records', lines=True) - else: - df.to_json(file_path, orient='records', lines=True, mode='a') - - # Update first_id - if len(response.data) > 0: - first_id = response.data[-1]['id'] + 1 - print("updated first_id: ", first_id) - - # Download file - try: - # zip file - zip_filename = filename.split('.')[0] + '.zip' - zip_file_path = os.path.join(os.getcwd(), zip_filename) - - with zipfile.ZipFile(zip_file_path, 'w', compression=zipfile.ZIP_DEFLATED) as zipf: - zipf.write(file_path, filename) - os.remove(file_path) - - return {"response": (zip_file_path, zip_filename, os.getcwd())} - except Exception as e: - print(e) - sentry_sdk.capture_exception(e) - return {"response": "Error downloading file!"} - else: - return {"response": "No data found between the given dates."} diff --git a/ai_ta_backend/extreme_context_stuffing.py b/ai_ta_backend/extreme_context_stuffing.py deleted file mode 100644 index 2f0f64f7..00000000 --- a/ai_ta_backend/extreme_context_stuffing.py +++ /dev/null @@ -1,550 +0,0 @@ -""" -API REQUEST PARALLEL PROCESSOR - -Using the OpenAI API to process lots of text quickly takes some care. -If you trickle in a million API requests one by one, they'll take days to complete. -If you flood a million API requests in parallel, they'll exceed the rate limits and fail with errors. -To maximize throughput, parallel requests need to be throttled to stay under rate limits. - -This script parallelizes requests to the OpenAI API while throttling to stay under rate limits. - -Features: -- Streams requests from file, to avoid running out of memory for giant jobs -- Makes requests concurrently, to maximize throughput -- Throttles request and token usage, to stay under rate limits -- Retries failed requests up to {max_attempts} times, to avoid missing data -- Logs errors, to diagnose problems with requests - -Example command to call script: -``` -python examples/api_request_parallel_processor.py \ - --requests_filepath examples/data/example_requests_to_parallel_process.jsonl \ - --save_filepath examples/data/example_requests_to_parallel_process_results.jsonl \ - --request_url https://api.openai.com/v1/embeddings \ - --max_requests_per_minute 1500 \ - --max_tokens_per_minute 6250000 \ - --token_encoding_name cl100k_base \ - --max_attempts 5 \ - --logging_level 20 -``` - -Inputs: -- requests_filepath : str - - path to the file containing the requests to be processed - - file should be a jsonl file, where each line is a json object with API parameters and an optional metadata field - - e.g., {"model": "text-embedding-ada-002", "input": "embed me", "metadata": {"row_id": 1}} - - as with all jsonl files, take care that newlines in the content are properly escaped (json.dumps does this automatically) - - an example file is provided at examples/data/example_requests_to_parallel_process.jsonl - - the code to generate the example file is appended to the bottom of this script -- save_filepath : str, optional - - path to the file where the results will be saved - - file will be a jsonl file, where each line is an array with the original request plus the API response - - e.g., [{"model": "text-embedding-ada-002", "input": "embed me"}, {...}] - - if omitted, results will be saved to {requests_filename}_results.jsonl -- request_url : str, optional - - URL of the API endpoint to call - - if omitted, will default to "https://api.openai.com/v1/embeddings" -- api_key : str, optional - - API key to use - - if omitted, the script will attempt to read it from an environment variable {os.getenv("OPENAI_API_KEY")} -- max_requests_per_minute : float, optional - - target number of requests to make per minute (will make less if limited by tokens) - - leave headroom by setting this to 50% or 75% of your limit - - if requests are limiting you, try batching multiple embeddings or completions into one request - - if omitted, will default to 1,500 -- max_tokens_per_minute : float, optional - - target number of tokens to use per minute (will use less if limited by requests) - - leave headroom by setting this to 50% or 75% of your limit - - if omitted, will default to 125,000 -- token_encoding_name : str, optional - - name of the token encoding used, as defined in the `tiktoken` package - - if omitted, will default to "cl100k_base" (used by `text-embedding-ada-002`) -- max_attempts : int, optional - - number of times to retry a failed request before giving up - - if omitted, will default to 5 -- logging_level : int, optional - - level of logging to use; higher numbers will log fewer messages - - 40 = ERROR; will log only when requests fail after all retries - - 30 = WARNING; will log when requests his rate limits or other errors - - 20 = INFO; will log when requests start and the status at finish - - 10 = DEBUG; will log various things as the loop runs to see when they occur - - if omitted, will default to 20 (INFO). - -The script is structured as follows: - - Imports - - Define main() - - Initialize things - - In main loop: - - Get next request if one is not already waiting for capacity - - Update available token & request capacity - - If enough capacity available, call API - - The loop pauses if a rate limit error is hit - - The loop breaks when no tasks remain - - Define dataclasses - - StatusTracker (stores script metadata counters; only one instance is created) - - APIRequest (stores API inputs, outputs, metadata; one method to call API) - - Define functions - - api_endpoint_from_url (extracts API endpoint from request URL) - - append_to_jsonl (writes to results file) - - num_tokens_consumed_from_request (bigger function to infer token usage from request) - - task_id_generator_function (yields 1, 2, 3, ...) - - Run main() -""" - -# import argparse -# import subprocess -# import tempfile -# from langchain.llms import OpenAI -import asyncio -import json -import logging - -# import os -import re -import time - -# for storing API inputs, outputs, and metadata -from dataclasses import dataclass, field -from typing import Any, List - -import aiohttp # for making API calls concurrently -import tiktoken # for counting tokens - -# from langchain.embeddings.openai import OpenAIEmbeddings -# from langchain.vectorstores import Qdrant -# from qdrant_client import QdrantClient, models - - -class OpenAIAPIProcessor: - - def __init__(self, input_prompts_list, request_url, api_key, max_requests_per_minute, max_tokens_per_minute, - token_encoding_name, max_attempts, logging_level): - self.request_url = request_url - self.api_key = api_key - self.max_requests_per_minute = max_requests_per_minute - self.max_tokens_per_minute = max_tokens_per_minute - self.token_encoding_name = token_encoding_name - self.max_attempts = max_attempts - self.logging_level = logging_level - self.input_prompts_list: List[dict] = input_prompts_list - self.results = [] - self.cleaned_results: List[str] = [] - - async def process_api_requests_from_file(self): - """Processes API requests in parallel, throttling to stay under rate limits.""" - # constants - seconds_to_pause_after_rate_limit_error = 15 - seconds_to_sleep_each_loop = 0.001 # 1 ms limits max throughput to 1,000 requests per second - - # initialize logging - logging.basicConfig(level=self.logging_level) - logging.debug(f"Logging initialized at level {self.logging_level}") - - # infer API endpoint and construct request header - api_endpoint = api_endpoint_from_url(self.request_url) - request_header = {"Authorization": f"Bearer {self.api_key}"} - - # initialize trackers - queue_of_requests_to_retry = asyncio.Queue() - task_id_generator = task_id_generator_function() # generates integer IDs of 1, 2, 3, ... - status_tracker = StatusTracker() # single instance to track a collection of variables - next_request = None # variable to hold the next request to call - - # initialize available capacity counts - available_request_capacity = self.max_requests_per_minute - available_token_capacity = self.max_tokens_per_minute - last_update_time = time.time() - - # initialize flags - file_not_finished = True # after file is empty, we'll skip reading it - logging.debug("Initialization complete.") - - requests = self.input_prompts_list.__iter__() - - logging.debug("File opened. Entering main loop") - - task_list = [] - - while True: - # get next request (if one is not already waiting for capacity) - if next_request is None: - if not queue_of_requests_to_retry.empty(): - next_request = queue_of_requests_to_retry.get_nowait() - logging.debug(f"Retrying request {next_request.task_id}: {next_request}") - elif file_not_finished: - try: - # get new request - # request_json = json.loads(next(requests)) - request_json = next(requests) - - next_request = APIRequest(task_id=next(task_id_generator), - request_json=request_json, - token_consumption=num_tokens_consumed_from_request( - request_json, api_endpoint, self.token_encoding_name), - attempts_left=self.max_attempts, - metadata=request_json.pop("metadata", None)) - status_tracker.num_tasks_started += 1 - status_tracker.num_tasks_in_progress += 1 - logging.debug(f"Reading request {next_request.task_id}: {next_request}") - except StopIteration: - # if file runs out, set flag to stop reading it - logging.debug("Read file exhausted") - file_not_finished = False - - # update available capacity - current_time = time.time() - seconds_since_update = current_time - last_update_time - available_request_capacity = min( - available_request_capacity + self.max_requests_per_minute * seconds_since_update / 60.0, - self.max_requests_per_minute, - ) - available_token_capacity = min( - available_token_capacity + self.max_tokens_per_minute * seconds_since_update / 60.0, - self.max_tokens_per_minute, - ) - last_update_time = current_time - - # if enough capacity available, call API - if next_request: - next_request_tokens = next_request.token_consumption - if (available_request_capacity >= 1 and available_token_capacity >= next_request_tokens): - # update counters - available_request_capacity -= 1 - available_token_capacity -= next_request_tokens - next_request.attempts_left -= 1 - - # call API - # TODO: NOT SURE RESPONSE WILL WORK HERE - task = asyncio.create_task( - next_request.call_api( - request_url=self.request_url, - request_header=request_header, - retry_queue=queue_of_requests_to_retry, - status_tracker=status_tracker, - )) - task_list.append(task) - next_request = None # reset next_request to empty - - # print("status_tracker.num_tasks_in_progress", status_tracker.num_tasks_in_progress) - # one_task_result = task.result() - # print("one_task_result", one_task_result) - - # if all tasks are finished, break - if status_tracker.num_tasks_in_progress == 0: - break - - # main loop sleeps briefly so concurrent tasks can run - await asyncio.sleep(seconds_to_sleep_each_loop) - - # if a rate limit error was hit recently, pause to cool down - seconds_since_rate_limit_error = (time.time() - status_tracker.time_of_last_rate_limit_error) - if seconds_since_rate_limit_error < seconds_to_pause_after_rate_limit_error: - remaining_seconds_to_pause = (seconds_to_pause_after_rate_limit_error - seconds_since_rate_limit_error) - await asyncio.sleep(remaining_seconds_to_pause) - # ^e.g., if pause is 15 seconds and final limit was hit 5 seconds ago - logging.warn( - f"Pausing to cool down until {time.ctime(status_tracker.time_of_last_rate_limit_error + seconds_to_pause_after_rate_limit_error)}" - ) - - # after finishing, log final status - logging.info("""Parallel processing complete. About to return.""") - if status_tracker.num_tasks_failed > 0: - logging.warning(f"{status_tracker.num_tasks_failed} / {status_tracker.num_tasks_started} requests failed.") - if status_tracker.num_rate_limit_errors > 0: - logging.warning( - f"{status_tracker.num_rate_limit_errors} rate limit errors received. Consider running at a lower rate.") - - # asyncio wait for task_list - await asyncio.wait(task_list) - - for task in task_list: - openai_completion = task.result() - self.results.append(openai_completion) - - self.cleaned_results: List[str] = extract_context_from_results(self.results) - - -def extract_context_from_results(results: List[Any]) -> List[str]: - assistant_contents = [] - total_prompt_tokens = 0 - total_completion_tokens = 0 - - for element in results: - if element is not None: - for item in element: - if 'choices' in item: - for choice in item['choices']: - if choice['message']['role'] == 'assistant': - assistant_contents.append(choice['message']['content']) - total_prompt_tokens += item['usage']['prompt_tokens'] - total_completion_tokens += item['usage']['completion_tokens'] - # Note: I don't think the prompt_tokens or completion_tokens is working quite right... - - return assistant_contents - - -# dataclasses - - -@dataclass -class StatusTracker: - """Stores metadata about the script's progress. Only one instance is created.""" - - num_tasks_started: int = 0 - num_tasks_in_progress: int = 0 # script ends when this reaches 0 - num_tasks_succeeded: int = 0 - num_tasks_failed: int = 0 - num_rate_limit_errors: int = 0 - num_api_errors: int = 0 # excluding rate limit errors, counted above - num_other_errors: int = 0 - time_of_last_rate_limit_error: float = 0 # used to cool off after hitting rate limits - - -@dataclass -class APIRequest: - """Stores an API request's inputs, outputs, and other metadata. Contains a method to make an API call.""" - - task_id: int - request_json: dict - token_consumption: int - attempts_left: int - metadata: dict - result: list = field(default_factory=list) - - async def call_api( - self, - request_url: str, - request_header: dict, - retry_queue: asyncio.Queue, - status_tracker: StatusTracker, - ): - """Calls the OpenAI API and saves results.""" - # logging.info(f"Starting request #{self.task_id}") - error = None - try: - async with aiohttp.ClientSession() as session: - async with session.post(url=request_url, headers=request_header, json=self.request_json) as response: - response = await response.json() - if "error" in response: - logging.warning(f"Request {self.task_id} failed with error {response['error']}") - status_tracker.num_api_errors += 1 - error = response - if "Rate limit" in response["error"].get("message", ""): - status_tracker.time_of_last_rate_limit_error = time.time() - status_tracker.num_rate_limit_errors += 1 - status_tracker.num_api_errors -= 1 # rate limit errors are counted separately - - except Exception as e: # catching naked exceptions is bad practice, but in this case we'll log & save them - logging.warning(f"Request {self.task_id} failed with Exception {e}") - status_tracker.num_other_errors += 1 - error = e - if error: - self.result.append(error) - if self.attempts_left: - retry_queue.put_nowait(self) - else: - logging.error(f"Request {self.request_json} failed after all attempts. Saving errors: {self.result}") - data = ([self.request_json, [str(e) for e in self.result], self.metadata] - if self.metadata else [self.request_json, [str(e) for e in self.result]]) - #append_to_jsonl(data, save_filepath) - status_tracker.num_tasks_in_progress -= 1 - status_tracker.num_tasks_failed += 1 - return data - else: - data = ([self.request_json, response, self.metadata] if self.metadata else [self.request_json, response] - ) # type: ignore - #append_to_jsonl(data, save_filepath) - status_tracker.num_tasks_in_progress -= 1 - status_tracker.num_tasks_succeeded += 1 - # logging.debug(f"Request {self.task_id} saved to {save_filepath}") - - return data - - -# functions - - -def api_endpoint_from_url(request_url: str): - """Extract the API endpoint from the request URL.""" - if 'text-embedding-ada-002' in request_url: - return 'embeddings' - else: - match = re.search('^https://[^/]+/v\\d+/(.+)$', request_url) - return match[1] # type: ignore - - -def append_to_jsonl(data, filename: str) -> None: - """Append a json payload to the end of a jsonl file.""" - json_string = json.dumps(data) - with open(filename, "a") as f: - f.write(json_string + "\n") - - -def num_tokens_consumed_from_request( - request_json: dict, - api_endpoint: str, - token_encoding_name: str, -): - """Count the number of tokens in the request. Only supports completion and embedding requests.""" - encoding = tiktoken.get_encoding(token_encoding_name) - # if completions request, tokens = prompt + n * max_tokens - if api_endpoint.endswith("completions"): - max_tokens = request_json.get("max_tokens", 15) - n = request_json.get("n", 1) - completion_tokens = n * max_tokens - - # chat completions - if api_endpoint.startswith("chat/"): - num_tokens = 0 - for message in request_json["messages"]: - num_tokens += 4 # every message follows {role/name}\n{content}\n - for key, value in message.items(): - num_tokens += len(encoding.encode(value)) - if key == "name": # if there's a name, the role is omitted - num_tokens -= 1 # role is always required and always 1 token - num_tokens += 2 # every reply is primed with assistant - return num_tokens + completion_tokens - # normal completions - else: - prompt = request_json["prompt"] - if isinstance(prompt, str): # single prompt - prompt_tokens = len(encoding.encode(prompt)) - num_tokens = prompt_tokens + completion_tokens - return num_tokens - elif isinstance(prompt, list): # multiple prompts - prompt_tokens = sum([len(encoding.encode(p)) for p in prompt]) - num_tokens = prompt_tokens + completion_tokens * len(prompt) - return num_tokens - else: - raise TypeError('Expecting either string or list of strings for "prompt" field in completion request') - # if embeddings request, tokens = input tokens - elif api_endpoint == "embeddings": - input = request_json["input"] - if isinstance(input, str): # single input - num_tokens = len(encoding.encode(input)) - return num_tokens - elif isinstance(input, list): # multiple inputs - num_tokens = sum([len(encoding.encode(i)) for i in input]) - return num_tokens - else: - raise TypeError('Expecting either string or list of strings for "inputs" field in embedding request') - # more logic needed to support other API calls (e.g., edits, inserts, DALL-E) - else: - raise NotImplementedError(f'API endpoint "{api_endpoint}" not implemented in this script') - - -def task_id_generator_function(): - """Generate integers 0, 1, 2, and so on.""" - task_id = 0 - while True: - yield task_id - task_id += 1 - - -if __name__ == '__main__': - pass - - # run script - # if __name__ == "__main__": - # qdrant_client = QdrantClient( - # url=os.getenv('QDRANT_URL'), - # api_key=os.getenv('QDRANT_API_KEY'), - # ) - # vectorstore = Qdrant( - # client=qdrant_client, - # collection_name=os.getenv('QDRANT_COLLECTION_NAME'), # type: ignore - # embeddings=OpenAIEmbeddings()) # type: ignore - - # user_question = "What is the significance of Six Sigma?" - # k = 4 - # fetch_k = 200 - # found_docs = vectorstore.max_marginal_relevance_search(user_question, k=k, fetch_k=200) - - # requests = [] - # for i, doc in enumerate(found_docs): - # dictionary = { - # "model": "gpt-3.5-turbo-0613", # 4k context - # "messages": [{ - # "role": "system", - # "content": "You are a factual summarizer of partial documents. Stick to the facts (including partial info when necessary to avoid making up potentially incorrect details), and say I don't know when necessary." - # }, { - # "role": - # "user", - # "content": - # f"What is a comprehensive summary of the given text, based on the question:\n{doc.page_content}\nQuestion: {user_question}\nThe summary should cover all the key points only relevant to the question, while also condensing the information into a concise and easy-to-understand format. Please ensure that the summary includes relevant details and examples that support the main ideas, while avoiding any unnecessary information or repetition. Feel free to include references, sentence fragments, keywords, or anything that could help someone learn about it, only as it relates to the given question. The length of the summary should be as short as possible, without losing relevant information.\n" - # }], - # "n": 1, - # "max_tokens": 500, - # "metadata": doc.metadata - # } - # requests.append(dictionary) - - # oai = OpenAIAPIProcessor( - # input_prompts_list=requests, - # request_url='https://api.openai.com/v1/chat/completions', - # api_key=os.getenv("OPENAI_API_KEY"), - # max_requests_per_minute=1500, - # max_tokens_per_minute=90000, - # token_encoding_name='cl100k_base', - # max_attempts=5, - # logging_level=20, - # ) - # # run script - # asyncio.run(oai.process_api_requests_from_file()) - - # assistant_contents = [] - # total_prompt_tokens = 0 - # total_completion_tokens = 0 - - # print("Results, end of main: ", oai.results) - # print("-"*50) - - # # jsonObject = json.loads(oai.results) - # for element in oai.results: - # for item in element: - # if 'choices' in item: - # for choice in item['choices']: - # if choice['message']['role'] == 'assistant': - # assistant_contents.append(choice['message']['content']) - # total_prompt_tokens += item['usage']['prompt_tokens'] - # total_completion_tokens += item['usage']['completion_tokens'] - - # print("Assistant Contents:", assistant_contents) - # print("Total Prompt Tokens:", total_prompt_tokens) - # print("Total Completion Tokens:", total_completion_tokens) - # turbo_total_cost = (total_prompt_tokens * 0.0015) + (total_completion_tokens * 0.002) - # print("Total cost (3.5-turbo):", (total_prompt_tokens * 0.0015), " + Completions: ", (total_completion_tokens * 0.002), " = ", turbo_total_cost) - - # gpt4_total_cost = (total_prompt_tokens * 0.03) + (total_completion_tokens * 0.06) - # print("Hypothetical cost for GPT-4:", (total_prompt_tokens * 0.03), " + Completions: ", (total_completion_tokens * 0.06), " = ", gpt4_total_cost) - # print("GPT-4 cost premium: ", (gpt4_total_cost / turbo_total_cost), "x") - ''' - Pricing: - GPT4: - * $0.03 prompt - * $0.06 completions - 3.5-turbo: - * $0.0015 prompt - * $0.002 completions - ''' -""" -APPENDIX - -The example requests file at openai-cookbook/examples/data/example_requests_to_parallel_process.jsonl contains 10,000 requests to text-embedding-ada-002. - -It was generated with the following code: - -```python -import json - -filename = "data/example_requests_to_parallel_process.jsonl" -n_requests = 10_000 -jobs = [{"model": "text-embedding-ada-002", "input": str(x) + "\n"} for x in range(n_requests)] -with open(filename, "w") as f: - for job in jobs: - json_string = json.dumps(job) - f.write(json_string + "\n") -``` - -As with all jsonl files, take care that newlines in the content are properly escaped (json.dumps does this automatically). -""" diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py index 3c87f04b..d72442af 100644 --- a/ai_ta_backend/main.py +++ b/ai_ta_backend/main.py @@ -1,10 +1,7 @@ -import gc import os import threading import time from typing import List -import requests -from threading import Thread from dotenv import load_dotenv from flask import ( @@ -15,30 +12,20 @@ make_response, request, send_from_directory, - stream_with_context, ) from flask_cors import CORS from flask_executor import Executor -from posthog import Posthog -# import ray -import sentry_sdk - -from ai_ta_backend.canvas import CanvasAPI - -from ai_ta_backend.export_data import export_convo_history_json, export_documents_json, check_s3_path_and_download -from ai_ta_backend.nomic_logging import get_nomic_map, log_convo_to_nomic, create_document_map -from ai_ta_backend.vector_database import Ingest -from ai_ta_backend.web_scrape import WebScrape, mit_course_download - -# Sentry.io error logging -sentry_sdk.init( - dsn=os.getenv("SENTRY_DSN"), - # Set traces_sample_rate to 1.0 to capture 100% of transactions for performance monitoring. - traces_sample_rate=1.0, - # Set profiles_sample_rate to 1.0 to profile 100% of sampled transactions. - # We recommend adjusting this value in production. - profiles_sample_rate=1.0, - enable_tracing=True) +from flask_injector import FlaskInjector, RequestScope +from injector import Binder, SingletonScope + +from ai_ta_backend.database.aws import AWSStorage +from ai_ta_backend.database.sql import SQLDatabase +from ai_ta_backend.database.vector import VectorDatabase +from ai_ta_backend.service.export_service import ExportService +from ai_ta_backend.service.nomic_service import NomicService +from ai_ta_backend.service.posthog_service import PosthogService +from ai_ta_backend.service.retrieval_service import RetrievalService +from ai_ta_backend.service.sentry_service import SentryService app = Flask(__name__) CORS(app) @@ -49,10 +36,6 @@ # load API keys from globally-availabe .env file load_dotenv() -# ray.init() - -print("NUM ACTIVE THREADS (top of main):", threading.active_count()) - @app.route('/') def index() -> Response: @@ -69,63 +52,8 @@ def index() -> Response: return response -@app.route('/coursera', methods=['GET']) -def coursera() -> Response: - try: - course_name: str = request.args.get('course_name') # type: ignore - coursera_course_name: str = request.args.get('coursera_course_name') # type: ignore - except Exception as e: - print(f"No course name provided: {e}") - - ingester = Ingest() - results = ingester.ingest_coursera(coursera_course_name, course_name) # type: ignore - del ingester - - response = jsonify(results) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - -@app.route('/github', methods=['GET']) -def github() -> Response: - course_name: str = request.args.get('course_name', default='', type=str) - github_url: str = request.args.get('github_url', default='', type=str) - - if course_name == '' or github_url == '': - # proper web error "400 Bad request" - abort( - 400, - description= - f"Missing one or more required parameters: 'course_name' and 's3_path' must be provided. Course name: `{course_name}`, S3 path: `{github_url}`" - ) - - ingester = Ingest() - results = ingester.ingest_github(github_url, course_name) - del ingester - response = jsonify(results) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - -@app.route('/delete-entire-course', methods=['GET']) -def delete_entire_course() -> Response: - try: - course_name: str = request.args.get('course_name') # type: ignore - # coursera_course_name: str = request.args.get('coursera_course_name') # type: ignore - except Exception as e: - print(f"No course name provided: {e}") - - ingester = Ingest() - results = ingester.delete_entire_course(course_name) # type: ignore - del ingester - - response = jsonify(results) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - @app.route('/getTopContexts', methods=['GET']) -def getTopContexts() -> Response: +def getTopContexts(service: RetrievalService) -> Response: """Get most relevant contexts for a given search query. Return value @@ -174,197 +102,16 @@ def getTopContexts() -> Response: print("NUM ACTIVE THREADS (top of getTopContexts):", threading.active_count()) - ingester = Ingest() - found_documents = ingester.getTopContexts(search_query, course_name, token_limit) + found_documents = service.getTopContexts(search_query, course_name, token_limit) print("NUM ACTIVE THREADS (after instantiating Ingest() class in getTopContexts):", threading.active_count()) - del ingester response = jsonify(found_documents) response.headers.add('Access-Control-Allow-Origin', '*') return response -@app.route('/get_stuffed_prompt', methods=['GET']) -def get_stuffed_prompt() -> Response: - """Get most relevant contexts for a given search query. - - ## GET arguments - course name (optional) str - A json response with TBD fields. - search_query - top_n - - Returns - ------- - String - - """ - course_name: str = request.args.get('course_name', default='', type=str) - search_query: str = request.args.get('search_query', default='', type=str) - token_limit: int = request.args.get('token_limit', default=-1, type=int) - if course_name == '' or search_query == '' or token_limit == -1: - # proper web error "400 Bad request" - abort( - 400, - description= - f"Missing one or more required parameters: 'course_name', 'search_query', and 'token_limit' must be provided. Course name: `{course_name}`, Search query: `{search_query}`, Token limit: `{token_limit}`" - ) - - print("In /getTopContexts: ", search_query) - if search_query is None: - return jsonify({"error": "No parameter `search_query` provided. It is undefined."}) - if token_limit is None: - token_limit = 3_000 - else: - token_limit = int(token_limit) - - ingester = Ingest() - prompt = ingester.get_stuffed_prompt(search_query, course_name, token_limit) - del ingester - - response = jsonify(prompt) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - -@app.route('/ingest', methods=['GET']) -def ingest() -> Response: - """Recursively ingests anything from S3 filepath and below. - Pass a s3_paths filepath (not URL) into our S3 bucket. - - Ingests all files, not just PDFs. - - args: - s3_paths: str | List[str] - - Returns: - str: Success or Failure message. Failure message if any failures. TODO: email on failure. - """ - s3_paths: List[str] | str = request.args.get('s3_paths', default='') - readable_filename: List[str] | str = request.args.get('readable_filename', default='') - course_name: List[str] | str = request.args.get('course_name', default='') - base_url: List[str] | str | None = request.args.get('base_url', default=None) - url: List[str] | str | None = request.args.get('url', default=None) - - print( - f"In top of /ingest route. course: {course_name}, s3paths: {s3_paths}, readable_filename: {readable_filename}, base_url: {base_url}, url: {url}" - ) - - if course_name == '' or s3_paths == '': - # proper web error "400 Bad request" - abort( - 400, - description= - f"Missing one or more required parameters: 'course_name' and 's3_path' must be provided. Course name: `{course_name}`, S3 path: `{s3_paths}`" - ) - - print("NUM ACTIVE THREADS (top of /ingest):", threading.active_count()) - - ingester = Ingest() - if readable_filename == '': - success_fail_dict = ingester.bulk_ingest(s3_paths, course_name, base_url=base_url, url=url) - else: - success_fail_dict = ingester.bulk_ingest(s3_paths, - course_name, - readable_filename=readable_filename, - base_url=base_url, - url=url) - print(f"Bottom of /ingest route. success or fail dict: {success_fail_dict}") - del ingester - - response = jsonify(success_fail_dict) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - -@app.route('/ingest-web-text', methods=['POST']) -def ingest_web_text() -> Response: - """Ingests web text data provided in the POST request body. - - Expects JSON data containing: - - url: The URL of the web text to ingest. - - base_url: The base URL of the web text to ingest. - - title: The title of the web text. - - content: The content of the web text. - - course_name: The name of the course associated with the web text. - - Returns: - str: Success or Failure message. Failure message if any failures. TODO: email on failure. - """ - data = request.get_json() - url: str = data.get('url', '') - base_url: str = data.get('base_url', '') - title: str = data.get('title', '') - content: str = data.get('content', '') - course_name: str = data.get('courseName', '') - - print(f"In top of /ingest-web-text. course: {course_name}, base_url: {base_url}, url: {url}") - - if course_name == '' or url == '' or title == '': - # proper web error "400 Bad request" - abort( - 400, - description= - f"Missing one or more required parameters: course_name, url or title. Course name: `{course_name}`, url: `{url}`, content: `{content}`, title: `{title}`" - ) - - if content == '': - print(f"Content is empty. Skipping ingestion of {url}") - response = jsonify({"outcome": "success"}) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - print("NUM ACTIVE THREADS (top of /ingest-web-text):", threading.active_count()) - - ingester = Ingest() - success_fail = ingester.ingest_single_web_text(course_name, base_url, url, content, title) - del ingester - - print(f"Bottom of /ingest route. success or fail dict: {success_fail}") - - response = jsonify(success_fail) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - -@app.route('/getContextStuffedPrompt', methods=['GET']) -def getContextStuffedPrompt() -> Response: - """ - Get a stuffed prompt for a given user question and course name. - Args : - search_query (str) - course_name (str) : used for metadata filtering - Returns : str - a very long "stuffed prompt" with question + summaries of 20 most relevant documents. - """ - print("In /getContextStuffedPrompt") - - ingester = Ingest() - search_query: str = request.args.get('search_query', default='', type=str) - course_name: str = request.args.get('course_name', default='', type=str) - top_n: int = request.args.get('top_n', default=-1, type=int) - top_k_to_search: int = request.args.get('top_k_to_search', default=-1, type=int) - - if search_query == '' or course_name == '' or top_n == -1 or top_k_to_search == -1: - # proper web error "400 Bad request" - abort( - 400, - description= - f"Missing one or more required parameters: 'search_query', 'course_name', 'top_n', and 'top_k_to_search' must be provided. Search query: `{search_query}`, Course name: `{course_name}`, Top N: `{top_n}`, Top K to search: `{top_k_to_search}`" - ) - - start_time = time.monotonic() - stuffed_prompt = ingester.get_context_stuffed_prompt(search_query, course_name, top_n, top_k_to_search) - print(f"⏰ Runtime of EXTREME prompt stuffing: {(time.monotonic() - start_time):.2f} seconds") - del ingester - - response = jsonify({"prompt": stuffed_prompt}) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - @app.route('/getAll', methods=['GET']) -def getAll() -> Response: +def getAll(service: RetrievalService) -> Response: """Get all course materials based on the course_name """ course_name: List[str] | str = request.args.get('course_name', default='', type=str) @@ -375,9 +122,7 @@ def getAll() -> Response: 400, description=f"Missing the one required parameter: 'course_name' must be provided. Course name: `{course_name}`") - ingester = Ingest() - distinct_dicts = ingester.getAll(course_name) - del ingester + distinct_dicts = service.getAll(course_name) response = jsonify({"distinct_files": distinct_dicts}) response.headers.add('Access-Control-Allow-Origin', '*') @@ -385,7 +130,7 @@ def getAll() -> Response: @app.route('/delete', methods=['DELETE']) -def delete(): +def delete(service: RetrievalService): """ Delete a single file from all our database: S3, Qdrant, and Supabase (for now). Note, of course, we still have parts of that file in our logs. @@ -403,149 +148,18 @@ def delete(): ) start_time = time.monotonic() - ingester = Ingest() # background execution of tasks!! - executor.submit(ingester.delete_data, course_name, s3_path, source_url) + executor.submit(service.delete_data, course_name, s3_path, source_url) print(f"From {course_name}, deleted file: {s3_path}") print(f"⏰ Runtime of FULL delete func: {(time.monotonic() - start_time):.2f} seconds") - del ingester - # we need instant return. Delets are "best effort" assume always successful... sigh :( response = jsonify({"outcome": 'success'}) response.headers.add('Access-Control-Allow-Origin', '*') return response -@app.route('/web-scrape', methods=['GET']) -def scrape() -> Response: - url: str = request.args.get('url', default='', type=str) - course_name: str = request.args.get('course_name', default='', type=str) - max_urls: int = request.args.get('max_urls', default=100, type=int) - max_depth: int = request.args.get('max_depth', default=2, type=int) - timeout: int = request.args.get('timeout', default=3, type=int) - # stay_on_baseurl = request.args.get('stay_on_baseurl', default='', type=str) - stay_on_baseurl: bool = request.args.get('stay_on_baseurl', default=True, type=lambda x: x.lower() == 'true') - depth_or_breadth: str = request.args.get('depth_or_breadth', default='breadth', type=str) - - if url == '' or max_urls == -1 or max_depth == -1 or timeout == -1 or course_name == '' or stay_on_baseurl is None: - # proper web error "400 Bad request" - abort( - 400, - description= - f"Missing one or more required parameters: 'url', 'max_urls', 'max_depth', 'timeout', 'course_name', and 'stay_on_baseurl' must be provided. url: `{url}`, max_urls: `{max_urls}`, max_depth: `{max_depth}`, timeout: `{timeout}`, course_name: `{course_name}`, stay_on_baseurl: `{stay_on_baseurl}`" - ) - - # print all input params - print(f"Web scrape: {url}") - print(f"Max Urls: {max_urls}") - print(f"Max Depth: {max_depth}") - print(f"Stay on BaseURL: {stay_on_baseurl}") - print(f"Timeout in Seconds ⏰: {timeout}") - - posthog = Posthog(sync_mode=True, project_api_key=os.environ['POSTHOG_API_KEY'], host='https://app.posthog.com') - posthog.capture('distinct_id_of_the_user', - event='web_scrape_invoked', - properties={ - 'url': url, - 'max_urls': max_urls, - 'max_depth': max_depth, - 'stay_on_baseurl': stay_on_baseurl, - 'timeout': timeout, - 'course_name': course_name, - 'depth_or_breadth': depth_or_breadth - }) - - scraper = WebScrape() - success_fail_dict = scraper.main_crawler(url, course_name, max_urls, max_depth, timeout, stay_on_baseurl, - depth_or_breadth) - del scraper - posthog.shutdown() - gc.collect() # manually invoke garbage collection, try to reduce memory on Railway $$$ - - response = jsonify(success_fail_dict) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - -@app.route('/mit-download', methods=['GET']) -def mit_download_course() -> Response: - """ Web scraper built for - """ - url: str = request.args.get('url', default='', type=str) - course_name: str = request.args.get('course_name', default='', type=str) - local_dir: str = request.args.get('local_dir', default='', type=str) - - if url == '' or course_name == '' or local_dir == '': - # proper web error "400 Bad request" - abort( - 400, - description= - f"Missing one or more required parameters: 'url', 'course_name', and 'local_dir' must be provided. url: `{url}`, course_name: `{course_name}`, local_dir: `{local_dir}`" - ) - - success_fail = mit_course_download(url, course_name, local_dir) - - response = jsonify(success_fail) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - -@app.route('/addCanvasUsers', methods=['GET']) -def add_canvas_users(): - """ - Add users from canvas to the course - """ - print("In /addCanvasUsers") - - canvas = CanvasAPI() - canvas_course_id: str = request.args.get('course_id') - course_name: str = request.args.get('course_name') - - success_or_failure = canvas.add_users(canvas_course_id, course_name) - - response = jsonify({"outcome": success_or_failure}) - - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - -@app.route('/ingestCanvas', methods=['GET']) -def ingest_canvas(): - """ - Ingest course content from Canvas - """ - print("made it to ingest") - canvas = CanvasAPI() - canvas_course_id: str = request.args.get('course_id') - course_name: str = request.args.get('course_name') - - # Retrieve the checkbox values from the request and create the content_ingest_dict - # Set default values to True if not provided in the request - content_ingest_dict = { - 'files': request.args.get('files', 'true').lower() == 'true', - 'pages': request.args.get('pages', 'true').lower() == 'true', - 'modules': request.args.get('modules', 'true').lower() == 'true', - 'syllabus': request.args.get('syllabus', 'true').lower() == 'true', - 'assignments': request.args.get('assignments', 'true').lower() == 'true', - 'discussions': request.args.get('discussions', 'true').lower() == 'true' - } - - if canvas_course_id == '' or course_name == '': - # proper web error "400 Bad request" - abort( - 400, - description= - f"Missing one or more required parameters: 'course_id' and 'course_name' must be provided. course_id: `{canvas_course_id}`, course_name: `{course_name}`" - ) - - success_or_failure = canvas.ingest_course_content(canvas_course_id, course_name, content_ingest_dict) - response = jsonify({"outcome": success_or_failure}) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - @app.route('/getNomicMap', methods=['GET']) -def nomic_map(): +def nomic_map(service: NomicService): course_name: str = request.args.get('course_name', default='', type=str) map_type: str = request.args.get('map_type', default='conversation', type=str) @@ -553,7 +167,7 @@ def nomic_map(): # proper web error "400 Bad request" abort(400, description=f"Missing required parameter: 'course_name' must be provided. Course name: `{course_name}`") - map_id = get_nomic_map(course_name, map_type) + map_id = service.get_nomic_map(course_name, map_type) print("nomic map\n", map_id) response = jsonify(map_id) @@ -562,14 +176,14 @@ def nomic_map(): @app.route('/createDocumentMap', methods=['GET']) -def createDocumentMap(): +def createDocumentMap(service: NomicService): course_name: str = request.args.get('course_name', default='', type=str) if course_name == '': # proper web error "400 Bad request" abort(400, description=f"Missing required parameter: 'course_name' must be provided. Course name: `{course_name}`") - map_id = create_document_map(course_name) + map_id = service.create_document_map(course_name) response = jsonify(map_id) response.headers.add('Access-Control-Allow-Origin', '*') @@ -577,7 +191,7 @@ def createDocumentMap(): @app.route('/onResponseCompletion', methods=['POST']) -def logToNomic(): +def logToNomic(service: NomicService): data = request.get_json() course_name = data['course_name'] conversation = data['conversation'] @@ -592,14 +206,14 @@ def logToNomic(): print(f"In /onResponseCompletion for course: {course_name}") # background execution of tasks!! - response = executor.submit(log_convo_to_nomic, course_name, data) + response = executor.submit(service.log_convo_to_nomic, course_name, data) response = jsonify({'outcome': 'success'}) response.headers.add('Access-Control-Allow-Origin', '*') return response @app.route('/export-convo-history-csv', methods=['GET']) -def export_convo_history(): +def export_convo_history(service: ExportService): course_name: str = request.args.get('course_name', default='', type=str) from_date: str = request.args.get('from_date', default='', type=str) to_date: str = request.args.get('to_date', default='', type=str) @@ -608,7 +222,7 @@ def export_convo_history(): # proper web error "400 Bad request" abort(400, description=f"Missing required parameter: 'course_name' must be provided. Course name: `{course_name}`") - export_status = export_convo_history_json(course_name, from_date, to_date) + export_status = service.export_convo_history_json(course_name, from_date, to_date) print("EXPORT FILE LINKS: ", export_status) if export_status['response'] == "No data found between the given dates.": @@ -630,7 +244,7 @@ def export_convo_history(): @app.route('/exportDocuments', methods=['GET']) -def exportDocuments(): +def exportDocuments(service: ExportService): course_name: str = request.args.get('course_name', default='', type=str) from_date: str = request.args.get('from_date', default='', type=str) to_date: str = request.args.get('to_date', default='', type=str) @@ -639,7 +253,7 @@ def exportDocuments(): # proper web error "400 Bad request" abort(400, description=f"Missing required parameter: 'course_name' must be provided. Course name: `{course_name}`") - export_status = export_documents_json(course_name, from_date, to_date) + export_status = service.export_documents_json(course_name, from_date, to_date) print("EXPORT FILE LINKS: ", export_status) if export_status['response'] == "No data found between the given dates.": @@ -661,7 +275,7 @@ def exportDocuments(): @app.route('/getTopContextsWithMQR', methods=['GET']) -def getTopContextsWithMQR() -> Response: +def getTopContextsWithMQR(service: RetrievalService, posthog_service: PosthogService) -> Response: """ Get relevant contexts for a given search query, using Multi-query retrieval + filtering method. """ @@ -676,102 +290,32 @@ def getTopContextsWithMQR() -> Response: f"Missing one or more required parameters: 'search_query' and 'course_name' must be provided. Search query: `{search_query}`, Course name: `{course_name}`" ) - posthog = Posthog(sync_mode=True, project_api_key=os.environ['POSTHOG_API_KEY'], host='https://app.posthog.com') - posthog.capture('distinct_id_of_the_user', - event='filter_top_contexts_invoked', - properties={ - 'user_query': search_query, - 'course_name': course_name, - 'token_limit': token_limit, - }) + posthog_service.capture(event_name='filter_top_contexts_invoked', + properties={ + 'user_query': search_query, + 'course_name': course_name, + 'token_limit': token_limit, + }) - ingester = Ingest() - found_documents = ingester.getTopContextsWithMQR(search_query, course_name, token_limit) - del ingester - posthog.shutdown() + found_documents = service.getTopContextsWithMQR(search_query, course_name, token_limit) response = jsonify(found_documents) response.headers.add('Access-Control-Allow-Origin', '*') return response -@app.route('/resource-report', methods=['GET']) -def resource_report() -> Response: - """ - Print server resources. - # https://manpages.debian.org/bookworm/manpages-dev/getrlimit.2.en.html - """ - import resource - from resource import getrusage, RUSAGE_SELF, RUSAGE_CHILDREN - import subprocess - - print("👇👇👇👇👇👇👇👇👇 👇👇👇👇👇👇👇👇👇") - - print("NUM ACTIVE THREADS (top of /resource-report):", threading.active_count()) - try: - # result = subprocess.run(['ps', '-u', '$(whoami)', '|', 'wc', '-l'], stdout=subprocess.PIPE) - result = subprocess.run('ps -u $(whoami) | wc -l', shell=True, stdout=subprocess.PIPE) - print("Current active threads: ", result.stdout.decode('utf-8')) - except Exception as e: - print("Error executing ulimit -a: ", e) - - try: - with open('/etc/security/limits.conf', 'r') as file: - print("/etc/security/limits.conf:\n", file.read()) - except Exception as e: - print("Error reading /etc/security/limits.conf: ", e) - - try: - with open('/proc/sys/kernel/threads-max', 'r') as file: - print("/proc/sys/kernel/threads-max: ", file.read()) - except Exception as e: - print("Error reading /proc/sys/kernel/threads-max: ", e) - - # Check container or virtualization platform limits if applicable - # This is highly dependent on the specific platform and setup - # Here is an example for Docker, adjust as needed for your environment - try: - result = subprocess.run('docker stats --no-stream', shell=True, stdout=subprocess.PIPE) - print("Docker stats:\n", result.stdout.decode('utf-8')) - except Exception as e: - print("Error getting Docker stats: ", e) - - print("RLIMIT_NPROC: ", resource.getrlimit(resource.RLIMIT_NPROC)) - print("RLIMIT_AS (GB): ", [limit / (1024 * 1024 * 1024) for limit in resource.getrlimit(resource.RLIMIT_AS)]) - print("RLIMIT_DATA (GB): ", [limit / (1024 * 1024 * 1024) for limit in resource.getrlimit(resource.RLIMIT_DATA)]) - print("RLIMIT_MEMLOCK (GB): ", - [limit / (1024 * 1024 * 1024) for limit in resource.getrlimit(resource.RLIMIT_MEMLOCK) - ]) # The maximum address space which may be locked in memory. - print("RLIMIT_STACK (MB): ", [limit / (1024 * 1024) for limit in resource.getrlimit(resource.RLIMIT_STACK)]) - print("getpagesize (MB): ", resource.getpagesize() / (1024 * 1024)) - - print("RUSAGE_SELF", getrusage(RUSAGE_SELF), end="\n") - print("RUSAGE_CHILDREN", getrusage(RUSAGE_CHILDREN), end="\n") - - try: - result = subprocess.run('ulimit -u', shell=True, stdout=subprocess.PIPE) - print("ulimit -u: ", result.stdout.decode('utf-8')) - except Exception as e: - print("Error executing ulimit -u: ", e) - - try: - result = subprocess.run('ulimit -a', shell=True, stdout=subprocess.PIPE) - print(f"ulimit -a:\n{result.stdout.decode('utf-8')}") - except Exception as e: - print("Error executing ulimit -a: ", e) - - try: - print("RUSAGE_THREAD: ", resource.getrlimit(resource.RUSAGE_THREAD)) - except Exception as e: - pass - # print("Error in RUSAGE_THREAD: ", e) - - print("👆👆👆👆👆👆👆👆👆 👆👆👆👆👆👆👆👆👆") - - response = jsonify({"outcome": "success"}) - response.headers.add('Access-Control-Allow-Origin', '*') - return response +def configure(binder: Binder) -> None: + binder.bind(RetrievalService, to=RetrievalService, scope=RequestScope) + binder.bind(PosthogService, to=PosthogService, scope=SingletonScope) + binder.bind(SentryService, to=SentryService, scope=SingletonScope) + binder.bind(NomicService, to=NomicService, scope=SingletonScope) + binder.bind(ExportService, to=ExportService, scope=SingletonScope) + binder.bind(VectorDatabase, to=VectorDatabase, scope=SingletonScope) + binder.bind(SQLDatabase, to=SQLDatabase, scope=SingletonScope) + binder.bind(AWSStorage, to=AWSStorage, scope=SingletonScope) + +FlaskInjector(app=app, modules=[configure]) if __name__ == '__main__': app.run(debug=True, port=int(os.getenv("PORT", default=8000))) # nosec -- reasonable bandit error suppression diff --git a/ai_ta_backend/nomic_logging.py b/ai_ta_backend/nomic_logging.py deleted file mode 100644 index cf5bc699..00000000 --- a/ai_ta_backend/nomic_logging.py +++ /dev/null @@ -1,738 +0,0 @@ -import datetime -import os -import time - -import nomic -import numpy as np -import pandas as pd -import supabase -from langchain.embeddings import OpenAIEmbeddings -from nomic import AtlasProject, atlas -import sentry_sdk -import backoff -import json - -OPENAI_API_TYPE = "azure" - - -SUPABASE_CLIENT = supabase.create_client( # type: ignore - supabase_url=os.getenv('SUPABASE_URL'), # type: ignore - supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore - -LOCK_EXCEPTIONS = ['Project is locked for state access! Please wait until the project is unlocked to access embeddings.', - 'Project is locked for state access! Please wait until the project is unlocked to access data.', - 'Project is currently indexing and cannot ingest new datums. Try again later.'] - -def giveup_hdlr(e): - """ - Function to handle giveup conditions in backoff decorator - Args: - e: Exception raised by the decorated function - Returns: - True if we want to stop retrying, False otherwise - """ - (e_args,) = e.args - e_str = e_args['exception'] - - print("giveup_hdlr() called with exception:", e_str) - if e_str in LOCK_EXCEPTIONS: - return False - else: - sentry_sdk.capture_exception(e) - return True - - -def backoff_hdlr(details): - """ - Function to handle backup conditions in backoff decorator. - Currently just prints the details of the backoff. - """ - print( - "\nBacking off {wait:0.1f} seconds after {tries} tries, calling function {target} with args {args} and kwargs {kwargs}" - .format(**details)) - - -def backoff_strategy(): - """ - Function to define retry strategy. Is usualy defined in the decorator, - but passing parameters to it is giving errors. - """ - return backoff.expo(base=10, factor=1.5) - - -@backoff.on_exception(backoff_strategy, - Exception, - max_tries=5, - raise_on_giveup=False, - giveup=giveup_hdlr, - on_backoff=backoff_hdlr) -def log_convo_to_nomic(course_name: str, conversation) -> str: - nomic.login(os.getenv('NOMIC_API_KEY')) # login during start of flask app - NOMIC_MAP_NAME_PREFIX = 'Conversation Map for ' - """ - Logs conversation to Nomic. - 1. Check if map exists for given course - 2. Check if conversation ID exists - - if yes, delete and add new data point - - if no, add new data point - 3. Keep current logic for map doesn't exist - update metadata - """ - - print(f"in log_convo_to_nomic() for course: {course_name}") - print("type of conversation:", type(conversation)) - #conversation = json.loads(conversation) - messages = conversation['conversation']['messages'] - if 'user_email' not in conversation['conversation']: - user_email = "NULL" - else: - user_email = conversation['conversation']['user_email'] - conversation_id = conversation['conversation']['id'] - - # we have to upload whole conversations - # check what the fetched data looks like - pandas df or pyarrow table - # check if conversation ID exists in Nomic, if yes fetch all data from it and delete it. - # will have current QA and historical QA from Nomic, append new data and add_embeddings() - - project_name = NOMIC_MAP_NAME_PREFIX + course_name - start_time = time.monotonic() - emoji = "" - - try: - # fetch project metadata and embbeddings - project = AtlasProject(name=project_name, add_datums_if_exists=True) - - map_metadata_df = project.maps[1].data.df # type: ignore - map_embeddings_df = project.maps[1].embeddings.latent - # create a function which returns project, data and embeddings df here - map_metadata_df['id'] = map_metadata_df['id'].astype(int) - last_id = map_metadata_df['id'].max() - - if conversation_id in map_metadata_df.values: - # store that convo metadata locally - prev_data = map_metadata_df[map_metadata_df['conversation_id'] == conversation_id] - prev_index = prev_data.index.values[0] - embeddings = map_embeddings_df[prev_index - 1].reshape(1, 1536) - prev_convo = prev_data['conversation'].values[0] - prev_id = prev_data['id'].values[0] - created_at = pd.to_datetime(prev_data['created_at'].values[0]).strftime('%Y-%m-%d %H:%M:%S') - - # delete that convo data point from Nomic, and print result - print("Deleting point from nomic:", project.delete_data([str(prev_id)])) - - # prep for new point - first_message = prev_convo.split("\n")[1].split(": ")[1] - - # select the last 2 messages and append new convo to prev convo - messages_to_be_logged = messages[-2:] - for message in messages_to_be_logged: - if message['role'] == 'user': - emoji = "🙋 " - else: - emoji = "🤖 " - - if isinstance(message['content'], list): - text = message['content'][0]['text'] - else: - text = message['content'] - - prev_convo += "\n>>> " + emoji + message['role'] + ": " + text + "\n" - - # modified timestamp - current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - - # update metadata - metadata = [{ - "course": course_name, - "conversation": prev_convo, - "conversation_id": conversation_id, - "id": last_id + 1, - "user_email": user_email, - "first_query": first_message, - "created_at": created_at, - "modified_at": current_time - }] - else: - print("conversation_id does not exist") - - # add new data point - user_queries = [] - conversation_string = "" - - first_message = messages[0]['content'] - if isinstance(first_message, list): - first_message = first_message[0]['text'] - user_queries.append(first_message) - - for message in messages: - if message['role'] == 'user': - emoji = "🙋 " - else: - emoji = "🤖 " - - if isinstance(message['content'], list): - text = message['content'][0]['text'] - else: - text = message['content'] - - conversation_string += "\n>>> " + emoji + message['role'] + ": " + text + "\n" - - # modified timestamp - current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - - metadata = [{ - "course": course_name, - "conversation": conversation_string, - "conversation_id": conversation_id, - "id": last_id + 1, - "user_email": user_email, - "first_query": first_message, - "created_at": current_time, - "modified_at": current_time - }] - - # create embeddings - embeddings_model = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE) # type: ignore - embeddings = embeddings_model.embed_documents(user_queries) - - # add embeddings to the project - create a new function for this - project = atlas.AtlasProject(name=project_name, add_datums_if_exists=True) - with project.wait_for_project_lock(): - project.add_embeddings(embeddings=np.array(embeddings), data=pd.DataFrame(metadata)) - project.rebuild_maps() - - print(f"⏰ Nomic logging runtime: {(time.monotonic() - start_time):.2f} seconds") - return f"Successfully logged for {course_name}" - - except Exception as e: - if str(e) == 'You must specify a unique_id_field when creating a new project.': - print("Attempting to create Nomic map...") - result = create_nomic_map(course_name, conversation) - print("result of create_nomic_map():", result) - else: - # raising exception again to trigger backoff and passing parameters to use in create_nomic_map() - raise Exception({"exception": str(e)}) - - -def get_nomic_map(course_name: str, type: str): - - """ - Returns the variables necessary to construct an iframe of the Nomic map given a course name. - We just need the ID and URL. - Example values: - map link: https://atlas.nomic.ai/map/ed222613-97d9-46a9-8755-12bbc8a06e3a/f4967ad7-ff37-4098-ad06-7e1e1a93dd93 - map id: f4967ad7-ff37-4098-ad06-7e1e1a93dd93 - """ - nomic.login(os.getenv('NOMIC_API_KEY')) # login during start of flask app - if type.lower() == 'document': - NOMIC_MAP_NAME_PREFIX = 'Document Map for ' - else: - NOMIC_MAP_NAME_PREFIX = 'Conversation Map for ' - - project_name = NOMIC_MAP_NAME_PREFIX + course_name - start_time = time.monotonic() - - try: - project = atlas.AtlasProject(name=project_name, add_datums_if_exists=True) - map = project.get_map(project_name) - - print(f"⏰ Nomic Full Map Retrieval: {(time.monotonic() - start_time):.2f} seconds") - return {"map_id": f"iframe{map.id}", "map_link": map.map_link} - except Exception as e: - # Error: ValueError: You must specify a unique_id_field when creating a new project. - if str(e) == 'You must specify a unique_id_field when creating a new project.': # type: ignore - print("Nomic map does not exist yet, probably because you have less than 20 queries/documents on your project: ", e) - else: - print("ERROR in get_nomic_map():", e) - sentry_sdk.capture_exception(e) - return {"map_id": None, "map_link": None} - - -def create_nomic_map(course_name: str, log_data: list): - """ - Creates a Nomic map for new courses and those which previously had < 20 queries. - 1. fetches supabase conversations for course - 2. appends current embeddings and metadata to it - 2. creates map if there are at least 20 queries - """ - nomic.login(os.getenv('NOMIC_API_KEY')) # login during start of flask app - NOMIC_MAP_NAME_PREFIX = 'Conversation Map for ' - - print(f"in create_nomic_map() for {course_name}") - # initialize supabase - supabase_client = supabase.create_client( # type: ignore - supabase_url=os.getenv('SUPABASE_URL'), # type: ignore - supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore - - try: - # fetch all conversations with this new course (we expect <=20 conversations, because otherwise the map should be made already) - response = supabase_client.table("llm-convo-monitor").select("*").eq("course_name", course_name).execute() - data = response.data - df = pd.DataFrame(data) - - if len(data) < 19: - return None - else: - # get all queries for course and create metadata - user_queries = [] - metadata = [] - i = 1 - conversation_exists = False - - # current log details - log_messages = log_data['conversation']['messages'] # type: ignore - log_user_email = log_data['conversation']['user_email'] # type: ignore - log_conversation_id = log_data['conversation']['id'] # type: ignore - - for _index, row in df.iterrows(): - user_email = row['user_email'] - created_at = pd.to_datetime(row['created_at']).strftime('%Y-%m-%d %H:%M:%S') - convo = row['convo'] - messages = convo['messages'] - - first_message = messages[0]['content'] - if isinstance(first_message, list): - first_message = first_message[0]['text'] - - user_queries.append(first_message) - - # create metadata for multi-turn conversation - conversation = "" - for message in messages: - # string of role: content, role: content, ... - if message['role'] == 'user': # type: ignore - emoji = "🙋 " - else: - emoji = "🤖 " - - if isinstance(message['content'], list): - text = message['content'][0]['text'] - else: - text = message['content'] - - conversation += "\n>>> " + emoji + message['role'] + ": " + text + "\n" - - # append current chat to previous chat if convo already exists - if convo['id'] == log_conversation_id: - conversation_exists = True - - for m in log_messages: - if m['role'] == 'user': # type: ignore - emoji = "🙋 " - else: - emoji = "🤖 " - - if isinstance(m['content'], list): - text = m['content'][0]['text'] - else: - text = m['content'] - conversation += "\n>>> " + emoji + m['role'] + ": " + text + "\n" - - # adding modified timestamp - current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - - # add to metadata - metadata_row = { - "course": row['course_name'], - "conversation": conversation, - "conversation_id": convo['id'], - "id": i, - "user_email": user_email, - "first_query": first_message, - "created_at": created_at, - "modified_at": current_time - } - metadata.append(metadata_row) - i += 1 - - # add current log as a new data point if convo doesn't exist - if not conversation_exists: - user_queries.append(log_messages[0]['content']) - conversation = "" - for message in log_messages: - if message['role'] == 'user': - emoji = "🙋 " - else: - emoji = "🤖 " - - if isinstance(message['content'], list): - text = message['content'][0]['text'] - else: - text = message['content'] - conversation += "\n>>> " + emoji + message['role'] + ": " + text + "\n" - - # adding timestamp - current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - - metadata_row = { - "course": course_name, - "conversation": conversation, - "conversation_id": log_conversation_id, - "id": i, - "user_email": log_user_email, - "first_query": log_messages[0]['content'], - "created_at": current_time, - "modified_at": current_time - } - metadata.append(metadata_row) - - metadata = pd.DataFrame(metadata) - embeddings_model = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE) # type: ignore - embeddings = embeddings_model.embed_documents(user_queries) - - # create Atlas project - project_name = NOMIC_MAP_NAME_PREFIX + course_name - index_name = course_name + "_convo_index" - project = atlas.map_embeddings( - embeddings=np.array(embeddings), - data=metadata, # type: ignore - this is the correct type, the func signature from Nomic is incomplete - id_field='id', - build_topic_model=True, - topic_label_field='first_query', - name=project_name, - colorable_fields=['conversation_id', 'first_query']) - project.create_index(index_name, build_topic_model=True) - return f"Successfully created Nomic map for {course_name}" - except Exception as e: - # Error: ValueError: You must specify a unique_id_field when creating a new project. - if str(e) == 'You must specify a unique_id_field when creating a new project.': # type: ignore - print("Nomic map does not exist yet, probably because you have less than 20 queries on your project: ", e) - else: - print("ERROR in create_nomic_map():", e) - sentry_sdk.capture_exception(e) - - return "failed" - -## -------------------------------- DOCUMENT MAP FUNCTIONS --------------------------------- ## - -def create_document_map(course_name: str): - """ - This is a function which creates a document map for a given course from scratch - 1. Gets count of documents for the course - 2. If less than 20, returns a message that a map cannot be created - 3. If greater than 20, iteratively fetches documents in batches of 25 - 4. Prepares metadata and embeddings for nomic upload - 5. Creates a new map and uploads the data - - Args: - course_name: str - Returns: - str: success or failed - """ - print("in create_document_map()") - nomic.login(os.getenv('NOMIC_API_KEY')) - NOMIC_MAP_NAME_PREFIX = 'Document Map for ' - - # initialize supabase - supabase_client = supabase.create_client( # type: ignore - supabase_url=os.getenv('SUPABASE_URL'), # type: ignore - supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore - - try: - # check if map exists - response = supabase_client.table("projects").select("doc_map_id").eq("course_name", course_name).execute() - if response.data: - return "Map already exists for this course." - - # fetch relevant document data from Supabase - response = supabase_client.table("documents").select("id", count="exact").eq("course_name", course_name).order('id', desc=False).execute() - if not response.count: - return "No documents found for this course." - - total_doc_count = response.count - print("Total number of documents in Supabase: ", total_doc_count) - - # minimum 20 docs needed to create map - if total_doc_count > 19: - - first_id = response.data[0]['id'] - combined_dfs = [] - curr_total_doc_count = 0 - doc_count = 0 - first_batch = True - - # iteratively query in batches of 25 - while curr_total_doc_count < total_doc_count: - - response = supabase_client.table("documents").select("id, created_at, s3_path, url, readable_filename, contexts").eq("course_name", course_name).gte( - 'id', first_id).order('id', desc=False).limit(25).execute() - df = pd.DataFrame(response.data) - combined_dfs.append(df) # list of dfs - - curr_total_doc_count += len(response.data) - doc_count += len(response.data) - - if doc_count >= 1000: # upload to Nomic every 1000 docs - - # concat all dfs from the combined_dfs list - final_df = pd.concat(combined_dfs, ignore_index=True) - - # prep data for nomic upload - embeddings, metadata = data_prep_for_doc_map(final_df) - - if first_batch: - # create a new map - print("Creating new map...") - project_name = NOMIC_MAP_NAME_PREFIX + course_name - index_name = course_name + "_doc_index" - topic_label_field = "text" - colorable_fields = ["readable_filename", "text"] - result = create_map(embeddings, metadata, project_name, index_name, topic_label_field, colorable_fields) - # update flag - first_batch = False - - else: - # append to existing map - print("Appending data to existing map...") - project_name = NOMIC_MAP_NAME_PREFIX + course_name - # add project lock logic here - result = append_to_map(embeddings, metadata, project_name) - - # reset variables - combined_dfs = [] - doc_count = 0 - - # set first_id for next iteration - first_id = response.data[-1]['id'] + 1 - - # upload last set of docs - final_df = pd.concat(combined_dfs, ignore_index=True) - embeddings, metadata = data_prep_for_doc_map(final_df) - project_name = NOMIC_MAP_NAME_PREFIX + course_name - if first_batch: - index_name = course_name + "_doc_index" - topic_label_field = "text" - colorable_fields = ["readable_filename", "text"] - result = create_map(embeddings, metadata, project_name, index_name, topic_label_field, colorable_fields) - else: - result = append_to_map(embeddings, metadata, project_name) - print("Atlas upload status: ", result) - - # log info to supabase - project = AtlasProject(name=project_name, add_datums_if_exists=True) - project_id = project.id - project.rebuild_maps() - project_info = {'course_name': course_name, 'doc_map_id': project_id} - response = supabase_client.table("projects").insert(project_info).execute() - print("Response from supabase: ", response) - return "success" - else: - return "Cannot create a map because there are less than 20 documents in the course." - except Exception as e: - print(e) - sentry_sdk.capture_exception(e) - return "failed" - - -def delete_from_document_map(course_name: str, ids: list): - """ - This function is used to delete datapoints from a document map. - Currently used within the delete_data() function in vector_database.py - Args: - course_name: str - ids: list of str - """ - print("in delete_from_document_map()") - - try: - # check if project exists - response = SUPABASE_CLIENT.table("projects").select("doc_map_id").eq("course_name", course_name).execute() - if response.data: - project_id = response.data[0]['doc_map_id'] - else: - return "No document map found for this course" - - # fetch project from Nomic - project = AtlasProject(project_id=project_id, add_datums_if_exists=True) - - # delete the ids from Nomic - print("Deleting point from document map:", project.delete_data(ids)) - with project.wait_for_project_lock(): - project.rebuild_maps() - return "Successfully deleted from Nomic map" - except Exception as e: - print(e) - sentry_sdk.capture_exception(e) - return "Error in deleting from document map: {e}" - -def log_to_document_map(data: dict): - """ - This is a function which appends new documents to an existing document map. It's called - at the end of split_and_upload() after inserting data to Supabase. - Args: - data: dict - the response data from Supabase insertion - """ - print("in add_to_document_map()") - - try: - # check if map exists - course_name = data['course_name'] - response = SUPABASE_CLIENT.table("projects").select("doc_map_id").eq("course_name", course_name).execute() - if response.data: - project_id = response.data[0]['doc_map_id'] - else: - # create a map - map_creation_result = create_document_map(course_name) - if map_creation_result != "success": - return "The project has less than 20 documents and a map cannot be created." - else: - # fetch project id - response = SUPABASE_CLIENT.table("projects").select("doc_map_id").eq("course_name", course_name).execute() - project_id = response.data[0]['doc_map_id'] - - - project = AtlasProject(project_id=project_id, add_datums_if_exists=True) - #print("Inserted data: ", data) - - embeddings = [] - metadata = [] - context_count = 0 - # prep data for nomic upload - for row in data['contexts']: - context_count += 1 - embeddings.append(row['embedding']) - metadata.append({ - "id": str(data['id']) + "_" + str(context_count), - "doc_ingested_at": data['created_at'], - "s3_path": data['s3_path'], - "url": data['url'], - "readable_filename": data['readable_filename'], - "created_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - "text": row['text'] - }) - embeddings = np.array(embeddings) - metadata = pd.DataFrame(metadata) - print("Shape of embeddings: ", embeddings.shape) - - # append to existing map - project_name = "Document Map for " + course_name - result = append_to_map(embeddings, metadata, project_name) - - # check if project is accepting new datums - if project.is_accepting_data: - with project.wait_for_project_lock(): - project.rebuild_maps() - - # with project.wait_for_project_lock(): - # project.rebuild_maps() - return result - - except Exception as e: - print(e) - sentry_sdk.capture_exception(e) - return "Error in appending to map: {e}" - - -def create_map(embeddings, metadata, map_name, index_name, topic_label_field, colorable_fields): - """ - Generic function to create a Nomic map from given parameters. - Args: - embeddings: np.array of embeddings - metadata: pd.DataFrame of metadata - map_name: str - index_name: str - topic_label_field: str - colorable_fields: list of str - """ - nomic.login(os.getenv('NOMIC_API_KEY')) - - try: - project = atlas.map_embeddings( - embeddings=embeddings, - data=metadata, - id_field="id", - build_topic_model=True, - name=map_name, - topic_label_field=topic_label_field, - colorable_fields=colorable_fields, - add_datums_if_exists=True - ) - project.create_index(index_name, build_topic_model=True) - return "success" - except Exception as e: - print(e) - return "Error in creating map: {e}" - - -def append_to_map(embeddings, metadata, map_name): - """ - Generic function to append new data to an existing Nomic map. - Args: - embeddings: np.array of embeddings - metadata: pd.DataFrame of Nomic upload metadata - map_name: str - """ - nomic.login(os.getenv('NOMIC_API_KEY')) - try: - project = atlas.AtlasProject(name=map_name, add_datums_if_exists=True) - with project.wait_for_project_lock(): - project.add_embeddings(embeddings=embeddings, data=metadata) - return "Successfully appended to Nomic map" - except Exception as e: - print(e) - return "Error in appending to map: {e}" - - -def data_prep_for_doc_map(df: pd.DataFrame): - """ - This function prepares embeddings and metadata for nomic upload in document map creation. - Args: - df: pd.DataFrame - the dataframe of documents from Supabase - Returns: - embeddings: np.array of embeddings - metadata: pd.DataFrame of metadata - """ - print("in data_prep_for_doc_map()") - - metadata = [] - embeddings = [] - texts = [] - - for index, row in df.iterrows(): - - current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - if row['url'] == None: - row['url'] = "" - # iterate through all contexts and create separate entries for each - context_count = 0 - for context in row['contexts']: - context_count += 1 - text_row = context['text'] - embeddings_row = context['embedding'] - - meta_row = { - "id": str(row['id']) + "_" + str(context_count), - "doc_ingested_at": row['created_at'], - "s3_path": row['s3_path'], - "url": row['url'], - "readable_filename": row['readable_filename'], - "created_at": current_time, - "text": text_row - } - - embeddings.append(embeddings_row) - metadata.append(meta_row) - texts.append(text_row) - - embeddings_np = np.array(embeddings, dtype=object) - print("Shape of embeddings: ", embeddings_np.shape) - - # check dimension if embeddings_np is (n, 1536) - if len(embeddings_np.shape) < 2: - print("Creating new embeddings...") - # embeddings_model = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE, - # openai_api_base=os.getenv('AZURE_OPENAI_BASE'), - # openai_api_key=os.getenv('AZURE_OPENAI_KEY')) # type: ignore - embeddings_model = OpenAIEmbeddings(openai_api_type="openai", - openai_api_base="https://api.openai.com/v1/", - openai_api_key=os.getenv('VLADS_OPENAI_KEY')) # type: ignore - embeddings = embeddings_model.embed_documents(texts) - - metadata = pd.DataFrame(metadata) - embeddings = np.array(embeddings) - - return embeddings, metadata - - - -if __name__ == '__main__': - pass diff --git a/ai_ta_backend/service/__init__.py b/ai_ta_backend/service/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ai_ta_backend/service/export_service.py b/ai_ta_backend/service/export_service.py new file mode 100644 index 00000000..0c3718e0 --- /dev/null +++ b/ai_ta_backend/service/export_service.py @@ -0,0 +1,263 @@ +import io +import json +import os +import uuid +import zipfile +from concurrent.futures import ProcessPoolExecutor + +import pandas as pd +import requests + +from ai_ta_backend.database.aws import AWSStorage +from ai_ta_backend.database.sql import SQLDatabase +from ai_ta_backend.service.sentry_service import SentryService +from ai_ta_backend.utils.emails import send_email + + +class ExportService: + + def __init__(self, sql: SQLDatabase, s3: AWSStorage, sentry=SentryService): + self.sql = sql + self.s3 = s3 + self.sentry = sentry + + def export_documents_json(self, course_name: str, from_date='', to_date=''): + """ + This function exports the documents to a json file. + 1. If the number of documents is greater than 1000, it calls a background task to upload the documents to S3. + 2. If the number of documents is less than 1000, it fetches the documents and zips them. + Args: + course_name (str): The name of the course. + from_date (str, optional): The start date for the data export. Defaults to ''. + to_date (str, optional): The end date for the data export. Defaults to ''. + """ + + response = self.sql.getDocumentsBetweenDates(course_name, from_date, to_date, 'documents') + # add a condition to route to direct download or s3 download + if response.count > 1000: + # call background task to upload to s3 + + filename = course_name + '_' + str(uuid.uuid4()) + '_documents.zip' + s3_filepath = s3_file = f"courses/{course_name}/{filename}" + # background task of downloading data - map it with above ID + executor = ProcessPoolExecutor() + executor.submit(self.export_data_in_bg, response, "documents", course_name, s3_filepath) + return {"response": 'Download from S3', "s3_path": s3_filepath} + + else: + # Fetch data + if response.count > 0: + # batch download + total_doc_count = response.count + first_id = response.data[0]['id'] + last_id = response.data[-1]['id'] + + print("total_doc_count: ", total_doc_count) + print("first_id: ", first_id) + print("last_id: ", last_id) + + curr_doc_count = 0 + filename = course_name + '_' + str(uuid.uuid4()) + '_documents.json' + file_path = os.path.join(os.getcwd(), filename) + + while curr_doc_count < total_doc_count: + print("Fetching data from id: ", first_id) + + response = self.sql.getDocsForIdsGte(course_name, first_id) + df = pd.DataFrame(response.data) + curr_doc_count += len(response.data) + + # writing to file + if not os.path.isfile(file_path): + df.to_json(file_path, orient='records') + else: + df.to_json(file_path, orient='records', lines=True, mode='a') + + if len(response.data) > 0: + first_id = response.data[-1]['id'] + 1 + + # Download file + try: + # zip file + zip_filename = filename.split('.')[0] + '.zip' + zip_file_path = os.path.join(os.getcwd(), zip_filename) + + with zipfile.ZipFile(zip_file_path, 'w', compression=zipfile.ZIP_DEFLATED) as zipf: + zipf.write(file_path, filename) + + os.remove(file_path) + return {"response": (zip_file_path, zip_filename, os.getcwd())} + except Exception as e: + print(e) + self.sentry.capture_exception(e) + return {"response": "Error downloading file."} + else: + return {"response": "No data found between the given dates."} + + def export_data_in_bg(self, response, download_type, course_name, s3_path): + """ + This function is called in export_documents_csv() to upload the documents to S3. + 1. download the documents in batches of 100 and upload them to S3. + 2. generate a pre-signed URL for the S3 file. + 3. send an email to the course admins with the pre-signed URL. + + Args: + response (dict): The response from the Supabase query. + download_type (str): The type of download - 'documents' or 'conversations'. + course_name (str): The name of the course. + s3_path (str): The S3 path where the file will be uploaded. + """ + total_doc_count = response.count + first_id = response.data[0]['id'] + print("total_doc_count: ", total_doc_count) + print("pre-defined s3_path: ", s3_path) + + curr_doc_count = 0 + filename = s3_path.split('/')[-1].split('.')[0] + '.json' + file_path = os.path.join(os.getcwd(), filename) + + # download data in batches of 100 + while curr_doc_count < total_doc_count: + print("Fetching data from id: ", first_id) + response = self.sql.getAllFromTableForDownloadType(course_name, download_type, first_id) + df = pd.DataFrame(response.data) + curr_doc_count += len(response.data) + + # writing to file + if not os.path.isfile(file_path): + df.to_json(file_path, orient='records') + else: + df.to_json(file_path, orient='records', lines=True, mode='a') + + if len(response.data) > 0: + first_id = response.data[-1]['id'] + 1 + + # zip file + zip_filename = filename.split('.')[0] + '.zip' + zip_file_path = os.path.join(os.getcwd(), zip_filename) + + with zipfile.ZipFile(zip_file_path, 'w', compression=zipfile.ZIP_DEFLATED) as zipf: + zipf.write(file_path, filename) + + print("zip file created: ", zip_file_path) + + try: + # upload to S3 + + #s3_file = f"courses/{course_name}/exports/{os.path.basename(zip_file_path)}" + s3_file = f"courses/{course_name}/{os.path.basename(zip_file_path)}" + self.s3.upload_file(zip_file_path, os.getenv('S3_BUCKET_NAME'), s3_file) + + # remove local files + os.remove(file_path) + os.remove(zip_file_path) + + print("file uploaded to s3: ", s3_file) + + # generate presigned URL + s3_url = self.s3.generatePresignedUrl('get_object', os.getenv('S3_BUCKET_NAME'), s3_path, 3600) + + # get admin email IDs + headers = {"Authorization": f"Bearer {os.getenv('VERCEL_READ_ONLY_API_KEY')}", "Content-Type": "application/json"} + + hget_url = str(os.getenv('VERCEL_BASE_URL')) + "course_metadatas/" + course_name + response = requests.get(hget_url, headers=headers) + course_metadata = response.json() + course_metadata = json.loads(course_metadata['result']) + admin_emails = course_metadata['course_admins'] + bcc_emails = [] + + # check for Kastan's email and move to bcc + if 'kvday2@illinois.edu' in admin_emails: + admin_emails.remove('kvday2@illinois.edu') + bcc_emails.append('kvday2@illinois.edu') + + # add course owner email to admin_emails + admin_emails.append(course_metadata['course_owner']) + admin_emails = list(set(admin_emails)) + print("admin_emails: ", admin_emails) + print("bcc_emails: ", bcc_emails) + + # add a check for emails, don't send email if no admin emails + if len(admin_emails) == 0: + return "No admin emails found. Email not sent." + + # send email to admins + subject = "UIUC.chat Data Export Complete for " + course_name + body_text = "The data export for " + course_name + " is complete.\n\nYou can download the file from the following link: \n\n" + s3_url + "\n\nThis link will expire in 48 hours." + email_status = send_email(subject, body_text, os.getenv('EMAIL_SENDER'), admin_emails, bcc_emails) + print("email_status: ", email_status) + + return "File uploaded to S3. Email sent to admins." + + except Exception as e: + print(e) + return "Error: " + str(e) + + def export_convo_history_json(self, course_name: str, from_date='', to_date=''): + """ + This function exports the conversation history to a csv file. + Args: + course_name (str): The name of the course. + from_date (str, optional): The start date for the data export. Defaults to ''. + to_date (str, optional): The end date for the data export. Defaults to ''. + """ + print("Exporting conversation history to csv file...") + + response = self.sql.getDocumentsBetweenDates(course_name, from_date, to_date, 'llm-convo-monitor') + + if response.count > 1000: + # call background task to upload to s3 + filename = course_name + '_' + str(uuid.uuid4()) + '_convo_history.zip' + s3_filepath = s3_file = f"courses/{course_name}/{filename}" + # background task of downloading data - map it with above ID + executor = ProcessPoolExecutor() + executor.submit(self.export_data_in_bg, response, "conversations", course_name, s3_filepath) + return {"response": 'Download from S3', "s3_path": s3_filepath} + + # Fetch data + if response.count > 0: + print("id count greater than zero") + first_id = response.data[0]['id'] + last_id = response.data[-1]['id'] + total_count = response.count + + filename = course_name + '_' + str(uuid.uuid4()) + '_convo_history.csv' + file_path = os.path.join(os.getcwd(), filename) + curr_count = 0 + # Fetch data in batches of 25 from first_id to last_id + while curr_count < total_count: + print("Fetching data from id: ", first_id) + response = self.sql.getAllConversationsBetweenIds(course_name, first_id, last_id) + # Convert to pandas dataframe + df = pd.DataFrame(response.data) + curr_count += len(response.data) + + # Append to csv file + if not os.path.isfile(file_path): + df.to_json(file_path, orient='records', lines=True) + else: + df.to_json(file_path, orient='records', lines=True, mode='a') + + # Update first_id + if len(response.data) > 0: + first_id = response.data[-1]['id'] + 1 + print("updated first_id: ", first_id) + + # Download file + try: + # zip file + zip_filename = filename.split('.')[0] + '.zip' + zip_file_path = os.path.join(os.getcwd(), zip_filename) + + with zipfile.ZipFile(zip_file_path, 'w', compression=zipfile.ZIP_DEFLATED) as zipf: + zipf.write(file_path, filename) + os.remove(file_path) + + return {"response": (zip_file_path, zip_filename, os.getcwd())} + except Exception as e: + print(e) + sentry_sdk.capture_exception(e) + return {"response": "Error downloading file!"} + else: + return {"response": "No data found between the given dates."} diff --git a/ai_ta_backend/service/nomic_service.py b/ai_ta_backend/service/nomic_service.py new file mode 100644 index 00000000..973f17f5 --- /dev/null +++ b/ai_ta_backend/service/nomic_service.py @@ -0,0 +1,727 @@ +import datetime +import os +import time + +import backoff +import nomic +import numpy as np +import pandas as pd +from injector import inject +from langchain.embeddings import OpenAIEmbeddings +from nomic import AtlasProject, atlas + +from ai_ta_backend.service.sentry_service import SentryService + +LOCK_EXCEPTIONS = [ + 'Project is locked for state access! Please wait until the project is unlocked to access embeddings.', + 'Project is locked for state access! Please wait until the project is unlocked to access data.', + 'Project is currently indexing and cannot ingest new datums. Try again later.' +] + + +def giveup_hdlr(e): + """ + Function to handle giveup conditions in backoff decorator + Args: + e: Exception raised by the decorated function + Returns: + True if we want to stop retrying, False otherwise + """ + (e_args,) = e.args + e_str = e_args['exception'] + + print("giveup_hdlr() called with exception:", e_str) + if e_str in LOCK_EXCEPTIONS: + return False + else: + # self.sentry.capture_exception(e) + return True + + +def backoff_hdlr(details): + """ + Function to handle backup conditions in backoff decorator. + Currently just prints the details of the backoff. + """ + print( + "\nBacking off {wait:0.1f} seconds after {tries} tries, calling function {target} with args {args} and kwargs {kwargs}" + .format(**details)) + + +def backoff_strategy(): + """ + Function to define retry strategy. Is usualy defined in the decorator, + but passing parameters to it is giving errors. + """ + return backoff.expo(base=10, factor=1.5) + + +class NomicService(): + + @inject + def __init__(self, sentry: SentryService): + nomic.login(os.getenv('NOMIC_API_KEY')) + self.sentry = sentry + + @backoff.on_exception(backoff_strategy, + Exception, + max_tries=5, + raise_on_giveup=False, + giveup=giveup_hdlr, + on_backoff=backoff_hdlr) + def log_convo_to_nomic(self, course_name: str, conversation) -> str: + # nomic.login(os.getenv('NOMIC_API_KEY')) # login during start of flask app + NOMIC_MAP_NAME_PREFIX = 'Conversation Map for ' + """ + Logs conversation to Nomic. + 1. Check if map exists for given course + 2. Check if conversation ID exists + - if yes, delete and add new data point + - if no, add new data point + 3. Keep current logic for map doesn't exist - update metadata + """ + + print(f"in log_convo_to_nomic() for course: {course_name}") + print("type of conversation:", type(conversation)) + #conversation = json.loads(conversation) + messages = conversation['conversation']['messages'] + if 'user_email' not in conversation['conversation']: + user_email = "NULL" + else: + user_email = conversation['conversation']['user_email'] + conversation_id = conversation['conversation']['id'] + + # we have to upload whole conversations + # check what the fetched data looks like - pandas df or pyarrow table + # check if conversation ID exists in Nomic, if yes fetch all data from it and delete it. + # will have current QA and historical QA from Nomic, append new data and add_embeddings() + + project_name = NOMIC_MAP_NAME_PREFIX + course_name + start_time = time.monotonic() + emoji = "" + + try: + # fetch project metadata and embbeddings + project = AtlasProject(name=project_name, add_datums_if_exists=True) + + map_metadata_df = project.maps[1].data.df # type: ignore + map_embeddings_df = project.maps[1].embeddings.latent + # create a function which returns project, data and embeddings df here + map_metadata_df['id'] = map_metadata_df['id'].astype(int) + last_id = map_metadata_df['id'].max() + + if conversation_id in map_metadata_df.values: + # store that convo metadata locally + prev_data = map_metadata_df[map_metadata_df['conversation_id'] == conversation_id] + prev_index = prev_data.index.values[0] + embeddings = map_embeddings_df[prev_index - 1].reshape(1, 1536) + prev_convo = prev_data['conversation'].values[0] + prev_id = prev_data['id'].values[0] + created_at = pd.to_datetime(prev_data['created_at'].values[0]).strftime('%Y-%m-%d %H:%M:%S') + + # delete that convo data point from Nomic, and print result + print("Deleting point from nomic:", project.delete_data([str(prev_id)])) + + # prep for new point + first_message = prev_convo.split("\n")[1].split(": ")[1] + + # select the last 2 messages and append new convo to prev convo + messages_to_be_logged = messages[-2:] + for message in messages_to_be_logged: + if message['role'] == 'user': + emoji = "🙋 " + else: + emoji = "🤖 " + + if isinstance(message['content'], list): + text = message['content'][0]['text'] + else: + text = message['content'] + + prev_convo += "\n>>> " + emoji + message['role'] + ": " + text + "\n" + + # modified timestamp + current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + # update metadata + metadata = [{ + "course": course_name, + "conversation": prev_convo, + "conversation_id": conversation_id, + "id": last_id + 1, + "user_email": user_email, + "first_query": first_message, + "created_at": created_at, + "modified_at": current_time + }] + else: + print("conversation_id does not exist") + + # add new data point + user_queries = [] + conversation_string = "" + + first_message = messages[0]['content'] + if isinstance(first_message, list): + first_message = first_message[0]['text'] + user_queries.append(first_message) + + for message in messages: + if message['role'] == 'user': + emoji = "🙋 " + else: + emoji = "🤖 " + + if isinstance(message['content'], list): + text = message['content'][0]['text'] + else: + text = message['content'] + + conversation_string += "\n>>> " + emoji + message['role'] + ": " + text + "\n" + + # modified timestamp + current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + metadata = [{ + "course": course_name, + "conversation": conversation_string, + "conversation_id": conversation_id, + "id": last_id + 1, + "user_email": user_email, + "first_query": first_message, + "created_at": current_time, + "modified_at": current_time + }] + + # create embeddings + embeddings_model = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE) # type: ignore + embeddings = embeddings_model.embed_documents(user_queries) + + # add embeddings to the project - create a new function for this + project = atlas.AtlasProject(name=project_name, add_datums_if_exists=True) + with project.wait_for_project_lock(): + project.add_embeddings(embeddings=np.array(embeddings), data=pd.DataFrame(metadata)) + project.rebuild_maps() + + print(f"⏰ Nomic logging runtime: {(time.monotonic() - start_time):.2f} seconds") + return f"Successfully logged for {course_name}" + + except Exception as e: + if str(e) == 'You must specify a unique_id_field when creating a new project.': + print("Attempting to create Nomic map...") + result = self.create_nomic_map(course_name, conversation) + print("result of create_nomic_map():", result) + else: + # raising exception again to trigger backoff and passing parameters to use in create_nomic_map() + raise Exception({"exception": str(e)}) + + def get_nomic_map(self, course_name: str, type: str): + """ + Returns the variables necessary to construct an iframe of the Nomic map given a course name. + We just need the ID and URL. + Example values: + map link: https://atlas.nomic.ai/map/ed222613-97d9-46a9-8755-12bbc8a06e3a/f4967ad7-ff37-4098-ad06-7e1e1a93dd93 + map id: f4967ad7-ff37-4098-ad06-7e1e1a93dd93 + """ + # nomic.login(os.getenv('NOMIC_API_KEY')) # login during start of flask app + if type.lower() == 'document': + NOMIC_MAP_NAME_PREFIX = 'Document Map for ' + else: + NOMIC_MAP_NAME_PREFIX = 'Conversation Map for ' + + project_name = NOMIC_MAP_NAME_PREFIX + course_name + start_time = time.monotonic() + + try: + project = atlas.AtlasProject(name=project_name, add_datums_if_exists=True) + map = project.get_map(project_name) + + print(f"⏰ Nomic Full Map Retrieval: {(time.monotonic() - start_time):.2f} seconds") + return {"map_id": f"iframe{map.id}", "map_link": map.map_link} + except Exception as e: + # Error: ValueError: You must specify a unique_id_field when creating a new project. + if str(e) == 'You must specify a unique_id_field when creating a new project.': # type: ignore + print( + "Nomic map does not exist yet, probably because you have less than 20 queries/documents on your project: ", + e) + else: + print("ERROR in get_nomic_map():", e) + self.sentry.capture_exception(e) + return {"map_id": None, "map_link": None} + + def create_nomic_map(self, course_name: str, log_data: list): + """ + Creates a Nomic map for new courses and those which previously had < 20 queries. + 1. fetches supabase conversations for course + 2. appends current embeddings and metadata to it + 2. creates map if there are at least 20 queries + """ + nomic.login(os.getenv('NOMIC_API_KEY')) # login during start of flask app + NOMIC_MAP_NAME_PREFIX = 'Conversation Map for ' + + print(f"in create_nomic_map() for {course_name}") + # initialize supabase + supabase_client = supabase.create_client( # type: ignore + supabase_url=os.getenv('SUPABASE_URL'), # type: ignore + supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore + + try: + # fetch all conversations with this new course (we expect <=20 conversations, because otherwise the map should be made already) + response = supabase_client.table("llm-convo-monitor").select("*").eq("course_name", course_name).execute() + data = response.data + df = pd.DataFrame(data) + + if len(data) < 19: + return None + else: + # get all queries for course and create metadata + user_queries = [] + metadata = [] + i = 1 + conversation_exists = False + + # current log details + log_messages = log_data['conversation']['messages'] # type: ignore + log_user_email = log_data['conversation']['user_email'] # type: ignore + log_conversation_id = log_data['conversation']['id'] # type: ignore + + for _index, row in df.iterrows(): + user_email = row['user_email'] + created_at = pd.to_datetime(row['created_at']).strftime('%Y-%m-%d %H:%M:%S') + convo = row['convo'] + messages = convo['messages'] + + first_message = messages[0]['content'] + if isinstance(first_message, list): + first_message = first_message[0]['text'] + + user_queries.append(first_message) + + # create metadata for multi-turn conversation + conversation = "" + for message in messages: + # string of role: content, role: content, ... + if message['role'] == 'user': # type: ignore + emoji = "🙋 " + else: + emoji = "🤖 " + + if isinstance(message['content'], list): + text = message['content'][0]['text'] + else: + text = message['content'] + + conversation += "\n>>> " + emoji + message['role'] + ": " + text + "\n" + + # append current chat to previous chat if convo already exists + if convo['id'] == log_conversation_id: + conversation_exists = True + + for m in log_messages: + if m['role'] == 'user': # type: ignore + emoji = "🙋 " + else: + emoji = "🤖 " + + if isinstance(m['content'], list): + text = m['content'][0]['text'] + else: + text = m['content'] + conversation += "\n>>> " + emoji + m['role'] + ": " + text + "\n" + + # adding modified timestamp + current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + # add to metadata + metadata_row = { + "course": row['course_name'], + "conversation": conversation, + "conversation_id": convo['id'], + "id": i, + "user_email": user_email, + "first_query": first_message, + "created_at": created_at, + "modified_at": current_time + } + metadata.append(metadata_row) + i += 1 + + # add current log as a new data point if convo doesn't exist + if not conversation_exists: + user_queries.append(log_messages[0]['content']) + conversation = "" + for message in log_messages: + if message['role'] == 'user': + emoji = "🙋 " + else: + emoji = "🤖 " + + if isinstance(message['content'], list): + text = message['content'][0]['text'] + else: + text = message['content'] + conversation += "\n>>> " + emoji + message['role'] + ": " + text + "\n" + + # adding timestamp + current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + metadata_row = { + "course": course_name, + "conversation": conversation, + "conversation_id": log_conversation_id, + "id": i, + "user_email": log_user_email, + "first_query": log_messages[0]['content'], + "created_at": current_time, + "modified_at": current_time + } + metadata.append(metadata_row) + + metadata = pd.DataFrame(metadata) + embeddings_model = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE) # type: ignore + embeddings = embeddings_model.embed_documents(user_queries) + + # create Atlas project + project_name = NOMIC_MAP_NAME_PREFIX + course_name + index_name = course_name + "_convo_index" + project = atlas.map_embeddings( + embeddings=np.array(embeddings), + data=metadata, # type: ignore - this is the correct type, the func signature from Nomic is incomplete + id_field='id', + build_topic_model=True, + topic_label_field='first_query', + name=project_name, + colorable_fields=['conversation_id', 'first_query']) + project.create_index(index_name, build_topic_model=True) + return f"Successfully created Nomic map for {course_name}" + except Exception as e: + # Error: ValueError: You must specify a unique_id_field when creating a new project. + if str(e) == 'You must specify a unique_id_field when creating a new project.': # type: ignore + print("Nomic map does not exist yet, probably because you have less than 20 queries on your project: ", e) + else: + print("ERROR in create_nomic_map():", e) + self.sentry.capture_exception(e) + + return "failed" + + ## -------------------------------- DOCUMENT MAP FUNCTIONS --------------------------------- ## + + def create_document_map(self, course_name: str): + """ + This is a function which creates a document map for a given course from scratch + 1. Gets count of documents for the course + 2. If less than 20, returns a message that a map cannot be created + 3. If greater than 20, iteratively fetches documents in batches of 25 + 4. Prepares metadata and embeddings for nomic upload + 5. Creates a new map and uploads the data + + Args: + course_name: str + Returns: + str: success or failed + """ + print("in create_document_map()") + # nomic.login(os.getenv('NOMIC_API_KEY')) + NOMIC_MAP_NAME_PREFIX = 'Document Map for ' + + # initialize supabase + supabase_client = supabase.create_client( # type: ignore + supabase_url=os.getenv('SUPABASE_URL'), # type: ignore + supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore + + try: + # check if map exists + response = supabase_client.table("projects").select("doc_map_id").eq("course_name", course_name).execute() + if response.data: + return "Map already exists for this course." + + # fetch relevant document data from Supabase + response = supabase_client.table("documents").select("id", + count="exact").eq("course_name", + course_name).order('id', + desc=False).execute() + if not response.count: + return "No documents found for this course." + + total_doc_count = response.count + print("Total number of documents in Supabase: ", total_doc_count) + + # minimum 20 docs needed to create map + if total_doc_count > 19: + + first_id = response.data[0]['id'] + combined_dfs = [] + curr_total_doc_count = 0 + doc_count = 0 + first_batch = True + + # iteratively query in batches of 25 + while curr_total_doc_count < total_doc_count: + + response = supabase_client.table("documents").select( + "id, created_at, s3_path, url, readable_filename, contexts").eq("course_name", course_name).gte( + 'id', first_id).order('id', desc=False).limit(25).execute() + df = pd.DataFrame(response.data) + combined_dfs.append(df) # list of dfs + + curr_total_doc_count += len(response.data) + doc_count += len(response.data) + + if doc_count >= 1000: # upload to Nomic every 1000 docs + + # concat all dfs from the combined_dfs list + final_df = pd.concat(combined_dfs, ignore_index=True) + + # prep data for nomic upload + embeddings, metadata = self.data_prep_for_doc_map(final_df) + + if first_batch: + # create a new map + print("Creating new map...") + project_name = NOMIC_MAP_NAME_PREFIX + course_name + index_name = course_name + "_doc_index" + topic_label_field = "text" + colorable_fields = ["readable_filename", "text"] + result = self.create_map(embeddings, metadata, project_name, index_name, topic_label_field, + colorable_fields) + # update flag + first_batch = False + + else: + # append to existing map + print("Appending data to existing map...") + project_name = NOMIC_MAP_NAME_PREFIX + course_name + # add project lock logic here + result = self.append_to_map(embeddings, metadata, project_name) + + # reset variables + combined_dfs = [] + doc_count = 0 + + # set first_id for next iteration + first_id = response.data[-1]['id'] + 1 + + # upload last set of docs + final_df = pd.concat(combined_dfs, ignore_index=True) + embeddings, metadata = self.data_prep_for_doc_map(final_df) + project_name = NOMIC_MAP_NAME_PREFIX + course_name + if first_batch: + index_name = course_name + "_doc_index" + topic_label_field = "text" + colorable_fields = ["readable_filename", "text"] + result = self.create_map(embeddings, metadata, project_name, index_name, topic_label_field, colorable_fields) + else: + result = self.append_to_map(embeddings, metadata, project_name) + print("Atlas upload status: ", result) + + # log info to supabase + project = AtlasProject(name=project_name, add_datums_if_exists=True) + project_id = project.id + project.rebuild_maps() + project_info = {'course_name': course_name, 'doc_map_id': project_id} + response = supabase_client.table("projects").insert(project_info).execute() + print("Response from supabase: ", response) + return "success" + else: + return "Cannot create a map because there are less than 20 documents in the course." + except Exception as e: + print(e) + self.sentry.capture_exception(e) + return "failed" + + def delete_from_document_map(self, project_id: str, ids: list): + """ + This function is used to delete datapoints from a document map. + Currently used within the delete_data() function in vector_database.py + Args: + course_name: str + ids: list of str + """ + print("in delete_from_document_map()") + + try: + # fetch project from Nomic + project = AtlasProject(project_id=project_id, add_datums_if_exists=True) + + # delete the ids from Nomic + print("Deleting point from document map:", project.delete_data(ids)) + with project.wait_for_project_lock(): + project.rebuild_maps() + return "Successfully deleted from Nomic map" + except Exception as e: + print(e) + self.sentry.capture_exception(e) + return "Error in deleting from document map: {e}" + + # If this needs to be uncommented, make sure to move the supabase call to the respective service + # def log_to_document_map(self, data: dict): + # """ + # This is a function which appends new documents to an existing document map. It's called + # at the end of split_and_upload() after inserting data to Supabase. + # Args: + # data: dict - the response data from Supabase insertion + # """ + # print("in add_to_document_map()") + + # try: + # # check if map exists + # course_name = data['course_name'] + # response = SUPABASE_CLIENT.table("projects").select("doc_map_id").eq("course_name", course_name).execute() + # if response.data: + # project_id = response.data[0]['doc_map_id'] + # else: + # # create a map + # map_creation_result = self.create_document_map(course_name) + # if map_creation_result != "success": + # return "The project has less than 20 documents and a map cannot be created." + # else: + # # fetch project id + # response = SUPABASE_CLIENT.table("projects").select("doc_map_id").eq("course_name", course_name).execute() + # project_id = response.data[0]['doc_map_id'] + + # project = AtlasProject(project_id=project_id, add_datums_if_exists=True) + # #print("Inserted data: ", data) + + # embeddings = [] + # metadata = [] + # context_count = 0 + # # prep data for nomic upload + # for row in data['contexts']: + # context_count += 1 + # embeddings.append(row['embedding']) + # metadata.append({ + # "id": str(data['id']) + "_" + str(context_count), + # "doc_ingested_at": data['created_at'], + # "s3_path": data['s3_path'], + # "url": data['url'], + # "readable_filename": data['readable_filename'], + # "created_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + # "text": row['text'] + # }) + # embeddings = np.array(embeddings) + # metadata = pd.DataFrame(metadata) + # print("Shape of embeddings: ", embeddings.shape) + + # # append to existing map + # project_name = "Document Map for " + course_name + # result = self.append_to_map(embeddings, metadata, project_name) + + # # check if project is accepting new datums + # if project.is_accepting_data: + # with project.wait_for_project_lock(): + # project.rebuild_maps() + + # # with project.wait_for_project_lock(): + # # project.rebuild_maps() + # return result + + # except Exception as e: + # print(e) + # self.sentry.capture_exception(e) + # return "Error in appending to map: {e}" + + def create_map(self, embeddings, metadata, map_name, index_name, topic_label_field, colorable_fields): + """ + Generic function to create a Nomic map from given parameters. + Args: + embeddings: np.array of embeddings + metadata: pd.DataFrame of metadata + map_name: str + index_name: str + topic_label_field: str + colorable_fields: list of str + """ + nomic.login(os.getenv('NOMIC_API_KEY')) + + try: + project = atlas.map_embeddings(embeddings=embeddings, + data=metadata, + id_field="id", + build_topic_model=True, + name=map_name, + topic_label_field=topic_label_field, + colorable_fields=colorable_fields, + add_datums_if_exists=True) + project.create_index(index_name, build_topic_model=True) + return "success" + except Exception as e: + print(e) + return "Error in creating map: {e}" + + def append_to_map(self, embeddings, metadata, map_name): + """ + Generic function to append new data to an existing Nomic map. + Args: + embeddings: np.array of embeddings + metadata: pd.DataFrame of Nomic upload metadata + map_name: str + """ + nomic.login(os.getenv('NOMIC_API_KEY')) + try: + project = atlas.AtlasProject(name=map_name, add_datums_if_exists=True) + with project.wait_for_project_lock(): + project.add_embeddings(embeddings=embeddings, data=metadata) + return "Successfully appended to Nomic map" + except Exception as e: + print(e) + return "Error in appending to map: {e}" + + def data_prep_for_doc_map(self, df: pd.DataFrame): + """ + This function prepares embeddings and metadata for nomic upload in document map creation. + Args: + df: pd.DataFrame - the dataframe of documents from Supabase + Returns: + embeddings: np.array of embeddings + metadata: pd.DataFrame of metadata + """ + print("in data_prep_for_doc_map()") + + metadata = [] + embeddings = [] + texts = [] + + for index, row in df.iterrows(): + + current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + if row['url'] == None: + row['url'] = "" + # iterate through all contexts and create separate entries for each + context_count = 0 + for context in row['contexts']: + context_count += 1 + text_row = context['text'] + embeddings_row = context['embedding'] + + meta_row = { + "id": str(row['id']) + "_" + str(context_count), + "doc_ingested_at": row['created_at'], + "s3_path": row['s3_path'], + "url": row['url'], + "readable_filename": row['readable_filename'], + "created_at": current_time, + "text": text_row + } + + embeddings.append(embeddings_row) + metadata.append(meta_row) + texts.append(text_row) + + embeddings_np = np.array(embeddings, dtype=object) + print("Shape of embeddings: ", embeddings_np.shape) + + # check dimension if embeddings_np is (n, 1536) + if len(embeddings_np.shape) < 2: + print("Creating new embeddings...") + # embeddings_model = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE, + # openai_api_base=os.getenv('AZURE_OPENAI_BASE'), + # openai_api_key=os.getenv('AZURE_OPENAI_KEY')) # type: ignore + embeddings_model = OpenAIEmbeddings(openai_api_type="openai", + openai_api_base="https://api.openai.com/v1/", + openai_api_key=os.getenv('VLADS_OPENAI_KEY')) # type: ignore + embeddings = embeddings_model.embed_documents(texts) + + metadata = pd.DataFrame(metadata) + embeddings = np.array(embeddings) + + return embeddings, metadata diff --git a/ai_ta_backend/service/posthog_service.py b/ai_ta_backend/service/posthog_service.py new file mode 100644 index 00000000..87f41d9a --- /dev/null +++ b/ai_ta_backend/service/posthog_service.py @@ -0,0 +1,18 @@ +import os + +from injector import inject +from posthog import Posthog + + +class PosthogService: + + @inject + def __init__(self): + self.posthog = Posthog( + sync_mode=True, + project_api_key=os.environ["POSTHOG_API_KEY"], + host="https://app.posthog.com", + ) + + def capture(self, event_name, properties): + self.posthog.capture("distinct_id_of_the_user", event=event_name, properties=properties) diff --git a/ai_ta_backend/service/retrieval_service.py b/ai_ta_backend/service/retrieval_service.py new file mode 100644 index 00000000..b6e03f05 --- /dev/null +++ b/ai_ta_backend/service/retrieval_service.py @@ -0,0 +1,414 @@ +import inspect +import os +import time +import traceback +from typing import Dict, List, Union + +import openai +from injector import inject +from langchain import hub +from langchain.chat_models import AzureChatOpenAI +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.load import dumps, loads +from langchain.schema import Document + +from ai_ta_backend.database.aws import AWSStorage +from ai_ta_backend.database.sql import SQLDatabase +from ai_ta_backend.database.vector import VectorDatabase +from ai_ta_backend.service.nomic_service import NomicService +from ai_ta_backend.service.posthog_service import PosthogService +from ai_ta_backend.service.sentry_service import SentryService +from ai_ta_backend.utils_tokenization import count_tokens_and_cost + +OPENAI_API_TYPE = "azure" # "openai" or "azure" + + +class RetrievalService: + """ + Contains all methods for business logic of the retrieval service. + """ + + @inject + def __init__(self, vdb: VectorDatabase, sqlDb: SQLDatabase, aws: AWSStorage, posthog: PosthogService, + sentry: SentryService, nomicService: NomicService): + self.vdb = vdb + self.sqlDb = sqlDb + self.aws = aws + self.sentry = sentry + self.posthog = posthog + self.nomicService = nomicService + + openai.api_key = os.getenv("OPENAI_API_KEY") + + self.embeddings = OpenAIEmbeddings( + model='text-embedding-ada-002', + openai_api_base=os.getenv("AZURE_OPENAI_ENDPOINT"), # type:ignore + openai_api_type=OPENAI_API_TYPE, + openai_api_key=os.getenv("AZURE_OPENAI_KEY"), # type:ignore + openai_api_version=os.getenv("OPENAI_API_VERSION"), # type:ignore + ) + + self.llm = AzureChatOpenAI( + temperature=0, + deployment_name=os.getenv("AZURE_OPENAI_ENGINE"), # type:ignore + openai_api_base=os.getenv("AZURE_OPENAI_ENDPOINT"), # type:ignore + openai_api_key=os.getenv("AZURE_OPENAI_KEY"), # type:ignore + openai_api_version=os.getenv("OPENAI_API_VERSION"), # type:ignore + openai_api_type=OPENAI_API_TYPE, + ) + + def getTopContexts(self, search_query: str, course_name: str, token_limit: int = 4_000) -> Union[List[Dict], str]: + """Here's a summary of the work. + + /GET arguments + course name (optional) str: A json response with TBD fields. + + Returns + JSON: A json response with TBD fields. See main.py:getTopContexts docs. + or + String: An error message with traceback. + """ + try: + start_time_overall = time.monotonic() + + found_docs: list[Document] = self.vector_search(search_query=search_query, course_name=course_name) + + pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" + # count tokens at start and end, then also count each context. + token_counter, _ = count_tokens_and_cost(pre_prompt + "\n\nNow please respond to my query: " + + search_query) # type: ignore + + valid_docs = [] + num_tokens = 0 + for doc in found_docs: + doc_string = f"Document: {doc.metadata['readable_filename']}{', page: ' + str(doc.metadata['pagenumber']) if doc.metadata['pagenumber'] else ''}\n{str(doc.page_content)}\n" + num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore + + print( + f"tokens used/limit: {token_counter}/{token_limit}, tokens in chunk: {num_tokens}, total prompt cost (of these contexts): {prompt_cost}. 📄 File: {doc.metadata['readable_filename']}" + ) + if token_counter + num_tokens <= token_limit: + token_counter += num_tokens + valid_docs.append(doc) + else: + # filled our token size, time to return + break + + print(f"Total tokens used: {token_counter}. Docs used: {len(valid_docs)} of {len(found_docs)} docs retrieved") + print(f"Course: {course_name} ||| search_query: {search_query}") + print(f"⏰ ^^ Runtime of getTopContexts: {(time.monotonic() - start_time_overall):.2f} seconds") + if len(valid_docs) == 0: + return [] + + self.posthog.capture( + event_name="success_get_top_contexts_OG", + properties={ + "user_query": search_query, + "course_name": course_name, + "token_limit": token_limit, + "total_tokens_used": token_counter, + "total_contexts_used": len(valid_docs), + "total_unique_docs_retrieved": len(found_docs), + "getTopContext_total_latency_sec": time.monotonic() - start_time_overall, + }, + ) + + return self.format_for_json(valid_docs) + except Exception as e: + # return full traceback to front end + # err: str = f"ERROR: In /getTopContexts. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}" # type: ignore + err: str = f"ERROR: In /getTopContexts. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.print_exc} \n{e}" # type: ignore + traceback.print_exc() + print(err) + self.sentry.capture_exception(e) + return err + + def getAll( + self, + course_name: str, + ): + """Get all course materials based on course name. + Args: + course_name (as uploaded on supabase) + Returns: + list of dictionaries with distinct s3 path, readable_filename and course_name, url, base_url. + """ + + response = self.sqlDb.getAllMaterialsForCourse(course_name) + + data = response.data + unique_combinations = set() + distinct_dicts = [] + + for item in data: + combination = (item['s3_path'], item['readable_filename'], item['course_name'], item['url'], item['base_url']) + if combination not in unique_combinations: + unique_combinations.add(combination) + distinct_dicts.append(item) + + return distinct_dicts + + def delete_data(self, course_name: str, s3_path: str, source_url: str): + """Delete file from S3, Qdrant, and Supabase.""" + print(f"Deleting data for course {course_name}") + # add delete from doc map logic here + try: + # Delete file from S3 + bucket_name = os.getenv('S3_BUCKET_NAME') + if bucket_name is None: + raise ValueError("S3_BUCKET_NAME environment variable is not set") + + identifier_key, identifier_value = ("s3_path", s3_path) if s3_path else ("url", source_url) + print(f"Deleting {identifier_value} from S3, Qdrant, and Supabase using {identifier_key}") + + # Delete from S3 + if identifier_key == "s3_path": + self.delete_from_s3(bucket_name, s3_path) + + # Delete from Qdrant + self.delete_from_qdrant(identifier_key, identifier_value) + + # Delete from Nomic and Supabase + self.delete_from_nomic_and_supabase(course_name, identifier_key, identifier_value) + + return "Success" + except Exception as e: + err: str = f"ERROR IN delete_data: Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore + print(err) + self.sentry.capture_exception(e) + return err + + def delete_from_s3(self, bucket_name: str, s3_path: str): + try: + self.aws.delete_file(bucket_name, s3_path) + except Exception as e: + print("Error in deleting file from s3:", e) + self.sentry.capture_exception(e) + + def delete_from_qdrant(self, identifier_key: str, identifier_value: str): + try: + self.vdb.delete_data(os.environ['QDRANT_COLLECTION_NAME'], identifier_key, identifier_value) + except Exception as e: + if "timed out" in str(e): + # Timed out is fine. Still deletes. + pass + else: + print("Error in deleting file from Qdrant:", e) + self.sentry.capture_exception(e) + + def getTopContextsWithMQR(self, + search_query: str, + course_name: str, + token_limit: int = 4_000) -> Union[List[Dict], str]: + """ + New info-retrieval pipeline that uses multi-query retrieval + filtering + reciprocal rank fusion + context padding. + 1. Generate multiple queries based on the input search query. + 2. Retrieve relevant docs for each query. + 3. Filter the relevant docs based on the user query and pass them to the rank fusion step. + 4. [CANCELED BEC POINTLESS] Rank the docs based on the relevance score. + 5. Parent-doc-retrieval: Pad just the top 5 docs with expanded context from the original document. + """ + return 'fail' + + # try: + # top_n_per_query = 40 # HARD CODE TO ENSURE WE HIT THE MAX TOKENS + # start_time_overall = time.monotonic() + # mq_start_time = time.monotonic() + + # # 1. GENERATE MULTIPLE QUERIES + # generate_queries = ( + # MULTI_QUERY_PROMPT | self.llm | StrOutputParser() | (lambda x: x.split("\n")) | + # (lambda x: list(filter(None, x))) # filter out non-empty strings + # ) + + # generated_queries = generate_queries.invoke({"original_query": search_query}) + # print("generated_queries", generated_queries) + + # # 2. VECTOR SEARCH FOR EACH QUERY + # batch_found_docs_nested: list[list[Document]] = self.batch_vector_search(search_queries=generated_queries, + # course_name=course_name, + # top_n=top_n_per_query) + + # # 3. RANK REMAINING DOCUMENTS -- good for parent doc padding of top 5 at the end. + # found_docs = self.reciprocal_rank_fusion(batch_found_docs_nested) + # found_docs = [doc for doc, score in found_docs] + # print(f"Num docs after re-ranking: {len(found_docs)}") + # if len(found_docs) == 0: + # return [] + # print(f"⏰ Total multi-query processing runtime: {(time.monotonic() - mq_start_time):.2f} seconds") + + # # 4. FILTER DOCS + # filtered_docs = filter_top_contexts(contexts=found_docs, user_query=search_query, timeout=30, max_concurrency=180) + # if len(filtered_docs) == 0: + # return [] + + # # 5. TOP DOC CONTEXT PADDING // parent document retriever + # final_docs = context_parent_doc_padding(filtered_docs, search_query, course_name) + # print(f"Number of final docs after context padding: {len(final_docs)}") + + # pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" + # token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' + + # search_query) # type: ignore + + # valid_docs = [] + # num_tokens = 0 + # for doc in final_docs: + # doc_string = f"Document: {doc['readable_filename']}{', page: ' + str(doc['pagenumber']) if doc['pagenumber'] else ''}\n{str(doc['text'])}\n" + # num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore + + # print(f"token_counter: {token_counter}, num_tokens: {num_tokens}, max_tokens: {token_limit}") + # if token_counter + num_tokens <= token_limit: + # token_counter += num_tokens + # valid_docs.append(doc) + # else: + # # filled our token size, time to return + # break + + # print(f"Total tokens used: {token_counter} Used {len(valid_docs)} of total unique docs {len(found_docs)}.") + # print(f"Course: {course_name} ||| search_query: {search_query}") + # print(f"⏰ ^^ Runtime of getTopContextsWithMQR: {(time.monotonic() - start_time_overall):.2f} seconds") + + # if len(valid_docs) == 0: + # return [] + + # self.posthog.capture('distinct_id_of_the_user', + # event='filter_top_contexts_succeeded', + # properties={ + # 'user_query': search_query, + # 'course_name': course_name, + # 'token_limit': token_limit, + # 'total_tokens_used': token_counter, + # 'total_contexts_used': len(valid_docs), + # 'total_unique_docs_retrieved': len(found_docs), + # }) + + # return self.format_for_json_mqr(valid_docs) + # except Exception as e: + # # return full traceback to front end + # err: str = f"ERROR: In /getTopContextsWithMQR. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.format_exc()}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}" # type: ignore + # print(err) + # sentry_sdk.capture_exception(e) + # return err + + def format_for_json_mqr(self, found_docs) -> List[Dict]: + """ + Same as format_for_json, but for the new MQR pipeline. + """ + for found_doc in found_docs: + if "pagenumber" not in found_doc.keys(): + print("found no pagenumber") + found_doc['pagenumber'] = found_doc['pagenumber_or_timestamp'] + + contexts = [ + { + 'text': doc['text'], + 'readable_filename': doc['readable_filename'], + 'course_name ': doc['course_name'], + 's3_path': doc['s3_path'], + 'pagenumber': doc['pagenumber'], + 'url': doc['url'], # wouldn't this error out? + 'base_url': doc['base_url'], + } for doc in found_docs + ] + + return contexts + + def delete_from_nomic_and_supabase(self, course_name: str, identifier_key: str, identifier_value: str): + try: + response = self.sqlDb.getMaterialsForCourseAndKeyAndValue(course_name, identifier_key, identifier_value) + data = response.data[0] # single record fetched + nomic_ids_to_delete = [str(data['id']) + "_" + str(i) for i in range(1, len(data['contexts']) + 1)] + + # delete from Nomic + # check if project exists + response = self.sqlDb.getProjectsMapForCourse(course_name) + if response.data: + project_id = response.data[0]['doc_map_id'] + else: + return "No document map found for this course" + res = self.nomicService.delete_from_document_map(project_id, nomic_ids_to_delete) + + # delete from Supabase + self.sqlDb.deleteMaterialsForCourseAndKeyAndValue(course_name, identifier_key, identifier_value) + except Exception as e: + print(f"Error in deleting file from Nomic or Supabase using {identifier_key}: {identifier_value}", e) + self.sentry.capture_exception(e) + + def vector_search(self, search_query, course_name): + top_n = 80 + # EMBED + openai_start_time = time.monotonic() + print("OPENAI_API_TYPE", OPENAI_API_TYPE) + user_query_embedding = self.embeddings.embed_query(search_query) + openai_embedding_latency = time.monotonic() - openai_start_time + + # SEARCH + self.posthog.capture( + event_name="vector_search_invoked", + properties={ + "user_query": search_query, + "course_name": course_name, + }, + ) + qdrant_start_time = time.monotonic() + search_results = self.vdb.vector_search(search_query, course_name, user_query_embedding, top_n) + + found_docs: list[Document] = [] + for d in search_results: + try: + metadata = d.payload + page_content = metadata["page_content"] + del metadata["page_content"] + if "pagenumber" not in metadata.keys() and "pagenumber_or_timestamp" in metadata.keys(): # type: ignore + # aiding in the database migration... + metadata["pagenumber"] = metadata["pagenumber_or_timestamp"] # type: ignore + + found_docs.append(Document(page_content=page_content, metadata=metadata)) # type: ignore + except Exception as e: + print(f"Error in vector_search(), for course: `{course_name}`. Error: {e}") + self.sentry.capture_exception(e) + + self.posthog.capture( + event_name="vector_search_succeded", + properties={ + "user_query": search_query, + "course_name": course_name, + "qdrant_latency_sec": time.monotonic() - qdrant_start_time, + "openai_embedding_latency_sec": openai_embedding_latency, + }, + ) + # print("found_docs", found_docs) + return found_docs + + def format_for_json(self, found_docs: List[Document]) -> List[Dict]: + """Formatting only. + {'course_name': course_name, 'contexts': [{'source_name': 'Lumetta_notes', 'source_location': 'pg. 19', 'text': 'In FSM, we do this...'}, {'source_name': 'Lumetta_notes', 'source_location': 'pg. 20', 'text': 'In Assembly language, the code does that...'},]} + + Args: + found_docs (List[Document]): _description_ + + Raises: + Exception: _description_ + + Returns: + List[Dict]: _description_ + """ + for found_doc in found_docs: + if "pagenumber" not in found_doc.metadata.keys(): + print("found no pagenumber") + found_doc.metadata["pagenumber"] = found_doc.metadata["pagenumber_or_timestamp"] + + contexts = [ + { + "text": doc.page_content, + "readable_filename": doc.metadata["readable_filename"], + "course_name ": doc.metadata["course_name"], + "s3_path": doc.metadata["s3_path"], + "pagenumber": doc.metadata["pagenumber"], # this because vector db schema is older... + # OPTIONAL PARAMS... + "url": doc.metadata.get("url"), # wouldn't this error out? + "base_url": doc.metadata.get("base_url"), + } for doc in found_docs + ] + + return contexts diff --git a/ai_ta_backend/service/sentry_service.py b/ai_ta_backend/service/sentry_service.py new file mode 100644 index 00000000..53b780b0 --- /dev/null +++ b/ai_ta_backend/service/sentry_service.py @@ -0,0 +1,22 @@ +import os + +import sentry_sdk +from injector import inject + + +class SentryService: + + @inject + def __init__(self, dsn: str): + # Sentry.io error logging + sentry_sdk.init( + dsn=os.getenv("SENTRY_DSN"), + # Set traces_sample_rate to 1.0 to capture 100% of transactions for performance monitoring. + traces_sample_rate=1.0, + # Set profiles_sample_rate to 1.0 to profile 100% of sampled transactions. + # We recommend adjusting this value in production. + profiles_sample_rate=1.0, + enable_tracing=True) + + def capture_exception(self, exception: Exception): + sentry_sdk.capture_exception(exception) diff --git a/ai_ta_backend/utils/__init__.py b/ai_ta_backend/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ai_ta_backend/context_parent_doc_padding.py b/ai_ta_backend/utils/context_parent_doc_padding.py similarity index 96% rename from ai_ta_backend/context_parent_doc_padding.py rename to ai_ta_backend/utils/context_parent_doc_padding.py index 5c095b0b..8521d99e 100644 --- a/ai_ta_backend/context_parent_doc_padding.py +++ b/ai_ta_backend/utils/context_parent_doc_padding.py @@ -7,8 +7,8 @@ import supabase DOCUMENTS_TABLE = os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE'] -SUPABASE_CLIENT = supabase.create_client(supabase_url=os.environ['SUPABASE_URL'], - supabase_key=os.environ['SUPABASE_API_KEY']) # type: ignore +# SUPABASE_CLIENT = supabase.create_client(supabase_url=os.environ['SUPABASE_URL'], +# supabase_key=os.environ['SUPABASE_API_KEY']) # type: ignore def context_parent_doc_padding(found_docs, search_query, course_name): diff --git a/ai_ta_backend/utils/emails.py b/ai_ta_backend/utils/emails.py new file mode 100644 index 00000000..4312a35d --- /dev/null +++ b/ai_ta_backend/utils/emails.py @@ -0,0 +1,38 @@ +import os +import smtplib +from email.mime.multipart import MIMEMultipart +from email.mime.text import MIMEText + + +def send_email(subject: str, body_text: str, sender: str, receipients: list, bcc_receipients: list): + """ + Send an email using the AWS SES service + :param subject: The subject of the email + :param body_text: The body of the email + :param sender: The email address of the sender + :param receipients: A list of email addresses to send the email to + :param bcc_receipients: A list of email addresses to send the email to as BCC + :return: A string indicating the result of the email send operation + + """ + # Create message content + message = MIMEMultipart("alternative") + message["Subject"] = subject + message["From"] = sender + message["To"] = ", ".join(receipients) + + if len(bcc_receipients) > 0: + message["Bcc"] = ", ".join(bcc_receipients) + + # Add plain text part + part1 = MIMEText(body_text, "plain") + message.attach(part1) + + # Add additional parts for HTML, attachments, etc. (optional) + + # Connect to SMTP server + with smtplib.SMTP_SSL(os.getenv('SES_HOST'), os.getenv('SES_PORT')) as server: # type: ignore + server.login(os.getenv('USERNAME_SMTP'), os.getenv('PASSWORD_SMTP')) # type: ignore + server.sendmail(sender, receipients + bcc_receipients, message.as_string()) + + return "Email sent successfully!" diff --git a/ai_ta_backend/filtering_contexts.py b/ai_ta_backend/utils/filtering_contexts.py similarity index 100% rename from ai_ta_backend/filtering_contexts.py rename to ai_ta_backend/utils/filtering_contexts.py diff --git a/ai_ta_backend/vector_database.py b/ai_ta_backend/vector_database.py deleted file mode 100644 index fcfc1887..00000000 --- a/ai_ta_backend/vector_database.py +++ /dev/null @@ -1,803 +0,0 @@ -import asyncio -import inspect -import os -import time -import traceback -from typing import Dict, List, Union - -import boto3 -import openai -import sentry_sdk -import supabase -from langchain import hub -from langchain.chat_models import AzureChatOpenAI -from langchain.embeddings.openai import OpenAIEmbeddings -from langchain.load import dumps, loads -from langchain.schema import Document -from langchain.schema.output_parser import StrOutputParser -from langchain.vectorstores import Qdrant -from posthog import Posthog -from qdrant_client import QdrantClient, models - -from ai_ta_backend.context_parent_doc_padding import context_parent_doc_padding -from ai_ta_backend.extreme_context_stuffing import OpenAIAPIProcessor - -# from ai_ta_backend.filtering_contexts import filter_top_contexts -from ai_ta_backend.nomic_logging import delete_from_document_map -from ai_ta_backend.utils_tokenization import count_tokens_and_cost - -MULTI_QUERY_PROMPT = hub.pull("langchain-ai/rag-fusion-query-generation") -OPENAI_API_TYPE = "azure" # "openai" or "azure" - - -class Ingest(): - """ - Contains all methods for building and using vector databases. - """ - - def __init__(self): - """ - Initialize AWS S3, Qdrant, and Supabase. - """ - openai.api_key = os.getenv("OPENAI_API_KEY") - - # vector DB - self.qdrant_client = QdrantClient( - url=os.getenv('QDRANT_URL'), - api_key=os.getenv('QDRANT_API_KEY'), - ) - - self.vectorstore = Qdrant(client=self.qdrant_client, - collection_name=os.environ['QDRANT_COLLECTION_NAME'], - embeddings=OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE)) - - # S3 - self.s3_client = boto3.client( - 's3', - aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), - aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), - ) - - # Create a Supabase client - self.supabase_client = supabase.create_client( # type: ignore - supabase_url=os.environ['SUPABASE_URL'], supabase_key=os.environ['SUPABASE_API_KEY']) - - self.llm = AzureChatOpenAI( - temperature=0, - deployment_name=os.getenv('AZURE_OPENAI_ENGINE'), #type:ignore - openai_api_base=os.getenv('AZURE_OPENAI_ENDPOINT'), #type:ignore - openai_api_key=os.getenv('AZURE_OPENAI_KEY'), #type:ignore - openai_api_version=os.getenv('OPENAI_API_VERSION'), #type:ignore - openai_api_type=OPENAI_API_TYPE) - - self.posthog = Posthog(sync_mode=True, - project_api_key=os.environ['POSTHOG_API_KEY'], - host='https://app.posthog.com') - - return None - - def __del__(self): - # Gracefully shutdown the Posthog client -- this was a main cause of dangling threads. - # Since I changed Posthog to be sync, no need to shutdown. - # try: - # self.posthog.shutdown() - # except Exception as e: - # print("Failed to shutdown PostHog. Probably fine. Error: ", e) - try: - self.qdrant_client.close() - except Exception as e: - print("Failed to shutdown Qdrant. Probably fine. Error: ", e) - try: - del self.supabase_client - except Exception as e: - print("Failed delete supabase_client. Probably fine. Error: ", e) - try: - del self.s3_client - except Exception as e: - print("Failed to delete s3_client. Probably fine. Error: ", e) - - def delete_entire_course(self, course_name: str): - """Delete entire course. - - Delete materials from S3, Supabase SQL, Vercel KV, and QDrant vector DB - Args: - course_name (str): _description_ - """ - print(f"Deleting entire course: {course_name}") - try: - # Delete file from S3 - print("Deleting from S3") - objects_to_delete = self.s3_client.list_objects(Bucket=os.getenv('S3_BUCKET_NAME'), - Prefix=f'courses/{course_name}/') - for object in objects_to_delete['Contents']: - self.s3_client.delete_object(Bucket=os.getenv('S3_BUCKET_NAME'), Key=object['Key']) - except Exception as e: - err: str = f"ERROR IN delete_entire_course(): Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore - print(err) - sentry_sdk.capture_exception(e) - pass - - try: - # Delete from Qdrant - # docs for nested keys: https://qdrant.tech/documentation/concepts/filtering/#nested-key - # Qdrant "points" look like this: Record(id='000295ca-bd28-ac4a-6f8d-c245f7377f90', payload={'metadata': {'course_name': 'zotero-extreme', 'pagenumber_or_timestamp': 15, 'readable_filename': 'Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf', 's3_path': 'courses/zotero-extreme/Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf'}, 'page_content': '18 \nDunlosky et al.\n3.3 Effects in representative educational contexts. Sev-\neral of the large summarization-training studies have been \nconducted in regular classrooms, indicating the feasibility of \ndoing so. For example, the study by A. King (1992) took place \nin the context of a remedial study-skills course for undergrad-\nuates, and the study by Rinehart et al. (1986) took place in \nsixth-grade classrooms, with the instruction led by students \nregular teachers. In these and other cases, students benefited \nfrom the classroom training. We suspect it may actually be \nmore feasible to conduct these kinds of training studies in \nclassrooms than in the laboratory, given the nature of the time \ncommitment for students. Even some of the studies that did \nnot involve training were conducted outside the laboratory; for \nexample, in the Bednall and Kehoe (2011) study on learning \nabout logical fallacies from Web modules (see data in Table 3), \nthe modules were actually completed as a homework assign-\nment. Overall, benefits can be observed in classroom settings; \nthe real constraint is whether students have the skill to suc-\ncessfully summarize, not whether summarization occurs in the \nlab or the classroom.\n3.4 Issues for implementation. Summarization would be \nfeasible for undergraduates or other learners who already \nknow how to summarize. For these students, summarization \nwould constitute an easy-to-implement technique that would \nnot take a lot of time to complete or understand. The only \nconcern would be whether these students might be better \nserved by some other strategy, but certainly summarization \nwould be better than the study strategies students typically \nfavor, such as highlighting and rereading (as we discuss in the \nsections on those strategies below). A trickier issue would \nconcern implementing the strategy with students who are not \nskilled summarizers. Relatively intensive training programs \nare required for middle school students or learners with learn-\ning disabilities to benefit from summarization. Such efforts \nare not misplaced; training has been shown to benefit perfor-\nmance on a range of measures, although the training proce-\ndures do raise practical issues (e.g., Gajria & Salvia, 1992: \n6.511 hours of training used for sixth through ninth graders \nwith learning disabilities; Malone & Mastropieri, 1991: 2 \ndays of training used for middle school students with learning \ndisabilities; Rinehart et al., 1986: 4550 minutes of instruc-\ntion per day for 5 days used for sixth graders). Of course, \ninstructors may want students to summarize material because \nsummarization itself is a goal, not because they plan to use \nsummarization as a study technique, and that goal may merit \nthe efforts of training.\nHowever, if the goal is to use summarization as a study \ntechnique, our question is whether training students would be \nworth the amount of time it would take, both in terms of the \ntime required on the part of the instructor and in terms of the \ntime taken away from students other activities. For instance, \nin terms of efficacy, summarization tends to fall in the middle \nof the pack when compared to other techniques. In direct \ncomparisons, it was sometimes more useful than rereading \n(Rewey, Dansereau, & Peel, 1991) and was as useful as note-\ntaking (e.g., Bretzing & Kulhavy, 1979) but was less powerful \nthan generating explanations (e.g., Bednall & Kehoe, 2011) or \nself-questioning (A. King, 1992).\n3.5 Summarization: Overall assessment. On the basis of the \navailable evidence, we rate summarization as low utility. It can \nbe an effective learning strategy for learners who are already \nskilled at summarizing; however, many learners (including \nchildren, high school students, and even some undergraduates) \nwill require extensive training, which makes this strategy less \nfeasible. Our enthusiasm is further dampened by mixed find-\nings regarding which tasks summarization actually helps. \nAlthough summarization has been examined with a wide \nrange of text materials, many researchers have pointed to fac-\ntors of these texts that seem likely to moderate the effects of \nsummarization (e.g'}, vector=None), - print("deleting from qdrant") - self.qdrant_client.delete( - collection_name=os.environ['QDRANT_COLLECTION_NAME'], - points_selector=models.Filter(must=[ - models.FieldCondition( - key="course_name", - match=models.MatchValue(value=course_name), - ), - ]), - ) - except Exception as e: - err: str = f"ERROR IN delete_entire_course(): Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore - print(err) - sentry_sdk.capture_exception(e) - pass - - try: - # Delete from Supabase - print("deleting from supabase") - response = self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq( - 'course_name', course_name).execute() - print("supabase response: ", response) - return "Success" - except Exception as e: - err: str = f"ERROR IN delete_entire_course(): Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore - print(err) - sentry_sdk.capture_exception(e) - # todo: delete from Vercel KV to fully make the coure not exist. Last db to delete from (as of now, Aug 15) - - def delete_data(self, course_name: str, s3_path: str, source_url: str): - """Delete file from S3, Qdrant, and Supabase.""" - print(f"Deleting {s3_path} from S3, Qdrant, and Supabase for course {course_name}") - # add delete from doc map logic here - try: - # Delete file from S3 - bucket_name = os.getenv('S3_BUCKET_NAME') - - # Delete files by S3 path - if s3_path: - try: - self.s3_client.delete_object(Bucket=bucket_name, Key=s3_path) - except Exception as e: - print("Error in deleting file from s3:", e) - sentry_sdk.capture_exception(e) - # Delete from Qdrant - # docs for nested keys: https://qdrant.tech/documentation/concepts/filtering/#nested-key - # Qdrant "points" look like this: Record(id='000295ca-bd28-ac4a-6f8d-c245f7377f90', payload={'metadata': {'course_name': 'zotero-extreme', 'pagenumber_or_timestamp': 15, 'readable_filename': 'Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf', 's3_path': 'courses/zotero-extreme/Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf'}, 'page_content': '18 \nDunlosky et al.\n3.3 Effects in representative educational contexts. Sev-\neral of the large summarization-training studies have been \nconducted in regular classrooms, indicating the feasibility of \ndoing so. For example, the study by A. King (1992) took place \nin the context of a remedial study-skills course for undergrad-\nuates, and the study by Rinehart et al. (1986) took place in \nsixth-grade classrooms, with the instruction led by students \nregular teachers. In these and other cases, students benefited \nfrom the classroom training. We suspect it may actually be \nmore feasible to conduct these kinds of training ... - try: - self.qdrant_client.delete( - collection_name=os.environ['QDRANT_COLLECTION_NAME'], - points_selector=models.Filter(must=[ - models.FieldCondition( - key="s3_path", - match=models.MatchValue(value=s3_path), - ), - ]), - ) - except Exception as e: - if "timed out" in str(e): - # Timed out is fine. Still deletes. - # https://github.com/qdrant/qdrant/issues/3654#issuecomment-1955074525 - pass - else: - print("Error in deleting file from Qdrant:", e) - sentry_sdk.capture_exception(e) - try: - # delete from Nomic - response = self.supabase_client.from_( - os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select("id, s3_path, contexts").eq( - 's3_path', s3_path).eq('course_name', course_name).execute() - data = response.data[0] #single record fetched - nomic_ids_to_delete = [] - context_count = len(data['contexts']) - for i in range(1, context_count + 1): - nomic_ids_to_delete.append(str(data['id']) + "_" + str(i)) - - # delete from Nomic - res = delete_from_document_map(course_name, nomic_ids_to_delete) - except Exception as e: - print("Error in deleting file from Nomic:", e) - sentry_sdk.capture_exception(e) - - try: - self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq( - 's3_path', s3_path).eq('course_name', course_name).execute() - except Exception as e: - print("Error in deleting file from supabase:", e) - sentry_sdk.capture_exception(e) - - # Delete files by their URL identifier - elif source_url: - try: - # Delete from Qdrant - self.qdrant_client.delete( - collection_name=os.environ['QDRANT_COLLECTION_NAME'], - points_selector=models.Filter(must=[ - models.FieldCondition( - key="url", - match=models.MatchValue(value=source_url), - ), - ]), - ) - except Exception as e: - if "timed out" in str(e): - # Timed out is fine. Still deletes. - # https://github.com/qdrant/qdrant/issues/3654#issuecomment-1955074525 - pass - else: - print("Error in deleting file from Qdrant:", e) - sentry_sdk.capture_exception(e) - try: - # delete from Nomic - response = self.supabase_client.from_( - os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select("id, url, contexts").eq( - 'url', source_url).eq('course_name', course_name).execute() - data = response.data[0] #single record fetched - nomic_ids_to_delete = [] - context_count = len(data['contexts']) - for i in range(1, context_count + 1): - nomic_ids_to_delete.append(str(data['id']) + "_" + str(i)) - - # delete from Nomic - res = delete_from_document_map(course_name, nomic_ids_to_delete) - except Exception as e: - print("Error in deleting file from Nomic:", e) - sentry_sdk.capture_exception(e) - - try: - # delete from Supabase - self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq( - 'url', source_url).eq('course_name', course_name).execute() - except Exception as e: - print("Error in deleting file from supabase:", e) - sentry_sdk.capture_exception(e) - - # Delete from Supabase - return "Success" - except Exception as e: - err: str = f"ERROR IN delete_data: Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore - print(err) - sentry_sdk.capture_exception(e) - return err - - def getAll( - self, - course_name: str, - ): - """Get all course materials based on course name. - Args: - course_name (as uploaded on supabase) - Returns: - list of dictionaries with distinct s3 path, readable_filename and course_name, url, base_url. - """ - - response = self.supabase_client.table(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select( - 'course_name, s3_path, readable_filename, url, base_url').eq('course_name', course_name).execute() - - data = response.data - unique_combinations = set() - distinct_dicts = [] - - for item in data: - combination = (item['s3_path'], item['readable_filename'], item['course_name'], item['url'], item['base_url']) - if combination not in unique_combinations: - unique_combinations.add(combination) - distinct_dicts.append(item) - - return distinct_dicts - - def vector_search(self, search_query, course_name): - top_n = 80 - # EMBED - openai_start_time = time.monotonic() - o = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE) - user_query_embedding = o.embed_query(search_query) - openai_embedding_latency = time.monotonic() - openai_start_time - - # SEARCH - myfilter = models.Filter(must=[ - models.FieldCondition(key='course_name', match=models.MatchValue(value=course_name)), - ]) - self.posthog.capture('distinct_id_of_the_user', - event='vector_search_invoked', - properties={ - 'user_query': search_query, - 'course_name': course_name, - }) - qdrant_start_time = time.monotonic() - search_results = self.qdrant_client.search( - collection_name=os.environ['QDRANT_COLLECTION_NAME'], - query_filter=myfilter, - with_vectors=False, - query_vector=user_query_embedding, - limit=top_n, # Return n closest points - - # In a system with high disk latency, the re-scoring step may become a bottleneck: https://qdrant.tech/documentation/guides/quantization/ - search_params=models.SearchParams(quantization=models.QuantizationSearchParams(rescore=False))) - - found_docs: list[Document] = [] - for d in search_results: - try: - metadata = d.payload - page_content = metadata['page_content'] - del metadata['page_content'] - if "pagenumber" not in metadata.keys() and "pagenumber_or_timestamp" in metadata.keys(): # type: ignore - # aiding in the database migration... - metadata["pagenumber"] = metadata["pagenumber_or_timestamp"] # type: ignore - - found_docs.append(Document(page_content=page_content, metadata=metadata)) # type: ignore - except Exception as e: - print(f"Error in vector_search(), for course: `{course_name}`. Error: {e}") - sentry_sdk.capture_exception(e) - - self.posthog.capture('distinct_id_of_the_user', - event='vector_search_succeded', - properties={ - 'user_query': search_query, - 'course_name': course_name, - 'qdrant_latency_sec': time.monotonic() - qdrant_start_time, - 'openai_embedding_latency_sec': openai_embedding_latency - }) - # print("found_docs", found_docs) - return found_docs - - def getTopContexts(self, search_query: str, course_name: str, token_limit: int = 4_000) -> Union[List[Dict], str]: - """Here's a summary of the work. - - /GET arguments - course name (optional) str: A json response with TBD fields. - - Returns - JSON: A json response with TBD fields. See main.py:getTopContexts docs. - or - String: An error message with traceback. - """ - try: - start_time_overall = time.monotonic() - - found_docs: list[Document] = self.vector_search(search_query=search_query, course_name=course_name) - - pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" - # count tokens at start and end, then also count each context. - token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' + - search_query) # type: ignore - - valid_docs = [] - num_tokens = 0 - for doc in found_docs: - doc_string = f"Document: {doc.metadata['readable_filename']}{', page: ' + str(doc.metadata['pagenumber']) if doc.metadata['pagenumber'] else ''}\n{str(doc.page_content)}\n" - num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore - - print( - f"tokens used/limit: {token_counter}/{token_limit}, tokens in chunk: {num_tokens}, total prompt cost (of these contexts): {prompt_cost}. 📄 File: {doc.metadata['readable_filename']}" - ) - if token_counter + num_tokens <= token_limit: - token_counter += num_tokens - valid_docs.append(doc) - else: - # filled our token size, time to return - break - - print(f"Total tokens used: {token_counter}. Docs used: {len(valid_docs)} of {len(found_docs)} docs retrieved") - print(f"Course: {course_name} ||| search_query: {search_query}") - print(f"⏰ ^^ Runtime of getTopContexts: {(time.monotonic() - start_time_overall):.2f} seconds") - if len(valid_docs) == 0: - return [] - - self.posthog.capture('distinct_id_of_the_user', - event='success_get_top_contexts_OG', - properties={ - 'user_query': search_query, - 'course_name': course_name, - 'token_limit': token_limit, - 'total_tokens_used': token_counter, - 'total_contexts_used': len(valid_docs), - 'total_unique_docs_retrieved': len(found_docs), - 'getTopContext_total_latency_sec': time.monotonic() - start_time_overall, - }) - - return self.format_for_json(valid_docs) - except Exception as e: - # return full traceback to front end - err: str = f"ERROR: In /getTopContexts. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}" # type: ignore - print(err) - sentry_sdk.capture_exception(e) - return err - - def batch_vector_search(self, search_queries: List[str], course_name: str, top_n: int = 50): - """ - Perform a similarity search for all the generated queries at once. - """ - start_time = time.monotonic() - - from qdrant_client.http import models as rest - o = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE) - # Prepare the filter for the course name - myfilter = rest.Filter(must=[ - rest.FieldCondition(key='course_name', match=rest.MatchValue(value=course_name)), - ]) - - # Prepare the search requests - search_requests = [] - for query in search_queries: - user_query_embedding = o.embed_query(query) - search_requests.append( - rest.SearchRequest(vector=user_query_embedding, - filter=myfilter, - limit=top_n, - with_payload=True, - params=models.SearchParams(quantization=models.QuantizationSearchParams(rescore=False)))) - - # Perform the batch search - search_results = self.qdrant_client.search_batch( - collection_name=os.environ['QDRANT_COLLECTION_NAME'], - requests=search_requests, - ) - # process search results - found_docs: list[list[Document]] = [] - for result in search_results: - docs = [] - for doc in result: - try: - metadata = doc.payload - page_content = metadata['page_content'] - del metadata['page_content'] - - if "pagenumber" not in metadata.keys() and "pagenumber_or_timestamp" in metadata.keys(): - metadata["pagenumber"] = metadata["pagenumber_or_timestamp"] - - docs.append(Document(page_content=page_content, metadata=metadata)) - except Exception: - print(traceback.print_exc()) - found_docs.append(docs) - - print(f"⏰ Qdrant Batch Search runtime: {(time.monotonic() - start_time):.2f} seconds") - return found_docs - - def reciprocal_rank_fusion(self, results: list[list], k=60): - """ - Since we have multiple queries, and n documents returned per query, we need to go through all the results - and collect the documents with the highest overall score, as scored by qdrant similarity matching. - """ - fused_scores = {} - count = 0 - unique_count = 0 - for docs in results: - # Assumes the docs are returned in sorted order of relevance - count += len(docs) - for rank, doc in enumerate(docs): - doc_str = dumps(doc) - if doc_str not in fused_scores: - fused_scores[doc_str] = 0 - unique_count += 1 - fused_scores[doc_str] += 1 / (rank + k) - # Uncomment for debugging - # previous_score = fused_scores[doc_str] - #print(f"Change score for doc: {doc_str}, previous score: {previous_score}, updated score: {fused_scores[doc_str]} ") - print(f"Total number of documents in rank fusion: {count}") - print(f"Total number of unique documents in rank fusion: {unique_count}") - reranked_results = [ - (loads(doc), score) for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True) - ] - return reranked_results - - def getTopContextsWithMQR(self, - search_query: str, - course_name: str, - token_limit: int = 4_000) -> Union[List[Dict], str]: - """ - New info-retrieval pipeline that uses multi-query retrieval + filtering + reciprocal rank fusion + context padding. - 1. Generate multiple queries based on the input search query. - 2. Retrieve relevant docs for each query. - 3. Filter the relevant docs based on the user query and pass them to the rank fusion step. - 4. [CANCELED BEC POINTLESS] Rank the docs based on the relevance score. - 5. Parent-doc-retrieval: Pad just the top 5 docs with expanded context from the original document. - """ - return 'fail' - - # try: - # top_n_per_query = 40 # HARD CODE TO ENSURE WE HIT THE MAX TOKENS - # start_time_overall = time.monotonic() - # mq_start_time = time.monotonic() - - # # 1. GENERATE MULTIPLE QUERIES - # generate_queries = ( - # MULTI_QUERY_PROMPT | self.llm | StrOutputParser() | (lambda x: x.split("\n")) | - # (lambda x: list(filter(None, x))) # filter out non-empty strings - # ) - - # generated_queries = generate_queries.invoke({"original_query": search_query}) - # print("generated_queries", generated_queries) - - # # 2. VECTOR SEARCH FOR EACH QUERY - # batch_found_docs_nested: list[list[Document]] = self.batch_vector_search(search_queries=generated_queries, - # course_name=course_name, - # top_n=top_n_per_query) - - # # 3. RANK REMAINING DOCUMENTS -- good for parent doc padding of top 5 at the end. - # found_docs = self.reciprocal_rank_fusion(batch_found_docs_nested) - # found_docs = [doc for doc, score in found_docs] - # print(f"Num docs after re-ranking: {len(found_docs)}") - # if len(found_docs) == 0: - # return [] - # print(f"⏰ Total multi-query processing runtime: {(time.monotonic() - mq_start_time):.2f} seconds") - - # # 4. FILTER DOCS - # filtered_docs = filter_top_contexts(contexts=found_docs, user_query=search_query, timeout=30, max_concurrency=180) - # if len(filtered_docs) == 0: - # return [] - - # # 5. TOP DOC CONTEXT PADDING // parent document retriever - # final_docs = context_parent_doc_padding(filtered_docs, search_query, course_name) - # print(f"Number of final docs after context padding: {len(final_docs)}") - - # pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" - # token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' + - # search_query) # type: ignore - - # valid_docs = [] - # num_tokens = 0 - # for doc in final_docs: - # doc_string = f"Document: {doc['readable_filename']}{', page: ' + str(doc['pagenumber']) if doc['pagenumber'] else ''}\n{str(doc['text'])}\n" - # num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore - - # print(f"token_counter: {token_counter}, num_tokens: {num_tokens}, max_tokens: {token_limit}") - # if token_counter + num_tokens <= token_limit: - # token_counter += num_tokens - # valid_docs.append(doc) - # else: - # # filled our token size, time to return - # break - - # print(f"Total tokens used: {token_counter} Used {len(valid_docs)} of total unique docs {len(found_docs)}.") - # print(f"Course: {course_name} ||| search_query: {search_query}") - # print(f"⏰ ^^ Runtime of getTopContextsWithMQR: {(time.monotonic() - start_time_overall):.2f} seconds") - - # if len(valid_docs) == 0: - # return [] - - # self.posthog.capture('distinct_id_of_the_user', - # event='filter_top_contexts_succeeded', - # properties={ - # 'user_query': search_query, - # 'course_name': course_name, - # 'token_limit': token_limit, - # 'total_tokens_used': token_counter, - # 'total_contexts_used': len(valid_docs), - # 'total_unique_docs_retrieved': len(found_docs), - # }) - - # return self.format_for_json_mqr(valid_docs) - # except Exception as e: - # # return full traceback to front end - # err: str = f"ERROR: In /getTopContextsWithMQR. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.format_exc()}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}" # type: ignore - # print(err) - # sentry_sdk.capture_exception(e) - # return err - - def format_for_json_mqr(self, found_docs) -> List[Dict]: - """ - Same as format_for_json, but for the new MQR pipeline. - """ - for found_doc in found_docs: - if "pagenumber" not in found_doc.keys(): - print("found no pagenumber") - found_doc['pagenumber'] = found_doc['pagenumber_or_timestamp'] - - contexts = [ - { - 'text': doc['text'], - 'readable_filename': doc['readable_filename'], - 'course_name ': doc['course_name'], - 's3_path': doc['s3_path'], - 'pagenumber': doc['pagenumber'], - 'url': doc['url'], # wouldn't this error out? - 'base_url': doc['base_url'], - } for doc in found_docs - ] - - return contexts - - def get_context_stuffed_prompt(self, user_question: str, course_name: str, top_n: int, top_k_to_search: int) -> str: - """ - Get a stuffed prompt for a given user question and course name. - Args: - user_question (str) - course_name (str) : used for metadata filtering - Returns : str - a very long "stuffed prompt" with question + summaries of top_n most relevant documents. - """ - # MMR with metadata filtering based on course_name - vec_start_time = time.monotonic() - found_docs = self.vectorstore.max_marginal_relevance_search(user_question, k=top_n, fetch_k=top_k_to_search) - print( - f"⏰ MMR Search runtime (top_n_to_keep: {top_n}, top_k_to_search: {top_k_to_search}): {(time.monotonic() - vec_start_time):.2f} seconds" - ) - - requests = [] - for doc in found_docs: - print("doc", doc) - dictionary = { - "model": "gpt-3.5-turbo", - "messages": [{ - "role": - "system", - "content": - "You are a factual summarizer of partial documents. Stick to the facts (including partial info when necessary to avoid making up potentially incorrect details), and say I don't know when necessary." - }, { - "role": - "user", - "content": - f"Provide a comprehensive summary of the given text, based on this question:\n{doc.page_content}\nQuestion: {user_question}\nThe summary should cover all the key points that are relevant to the question, while also condensing the information into a concise format. The length of the summary should be as short as possible, without losing relevant information.\nMake use of direct quotes from the text.\nFeel free to include references, sentence fragments, keywords or anything that could help someone learn about it, only as it relates to the given question.\nIf the text does not provide information to answer the question, please write 'None' and nothing else.", - }], - "n": 1, - "max_tokens": 600, - "metadata": doc.metadata - } - requests.append(dictionary) - - oai = OpenAIAPIProcessor( - input_prompts_list=requests, - request_url='https://api.openai.com/v1/chat/completions', - api_key=os.getenv("OPENAI_API_KEY"), - max_requests_per_minute=1500, - max_tokens_per_minute=90000, - token_encoding_name='cl100k_base', # nosec -- reasonable bandit error suppression - max_attempts=5, - logging_level=20) - - chain_start_time = time.monotonic() - asyncio.run(oai.process_api_requests_from_file()) - results: list[str] = oai.results - print(f"⏰ EXTREME context stuffing runtime: {(time.monotonic() - chain_start_time):.2f} seconds") - - print(f"Cleaned results: {oai.cleaned_results}") - - all_texts = "" - separator = '---' # between each context - token_counter = 0 #keeps track of tokens in each summarization - max_tokens = 7_500 #limit, will keep adding text to string until 8000 tokens reached. - for i, text in enumerate(oai.cleaned_results): - if text.lower().startswith('none') or text.lower().endswith('none.') or text.lower().endswith('none'): - # no useful text, it replied with a summary of "None" - continue - if text is not None: - if "pagenumber" not in results[i][-1].keys(): # type: ignore - results[i][-1]['pagenumber'] = results[i][-1].get('pagenumber_or_timestamp') # type: ignore - num_tokens, prompt_cost = count_tokens_and_cost(text) # type: ignore - if token_counter + num_tokens > max_tokens: - print(f"Total tokens yet in loop {i} is {num_tokens}") - break # Stop building the string if it exceeds the maximum number of tokens - token_counter += num_tokens - filename = str(results[i][-1].get('readable_filename', '')) # type: ignore - pagenumber_or_timestamp = str(results[i][-1].get('pagenumber', '')) # type: ignore - pagenumber = f", page: {pagenumber_or_timestamp}" if pagenumber_or_timestamp else '' - doc = f"Document : filename: {filename}" + pagenumber - summary = f"\nSummary: {text}" - all_texts += doc + summary + '\n' + separator + '\n' - - stuffed_prompt = """Please answer the following question. -Use the context below, called 'your documents', only if it's helpful and don't use parts that are very irrelevant. -It's good to quote 'your documents' directly using informal citations, like "in document X it says Y". Try to avoid giving false or misleading information. Feel free to say you don't know. -Try to be helpful, polite, honest, sophisticated, emotionally aware, and humble-but-knowledgeable. -That said, be practical and really do your best, and don't let caution get too much in the way of being useful. -To help answer the question, here's a few passages of high quality documents:\n{all_texts} -Now please respond to my question: {user_question}""" - - # "Please answer the following question. It's good to quote 'your documents' directly, something like 'from ABS source it says XYZ' Feel free to say you don't know. \nHere's a few passages of the high quality 'your documents':\n" - - return stuffed_prompt - - def get_stuffed_prompt(self, search_query: str, course_name: str, token_limit: int = 7_000) -> str: - """ - Returns - String: A fully formatted prompt string. - """ - try: - top_n = 90 - start_time_overall = time.monotonic() - o = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE) - user_query_embedding = o.embed_documents(search_query)[0] # type: ignore - myfilter = models.Filter(must=[ - models.FieldCondition(key='course_name', match=models.MatchValue(value=course_name)), - ]) - - found_docs = self.qdrant_client.search( - collection_name=os.environ['QDRANT_COLLECTION_NAME'], - query_filter=myfilter, - with_vectors=False, - query_vector=user_query_embedding, - limit=top_n # Return 5 closest points - ) - print("Search results: ", found_docs) - if len(found_docs) == 0: - return search_query - - pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" - - # count tokens at start and end, then also count each context. - token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' + - search_query) # type: ignore - valid_docs = [] - for d in found_docs: - if d.payload is not None: - if "pagenumber" not in d.payload.keys(): - d.payload["pagenumber"] = d.payload["pagenumber_or_timestamp"] - - doc_string = f"---\nDocument: {d.payload['readable_filename']}{', page: ' + str(d.payload['pagenumber']) if d.payload['pagenumber'] else ''}\n{d.payload.get('page_content')}\n" - num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore - - # print(f"Page: {d.payload.get('page_content', ' '*100)[:100]}...") - print( - f"tokens used/limit: {token_counter}/{token_limit}, tokens in chunk: {num_tokens}, prompt cost of chunk: {prompt_cost}. 📄 File: {d.payload.get('readable_filename', '')}" - ) - if token_counter + num_tokens <= token_limit: - token_counter += num_tokens - valid_docs.append( - Document(page_content=d.payload.get('page_content', ''), metadata=d.payload)) - else: - continue - - # Convert the valid_docs to full prompt - separator = '---\n' # between each context - context_text = separator.join( - f"Document: {d.metadata['readable_filename']}{', page: ' + str(d.metadata['pagenumber']) if d.metadata['pagenumber'] else ''}\n{d.page_content}\n" - for d in valid_docs) - - # Create the stuffedPrompt - stuffedPrompt = (pre_prompt + context_text + '\n\nNow please respond to my query: ' + search_query) - - TOTAL_num_tokens, prompt_cost = count_tokens_and_cost(stuffedPrompt, openai_model_name='gpt-4') # type: ignore - print(f"Total tokens: {TOTAL_num_tokens}, prompt_cost: {prompt_cost}") - print("total docs: ", len(found_docs)) - print("num docs used: ", len(valid_docs)) - - print(f"⏰ ^^ Runtime of getTopContexts: {(time.monotonic() - start_time_overall):.2f} seconds") - return stuffedPrompt - except Exception as e: - # return full traceback to front end - err: str = f"Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore - print(err) - sentry_sdk.capture_exception(e) - return err - - def format_for_json(self, found_docs: List[Document]) -> List[Dict]: - """Formatting only. - {'course_name': course_name, 'contexts': [{'source_name': 'Lumetta_notes', 'source_location': 'pg. 19', 'text': 'In FSM, we do this...'}, {'source_name': 'Lumetta_notes', 'source_location': 'pg. 20', 'text': 'In Assembly language, the code does that...'},]} - - Args: - found_docs (List[Document]): _description_ - - Raises: - Exception: _description_ - - Returns: - List[Dict]: _description_ - """ - for found_doc in found_docs: - if "pagenumber" not in found_doc.metadata.keys(): - print("found no pagenumber") - found_doc.metadata['pagenumber'] = found_doc.metadata['pagenumber_or_timestamp'] - - contexts = [ - { - 'text': doc.page_content, - 'readable_filename': doc.metadata['readable_filename'], - 'course_name ': doc.metadata['course_name'], - 's3_path': doc.metadata['s3_path'], - 'pagenumber': doc.metadata['pagenumber'], # this because vector db schema is older... - # OPTIONAL PARAMS... - 'url': doc.metadata.get('url'), # wouldn't this error out? - 'base_url': doc.metadata.get('base_url'), - } for doc in found_docs - ] - - return contexts - - -if __name__ == '__main__': - pass diff --git a/ai_ta_backend/web_scrape.py b/ai_ta_backend/web_scrape.py deleted file mode 100644 index e341657c..00000000 --- a/ai_ta_backend/web_scrape.py +++ /dev/null @@ -1,664 +0,0 @@ -import mimetypes -import os -import re -import shutil -import time -import uuid -from tempfile import NamedTemporaryFile -from typing import List, Optional -from zipfile import ZipFile - -import boto3 # type: ignore -import requests -import supabase -from bs4 import BeautifulSoup - -from ai_ta_backend.aws import upload_data_files_to_s3 -from ai_ta_backend.vector_database import Ingest - - -class WebScrape(): - - def __init__(self) -> None: - - # S3 - self.s3_client = boto3.client( - 's3', - aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), - aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), - ) - - # Create a Supabase client - self.supabase_client = supabase.create_client( # type: ignore - supabase_url=os.environ['SUPABASE_URL'], supabase_key=os.environ['SUPABASE_API_KEY']) - - self.ingester = Ingest() - - self.url_contents = [] - self.invalid_urls = [] - self.existing_urls = [] - self.max_urls = 0 - self.original_amount = 0 - self.supa_urls = 0 - self.queue = {} - - return None - - def get_file_extension(self, filename): - match = re.search(r'\.([a-zA-Z0-9]+)$', filename) - valid_filetypes = list(mimetypes.types_map.keys()) - valid_filetypes = valid_filetypes + ['.html', '.py', '.vtt', '.pdf', '.txt', '.srt', '.docx', '.ppt', '.pptx'] - if match: - filetype = "." + match.group(1) - if filetype in valid_filetypes: - return filetype - else: - return '.html' - else: - return '.html' - - def valid_url(self, url): - """ - Returns the URL and it's content if it's good, otherwise returns false. Prints the status code. - """ - try: - response = requests.get(url, allow_redirects=True, timeout=20) - - redirect_loop_counter = 0 - while response.status_code == 301: - # Check for permanent redirect - if redirect_loop_counter > 3: - print("❌ Redirect loop (on 301 error) exceeded redirect limit of:", redirect_loop_counter, "❌") - return (False, False, False) - redirect_url = response.headers['Location'] - response = requests.head(redirect_url) - redirect_loop_counter += 1 - if response.status_code == 200: - filetype = self.get_file_extension(response.url) - print("file extension:", filetype) - if filetype == '.html': - content = BeautifulSoup(response.content, "html.parser") - if " average: - # print("Too many repeated urls, exiting web scraper") - # return True - # else: - # return False - - def count_hard_stop_len(self): - count = len(self.url_contents) - if self.url_contents != []: - print("📈📈 Counted URLs", count, "out of", self.original_amount, "📈📈") - if count > self.original_amount: - print("Too many repeated urls, exiting web scraper") - return True - else: - return False - - def check_and_ingest(self, url: str, course_name: str, timeout: int, base_url_on: str): - if url not in self.invalid_urls and url not in self.existing_urls: - second_url, content, filetype = self.valid_url(url) - else: - print("This URL is invalid or already existing in the database") - self.existing_urls.append((url)) - return '', '', '' - - if second_url: - time.sleep(timeout) - url_content = (second_url, content, filetype) - if self.check_file_not_exists(url_content): - path_name = self.title_path_name(url_content) - self.url_contents.append(url_content) - self.existing_urls.append(url_content) - # url_contents = remove_duplicates(url_contents, _existing_urls) - self.ingest_file(url_content, course_name, path_name, base_url_on) - print("✅✅ Scraped:", second_url, "✅✅") - self.max_urls -= 1 - else: - print("This URL is already existing in the database") - self.existing_urls.append((second_url, content, filetype)) - else: - self.invalid_urls.append(url) - print("This URL is invalid") - - return url, content, filetype - - def scrape_user_provided_page(self, url: str, course_name: str, timeout: int, base: str): - urls = [] - url, content, filetype = self.check_and_ingest(url, course_name, timeout, base) - - if url: - if filetype == '.html': - try: - body = content.find("body") - header = content.find("head") - footer = content.find("footer") - nav = content.find("nav") - except Exception as e: - print("Error:", e) - body = "" - header = "" - # Check for 403 Forbidden urls - try: - if content.title.string.lower() == "403 forbidden" or content.title.string.lower( - ) == 'page not found': # type: ignore - print("403 Forbidden") - self.invalid_urls.append(url) - else: - pass - except Exception as e: - print("Error:", e) - pass - if body != "" and header != "": - urls = self.find_urls(body, base, urls) # type: ignore - urls = self.find_urls(header, base, urls) # type: ignore - self.invalid_urls.append(self.find_urls(footer, base)) # type: ignore - self.invalid_urls.append(self.find_urls(nav, base)) # type: ignore - else: - urls = self.find_urls(content, base, urls) # type: ignore - - return urls - - def non_user_provided_page_urls(self, url: str, base: str, soup, filetype: str): - urls = [] - if filetype == '.html': - try: - body = soup.find("body") - header = soup.find("head") - footer = soup.find("footer") - nav = soup.find("nav") - except Exception as e: - print("Error:", e) - body = "" - header = "" - - # Check for 403 Forbidden urls - try: - if soup.title.string.lower() == "403 forbidden" or soup.title.string.lower( - ) == 'page not found': # type: ignore - print("403 Forbidden") - self.invalid_urls.append(url) - else: - pass - except Exception as e: - print("Error:", e) - pass - if body != "" and header != "": - urls = self.find_urls(body, base, urls) - urls = self.find_urls(header, base, urls) - self.invalid_urls.append(self.find_urls(footer, base)) # type: ignore - self.invalid_urls.append(self.find_urls(nav, base)) # type: ignore - else: - urls = self.find_urls(soup, base, urls) - - return urls - - # def depth_crawler(self, url:str, course_name:str, max_depth:int=3, timeout:int=1, base_url_on:str=None, _depth:int=0, _soup=None, _filetype:str=None): # type: ignore - # '''Function gets titles of urls and the urls themselves''' - # # Prints the depth of the current search - # print("depth: ", _depth) - # if base_url_on: - # base_url_on = str(base_url_on) - - # # Create a base site for incomplete hrefs - # base = self.base_url(url) - # if base == "": - # raise ValueError("This URL is invalid") - - # if self.count_hard_stop_len(): - # raise ValueError("Too many repeated urls, exiting web scraper") - - # try: - # if _soup: - # urls = self.non_user_provided_page_urls(url, base, _soup, _filetype) - # else: - # urls = self.scrape_user_provided_page(url, course_name, timeout, base) - # except ValueError as e: - # raise e - - # temp_urls = [] - # # We grab content out of these urls - # try: - # for url in urls: - # if self.max_urls > 0: - # if base_url_on: - # if url.startswith(base): - # new_url, content, filetype = self.check_and_ingest(url, course_name, timeout, base_url_on) - # if new_url: - # temp_urls.append((new_url, content, filetype)) - # if self.count_hard_stop_len(): - # raise ValueError("Too many repeated urls, exiting web scraper") - # else: - # new_url, content, filetype = self.check_and_ingest(url, course_name, timeout, base_url_on) - # if new_url: - # temp_urls.append((new_url, content, filetype)) - # if self.count_hard_stop_len(): - # raise ValueError("Too many repeated urls, exiting web scraper") - # else: - # print("Max URLs reached") - # raise ValueError("Max URLs reached") - # except ValueError as e: - # print("Error:", e) - - # # recursively go through crawler until we reach the max amount of urls. - # for url in temp_urls: - # if self.max_urls > 0: - # if _depth < max_depth: - # self.depth_crawler(url[0], course_name, max_depth, timeout, base_url_on, _depth+1, url[1], url[2]) - # print(self.max_urls, "urls left") - # if self.count_hard_stop_len(): - # raise ValueError("Too many repeated urls, exiting web scraper") - # else: - # print("Depth exceeded:", _depth+1, "out of", max_depth) - # break - # else: - # print("Max urls reached") - # break - - # return None - - def breadth_crawler(self, - url: str, - course_name: str, - timeout: int = 1, - base_url_on: str = None, - max_depth: int = 3, - base_option: bool = False): # type: ignore - depth = 0 - if base_url_on: - base_url_on = str(base_url_on) - - # Create a base site for incomplete hrefs - base = self.base_url(url) - if base == "": - raise ValueError("This URL is invalid") - - self.queue[depth] = self.scrape_user_provided_page(url, course_name, timeout, base) - self.queue[depth + 1] = [] - print("queue", self.queue) - print("len", len(self.queue[depth]), len(self.queue[depth + 1])) - - while self.count_hard_stop_len() is False: - print("queue", len(self.queue[depth]), len(self.queue[depth + 1])) - - if self.queue[depth] == []: - depth += 1 - print("depth:", depth) - self.queue[depth + 1] = [] - if depth > max_depth: - print("Depth exceeded:", depth, "out of", max_depth) - raise ValueError("Depth exceeded") - - if self.queue[depth] == []: - print("queue is empty") - raise ValueError("Queue is empty") - - url = self.queue[depth].pop(0) - if self.max_urls > 0: - if depth <= max_depth: - if base_url_on: - if self.base_requirements(url, base_url_on): - print("url", url) - print("requirements", self.base_requirements(url, base_url_on)) - new_url, content, filetype = self.check_and_ingest(url, course_name, timeout, base_url_on) - self.queue[depth + 1] += self.non_user_provided_page_urls(new_url, base, content, filetype) - if self.count_hard_stop_len(): - raise ValueError("Too many repeated urls, exiting web scraper") - else: - if base_option: - new_url, content, filetype = self.check_and_ingest(url, course_name, timeout, base_url_on) - if self.count_hard_stop_len(): - raise ValueError("Too many repeated urls, exiting web scraper") - else: - new_url, content, filetype = self.check_and_ingest(url, course_name, timeout, base_url_on) - self.queue[depth + 1] += self.non_user_provided_page_urls(new_url, base, content, filetype) - if self.count_hard_stop_len(): - raise ValueError("Too many repeated urls, exiting web scraper") - else: - print("Depth exceeded:", depth + 1, "out of", max_depth) - break - else: - print("Max URLs reached") - raise ValueError("Max URLs reached") - - return None - - def main_crawler(self, - url: str, - course_name: str, - max_urls: int = 100, - max_depth: int = 3, - timeout: int = 1, - stay_on_baseurl: bool = True, - depth_or_breadth: str = 'breadth'): - """ - Crawl a site and scrape its content and PDFs, then upload the data to S3 and ingest it. - - Args: - url (str): The URL of the site to crawl. - course_name (str): The name of the course to associate with the crawled data. - max_urls (int, optional): The maximum number of URLs to crawl. Defaults to 100. - max_depth (int, optional): The maximum depth of URLs to crawl. Defaults to 3. - timeout (int, optional): The number of seconds to wait between requests. Defaults to 1. - - Returns: - None - """ - print("\n") - max_urls = int(max_urls) - max_depth = int(max_depth) - timeout = int(timeout) - stay_on_baseurl = bool(stay_on_baseurl) - self.max_urls = max_urls - self.original_amount = max_urls - if stay_on_baseurl: - base_url_str = self.base_url(url) - print(base_url_str) - else: - base_url_str = '' - - # Check for GitHub repository coming soon - if is_github_repo(url): - print("Begin Ingesting GitHub page") - results = self.ingester.ingest_github(url, course_name) - print("Finished ingesting GitHub page") - return results - else: - try: - print("Gathering existing urls from Supabase") - urls = self.supabase_client.table( - os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).select('course_name, url').eq( - 'course_name', course_name).execute() # type: ignore - - if urls.data == []: - self.existing_urls = [] - else: - self.existing_urls = [] - for row in urls.data: - # whole = '' - # for text in row['contexts']: - # whole += text['text'] - self.existing_urls.append((row['url'], 'whole', 'supa')) - print("Finished gathering existing urls from Supabase") - except Exception as e: - print("Error:", e) - print("Could not gather existing urls from Supabase") - self.existing_urls = [] - try: - print("Begin Ingesting Web page") - self.supa_urls = len(self.existing_urls) - if depth_or_breadth.lower() == 'depth': - self.depth_crawler(url=url, - course_name=course_name, - max_depth=max_depth, - timeout=timeout, - base_url_on=base_url_str) - elif depth_or_breadth.lower() == 'breadth': - self.breadth_crawler(url=url, - course_name=course_name, - timeout=timeout, - base_url_on=base_url_str, - max_depth=max_depth) - else: - raise ValueError("Invalid depth_or_breadth argument") - except ValueError as e: - print("Error:", e) - - if len(self.url_contents) < self.original_amount: - print("Max URLS not reached, returning all urls found:", len(self.url_contents), "out of", self.original_amount) - elif len(self.url_contents) == self.original_amount: - print("Max URLS reached:", len(self.url_contents), "out of", self.original_amount) - else: - print("Exceeded Max URLS, found:", len(self.url_contents), "out of", self.original_amount) - print(len(self.url_contents), "urls found") - print(f"Successfully uploaded files to s3: {len(self.url_contents)}") - print("Finished /web-scrape") - - -def is_github_repo(url): - # Split the URL by '?' to ignore any parameters - base_url = url.split('?')[0] - - # The regular expression now allows for optional 'http', 'https', and 'www' prefixes. - # It also accounts for optional trailing slashes. - # The pattern is also case-insensitive. - pattern = re.compile(r'^(https?://)?(www\.)?github\.com/[^/?]+/[^/?]+/?$', re.IGNORECASE) - - # The function returns True or False based on whether the pattern matches the base_url - return base_url if pattern.match(base_url) else None - - -def mit_course_download(url: str, course_name: str, local_dir: str): - """ - Download an MIT course using its url - """ - ingester = Ingest() - if url.endswith("download"): - pass - else: - url = url + "download" - - r = requests.get(url) - soup = BeautifulSoup(r.text, "html.parser") - - zip = '' - for ref in soup.find_all("a"): - if ref.attrs['href'].endswith("zip"): - zip = ref.attrs['href'] - - site = zip - print('site', site) - r = requests.get(url=site, stream=True) - - zip_file = local_dir + ".zip" - - try: - with open(zip_file, 'wb') as fd: - for chunk in r.iter_content(chunk_size=128): - fd.write(chunk) - print("course downloaded!") - except Exception as e: - print("Error:", e, site) - - with ZipFile(zip_file, 'r') as zObject: - zObject.extractall(path=local_dir) - - shutil.move(local_dir + "/" + "robots.txt", local_dir + "/static_resources") - s3_paths = upload_data_files_to_s3(course_name, local_dir + "/static_resources") - success_fail = ingester.bulk_ingest(s3_paths, course_name) # type: ignore - - shutil.move(zip_file, local_dir) - shutil.rmtree(local_dir) - del ingester - print("Finished Ingest") - return success_fail - - -if __name__ == '__main__': - pass From f0542a80acdfa80fb3fcb4d314ef4034e4f272d8 Mon Sep 17 00:00:00 2001 From: rohanmarwaha Date: Thu, 7 Mar 2024 12:33:46 -0600 Subject: [PATCH 02/15] Move utils_tokenization to utils --- ai_ta_backend/database/sql.py | 1 + ai_ta_backend/service/retrieval_service.py | 2 +- ai_ta_backend/{ => utils}/utils_tokenization.py | 0 3 files changed, 2 insertions(+), 1 deletion(-) rename ai_ta_backend/{ => utils}/utils_tokenization.py (100%) diff --git a/ai_ta_backend/database/sql.py b/ai_ta_backend/database/sql.py index fbd2035a..89b58d64 100644 --- a/ai_ta_backend/database/sql.py +++ b/ai_ta_backend/database/sql.py @@ -1,5 +1,6 @@ import os +import supabase from injector import inject diff --git a/ai_ta_backend/service/retrieval_service.py b/ai_ta_backend/service/retrieval_service.py index b6e03f05..4f12ea8b 100644 --- a/ai_ta_backend/service/retrieval_service.py +++ b/ai_ta_backend/service/retrieval_service.py @@ -18,7 +18,7 @@ from ai_ta_backend.service.nomic_service import NomicService from ai_ta_backend.service.posthog_service import PosthogService from ai_ta_backend.service.sentry_service import SentryService -from ai_ta_backend.utils_tokenization import count_tokens_and_cost +from ai_ta_backend.utils.utils_tokenization import count_tokens_and_cost OPENAI_API_TYPE = "azure" # "openai" or "azure" diff --git a/ai_ta_backend/utils_tokenization.py b/ai_ta_backend/utils/utils_tokenization.py similarity index 100% rename from ai_ta_backend/utils_tokenization.py rename to ai_ta_backend/utils/utils_tokenization.py From f6a787e3203be633b052059057eb087cf3da9829 Mon Sep 17 00:00:00 2001 From: rohanmarwaha Date: Thu, 7 Mar 2024 12:35:54 -0600 Subject: [PATCH 03/15] Add Flask-Injector to dependencies --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index c3cfe6f4..848c10d0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ Flask==3.0.0 flask-cors==4.0.0 +Flask-Injector==0.15.0 gunicorn==21.2.0 protobuf==4.25.0 aiohttp==3.8.6 From a14cc442ecf81f3877f3bd568f13e8192cffff8a Mon Sep 17 00:00:00 2001 From: rohanmarwaha Date: Thu, 7 Mar 2024 13:30:06 -0600 Subject: [PATCH 04/15] Added executors for async operations --- ai_ta_backend/executors/flask_executor.py | 23 +++++++++++++ .../executors/process_pool_executor.py | 33 +++++++++++++++++++ .../executors/thread_pool_executor.py | 31 +++++++++++++++++ ai_ta_backend/main.py | 23 ++++++++++--- 4 files changed, 106 insertions(+), 4 deletions(-) create mode 100644 ai_ta_backend/executors/flask_executor.py create mode 100644 ai_ta_backend/executors/process_pool_executor.py create mode 100644 ai_ta_backend/executors/thread_pool_executor.py diff --git a/ai_ta_backend/executors/flask_executor.py b/ai_ta_backend/executors/flask_executor.py new file mode 100644 index 00000000..b9a78540 --- /dev/null +++ b/ai_ta_backend/executors/flask_executor.py @@ -0,0 +1,23 @@ +from flask_executor import Executor +from injector import inject + + +class ExecutorInterface: + + def submit(self, fn, *args, **kwargs): + raise NotImplementedError + + +class FlaskExecutorAdapter(ExecutorInterface): + """ + Adapter for Flask Executor, suitable for I/O-bound tasks that benefit from asynchronous execution. + Use this executor for tasks that involve waiting for I/O operations (e.g., network requests, file I/O), + where the overhead of creating new threads or processes is justified by the time spent waiting. + """ + + @inject + def __init__(self, executor: Executor): + self.executor = executor + + def submit(self, fn, *args, **kwargs): + return self.executor.submit(fn, *args, **kwargs) diff --git a/ai_ta_backend/executors/process_pool_executor.py b/ai_ta_backend/executors/process_pool_executor.py new file mode 100644 index 00000000..08464017 --- /dev/null +++ b/ai_ta_backend/executors/process_pool_executor.py @@ -0,0 +1,33 @@ +from concurrent.futures import ProcessPoolExecutor + +from injector import inject + + +class ProcessPoolExecutorInterface: + + def submit(self, fn, *args, **kwargs): + raise NotImplementedError + + +class ProcessPoolExecutorAdapter(ProcessPoolExecutorInterface): + """ + Adapter for Python's ProcessPoolExecutor, suitable for CPU-bound tasks that benefit from parallel execution. + Use this executor for tasks that require significant computation and can be efficiently parallelized across multiple CPUs. + Not for I/O-bound tasks like database queries, file I/O, or network requests, as the overhead of creating and managing processes can outweigh the benefits. + + This executor is ideal for scenarios where the task execution time would significantly benefit from being distributed + across multiple processes, thereby bypassing the GIL (Global Interpreter Lock) and utilizing multiple CPU cores. + + Note: ProcessPoolExecutor is best used with tasks that are relatively heavy and can be executed independently of each other. + """ + + def __init__(self, max_workers=None): + self.executor = ProcessPoolExecutor(max_workers=max_workers) + + def submit(self, fn, *args, **kwargs): + raise NotImplementedError( + "ProcessPoolExecutorAdapter does not support 'submit' directly due to its nature. Use 'map' or other methods as needed." + ) + + def map(self, fn, *iterables, timeout=None, chunksize=1): + return self.executor.map(fn, *iterables, timeout=timeout, chunksize=chunksize) diff --git a/ai_ta_backend/executors/thread_pool_executor.py b/ai_ta_backend/executors/thread_pool_executor.py new file mode 100644 index 00000000..124ac2b3 --- /dev/null +++ b/ai_ta_backend/executors/thread_pool_executor.py @@ -0,0 +1,31 @@ +from concurrent.futures import ThreadPoolExecutor + +from injector import inject + + +class ThreadPoolExecutorInterface: + + def submit(self, fn, *args, **kwargs): + raise NotImplementedError + + +class ThreadPoolExecutorAdapter(ThreadPoolExecutorInterface): + """ + Adapter for Python's ThreadPoolExecutor, suitable for I/O-bound tasks that can be performed concurrently. + Use this executor for tasks that are largely waiting on I/O operations, such as database queries or file reads, + where the GIL (Global Interpreter Lock) does not become a bottleneck. + + Not for CPU-bound tasks like heavy computation, as the GIL would prevent true parallel execution. + + This executor is particularly useful when you want more control over the number of concurrent threads + than what Flask Executor provides, or when you're not working within a Flask application context. + """ + + def __init__(self, max_workers=None): + self.executor = ThreadPoolExecutor(max_workers=max_workers) + + def submit(self, fn, *args, **kwargs): + return self.executor.submit(fn, *args, **kwargs) + + def map(self, fn, *iterables, timeout=None, chunksize=1): + return self.executor.map(fn, *iterables, timeout=timeout, chunksize=chunksize) diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py index d72442af..c5d9cb46 100644 --- a/ai_ta_backend/main.py +++ b/ai_ta_backend/main.py @@ -21,6 +21,18 @@ from ai_ta_backend.database.aws import AWSStorage from ai_ta_backend.database.sql import SQLDatabase from ai_ta_backend.database.vector import VectorDatabase +from ai_ta_backend.executors.flask_executor import ( + ExecutorInterface, + FlaskExecutorAdapter, +) +from ai_ta_backend.executors.process_pool_executor import ( + ProcessPoolExecutorAdapter, + ProcessPoolExecutorInterface, +) +from ai_ta_backend.executors.thread_pool_executor import ( + ThreadPoolExecutorAdapter, + ThreadPoolExecutorInterface, +) from ai_ta_backend.service.export_service import ExportService from ai_ta_backend.service.nomic_service import NomicService from ai_ta_backend.service.posthog_service import PosthogService @@ -130,7 +142,7 @@ def getAll(service: RetrievalService) -> Response: @app.route('/delete', methods=['DELETE']) -def delete(service: RetrievalService): +def delete(service: RetrievalService, flaskExecutor: ExecutorInterface): """ Delete a single file from all our database: S3, Qdrant, and Supabase (for now). Note, of course, we still have parts of that file in our logs. @@ -149,7 +161,7 @@ def delete(service: RetrievalService): start_time = time.monotonic() # background execution of tasks!! - executor.submit(service.delete_data, course_name, s3_path, source_url) + flaskExecutor.submit(service.delete_data, course_name, s3_path, source_url) print(f"From {course_name}, deleted file: {s3_path}") print(f"⏰ Runtime of FULL delete func: {(time.monotonic() - start_time):.2f} seconds") # we need instant return. Delets are "best effort" assume always successful... sigh :( @@ -191,7 +203,7 @@ def createDocumentMap(service: NomicService): @app.route('/onResponseCompletion', methods=['POST']) -def logToNomic(service: NomicService): +def logToNomic(service: NomicService, flaskExecutor: ExecutorInterface): data = request.get_json() course_name = data['course_name'] conversation = data['conversation'] @@ -206,7 +218,7 @@ def logToNomic(service: NomicService): print(f"In /onResponseCompletion for course: {course_name}") # background execution of tasks!! - response = executor.submit(service.log_convo_to_nomic, course_name, data) + response = flaskExecutor.submit(service.log_convo_to_nomic, course_name, data) response = jsonify({'outcome': 'success'}) response.headers.add('Access-Control-Allow-Origin', '*') return response @@ -313,6 +325,9 @@ def configure(binder: Binder) -> None: binder.bind(VectorDatabase, to=VectorDatabase, scope=SingletonScope) binder.bind(SQLDatabase, to=SQLDatabase, scope=SingletonScope) binder.bind(AWSStorage, to=AWSStorage, scope=SingletonScope) + binder.bind(ExecutorInterface, to=FlaskExecutorAdapter(executor), scope=SingletonScope) + binder.bind(ThreadPoolExecutorInterface, to=ThreadPoolExecutorAdapter, scope=SingletonScope) + binder.bind(ProcessPoolExecutorInterface, to=ProcessPoolExecutorAdapter, scope=SingletonScope) FlaskInjector(app=app, modules=[configure]) From ad220a675ef182550317cdc3e5ca0a8ce8845e58 Mon Sep 17 00:00:00 2001 From: rohanmarwaha Date: Thu, 7 Mar 2024 14:51:38 -0600 Subject: [PATCH 05/15] Adding injection to ExportService __init__, and add SQLDatabase injection to document map functions in NomicService --- ai_ta_backend/database/sql.py | 9 ++++++--- ai_ta_backend/service/export_service.py | 4 +++- ai_ta_backend/service/nomic_service.py | 26 +++++++++++-------------- 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/ai_ta_backend/database/sql.py b/ai_ta_backend/database/sql.py index 89b58d64..bf750aa3 100644 --- a/ai_ta_backend/database/sql.py +++ b/ai_ta_backend/database/sql.py @@ -78,6 +78,9 @@ def getAllConversationsBetweenIds(self, course_name: str, first_id: int, last_id return self.supabase_client.table("llm-convo-monitor").select("*").eq("course_name", course_name).gte( 'id', first_id).lte('id', last_id).order('id', desc=False).limit(25).execute() - def getDocsForIdsGte(self, course_name: str, first_id: int): - return self.supabase_client.table("documents").select("*").eq("course_name", course_name).gte('id', first_id).order( - 'id', desc=False).limit(100).execute() + def getDocsForIdsGte(self, course_name: str, first_id: int, fields: str = "*", limit: int = 100): + return self.supabase_client.table("documents").select(fields).eq("course_name", course_name).gte( + 'id', first_id).order('id', desc=False).limit(limit).execute() + + def insertProjectInfo(self, project_info): + return self.supabase_client.table("projects").insert(project_info).execute() diff --git a/ai_ta_backend/service/export_service.py b/ai_ta_backend/service/export_service.py index 0c3718e0..79bf9d74 100644 --- a/ai_ta_backend/service/export_service.py +++ b/ai_ta_backend/service/export_service.py @@ -7,6 +7,7 @@ import pandas as pd import requests +from injector import inject from ai_ta_backend.database.aws import AWSStorage from ai_ta_backend.database.sql import SQLDatabase @@ -16,6 +17,7 @@ class ExportService: + @inject def __init__(self, sql: SQLDatabase, s3: AWSStorage, sentry=SentryService): self.sql = sql self.s3 = s3 @@ -257,7 +259,7 @@ def export_convo_history_json(self, course_name: str, from_date='', to_date=''): return {"response": (zip_file_path, zip_filename, os.getcwd())} except Exception as e: print(e) - sentry_sdk.capture_exception(e) + self.sentry.capture_exception(e) return {"response": "Error downloading file!"} else: return {"response": "No data found between the given dates."} diff --git a/ai_ta_backend/service/nomic_service.py b/ai_ta_backend/service/nomic_service.py index 973f17f5..5b6c4e38 100644 --- a/ai_ta_backend/service/nomic_service.py +++ b/ai_ta_backend/service/nomic_service.py @@ -10,6 +10,7 @@ from langchain.embeddings import OpenAIEmbeddings from nomic import AtlasProject, atlas +from ai_ta_backend.database.sql import SQLDatabase from ai_ta_backend.service.sentry_service import SentryService LOCK_EXCEPTIONS = [ @@ -59,9 +60,10 @@ def backoff_strategy(): class NomicService(): @inject - def __init__(self, sentry: SentryService): + def __init__(self, sentry: SentryService, sql: SQLDatabase): nomic.login(os.getenv('NOMIC_API_KEY')) self.sentry = sentry + self.sql = sql @backoff.on_exception(backoff_strategy, Exception, @@ -424,22 +426,16 @@ def create_document_map(self, course_name: str): # nomic.login(os.getenv('NOMIC_API_KEY')) NOMIC_MAP_NAME_PREFIX = 'Document Map for ' - # initialize supabase - supabase_client = supabase.create_client( # type: ignore - supabase_url=os.getenv('SUPABASE_URL'), # type: ignore - supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore - try: # check if map exists - response = supabase_client.table("projects").select("doc_map_id").eq("course_name", course_name).execute() + + response = self.sql.getProjectsMapForCourse(course_name) if response.data: return "Map already exists for this course." # fetch relevant document data from Supabase - response = supabase_client.table("documents").select("id", - count="exact").eq("course_name", - course_name).order('id', - desc=False).execute() + response = self.sql.getDocumentsBetweenDates(course_name, '', '', "documents") + if not response.count: return "No documents found for this course." @@ -458,9 +454,9 @@ def create_document_map(self, course_name: str): # iteratively query in batches of 25 while curr_total_doc_count < total_doc_count: - response = supabase_client.table("documents").select( - "id, created_at, s3_path, url, readable_filename, contexts").eq("course_name", course_name).gte( - 'id', first_id).order('id', desc=False).limit(25).execute() + response = self.sql.getDocsForIdsGte(course_name, first_id, + "id, created_at, s3_path, url, readable_filename, contexts", 25) + df = pd.DataFrame(response.data) combined_dfs.append(df) # list of dfs @@ -519,7 +515,7 @@ def create_document_map(self, course_name: str): project_id = project.id project.rebuild_maps() project_info = {'course_name': course_name, 'doc_map_id': project_id} - response = supabase_client.table("projects").insert(project_info).execute() + response = self.sql.insertProjectInfo(project_info) print("Response from supabase: ", response) return "success" else: From 484b2a257b6581f2450349de8f83643ea001b556 Mon Sep 17 00:00:00 2001 From: rohanmarwaha Date: Thu, 7 Mar 2024 17:55:45 -0600 Subject: [PATCH 06/15] Fix sentry service instantiation issue in ExportService constructor --- ai_ta_backend/service/export_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai_ta_backend/service/export_service.py b/ai_ta_backend/service/export_service.py index 79bf9d74..61469ec4 100644 --- a/ai_ta_backend/service/export_service.py +++ b/ai_ta_backend/service/export_service.py @@ -18,7 +18,7 @@ class ExportService: @inject - def __init__(self, sql: SQLDatabase, s3: AWSStorage, sentry=SentryService): + def __init__(self, sql: SQLDatabase, s3: AWSStorage, sentry: SentryService): self.sql = sql self.s3 = s3 self.sentry = sentry From 016e48bdb06e3328e4b4db456706b30c74d96ada Mon Sep 17 00:00:00 2001 From: Kastan Day Date: Thu, 7 Mar 2024 16:19:57 -0800 Subject: [PATCH 07/15] Clean up env vars and minor type errors --- .env.template | 2 +- ai_ta_backend/beam/ingest.py | 21 +++++++++---------- ai_ta_backend/database/sql.py | 21 +++++++++---------- ai_ta_backend/service/export_service.py | 13 +++++++----- ai_ta_backend/service/nomic_service.py | 2 +- ai_ta_backend/service/retrieval_service.py | 8 +++---- .../utils/context_parent_doc_padding.py | 2 +- ai_ta_backend/utils/utils_tokenization.py | 1 + 8 files changed, 36 insertions(+), 34 deletions(-) diff --git a/.env.template b/.env.template index ba04c704..b007d62b 100644 --- a/.env.template +++ b/.env.template @@ -5,7 +5,7 @@ SUPABASE_READ_ONLY= SUPABASE_JWT_SECRET= MATERIALS_SUPABASE_TABLE=uiuc_chatbot -NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE=documents +SUPABASE_DOCUMENTS_TABLE=documents # QDRANT QDRANT_COLLECTION_NAME=uiuc-chatbot diff --git a/ai_ta_backend/beam/ingest.py b/ai_ta_backend/beam/ingest.py index 0aaf7d58..77f74de6 100644 --- a/ai_ta_backend/beam/ingest.py +++ b/ai_ta_backend/beam/ingest.py @@ -959,7 +959,7 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]): } response = self.supabase_client.table( - os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).insert(document).execute() # type: ignore + os.getenv('SUPABASE_DOCUMENTS_TABLE')).insert(document).execute() # type: ignore # add to Nomic document map if len(response.data) > 0: @@ -988,7 +988,7 @@ def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any] For given metadata, fetch docs from Supabase based on S3 path or URL. If docs exists, concatenate the texts and compare with current texts, if same, return True. """ - doc_table = os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE', '') + doc_table = os.getenv('SUPABASE_DOCUMENTS_TABLE', '') course_name = metadatas[0]['course_name'] incoming_s3_path = metadatas[0]['s3_path'] url = metadatas[0]['url'] @@ -1087,8 +1087,8 @@ def delete_data(self, course_name: str, s3_path: str, source_url: str): try: # delete from Nomic response = self.supabase_client.from_( - os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select("id, s3_path, contexts").eq( - 's3_path', s3_path).eq('course_name', course_name).execute() + os.environ['SUPABASE_DOCUMENTS_TABLE']).select("id, s3_path, contexts").eq('s3_path', s3_path).eq( + 'course_name', course_name).execute() data = response.data[0] #single record fetched nomic_ids_to_delete = [] context_count = len(data['contexts']) @@ -1102,8 +1102,8 @@ def delete_data(self, course_name: str, s3_path: str, source_url: str): sentry_sdk.capture_exception(e) try: - self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq( - 's3_path', s3_path).eq('course_name', course_name).execute() + self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).delete().eq('s3_path', s3_path).eq( + 'course_name', course_name).execute() except Exception as e: print("Error in deleting file from supabase:", e) sentry_sdk.capture_exception(e) @@ -1131,9 +1131,8 @@ def delete_data(self, course_name: str, s3_path: str, source_url: str): sentry_sdk.capture_exception(e) try: # delete from Nomic - response = self.supabase_client.from_( - os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select("id, url, contexts").eq( - 'url', source_url).eq('course_name', course_name).execute() + response = self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).select("id, url, contexts").eq( + 'url', source_url).eq('course_name', course_name).execute() data = response.data[0] #single record fetched nomic_ids_to_delete = [] context_count = len(data['contexts']) @@ -1148,8 +1147,8 @@ def delete_data(self, course_name: str, s3_path: str, source_url: str): try: # delete from Supabase - self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq( - 'url', source_url).eq('course_name', course_name).execute() + self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).delete().eq('url', source_url).eq( + 'course_name', course_name).execute() except Exception as e: print("Error in deleting file from supabase:", e) sentry_sdk.capture_exception(e) diff --git a/ai_ta_backend/database/sql.py b/ai_ta_backend/database/sql.py index bf750aa3..b8d4579c 100644 --- a/ai_ta_backend/database/sql.py +++ b/ai_ta_backend/database/sql.py @@ -13,26 +13,25 @@ def __init__(self, db_url: str): supabase_url=os.environ['SUPABASE_URL'], supabase_key=os.environ['SUPABASE_API_KEY']) def getAllMaterialsForCourse(self, course_name: str): - return self.supabase_client.table(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select( - 'course_name, s3_path, readable_filename, url, base_url').eq('course_name', course_name).execute() + return self.supabase_client.table( + os.environ['SUPABASE_DOCUMENTS_TABLE']).select('course_name, s3_path, readable_filename, url, base_url').eq( + 'course_name', course_name).execute() def getMaterialsForCourseAndS3Path(self, course_name: str, s3_path: str): - return self.supabase_client.from_( - os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select("id, s3_path, contexts").eq( - 's3_path', s3_path).eq('course_name', course_name).execute() + return self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).select("id, s3_path, contexts").eq( + 's3_path', s3_path).eq('course_name', course_name).execute() def getMaterialsForCourseAndKeyAndValue(self, course_name: str, key: str, value: str): - return self.supabase_client.from_( - os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select("id, s3_path, contexts").eq(key, value).eq( - 'course_name', course_name).execute() + return self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).select("id, s3_path, contexts").eq( + key, value).eq('course_name', course_name).execute() def deleteMaterialsForCourseAndKeyAndValue(self, course_name: str, key: str, value: str): - return self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq(key, value).eq( + return self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).delete().eq(key, value).eq( 'course_name', course_name).execute() def deleteMaterialsForCourseAndS3Path(self, course_name: str, s3_path: str): - return self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq( - 's3_path', s3_path).eq('course_name', course_name).execute() + return self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).delete().eq('s3_path', s3_path).eq( + 'course_name', course_name).execute() def getProjectsMapForCourse(self, course_name: str): return self.supabase_client.table("projects").select("doc_map_id").eq("course_name", course_name).execute() diff --git a/ai_ta_backend/service/export_service.py b/ai_ta_backend/service/export_service.py index 61469ec4..ad095d79 100644 --- a/ai_ta_backend/service/export_service.py +++ b/ai_ta_backend/service/export_service.py @@ -148,7 +148,7 @@ def export_data_in_bg(self, response, download_type, course_name, s3_path): #s3_file = f"courses/{course_name}/exports/{os.path.basename(zip_file_path)}" s3_file = f"courses/{course_name}/{os.path.basename(zip_file_path)}" - self.s3.upload_file(zip_file_path, os.getenv('S3_BUCKET_NAME'), s3_file) + self.s3.upload_file(zip_file_path, os.environ['S3_BUCKET_NAME'], s3_file) # remove local files os.remove(file_path) @@ -157,12 +157,15 @@ def export_data_in_bg(self, response, download_type, course_name, s3_path): print("file uploaded to s3: ", s3_file) # generate presigned URL - s3_url = self.s3.generatePresignedUrl('get_object', os.getenv('S3_BUCKET_NAME'), s3_path, 3600) + s3_url = self.s3.generatePresignedUrl('get_object', os.environ['S3_BUCKET_NAME'], s3_path, 3600) # get admin email IDs - headers = {"Authorization": f"Bearer {os.getenv('VERCEL_READ_ONLY_API_KEY')}", "Content-Type": "application/json"} + headers = { + "Authorization": f"Bearer {os.environ['VERCEL_READ_ONLY_API_KEY']}", + "Content-Type": "application/json" + } - hget_url = str(os.getenv('VERCEL_BASE_URL')) + "course_metadatas/" + course_name + hget_url = str(os.environ['VERCEL_BASE_URL']) + "course_metadatas/" + course_name response = requests.get(hget_url, headers=headers) course_metadata = response.json() course_metadata = json.loads(course_metadata['result']) @@ -187,7 +190,7 @@ def export_data_in_bg(self, response, download_type, course_name, s3_path): # send email to admins subject = "UIUC.chat Data Export Complete for " + course_name body_text = "The data export for " + course_name + " is complete.\n\nYou can download the file from the following link: \n\n" + s3_url + "\n\nThis link will expire in 48 hours." - email_status = send_email(subject, body_text, os.getenv('EMAIL_SENDER'), admin_emails, bcc_emails) + email_status = send_email(subject, body_text, os.environ['EMAIL_SENDER'], admin_emails, bcc_emails) print("email_status: ", email_status) return "File uploaded to S3. Email sent to admins." diff --git a/ai_ta_backend/service/nomic_service.py b/ai_ta_backend/service/nomic_service.py index 5b6c4e38..45a4074f 100644 --- a/ai_ta_backend/service/nomic_service.py +++ b/ai_ta_backend/service/nomic_service.py @@ -71,7 +71,7 @@ def __init__(self, sentry: SentryService, sql: SQLDatabase): raise_on_giveup=False, giveup=giveup_hdlr, on_backoff=backoff_hdlr) - def log_convo_to_nomic(self, course_name: str, conversation) -> str: + def log_convo_to_nomic(self, course_name: str, conversation) -> str | None: # nomic.login(os.getenv('NOMIC_API_KEY')) # login during start of flask app NOMIC_MAP_NAME_PREFIX = 'Conversation Map for ' """ diff --git a/ai_ta_backend/service/retrieval_service.py b/ai_ta_backend/service/retrieval_service.py index 4f12ea8b..92174ba4 100644 --- a/ai_ta_backend/service/retrieval_service.py +++ b/ai_ta_backend/service/retrieval_service.py @@ -75,8 +75,8 @@ def getTopContexts(self, search_query: str, course_name: str, token_limit: int = pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" # count tokens at start and end, then also count each context. - token_counter, _ = count_tokens_and_cost(pre_prompt + "\n\nNow please respond to my query: " + - search_query) # type: ignore + token_counter, _ = count_tokens_and_cost(pre_prompt + "\n\nNow please respond to my query: " + # type: ignore + search_query) valid_docs = [] num_tokens = 0 @@ -357,8 +357,8 @@ def vector_search(self, search_query, course_name): for d in search_results: try: metadata = d.payload - page_content = metadata["page_content"] - del metadata["page_content"] + page_content = metadata["page_content"] # type: ignore + del metadata["page_content"] # type: ignore if "pagenumber" not in metadata.keys() and "pagenumber_or_timestamp" in metadata.keys(): # type: ignore # aiding in the database migration... metadata["pagenumber"] = metadata["pagenumber_or_timestamp"] # type: ignore diff --git a/ai_ta_backend/utils/context_parent_doc_padding.py b/ai_ta_backend/utils/context_parent_doc_padding.py index 8521d99e..e015b018 100644 --- a/ai_ta_backend/utils/context_parent_doc_padding.py +++ b/ai_ta_backend/utils/context_parent_doc_padding.py @@ -6,7 +6,7 @@ import supabase -DOCUMENTS_TABLE = os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE'] +DOCUMENTS_TABLE = os.environ['SUPABASE_DOCUMENTS_TABLE'] # SUPABASE_CLIENT = supabase.create_client(supabase_url=os.environ['SUPABASE_URL'], # supabase_key=os.environ['SUPABASE_API_KEY']) # type: ignore diff --git a/ai_ta_backend/utils/utils_tokenization.py b/ai_ta_backend/utils/utils_tokenization.py index 7070ea7f..956cc196 100644 --- a/ai_ta_backend/utils/utils_tokenization.py +++ b/ai_ta_backend/utils/utils_tokenization.py @@ -9,6 +9,7 @@ def count_tokens_and_cost( completion: str = '', openai_model_name: str = "gpt-3.5-turbo"): # -> tuple[int, float] | tuple[int, float, int, float]: """ + # TODO: improve w/ extra tokens used by model: https://github.com/openai/openai-cookbook/blob/d00e9a48a63739f5b038797594c81c8bb494fc09/examples/How_to_count_tokens_with_tiktoken.ipynb Returns the number of tokens in a text string. Only the first parameter is required, a string of text to measure. The completion and model name are optional. From 3017603122eab99b23a4fe4e055f9a6a2814f1de Mon Sep 17 00:00:00 2001 From: Kastan Day Date: Thu, 7 Mar 2024 16:27:50 -0800 Subject: [PATCH 08/15] Clean up trunk check recommendation --- ai_ta_backend/beam/ingest.py | 2 +- ai_ta_backend/executors/process_pool_executor.py | 2 -- ai_ta_backend/executors/thread_pool_executor.py | 2 -- ai_ta_backend/public_api/uiuc_chat_api.py | 3 ++- ai_ta_backend/service/export_service.py | 5 ++--- ai_ta_backend/service/nomic_service.py | 4 ++-- ai_ta_backend/service/retrieval_service.py | 4 +--- ai_ta_backend/utils/context_parent_doc_padding.py | 2 -- 8 files changed, 8 insertions(+), 16 deletions(-) diff --git a/ai_ta_backend/beam/ingest.py b/ai_ta_backend/beam/ingest.py index 77f74de6..5a1569e0 100644 --- a/ai_ta_backend/beam/ingest.py +++ b/ai_ta_backend/beam/ingest.py @@ -22,7 +22,7 @@ import pytesseract import sentry_sdk import supabase -from beam import App, QueueDepthAutoscaler, RequestLatencyAutoscaler, Runtime +from beam import App, QueueDepthAutoscaler, Runtime # RequestLatencyAutoscaler, from bs4 import BeautifulSoup from git.repo import Repo from langchain.document_loaders import ( diff --git a/ai_ta_backend/executors/process_pool_executor.py b/ai_ta_backend/executors/process_pool_executor.py index 08464017..81b4860c 100644 --- a/ai_ta_backend/executors/process_pool_executor.py +++ b/ai_ta_backend/executors/process_pool_executor.py @@ -1,7 +1,5 @@ from concurrent.futures import ProcessPoolExecutor -from injector import inject - class ProcessPoolExecutorInterface: diff --git a/ai_ta_backend/executors/thread_pool_executor.py b/ai_ta_backend/executors/thread_pool_executor.py index 124ac2b3..0b40b5db 100644 --- a/ai_ta_backend/executors/thread_pool_executor.py +++ b/ai_ta_backend/executors/thread_pool_executor.py @@ -1,7 +1,5 @@ from concurrent.futures import ThreadPoolExecutor -from injector import inject - class ThreadPoolExecutorInterface: diff --git a/ai_ta_backend/public_api/uiuc_chat_api.py b/ai_ta_backend/public_api/uiuc_chat_api.py index 33029990..ee21d666 100644 --- a/ai_ta_backend/public_api/uiuc_chat_api.py +++ b/ai_ta_backend/public_api/uiuc_chat_api.py @@ -1,5 +1,6 @@ -import requests import json + +import requests """ # Example usage diff --git a/ai_ta_backend/service/export_service.py b/ai_ta_backend/service/export_service.py index ad095d79..6eb889c2 100644 --- a/ai_ta_backend/service/export_service.py +++ b/ai_ta_backend/service/export_service.py @@ -1,4 +1,3 @@ -import io import json import os import uuid @@ -40,7 +39,7 @@ def export_documents_json(self, course_name: str, from_date='', to_date=''): # call background task to upload to s3 filename = course_name + '_' + str(uuid.uuid4()) + '_documents.zip' - s3_filepath = s3_file = f"courses/{course_name}/{filename}" + s3_filepath = f"courses/{course_name}/{filename}" # background task of downloading data - map it with above ID executor = ProcessPoolExecutor() executor.submit(self.export_data_in_bg, response, "documents", course_name, s3_filepath) @@ -214,7 +213,7 @@ def export_convo_history_json(self, course_name: str, from_date='', to_date=''): if response.count > 1000: # call background task to upload to s3 filename = course_name + '_' + str(uuid.uuid4()) + '_convo_history.zip' - s3_filepath = s3_file = f"courses/{course_name}/{filename}" + s3_filepath = f"courses/{course_name}/{filename}" # background task of downloading data - map it with above ID executor = ProcessPoolExecutor() executor.submit(self.export_data_in_bg, response, "conversations", course_name, s3_filepath) diff --git a/ai_ta_backend/service/nomic_service.py b/ai_ta_backend/service/nomic_service.py index 45a4074f..feae1473 100644 --- a/ai_ta_backend/service/nomic_service.py +++ b/ai_ta_backend/service/nomic_service.py @@ -677,10 +677,10 @@ def data_prep_for_doc_map(self, df: pd.DataFrame): embeddings = [] texts = [] - for index, row in df.iterrows(): + for _index, row in df.iterrows(): current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - if row['url'] == None: + if row['url'] is None: row['url'] = "" # iterate through all contexts and create separate entries for each context_count = 0 diff --git a/ai_ta_backend/service/retrieval_service.py b/ai_ta_backend/service/retrieval_service.py index 92174ba4..9d39146c 100644 --- a/ai_ta_backend/service/retrieval_service.py +++ b/ai_ta_backend/service/retrieval_service.py @@ -6,10 +6,8 @@ import openai from injector import inject -from langchain import hub from langchain.chat_models import AzureChatOpenAI from langchain.embeddings.openai import OpenAIEmbeddings -from langchain.load import dumps, loads from langchain.schema import Document from ai_ta_backend.database.aws import AWSStorage @@ -326,7 +324,7 @@ def delete_from_nomic_and_supabase(self, course_name: str, identifier_key: str, project_id = response.data[0]['doc_map_id'] else: return "No document map found for this course" - res = self.nomicService.delete_from_document_map(project_id, nomic_ids_to_delete) + self.nomicService.delete_from_document_map(project_id, nomic_ids_to_delete) # delete from Supabase self.sqlDb.deleteMaterialsForCourseAndKeyAndValue(course_name, identifier_key, identifier_value) diff --git a/ai_ta_backend/utils/context_parent_doc_padding.py b/ai_ta_backend/utils/context_parent_doc_padding.py index e015b018..fc0ba19c 100644 --- a/ai_ta_backend/utils/context_parent_doc_padding.py +++ b/ai_ta_backend/utils/context_parent_doc_padding.py @@ -4,8 +4,6 @@ from functools import partial from multiprocessing import Manager -import supabase - DOCUMENTS_TABLE = os.environ['SUPABASE_DOCUMENTS_TABLE'] # SUPABASE_CLIENT = supabase.create_client(supabase_url=os.environ['SUPABASE_URL'], # supabase_key=os.environ['SUPABASE_API_KEY']) # type: ignore From 81fc4ef1e42ceb3d10b9ab1b904fa258e3120c9a Mon Sep 17 00:00:00 2001 From: Kastan Day Date: Thu, 7 Mar 2024 17:07:51 -0800 Subject: [PATCH 09/15] Reduce workers from 6 to 3, should be more than enough with reduced responsibilities (no /ingest) --- ai_ta_backend/main.py | 3 ++- run.sh | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py index c5d9cb46..69cd246f 100644 --- a/ai_ta_backend/main.py +++ b/ai_ta_backend/main.py @@ -59,7 +59,8 @@ def index() -> Response: Returns: JSON: _description_ """ - response = jsonify({"Choo Choo": "Welcome to your Flask app 🚅"}) + response = jsonify( + {"hi there, this is a 404": "Welcome to UIUC.chat backend 🚅 Read the docs here: https://docs.uiuc.chat/ "}) response.headers.add('Access-Control-Allow-Origin', '*') return response diff --git a/run.sh b/run.sh index 0d77691a..f73f2369 100755 --- a/run.sh +++ b/run.sh @@ -5,4 +5,4 @@ # 200 MB object store memory.. necessary to statically allocate or will crash in Railway env restrictions. # ray start --head --num-cpus 6 --object-store-memory 300000000 export PYTHONPATH=${PYTHONPATH}:$(pwd)/ai_ta_backend -exec gunicorn --workers=6 --threads=20000 --worker-class=gthread ai_ta_backend.main:app --timeout 1800 +exec gunicorn --workers=3 --threads=20000 --worker-class=gthread ai_ta_backend.main:app --timeout 1800 From 5951fe43fc477f8e2fce87965f37a57ef88dc9c9 Mon Sep 17 00:00:00 2001 From: rohanmarwaha Date: Thu, 7 Mar 2024 19:15:00 -0600 Subject: [PATCH 10/15] Update OpenAI API type to be fetched from environment variable --- ai_ta_backend/service/nomic_service.py | 4 ++-- ai_ta_backend/service/retrieval_service.py | 8 +++----- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/ai_ta_backend/service/nomic_service.py b/ai_ta_backend/service/nomic_service.py index feae1473..f9d33a59 100644 --- a/ai_ta_backend/service/nomic_service.py +++ b/ai_ta_backend/service/nomic_service.py @@ -196,7 +196,7 @@ def log_convo_to_nomic(self, course_name: str, conversation) -> str | None: }] # create embeddings - embeddings_model = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE) # type: ignore + embeddings_model = OpenAIEmbeddings(openai_api_type=os.environ['OPENAI_API_TYPE']) embeddings = embeddings_model.embed_documents(user_queries) # add embeddings to the project - create a new function for this @@ -380,7 +380,7 @@ def create_nomic_map(self, course_name: str, log_data: list): metadata.append(metadata_row) metadata = pd.DataFrame(metadata) - embeddings_model = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE) # type: ignore + embeddings_model = OpenAIEmbeddings(openai_api_type=os.environ['OPENAI_API_TYPE']) embeddings = embeddings_model.embed_documents(user_queries) # create Atlas project diff --git a/ai_ta_backend/service/retrieval_service.py b/ai_ta_backend/service/retrieval_service.py index 9d39146c..e51123f5 100644 --- a/ai_ta_backend/service/retrieval_service.py +++ b/ai_ta_backend/service/retrieval_service.py @@ -18,8 +18,6 @@ from ai_ta_backend.service.sentry_service import SentryService from ai_ta_backend.utils.utils_tokenization import count_tokens_and_cost -OPENAI_API_TYPE = "azure" # "openai" or "azure" - class RetrievalService: """ @@ -41,7 +39,7 @@ def __init__(self, vdb: VectorDatabase, sqlDb: SQLDatabase, aws: AWSStorage, pos self.embeddings = OpenAIEmbeddings( model='text-embedding-ada-002', openai_api_base=os.getenv("AZURE_OPENAI_ENDPOINT"), # type:ignore - openai_api_type=OPENAI_API_TYPE, + openai_api_type=os.environ['OPENAI_API_TYPE'], openai_api_key=os.getenv("AZURE_OPENAI_KEY"), # type:ignore openai_api_version=os.getenv("OPENAI_API_VERSION"), # type:ignore ) @@ -52,7 +50,7 @@ def __init__(self, vdb: VectorDatabase, sqlDb: SQLDatabase, aws: AWSStorage, pos openai_api_base=os.getenv("AZURE_OPENAI_ENDPOINT"), # type:ignore openai_api_key=os.getenv("AZURE_OPENAI_KEY"), # type:ignore openai_api_version=os.getenv("OPENAI_API_VERSION"), # type:ignore - openai_api_type=OPENAI_API_TYPE, + openai_api_type=os.environ['OPENAI_API_TYPE'], ) def getTopContexts(self, search_query: str, course_name: str, token_limit: int = 4_000) -> Union[List[Dict], str]: @@ -336,7 +334,7 @@ def vector_search(self, search_query, course_name): top_n = 80 # EMBED openai_start_time = time.monotonic() - print("OPENAI_API_TYPE", OPENAI_API_TYPE) + print("OPENAI_API_TYPE", os.environ['OPENAI_API_TYPE']) user_query_embedding = self.embeddings.embed_query(search_query) openai_embedding_latency = time.monotonic() - openai_start_time From ae006940f1035be145f11162efee4217b3621d81 Mon Sep 17 00:00:00 2001 From: Kastan Day Date: Thu, 7 Mar 2024 17:18:04 -0800 Subject: [PATCH 11/15] Reduce threads from 1_000 to 100. more sensible --- run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run.sh b/run.sh index f73f2369..49d79372 100755 --- a/run.sh +++ b/run.sh @@ -5,4 +5,4 @@ # 200 MB object store memory.. necessary to statically allocate or will crash in Railway env restrictions. # ray start --head --num-cpus 6 --object-store-memory 300000000 export PYTHONPATH=${PYTHONPATH}:$(pwd)/ai_ta_backend -exec gunicorn --workers=3 --threads=20000 --worker-class=gthread ai_ta_backend.main:app --timeout 1800 +exec gunicorn --workers=3 --threads=100 --worker-class=gthread ai_ta_backend.main:app --timeout 1800 From 75251d1e2c1c0d577c9d54b84f795e8ea22f5daa Mon Sep 17 00:00:00 2001 From: Kastan Day Date: Thu, 7 Mar 2024 17:37:06 -0800 Subject: [PATCH 12/15] Update posthog /getTopContexts name in so we can track impovements from DI --- ai_ta_backend/main.py | 1 - ai_ta_backend/service/retrieval_service.py | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py index 69cd246f..ef311b6c 100644 --- a/ai_ta_backend/main.py +++ b/ai_ta_backend/main.py @@ -101,7 +101,6 @@ def getTopContexts(service: RetrievalService) -> Response: Exception Testing how exceptions are handled. """ - print("In getRopContexts in Main()") search_query: str = request.args.get('search_query', default='', type=str) course_name: str = request.args.get('course_name', default='', type=str) token_limit: int = request.args.get('token_limit', default=3000, type=int) diff --git a/ai_ta_backend/service/retrieval_service.py b/ai_ta_backend/service/retrieval_service.py index e51123f5..91583323 100644 --- a/ai_ta_backend/service/retrieval_service.py +++ b/ai_ta_backend/service/retrieval_service.py @@ -97,7 +97,7 @@ def getTopContexts(self, search_query: str, course_name: str, token_limit: int = return [] self.posthog.capture( - event_name="success_get_top_contexts_OG", + event_name="getTopContexts_success_DI", properties={ "user_query": search_query, "course_name": course_name, @@ -204,7 +204,7 @@ def getTopContextsWithMQR(self, 4. [CANCELED BEC POINTLESS] Rank the docs based on the relevance score. 5. Parent-doc-retrieval: Pad just the top 5 docs with expanded context from the original document. """ - return 'fail' + raise NotImplementedError("Method deprecated for performance reasons. Hope to bring back soon.") # try: # top_n_per_query = 40 # HARD CODE TO ENSURE WE HIT THE MAX TOKENS @@ -334,7 +334,6 @@ def vector_search(self, search_query, course_name): top_n = 80 # EMBED openai_start_time = time.monotonic() - print("OPENAI_API_TYPE", os.environ['OPENAI_API_TYPE']) user_query_embedding = self.embeddings.embed_query(search_query) openai_embedding_latency = time.monotonic() - openai_start_time From f5a36fc36a1c432f8e3519cae383bb479557ffd5 Mon Sep 17 00:00:00 2001 From: rohanmarwaha Date: Fri, 8 Mar 2024 11:33:24 -0600 Subject: [PATCH 13/15] Add new method to SQLDatabase, update environment variable usage, some debugging logs --- ai_ta_backend/database/aws.py | 6 ++-- ai_ta_backend/database/sql.py | 3 ++ ai_ta_backend/database/vector.py | 6 ++-- ai_ta_backend/service/nomic_service.py | 17 +++++------- ai_ta_backend/service/retrieval_service.py | 32 ++++++++++++++-------- 5 files changed, 36 insertions(+), 28 deletions(-) diff --git a/ai_ta_backend/database/aws.py b/ai_ta_backend/database/aws.py index 1b2f63dc..68e61b68 100644 --- a/ai_ta_backend/database/aws.py +++ b/ai_ta_backend/database/aws.py @@ -11,8 +11,8 @@ def __init__(self): # S3 self.s3_client = boto3.client( 's3', - aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), - aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), + aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'], + aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'], ) def upload_file(self, file_path: str, bucket_name: str, object_name: str): @@ -22,7 +22,7 @@ def download_file(self, object_name: str, bucket_name: str, file_path: str): self.s3_client.download_file(bucket_name, object_name, file_path) def delete_file(self, bucket_name: str, s3_path: str): - self.s3_client.delete_object(Bucket=bucket_name, Key=s3_path) + return self.s3_client.delete_object(Bucket=bucket_name, Key=s3_path) def generatePresignedUrl(self, object: str, bucket_name: str, s3_path: str, expiration: int = 3600): # generate presigned URL diff --git a/ai_ta_backend/database/sql.py b/ai_ta_backend/database/sql.py index b8d4579c..a9819657 100644 --- a/ai_ta_backend/database/sql.py +++ b/ai_ta_backend/database/sql.py @@ -83,3 +83,6 @@ def getDocsForIdsGte(self, course_name: str, first_id: int, fields: str = "*", l def insertProjectInfo(self, project_info): return self.supabase_client.table("projects").insert(project_info).execute() + + def getAllFromLLMConvoMonitor(self, course_name: str): + return self.supabase_client.table("llm-convo-monitor").select("*").eq("course_name", course_name).execute() diff --git a/ai_ta_backend/database/vector.py b/ai_ta_backend/database/vector.py index 7ac67eda..d22fc6ca 100644 --- a/ai_ta_backend/database/vector.py +++ b/ai_ta_backend/database/vector.py @@ -20,8 +20,8 @@ def __init__(self): """ # vector DB self.qdrant_client = QdrantClient( - url=os.getenv('QDRANT_URL'), - api_key=os.getenv('QDRANT_API_KEY'), + url=os.environ['QDRANT_URL'], + api_key=os.environ['QDRANT_API_KEY'], ) self.vectorstore = Qdrant(client=self.qdrant_client, @@ -50,7 +50,7 @@ def delete_data(self, collection_name: str, key: str, value: str): """ Delete data from the vector database. """ - self.qdrant_client.delete( + return self.qdrant_client.delete( collection_name=collection_name, points_selector=models.Filter(must=[ models.FieldCondition( diff --git a/ai_ta_backend/service/nomic_service.py b/ai_ta_backend/service/nomic_service.py index f9d33a59..2e660de0 100644 --- a/ai_ta_backend/service/nomic_service.py +++ b/ai_ta_backend/service/nomic_service.py @@ -61,7 +61,7 @@ class NomicService(): @inject def __init__(self, sentry: SentryService, sql: SQLDatabase): - nomic.login(os.getenv('NOMIC_API_KEY')) + nomic.login(os.environ['NOMIC_API_KEY']) self.sentry = sentry self.sql = sql @@ -258,18 +258,15 @@ def create_nomic_map(self, course_name: str, log_data: list): 2. appends current embeddings and metadata to it 2. creates map if there are at least 20 queries """ - nomic.login(os.getenv('NOMIC_API_KEY')) # login during start of flask app + nomic.login(os.environ['NOMIC_API_KEY']) # login during start of flask app NOMIC_MAP_NAME_PREFIX = 'Conversation Map for ' print(f"in create_nomic_map() for {course_name}") - # initialize supabase - supabase_client = supabase.create_client( # type: ignore - supabase_url=os.getenv('SUPABASE_URL'), # type: ignore - supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore try: # fetch all conversations with this new course (we expect <=20 conversations, because otherwise the map should be made already) - response = supabase_client.table("llm-convo-monitor").select("*").eq("course_name", course_name).execute() + + response = self.sql.getAllFromLLMConvoMonitor(course_name) data = response.data df = pd.DataFrame(data) @@ -627,7 +624,7 @@ def create_map(self, embeddings, metadata, map_name, index_name, topic_label_fie topic_label_field: str colorable_fields: list of str """ - nomic.login(os.getenv('NOMIC_API_KEY')) + nomic.login(os.environ['NOMIC_API_KEY']) try: project = atlas.map_embeddings(embeddings=embeddings, @@ -652,7 +649,7 @@ def append_to_map(self, embeddings, metadata, map_name): metadata: pd.DataFrame of Nomic upload metadata map_name: str """ - nomic.login(os.getenv('NOMIC_API_KEY')) + nomic.login(os.environ['NOMIC_API_KEY']) try: project = atlas.AtlasProject(name=map_name, add_datums_if_exists=True) with project.wait_for_project_lock(): @@ -714,7 +711,7 @@ def data_prep_for_doc_map(self, df: pd.DataFrame): # openai_api_key=os.getenv('AZURE_OPENAI_KEY')) # type: ignore embeddings_model = OpenAIEmbeddings(openai_api_type="openai", openai_api_base="https://api.openai.com/v1/", - openai_api_key=os.getenv('VLADS_OPENAI_KEY')) # type: ignore + openai_api_key=os.environ['VLADS_OPENAI_KEY']) embeddings = embeddings_model.embed_documents(texts) metadata = pd.DataFrame(metadata) diff --git a/ai_ta_backend/service/retrieval_service.py b/ai_ta_backend/service/retrieval_service.py index 91583323..3f13c311 100644 --- a/ai_ta_backend/service/retrieval_service.py +++ b/ai_ta_backend/service/retrieval_service.py @@ -34,22 +34,22 @@ def __init__(self, vdb: VectorDatabase, sqlDb: SQLDatabase, aws: AWSStorage, pos self.posthog = posthog self.nomicService = nomicService - openai.api_key = os.getenv("OPENAI_API_KEY") + openai.api_key = os.environ["OPENAI_API_KEY"] self.embeddings = OpenAIEmbeddings( model='text-embedding-ada-002', - openai_api_base=os.getenv("AZURE_OPENAI_ENDPOINT"), # type:ignore + openai_api_base=os.environ["AZURE_OPENAI_ENDPOINT"], openai_api_type=os.environ['OPENAI_API_TYPE'], - openai_api_key=os.getenv("AZURE_OPENAI_KEY"), # type:ignore - openai_api_version=os.getenv("OPENAI_API_VERSION"), # type:ignore + openai_api_key=os.environ["AZURE_OPENAI_KEY"], + openai_api_version=os.environ["OPENAI_API_VERSION"], ) self.llm = AzureChatOpenAI( temperature=0, - deployment_name=os.getenv("AZURE_OPENAI_ENGINE"), # type:ignore - openai_api_base=os.getenv("AZURE_OPENAI_ENDPOINT"), # type:ignore - openai_api_key=os.getenv("AZURE_OPENAI_KEY"), # type:ignore - openai_api_version=os.getenv("OPENAI_API_VERSION"), # type:ignore + deployment_name=os.environ["AZURE_OPENAI_ENGINE"], + openai_api_base=os.environ["AZURE_OPENAI_ENDPOINT"], + openai_api_key=os.environ["AZURE_OPENAI_KEY"], + openai_api_version=os.environ["OPENAI_API_VERSION"], openai_api_type=os.environ['OPENAI_API_TYPE'], ) @@ -150,7 +150,7 @@ def delete_data(self, course_name: str, s3_path: str, source_url: str): # add delete from doc map logic here try: # Delete file from S3 - bucket_name = os.getenv('S3_BUCKET_NAME') + bucket_name = os.environ['S3_BUCKET_NAME'] if bucket_name is None: raise ValueError("S3_BUCKET_NAME environment variable is not set") @@ -176,14 +176,18 @@ def delete_data(self, course_name: str, s3_path: str, source_url: str): def delete_from_s3(self, bucket_name: str, s3_path: str): try: - self.aws.delete_file(bucket_name, s3_path) + print("Deleting from S3") + response = self.aws.delete_file(bucket_name, s3_path) + print(f"AWS response: {response}") except Exception as e: print("Error in deleting file from s3:", e) self.sentry.capture_exception(e) def delete_from_qdrant(self, identifier_key: str, identifier_value: str): try: - self.vdb.delete_data(os.environ['QDRANT_COLLECTION_NAME'], identifier_key, identifier_value) + print("Deleting from Qdrant") + response = self.vdb.delete_data(os.environ['QDRANT_COLLECTION_NAME'], identifier_key, identifier_value) + print(f"Qdrant response: {response}") except Exception as e: if "timed out" in str(e): # Timed out is fine. Still deletes. @@ -311,7 +315,9 @@ def format_for_json_mqr(self, found_docs) -> List[Dict]: def delete_from_nomic_and_supabase(self, course_name: str, identifier_key: str, identifier_value: str): try: + print(f"Deleting from Nomic and Supabase for {course_name} using {identifier_key}: {identifier_value}") response = self.sqlDb.getMaterialsForCourseAndKeyAndValue(course_name, identifier_key, identifier_value) + print(f"Trying to delete materials: {response}") data = response.data[0] # single record fetched nomic_ids_to_delete = [str(data['id']) + "_" + str(i) for i in range(1, len(data['contexts']) + 1)] @@ -325,7 +331,9 @@ def delete_from_nomic_and_supabase(self, course_name: str, identifier_key: str, self.nomicService.delete_from_document_map(project_id, nomic_ids_to_delete) # delete from Supabase - self.sqlDb.deleteMaterialsForCourseAndKeyAndValue(course_name, identifier_key, identifier_value) + print(f"Deleting from Supabase for {course_name} using {identifier_key}: {identifier_value}") + response = self.sqlDb.deleteMaterialsForCourseAndKeyAndValue(course_name, identifier_key, identifier_value) + print(f"Deleted from sql: {response}") except Exception as e: print(f"Error in deleting file from Nomic or Supabase using {identifier_key}: {identifier_value}", e) self.sentry.capture_exception(e) From 8a17e3d7acec3abe1d0fc18950b66ff88583964a Mon Sep 17 00:00:00 2001 From: rohanmarwaha Date: Fri, 8 Mar 2024 14:26:57 -0600 Subject: [PATCH 14/15] Fix deletion bug in retrieval_service to check for materials before deleting --- ai_ta_backend/service/retrieval_service.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/ai_ta_backend/service/retrieval_service.py b/ai_ta_backend/service/retrieval_service.py index 3f13c311..3d73a82b 100644 --- a/ai_ta_backend/service/retrieval_service.py +++ b/ai_ta_backend/service/retrieval_service.py @@ -317,23 +317,22 @@ def delete_from_nomic_and_supabase(self, course_name: str, identifier_key: str, try: print(f"Deleting from Nomic and Supabase for {course_name} using {identifier_key}: {identifier_value}") response = self.sqlDb.getMaterialsForCourseAndKeyAndValue(course_name, identifier_key, identifier_value) - print(f"Trying to delete materials: {response}") + if not response.data: + raise Exception(f"No materials found for {course_name} using {identifier_key}: {identifier_value}") data = response.data[0] # single record fetched nomic_ids_to_delete = [str(data['id']) + "_" + str(i) for i in range(1, len(data['contexts']) + 1)] # delete from Nomic # check if project exists response = self.sqlDb.getProjectsMapForCourse(course_name) - if response.data: - project_id = response.data[0]['doc_map_id'] - else: - return "No document map found for this course" + if not response.data: + raise Exception(f"No document map found for this course: {course_name}") + project_id = response.data[0]['doc_map_id'] self.nomicService.delete_from_document_map(project_id, nomic_ids_to_delete) # delete from Supabase print(f"Deleting from Supabase for {course_name} using {identifier_key}: {identifier_value}") response = self.sqlDb.deleteMaterialsForCourseAndKeyAndValue(course_name, identifier_key, identifier_value) - print(f"Deleted from sql: {response}") except Exception as e: print(f"Error in deleting file from Nomic or Supabase using {identifier_key}: {identifier_value}", e) self.sentry.capture_exception(e) From d6976fe6e017d526b07b98366b4b269ed27e7e28 Mon Sep 17 00:00:00 2001 From: Kastan Day Date: Tue, 12 Mar 2024 10:50:24 -0600 Subject: [PATCH 15/15] Remove ffmpeg and tesseract-ocr, no ingest here anymore --- railway.json | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/railway.json b/railway.json index 56b39dac..d6d92535 100644 --- a/railway.json +++ b/railway.json @@ -10,11 +10,9 @@ "python -m venv --copies /opt/venv && . /opt/venv/bin/activate", "pip install pip==23.3.1", "pip install -r requirements.txt" - ], - "aptPkgs": ["ffmpeg", "tesseract-ocr"] + ] }, "setup": { - "aptPkgs": ["libcap-dev", "libgl1"], "nixPkgs": ["python310", "gcc"] } }