From e277c39423da0f79334a45361748f3e157cb64a4 Mon Sep 17 00:00:00 2001 From: star-nox Date: Tue, 19 Mar 2024 13:08:14 -0500 Subject: [PATCH 1/4] fixed s3_path issues in cropwizard-1.5 --- ai_ta_backend/service/export_service.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ai_ta_backend/service/export_service.py b/ai_ta_backend/service/export_service.py index 3ceaabb7..ce60c23c 100644 --- a/ai_ta_backend/service/export_service.py +++ b/ai_ta_backend/service/export_service.py @@ -223,7 +223,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path): # upload to S3 #s3_file = f"courses/{course_name}/exports/{os.path.basename(zip_file_path)}" - s3_file = f"courses/{course_name}/{os.path.basename(zip_file_path)}" + s3_file = f"courses/{course_name}/{os.path.basename(s3_path)}" s3.upload_file(zip_file_path, os.environ['S3_BUCKET_NAME'], s3_file) # remove local files @@ -234,6 +234,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path): # generate presigned URL s3_url = s3.generatePresignedUrl('get_object', os.environ['S3_BUCKET_NAME'], s3_path, 3600) + #print("s3_url: ", s3_url) # get admin email IDs headers = { From 148e6da8386496f7eb6d0a224f582f365969a9f8 Mon Sep 17 00:00:00 2001 From: star-nox Date: Tue, 19 Mar 2024 16:43:17 -0500 Subject: [PATCH 2/4] changed json to jsonl --- ai_ta_backend/service/export_service.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ai_ta_backend/service/export_service.py b/ai_ta_backend/service/export_service.py index ce60c23c..dde6b206 100644 --- a/ai_ta_backend/service/export_service.py +++ b/ai_ta_backend/service/export_service.py @@ -59,7 +59,7 @@ def export_documents_json(self, course_name: str, from_date='', to_date=''): print("last_id: ", last_id) curr_doc_count = 0 - filename = course_name + '_' + str(uuid.uuid4()) + '_documents.json' + filename = course_name + '_' + str(uuid.uuid4()) + '_documents.jsonl' file_path = os.path.join(os.getcwd(), filename) while curr_doc_count < total_doc_count: @@ -71,7 +71,7 @@ def export_documents_json(self, course_name: str, from_date='', to_date=''): # writing to file if not os.path.isfile(file_path): - df.to_json(file_path, orient='records') + df.to_json(file_path, orient='records', lines=True) else: df.to_json(file_path, orient='records', lines=True, mode='a') @@ -126,7 +126,7 @@ def export_convo_history_json(self, course_name: str, from_date='', to_date=''): last_id = response.data[-1]['id'] total_count = response.count - filename = course_name + '_' + str(uuid.uuid4()) + '_convo_history.json' + filename = course_name + '_' + str(uuid.uuid4()) + '_convo_history.jsonl' file_path = os.path.join(os.getcwd(), filename) curr_count = 0 # Fetch data in batches of 25 from first_id to last_id @@ -191,7 +191,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path): print("pre-defined s3_path: ", s3_path) curr_doc_count = 0 - filename = s3_path.split('/')[-1].split('.')[0] + '.json' + filename = s3_path.split('/')[-1].split('.')[0] + '.jsonl' file_path = os.path.join(os.getcwd(), filename) # download data in batches of 100 @@ -203,7 +203,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path): # writing to file if not os.path.isfile(file_path): - df.to_json(file_path, orient='records') + df.to_json(file_path, orient='records', lines=True) else: df.to_json(file_path, orient='records', lines=True, mode='a') @@ -267,8 +267,8 @@ def export_data_in_bg(response, download_type, course_name, s3_path): # send email to admins subject = "UIUC.chat Data Export Complete for " + course_name body_text = "The data export for " + course_name + " is complete.\n\nYou can download the file from the following link: \n\n" + s3_url + "\n\nThis link will expire in 48 hours." - email_status = send_email(subject, body_text, os.environ['EMAIL_SENDER'], admin_emails, bcc_emails) - print("email_status: ", email_status) + # email_status = send_email(subject, body_text, os.environ['EMAIL_SENDER'], admin_emails, bcc_emails) + # print("email_status: ", email_status) return "File uploaded to S3. Email sent to admins." From f57f65e0387cc43ef537802d6555f726bea1f1d3 Mon Sep 17 00:00:00 2001 From: star-nox Date: Tue, 19 Mar 2024 16:44:46 -0500 Subject: [PATCH 3/4] uncommented send_email() --- ai_ta_backend/service/export_service.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ai_ta_backend/service/export_service.py b/ai_ta_backend/service/export_service.py index dde6b206..3e4925af 100644 --- a/ai_ta_backend/service/export_service.py +++ b/ai_ta_backend/service/export_service.py @@ -267,8 +267,8 @@ def export_data_in_bg(response, download_type, course_name, s3_path): # send email to admins subject = "UIUC.chat Data Export Complete for " + course_name body_text = "The data export for " + course_name + " is complete.\n\nYou can download the file from the following link: \n\n" + s3_url + "\n\nThis link will expire in 48 hours." - # email_status = send_email(subject, body_text, os.environ['EMAIL_SENDER'], admin_emails, bcc_emails) - # print("email_status: ", email_status) + email_status = send_email(subject, body_text, os.environ['EMAIL_SENDER'], admin_emails, bcc_emails) + print("email_status: ", email_status) return "File uploaded to S3. Email sent to admins." From 4e89610d05b560888238f4087ebd4609cd1d7b9c Mon Sep 17 00:00:00 2001 From: Kastan Day Date: Tue, 19 Mar 2024 16:43:50 -0700 Subject: [PATCH 4/4] HOTFIX: Improve subject line for data exports, clean up logs --- ai_ta_backend/main.py | 4 ---- ai_ta_backend/service/export_service.py | 24 +++++++++++------------ ai_ta_backend/utils/filtering_contexts.py | 5 ----- 3 files changed, 12 insertions(+), 21 deletions(-) diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py index ef311b6c..77bfeea5 100644 --- a/ai_ta_backend/main.py +++ b/ai_ta_backend/main.py @@ -1,5 +1,4 @@ import os -import threading import time from typing import List @@ -112,10 +111,7 @@ def getTopContexts(service: RetrievalService) -> Response: f"Missing one or more required parameters: 'search_query' and 'course_name' must be provided. Search query: `{search_query}`, Course name: `{course_name}`" ) - print("NUM ACTIVE THREADS (top of getTopContexts):", threading.active_count()) - found_documents = service.getTopContexts(search_query, course_name, token_limit) - print("NUM ACTIVE THREADS (after instantiating Ingest() class in getTopContexts):", threading.active_count()) response = jsonify(found_documents) response.headers.add('Access-Control-Allow-Origin', '*') diff --git a/ai_ta_backend/service/export_service.py b/ai_ta_backend/service/export_service.py index 3e4925af..6e2ef021 100644 --- a/ai_ta_backend/service/export_service.py +++ b/ai_ta_backend/service/export_service.py @@ -43,8 +43,7 @@ def export_documents_json(self, course_name: str, from_date='', to_date=''): # background task of downloading data - map it with above ID executor = ProcessPoolExecutor() executor.submit(export_data_in_bg, response, "documents", course_name, s3_filepath) - return {"response": 'Download from S3', - "s3_path": s3_filepath} + return {"response": 'Download from S3', "s3_path": s3_filepath} else: # Fetch data @@ -96,8 +95,6 @@ def export_documents_json(self, course_name: str, from_date='', to_date=''): else: return {"response": "No data found between the given dates."} - - def export_convo_history_json(self, course_name: str, from_date='', to_date=''): """ This function exports the conversation history to a csv file. @@ -169,6 +166,7 @@ def export_convo_history_json(self, course_name: str, from_date='', to_date=''): # Encountered pickling error while running the background task. So, moved the function outside the class. + def export_data_in_bg(response, download_type, course_name, s3_path): """ This function is called in export_documents_csv() to upload the documents to S3. @@ -184,7 +182,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path): """ s3 = AWSStorage() sql = SQLDatabase() - + total_doc_count = response.count first_id = response.data[0]['id'] print("total_doc_count: ", total_doc_count) @@ -203,7 +201,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path): # writing to file if not os.path.isfile(file_path): - df.to_json(file_path, orient='records', lines=True) + df.to_json(file_path, orient='records', lines=True) else: df.to_json(file_path, orient='records', lines=True, mode='a') @@ -237,10 +235,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path): #print("s3_url: ", s3_url) # get admin email IDs - headers = { - "Authorization": f"Bearer {os.environ['VERCEL_READ_ONLY_API_KEY']}", - "Content-Type": "application/json" - } + headers = {"Authorization": f"Bearer {os.environ['VERCEL_READ_ONLY_API_KEY']}", "Content-Type": "application/json"} hget_url = str(os.environ['VERCEL_BASE_URL']) + "course_metadatas/" + course_name response = requests.get(hget_url, headers=headers) @@ -265,7 +260,12 @@ def export_data_in_bg(response, download_type, course_name, s3_path): return "No admin emails found. Email not sent." # send email to admins - subject = "UIUC.chat Data Export Complete for " + course_name + if download_type == "documents": + subject = "UIUC.chat Documents Export Complete for " + course_name + elif download_type == "conversations": + subject = "UIUC.chat Conversation History Export Complete for " + course_name + else: + subject = "UIUC.chat Export Complete for " + course_name body_text = "The data export for " + course_name + " is complete.\n\nYou can download the file from the following link: \n\n" + s3_url + "\n\nThis link will expire in 48 hours." email_status = send_email(subject, body_text, os.environ['EMAIL_SENDER'], admin_emails, bcc_emails) print("email_status: ", email_status) @@ -274,4 +274,4 @@ def export_data_in_bg(response, download_type, course_name, s3_path): except Exception as e: print(e) - return "Error: " + str(e) \ No newline at end of file + return "Error: " + str(e) diff --git a/ai_ta_backend/utils/filtering_contexts.py b/ai_ta_backend/utils/filtering_contexts.py index 476df3d0..03deede0 100644 --- a/ai_ta_backend/utils/filtering_contexts.py +++ b/ai_ta_backend/utils/filtering_contexts.py @@ -137,8 +137,6 @@ # langsmith_prompt_obj = filter_unrelated_contexts_zephyr # posthog = Posthog(sync_mode=True, project_api_key=os.environ['POSTHOG_API_KEY'], host='https://app.posthog.com') -# print("NUM ACTIVE THREADS (top of filtering_contexts):", threading.active_count()) - # max_concurrency = min(100, len(contexts)) # print("max_concurrency is max of 100, or len(contexts), whichever is less ---- Max concurrency:", max_concurrency) # print("Num contexts to filter:", len(contexts)) @@ -153,14 +151,11 @@ # timeout=timeout, # fetch_local=False) -# print("NUM ACTIVE THREADS (before cleanup filtering_contexts):", threading.active_count()) # # Cleanup # for task in in_progress: # ray.cancel(task) # results = ray.get(done_tasks) -# print("NUM ACTIVE THREADS (before kill filtering_contexts):", threading.active_count()) # ray.kill(actor) -# print("NUM ACTIVE THREADS (after kill filtering_contexts):", threading.active_count()) # best_contexts_to_keep = [ # r['context'] for r in results if r and 'context' in r and 'completion' in r and parse_result(r['completion'])