From e277c39423da0f79334a45361748f3e157cb64a4 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Tue, 19 Mar 2024 13:08:14 -0500
Subject: [PATCH 1/4] fixed s3_path issues in cropwizard-1.5

---
 ai_ta_backend/service/export_service.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ai_ta_backend/service/export_service.py b/ai_ta_backend/service/export_service.py
index 3ceaabb7..ce60c23c 100644
--- a/ai_ta_backend/service/export_service.py
+++ b/ai_ta_backend/service/export_service.py
@@ -223,7 +223,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path):
     # upload to S3
 
     #s3_file = f"courses/{course_name}/exports/{os.path.basename(zip_file_path)}"
-    s3_file = f"courses/{course_name}/{os.path.basename(zip_file_path)}"
+    s3_file = f"courses/{course_name}/{os.path.basename(s3_path)}"
     s3.upload_file(zip_file_path, os.environ['S3_BUCKET_NAME'], s3_file)
 
     # remove local files
@@ -234,6 +234,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path):
 
     # generate presigned URL
     s3_url = s3.generatePresignedUrl('get_object', os.environ['S3_BUCKET_NAME'], s3_path, 3600)
+    #print("s3_url: ", s3_url)
 
     # get admin email IDs
     headers = {

From 148e6da8386496f7eb6d0a224f582f365969a9f8 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Tue, 19 Mar 2024 16:43:17 -0500
Subject: [PATCH 2/4] changed json to jsonl

---
 ai_ta_backend/service/export_service.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/ai_ta_backend/service/export_service.py b/ai_ta_backend/service/export_service.py
index ce60c23c..dde6b206 100644
--- a/ai_ta_backend/service/export_service.py
+++ b/ai_ta_backend/service/export_service.py
@@ -59,7 +59,7 @@ def export_documents_json(self, course_name: str, from_date='', to_date=''):
         print("last_id: ", last_id)
 
         curr_doc_count = 0
-        filename = course_name + '_' + str(uuid.uuid4()) + '_documents.json'
+        filename = course_name + '_' + str(uuid.uuid4()) + '_documents.jsonl'
         file_path = os.path.join(os.getcwd(), filename)
 
         while curr_doc_count < total_doc_count:
@@ -71,7 +71,7 @@ def export_documents_json(self, course_name: str, from_date='', to_date=''):
 
           # writing to file
           if not os.path.isfile(file_path):
-            df.to_json(file_path, orient='records')
+            df.to_json(file_path, orient='records', lines=True)
           else:
             df.to_json(file_path, orient='records', lines=True, mode='a')
 
@@ -126,7 +126,7 @@ def export_convo_history_json(self, course_name: str, from_date='', to_date=''):
       last_id = response.data[-1]['id']
       total_count = response.count
 
-      filename = course_name + '_' + str(uuid.uuid4()) + '_convo_history.json'
+      filename = course_name + '_' + str(uuid.uuid4()) + '_convo_history.jsonl'
       file_path = os.path.join(os.getcwd(), filename)
       curr_count = 0
       # Fetch data in batches of 25 from first_id to last_id
@@ -191,7 +191,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path):
   print("pre-defined s3_path: ", s3_path)
 
   curr_doc_count = 0
-  filename = s3_path.split('/')[-1].split('.')[0] + '.json'
+  filename = s3_path.split('/')[-1].split('.')[0] + '.jsonl'
   file_path = os.path.join(os.getcwd(), filename)
 
   # download data in batches of 100
@@ -203,7 +203,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path):
 
     # writing to file
     if not os.path.isfile(file_path):
-       df.to_json(file_path, orient='records')
+       df.to_json(file_path, orient='records', lines=True)
     else:
       df.to_json(file_path, orient='records', lines=True, mode='a')
 
@@ -267,8 +267,8 @@ def export_data_in_bg(response, download_type, course_name, s3_path):
     # send email to admins
     subject = "UIUC.chat Data Export Complete for " + course_name
     body_text = "The data export for " + course_name + " is complete.\n\nYou can download the file from the following link: \n\n" + s3_url + "\n\nThis link will expire in 48 hours."
-    email_status = send_email(subject, body_text, os.environ['EMAIL_SENDER'], admin_emails, bcc_emails)
-    print("email_status: ", email_status)
+    # email_status = send_email(subject, body_text, os.environ['EMAIL_SENDER'], admin_emails, bcc_emails)
+    # print("email_status: ", email_status)
 
     return "File uploaded to S3. Email sent to admins."
 

From f57f65e0387cc43ef537802d6555f726bea1f1d3 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Tue, 19 Mar 2024 16:44:46 -0500
Subject: [PATCH 3/4] uncommented send_email()

---
 ai_ta_backend/service/export_service.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ai_ta_backend/service/export_service.py b/ai_ta_backend/service/export_service.py
index dde6b206..3e4925af 100644
--- a/ai_ta_backend/service/export_service.py
+++ b/ai_ta_backend/service/export_service.py
@@ -267,8 +267,8 @@ def export_data_in_bg(response, download_type, course_name, s3_path):
     # send email to admins
     subject = "UIUC.chat Data Export Complete for " + course_name
     body_text = "The data export for " + course_name + " is complete.\n\nYou can download the file from the following link: \n\n" + s3_url + "\n\nThis link will expire in 48 hours."
-    # email_status = send_email(subject, body_text, os.environ['EMAIL_SENDER'], admin_emails, bcc_emails)
-    # print("email_status: ", email_status)
+    email_status = send_email(subject, body_text, os.environ['EMAIL_SENDER'], admin_emails, bcc_emails)
+    print("email_status: ", email_status)
 
     return "File uploaded to S3. Email sent to admins."
 

From 4e89610d05b560888238f4087ebd4609cd1d7b9c Mon Sep 17 00:00:00 2001
From: Kastan Day <kastanvday@gmail.com>
Date: Tue, 19 Mar 2024 16:43:50 -0700
Subject: [PATCH 4/4] HOTFIX: Improve subject line for data exports, clean up
 logs

---
 ai_ta_backend/main.py                     |  4 ----
 ai_ta_backend/service/export_service.py   | 24 +++++++++++------------
 ai_ta_backend/utils/filtering_contexts.py |  5 -----
 3 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py
index ef311b6c..77bfeea5 100644
--- a/ai_ta_backend/main.py
+++ b/ai_ta_backend/main.py
@@ -1,5 +1,4 @@
 import os
-import threading
 import time
 from typing import List
 
@@ -112,10 +111,7 @@ def getTopContexts(service: RetrievalService) -> Response:
         f"Missing one or more required parameters: 'search_query' and 'course_name' must be provided. Search query: `{search_query}`, Course name: `{course_name}`"
     )
 
-  print("NUM ACTIVE THREADS (top of getTopContexts):", threading.active_count())
-
   found_documents = service.getTopContexts(search_query, course_name, token_limit)
-  print("NUM ACTIVE THREADS (after instantiating Ingest() class in getTopContexts):", threading.active_count())
 
   response = jsonify(found_documents)
   response.headers.add('Access-Control-Allow-Origin', '*')
diff --git a/ai_ta_backend/service/export_service.py b/ai_ta_backend/service/export_service.py
index 3e4925af..6e2ef021 100644
--- a/ai_ta_backend/service/export_service.py
+++ b/ai_ta_backend/service/export_service.py
@@ -43,8 +43,7 @@ def export_documents_json(self, course_name: str, from_date='', to_date=''):
       # background task of downloading data - map it with above ID
       executor = ProcessPoolExecutor()
       executor.submit(export_data_in_bg, response, "documents", course_name, s3_filepath)
-      return {"response": 'Download from S3', 
-              "s3_path": s3_filepath}
+      return {"response": 'Download from S3', "s3_path": s3_filepath}
 
     else:
       # Fetch data
@@ -96,8 +95,6 @@ def export_documents_json(self, course_name: str, from_date='', to_date=''):
       else:
         return {"response": "No data found between the given dates."}
 
-  
-
   def export_convo_history_json(self, course_name: str, from_date='', to_date=''):
     """
 		This function exports the conversation history to a csv file.
@@ -169,6 +166,7 @@ def export_convo_history_json(self, course_name: str, from_date='', to_date=''):
 
 # Encountered pickling error while running the background task. So, moved the function outside the class.
 
+
 def export_data_in_bg(response, download_type, course_name, s3_path):
   """
 	This function is called in export_documents_csv() to upload the documents to S3.
@@ -184,7 +182,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path):
 	"""
   s3 = AWSStorage()
   sql = SQLDatabase()
-  
+
   total_doc_count = response.count
   first_id = response.data[0]['id']
   print("total_doc_count: ", total_doc_count)
@@ -203,7 +201,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path):
 
     # writing to file
     if not os.path.isfile(file_path):
-       df.to_json(file_path, orient='records', lines=True)
+      df.to_json(file_path, orient='records', lines=True)
     else:
       df.to_json(file_path, orient='records', lines=True, mode='a')
 
@@ -237,10 +235,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path):
     #print("s3_url: ", s3_url)
 
     # get admin email IDs
-    headers = {
-      "Authorization": f"Bearer {os.environ['VERCEL_READ_ONLY_API_KEY']}",
-      "Content-Type": "application/json"
-    }
+    headers = {"Authorization": f"Bearer {os.environ['VERCEL_READ_ONLY_API_KEY']}", "Content-Type": "application/json"}
 
     hget_url = str(os.environ['VERCEL_BASE_URL']) + "course_metadatas/" + course_name
     response = requests.get(hget_url, headers=headers)
@@ -265,7 +260,12 @@ def export_data_in_bg(response, download_type, course_name, s3_path):
       return "No admin emails found. Email not sent."
 
     # send email to admins
-    subject = "UIUC.chat Data Export Complete for " + course_name
+    if download_type == "documents":
+      subject = "UIUC.chat Documents Export Complete for " + course_name
+    elif download_type == "conversations":
+      subject = "UIUC.chat Conversation History Export Complete for " + course_name
+    else:
+      subject = "UIUC.chat Export Complete for " + course_name
     body_text = "The data export for " + course_name + " is complete.\n\nYou can download the file from the following link: \n\n" + s3_url + "\n\nThis link will expire in 48 hours."
     email_status = send_email(subject, body_text, os.environ['EMAIL_SENDER'], admin_emails, bcc_emails)
     print("email_status: ", email_status)
@@ -274,4 +274,4 @@ def export_data_in_bg(response, download_type, course_name, s3_path):
 
   except Exception as e:
     print(e)
-    return "Error: " + str(e)
\ No newline at end of file
+    return "Error: " + str(e)
diff --git a/ai_ta_backend/utils/filtering_contexts.py b/ai_ta_backend/utils/filtering_contexts.py
index 476df3d0..03deede0 100644
--- a/ai_ta_backend/utils/filtering_contexts.py
+++ b/ai_ta_backend/utils/filtering_contexts.py
@@ -137,8 +137,6 @@
 #   langsmith_prompt_obj = filter_unrelated_contexts_zephyr
 #   posthog = Posthog(sync_mode=True, project_api_key=os.environ['POSTHOG_API_KEY'], host='https://app.posthog.com')
 
-#   print("NUM ACTIVE THREADS (top of filtering_contexts):", threading.active_count())
-
 #   max_concurrency = min(100, len(contexts))
 #   print("max_concurrency is max of 100, or len(contexts), whichever is less ---- Max concurrency:", max_concurrency)
 #   print("Num contexts to filter:", len(contexts))
@@ -153,14 +151,11 @@
 #                                      timeout=timeout,
 #                                      fetch_local=False)
 
-#   print("NUM ACTIVE THREADS (before cleanup filtering_contexts):", threading.active_count())
 #   # Cleanup
 #   for task in in_progress:
 #     ray.cancel(task)
 #   results = ray.get(done_tasks)
-#   print("NUM ACTIVE THREADS (before kill filtering_contexts):", threading.active_count())
 #   ray.kill(actor)
-#   print("NUM ACTIVE THREADS (after kill filtering_contexts):", threading.active_count())
 
 #   best_contexts_to_keep = [
 #       r['context'] for r in results if r and 'context' in r and 'completion' in r and parse_result(r['completion'])