Merge pull request #235 from UIUC-Chatbot/s3-download

Fix for S3 path issues in data export
UIUC-Chatbot · Mar 19, 2024 · c9e77a5 · c9e77a5
2 parents f4cd916 + f57f65e
commit c9e77a5
Showing 1 changed file with 7 additions and 6 deletions.
diff --git a/ai_ta_backend/service/export_service.py b/ai_ta_backend/service/export_service.py
@@ -59,7 +59,7 @@ def export_documents_json(self, course_name: str, from_date='', to_date=''):
         print("last_id: ", last_id)
 
         curr_doc_count = 0
-        filename = course_name + '_' + str(uuid.uuid4()) + '_documents.json'
+        filename = course_name + '_' + str(uuid.uuid4()) + '_documents.jsonl'
         file_path = os.path.join(os.getcwd(), filename)
 
         while curr_doc_count < total_doc_count:
@@ -71,7 +71,7 @@ def export_documents_json(self, course_name: str, from_date='', to_date=''):
 
           # writing to file
           if not os.path.isfile(file_path):
-            df.to_json(file_path, orient='records')
+            df.to_json(file_path, orient='records', lines=True)
           else:
             df.to_json(file_path, orient='records', lines=True, mode='a')
 
@@ -126,7 +126,7 @@ def export_convo_history_json(self, course_name: str, from_date='', to_date=''):
       last_id = response.data[-1]['id']
       total_count = response.count
 
-      filename = course_name + '_' + str(uuid.uuid4()) + '_convo_history.json'
+      filename = course_name + '_' + str(uuid.uuid4()) + '_convo_history.jsonl'
       file_path = os.path.join(os.getcwd(), filename)
       curr_count = 0
       # Fetch data in batches of 25 from first_id to last_id
@@ -191,7 +191,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path):
   print("pre-defined s3_path: ", s3_path)
 
   curr_doc_count = 0
-  filename = s3_path.split('/')[-1].split('.')[0] + '.json'
+  filename = s3_path.split('/')[-1].split('.')[0] + '.jsonl'
   file_path = os.path.join(os.getcwd(), filename)
 
   # download data in batches of 100
@@ -203,7 +203,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path):
 
     # writing to file
     if not os.path.isfile(file_path):
-       df.to_json(file_path, orient='records')
+       df.to_json(file_path, orient='records', lines=True)
     else:
       df.to_json(file_path, orient='records', lines=True, mode='a')
 
@@ -223,7 +223,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path):
     # upload to S3
 
     #s3_file = f"courses/{course_name}/exports/{os.path.basename(zip_file_path)}"
-    s3_file = f"courses/{course_name}/{os.path.basename(zip_file_path)}"
+    s3_file = f"courses/{course_name}/{os.path.basename(s3_path)}"
     s3.upload_file(zip_file_path, os.environ['S3_BUCKET_NAME'], s3_file)
 
     # remove local files
@@ -234,6 +234,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path):
 
     # generate presigned URL
     s3_url = s3.generatePresignedUrl('get_object', os.environ['S3_BUCKET_NAME'], s3_path, 3600)
+    #print("s3_url: ", s3_url)
 
     # get admin email IDs
     headers = {