diff --git a/ai_ta_backend/service/export_service.py b/ai_ta_backend/service/export_service.py index 3ceaabb7..3e4925af 100644 --- a/ai_ta_backend/service/export_service.py +++ b/ai_ta_backend/service/export_service.py @@ -59,7 +59,7 @@ def export_documents_json(self, course_name: str, from_date='', to_date=''): print("last_id: ", last_id) curr_doc_count = 0 - filename = course_name + '_' + str(uuid.uuid4()) + '_documents.json' + filename = course_name + '_' + str(uuid.uuid4()) + '_documents.jsonl' file_path = os.path.join(os.getcwd(), filename) while curr_doc_count < total_doc_count: @@ -71,7 +71,7 @@ def export_documents_json(self, course_name: str, from_date='', to_date=''): # writing to file if not os.path.isfile(file_path): - df.to_json(file_path, orient='records') + df.to_json(file_path, orient='records', lines=True) else: df.to_json(file_path, orient='records', lines=True, mode='a') @@ -126,7 +126,7 @@ def export_convo_history_json(self, course_name: str, from_date='', to_date=''): last_id = response.data[-1]['id'] total_count = response.count - filename = course_name + '_' + str(uuid.uuid4()) + '_convo_history.json' + filename = course_name + '_' + str(uuid.uuid4()) + '_convo_history.jsonl' file_path = os.path.join(os.getcwd(), filename) curr_count = 0 # Fetch data in batches of 25 from first_id to last_id @@ -191,7 +191,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path): print("pre-defined s3_path: ", s3_path) curr_doc_count = 0 - filename = s3_path.split('/')[-1].split('.')[0] + '.json' + filename = s3_path.split('/')[-1].split('.')[0] + '.jsonl' file_path = os.path.join(os.getcwd(), filename) # download data in batches of 100 @@ -203,7 +203,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path): # writing to file if not os.path.isfile(file_path): - df.to_json(file_path, orient='records') + df.to_json(file_path, orient='records', lines=True) else: df.to_json(file_path, orient='records', lines=True, mode='a') @@ -223,7 +223,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path): # upload to S3 #s3_file = f"courses/{course_name}/exports/{os.path.basename(zip_file_path)}" - s3_file = f"courses/{course_name}/{os.path.basename(zip_file_path)}" + s3_file = f"courses/{course_name}/{os.path.basename(s3_path)}" s3.upload_file(zip_file_path, os.environ['S3_BUCKET_NAME'], s3_file) # remove local files @@ -234,6 +234,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path): # generate presigned URL s3_url = s3.generatePresignedUrl('get_object', os.environ['S3_BUCKET_NAME'], s3_path, 3600) + #print("s3_url: ", s3_url) # get admin email IDs headers = {