Skip to content

Commit

Permalink
Merge pull request #235 from UIUC-Chatbot/s3-download
Browse files Browse the repository at this point in the history
Fix for S3 path issues in data export
  • Loading branch information
star-nox authored Mar 19, 2024
2 parents f4cd916 + f57f65e commit c9e77a5
Showing 1 changed file with 7 additions and 6 deletions.
13 changes: 7 additions & 6 deletions ai_ta_backend/service/export_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def export_documents_json(self, course_name: str, from_date='', to_date=''):
print("last_id: ", last_id)

curr_doc_count = 0
filename = course_name + '_' + str(uuid.uuid4()) + '_documents.json'
filename = course_name + '_' + str(uuid.uuid4()) + '_documents.jsonl'
file_path = os.path.join(os.getcwd(), filename)

while curr_doc_count < total_doc_count:
Expand All @@ -71,7 +71,7 @@ def export_documents_json(self, course_name: str, from_date='', to_date=''):

# writing to file
if not os.path.isfile(file_path):
df.to_json(file_path, orient='records')
df.to_json(file_path, orient='records', lines=True)
else:
df.to_json(file_path, orient='records', lines=True, mode='a')

Expand Down Expand Up @@ -126,7 +126,7 @@ def export_convo_history_json(self, course_name: str, from_date='', to_date=''):
last_id = response.data[-1]['id']
total_count = response.count

filename = course_name + '_' + str(uuid.uuid4()) + '_convo_history.json'
filename = course_name + '_' + str(uuid.uuid4()) + '_convo_history.jsonl'
file_path = os.path.join(os.getcwd(), filename)
curr_count = 0
# Fetch data in batches of 25 from first_id to last_id
Expand Down Expand Up @@ -191,7 +191,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path):
print("pre-defined s3_path: ", s3_path)

curr_doc_count = 0
filename = s3_path.split('/')[-1].split('.')[0] + '.json'
filename = s3_path.split('/')[-1].split('.')[0] + '.jsonl'
file_path = os.path.join(os.getcwd(), filename)

# download data in batches of 100
Expand All @@ -203,7 +203,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path):

# writing to file
if not os.path.isfile(file_path):
df.to_json(file_path, orient='records')
df.to_json(file_path, orient='records', lines=True)
else:
df.to_json(file_path, orient='records', lines=True, mode='a')

Expand All @@ -223,7 +223,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path):
# upload to S3

#s3_file = f"courses/{course_name}/exports/{os.path.basename(zip_file_path)}"
s3_file = f"courses/{course_name}/{os.path.basename(zip_file_path)}"
s3_file = f"courses/{course_name}/{os.path.basename(s3_path)}"
s3.upload_file(zip_file_path, os.environ['S3_BUCKET_NAME'], s3_file)

# remove local files
Expand All @@ -234,6 +234,7 @@ def export_data_in_bg(response, download_type, course_name, s3_path):

# generate presigned URL
s3_url = s3.generatePresignedUrl('get_object', os.environ['S3_BUCKET_NAME'], s3_path, 3600)
#print("s3_url: ", s3_url)

# get admin email IDs
headers = {
Expand Down

0 comments on commit c9e77a5

Please sign in to comment.