-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
moved async download outside the class
- Loading branch information
Showing
2 changed files
with
113 additions
and
105 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,7 +42,7 @@ def export_documents_json(self, course_name: str, from_date='', to_date=''): | |
s3_filepath = f"courses/{course_name}/{filename}" | ||
# background task of downloading data - map it with above ID | ||
executor = ProcessPoolExecutor() | ||
executor.submit(self.export_data_in_bg, response, "documents", course_name, s3_filepath) | ||
executor.submit(export_data_in_bg, response, "documents", course_name, s3_filepath) | ||
return {"response": 'Download from S3', | ||
"s3_path": s3_filepath} | ||
|
||
|
@@ -96,108 +96,7 @@ def export_documents_json(self, course_name: str, from_date='', to_date=''): | |
else: | ||
return {"response": "No data found between the given dates."} | ||
|
||
def export_data_in_bg(self, response, download_type, course_name, s3_path): | ||
""" | ||
This function is called in export_documents_csv() to upload the documents to S3. | ||
1. download the documents in batches of 100 and upload them to S3. | ||
2. generate a pre-signed URL for the S3 file. | ||
3. send an email to the course admins with the pre-signed URL. | ||
Args: | ||
response (dict): The response from the Supabase query. | ||
download_type (str): The type of download - 'documents' or 'conversations'. | ||
course_name (str): The name of the course. | ||
s3_path (str): The S3 path where the file will be uploaded. | ||
""" | ||
total_doc_count = response.count | ||
first_id = response.data[0]['id'] | ||
print("total_doc_count: ", total_doc_count) | ||
print("pre-defined s3_path: ", s3_path) | ||
|
||
curr_doc_count = 0 | ||
filename = s3_path.split('/')[-1].split('.')[0] + '.json' | ||
file_path = os.path.join(os.getcwd(), filename) | ||
|
||
# download data in batches of 100 | ||
while curr_doc_count < total_doc_count: | ||
print("Fetching data from id: ", first_id) | ||
response = self.sql.getAllFromTableForDownloadType(course_name, download_type, first_id) | ||
df = pd.DataFrame(response.data) | ||
curr_doc_count += len(response.data) | ||
|
||
# writing to file | ||
if not os.path.isfile(file_path): | ||
df.to_json(file_path, orient='records') | ||
else: | ||
df.to_json(file_path, orient='records', lines=True, mode='a') | ||
|
||
if len(response.data) > 0: | ||
first_id = response.data[-1]['id'] + 1 | ||
|
||
# zip file | ||
zip_filename = filename.split('.')[0] + '.zip' | ||
zip_file_path = os.path.join(os.getcwd(), zip_filename) | ||
|
||
with zipfile.ZipFile(zip_file_path, 'w', compression=zipfile.ZIP_DEFLATED) as zipf: | ||
zipf.write(file_path, filename) | ||
|
||
print("zip file created: ", zip_file_path) | ||
|
||
try: | ||
# upload to S3 | ||
|
||
#s3_file = f"courses/{course_name}/exports/{os.path.basename(zip_file_path)}" | ||
s3_file = f"courses/{course_name}/{os.path.basename(zip_file_path)}" | ||
self.s3.upload_file(zip_file_path, os.environ['S3_BUCKET_NAME'], s3_file) | ||
|
||
# remove local files | ||
os.remove(file_path) | ||
os.remove(zip_file_path) | ||
|
||
print("file uploaded to s3: ", s3_file) | ||
|
||
# generate presigned URL | ||
s3_url = self.s3.generatePresignedUrl('get_object', os.environ['S3_BUCKET_NAME'], s3_path, 3600) | ||
|
||
# get admin email IDs | ||
headers = { | ||
"Authorization": f"Bearer {os.environ['VERCEL_READ_ONLY_API_KEY']}", | ||
"Content-Type": "application/json" | ||
} | ||
|
||
hget_url = str(os.environ['VERCEL_BASE_URL']) + "course_metadatas/" + course_name | ||
response = requests.get(hget_url, headers=headers) | ||
course_metadata = response.json() | ||
course_metadata = json.loads(course_metadata['result']) | ||
admin_emails = course_metadata['course_admins'] | ||
bcc_emails = [] | ||
|
||
# check for Kastan's email and move to bcc | ||
if '[email protected]' in admin_emails: | ||
admin_emails.remove('[email protected]') | ||
bcc_emails.append('[email protected]') | ||
|
||
# add course owner email to admin_emails | ||
admin_emails.append(course_metadata['course_owner']) | ||
admin_emails = list(set(admin_emails)) | ||
print("admin_emails: ", admin_emails) | ||
print("bcc_emails: ", bcc_emails) | ||
|
||
# add a check for emails, don't send email if no admin emails | ||
if len(admin_emails) == 0: | ||
return "No admin emails found. Email not sent." | ||
|
||
# send email to admins | ||
subject = "UIUC.chat Data Export Complete for " + course_name | ||
body_text = "The data export for " + course_name + " is complete.\n\nYou can download the file from the following link: \n\n" + s3_url + "\n\nThis link will expire in 48 hours." | ||
email_status = send_email(subject, body_text, os.environ['EMAIL_SENDER'], admin_emails, bcc_emails) | ||
print("email_status: ", email_status) | ||
|
||
return "File uploaded to S3. Email sent to admins." | ||
|
||
except Exception as e: | ||
print(e) | ||
return "Error: " + str(e) | ||
|
||
|
||
def export_convo_history_json(self, course_name: str, from_date='', to_date=''): | ||
""" | ||
|
@@ -217,7 +116,7 @@ def export_convo_history_json(self, course_name: str, from_date='', to_date=''): | |
s3_filepath = f"courses/{course_name}/{filename}" | ||
# background task of downloading data - map it with above ID | ||
executor = ProcessPoolExecutor() | ||
executor.submit(self.export_data_in_bg, response, "conversations", course_name, s3_filepath) | ||
executor.submit(export_data_in_bg, response, "conversations", course_name, s3_filepath) | ||
return {"response": 'Download from S3', "s3_path": s3_filepath} | ||
|
||
# Fetch data | ||
|
@@ -266,3 +165,112 @@ def export_convo_history_json(self, course_name: str, from_date='', to_date=''): | |
return {"response": "Error downloading file!"} | ||
else: | ||
return {"response": "No data found between the given dates."} | ||
|
||
|
||
# Encountered pickling error while running the background task. So, moved the function outside the class. | ||
|
||
def export_data_in_bg(response, download_type, course_name, s3_path): | ||
""" | ||
This function is called in export_documents_csv() to upload the documents to S3. | ||
1. download the documents in batches of 100 and upload them to S3. | ||
2. generate a pre-signed URL for the S3 file. | ||
3. send an email to the course admins with the pre-signed URL. | ||
Args: | ||
response (dict): The response from the Supabase query. | ||
download_type (str): The type of download - 'documents' or 'conversations'. | ||
course_name (str): The name of the course. | ||
s3_path (str): The S3 path where the file will be uploaded. | ||
""" | ||
s3 = AWSStorage() | ||
sql = SQLDatabase() | ||
|
||
total_doc_count = response.count | ||
first_id = response.data[0]['id'] | ||
print("total_doc_count: ", total_doc_count) | ||
print("pre-defined s3_path: ", s3_path) | ||
|
||
curr_doc_count = 0 | ||
filename = s3_path.split('/')[-1].split('.')[0] + '.json' | ||
file_path = os.path.join(os.getcwd(), filename) | ||
|
||
# download data in batches of 100 | ||
while curr_doc_count < total_doc_count: | ||
print("Fetching data from id: ", first_id) | ||
response = sql.getAllFromTableForDownloadType(course_name, download_type, first_id) | ||
df = pd.DataFrame(response.data) | ||
curr_doc_count += len(response.data) | ||
|
||
# writing to file | ||
if not os.path.isfile(file_path): | ||
df.to_json(file_path, orient='records') | ||
else: | ||
df.to_json(file_path, orient='records', lines=True, mode='a') | ||
|
||
if len(response.data) > 0: | ||
first_id = response.data[-1]['id'] + 1 | ||
|
||
# zip file | ||
zip_filename = filename.split('.')[0] + '.zip' | ||
zip_file_path = os.path.join(os.getcwd(), zip_filename) | ||
|
||
with zipfile.ZipFile(zip_file_path, 'w', compression=zipfile.ZIP_DEFLATED) as zipf: | ||
zipf.write(file_path, filename) | ||
|
||
print("zip file created: ", zip_file_path) | ||
|
||
try: | ||
# upload to S3 | ||
|
||
#s3_file = f"courses/{course_name}/exports/{os.path.basename(zip_file_path)}" | ||
s3_file = f"courses/{course_name}/{os.path.basename(zip_file_path)}" | ||
s3.upload_file(zip_file_path, os.environ['S3_BUCKET_NAME'], s3_file) | ||
|
||
# remove local files | ||
os.remove(file_path) | ||
os.remove(zip_file_path) | ||
|
||
print("file uploaded to s3: ", s3_file) | ||
|
||
# generate presigned URL | ||
s3_url = s3.generatePresignedUrl('get_object', os.environ['S3_BUCKET_NAME'], s3_path, 3600) | ||
|
||
# get admin email IDs | ||
headers = { | ||
"Authorization": f"Bearer {os.environ['VERCEL_READ_ONLY_API_KEY']}", | ||
"Content-Type": "application/json" | ||
} | ||
|
||
hget_url = str(os.environ['VERCEL_BASE_URL']) + "course_metadatas/" + course_name | ||
response = requests.get(hget_url, headers=headers) | ||
course_metadata = response.json() | ||
course_metadata = json.loads(course_metadata['result']) | ||
admin_emails = course_metadata['course_admins'] | ||
bcc_emails = [] | ||
|
||
# check for Kastan's email and move to bcc | ||
if '[email protected]' in admin_emails: | ||
admin_emails.remove('[email protected]') | ||
bcc_emails.append('[email protected]') | ||
|
||
# add course owner email to admin_emails | ||
admin_emails.append(course_metadata['course_owner']) | ||
admin_emails = list(set(admin_emails)) | ||
print("admin_emails: ", admin_emails) | ||
print("bcc_emails: ", bcc_emails) | ||
|
||
# add a check for emails, don't send email if no admin emails | ||
if len(admin_emails) == 0: | ||
return "No admin emails found. Email not sent." | ||
|
||
# send email to admins | ||
subject = "UIUC.chat Data Export Complete for " + course_name | ||
body_text = "The data export for " + course_name + " is complete.\n\nYou can download the file from the following link: \n\n" + s3_url + "\n\nThis link will expire in 48 hours." | ||
email_status = send_email(subject, body_text, os.environ['EMAIL_SENDER'], admin_emails, bcc_emails) | ||
print("email_status: ", email_status) | ||
|
||
return "File uploaded to S3. Email sent to admins." | ||
|
||
except Exception as e: | ||
print(e) | ||
return "Error: " + str(e) |