Skip to content

Commit

Permalink
modified canvas update
Browse files Browse the repository at this point in the history
  • Loading branch information
root authored and root committed Aug 30, 2023
1 parent bf3726b commit 93646ac
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 33 deletions.
3 changes: 2 additions & 1 deletion ai_ta_backend/canvas.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def add_users(self, canvas_course_id: str, course_name: str):
email_id = net_id + "@illinois.edu"
user_emails.append(email_id)

print(user_emails)
print("Collected emails: ", user_emails)

if len(user_emails) > 0:
return "Success"
Expand Down Expand Up @@ -100,6 +100,7 @@ def ingest_course_content(self, canvas_course_id: int, course_name: str)-> str:
3. Call bulk_ingest() to ingest all files into QDRANT
4. Delete extracted files from local directory
"""
print("-------------")
print("In ingest_course_content")
try:
# Download files into course_content folder
Expand Down
69 changes: 41 additions & 28 deletions ai_ta_backend/update_materials.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,51 +28,64 @@ def update_files(source_path: str, course_name: str):
"""
print("In update_files")



ingester = Ingest()
# Get S3 paths of files for given course_name
s3_files = ingester.getAll(course_name)
s3_files = ingester.getAll(course_name)


# Access checksum of s3 files
s3_client = boto3.client('s3', aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),)

# Compute checksum of every file in source_path folder
filenames = []
total_files = 0
files_removed = 0
for root, subdirs, files in os.walk(source_path):
for file in files:
total_files += 1
print("file: ", file)
filepath = os.path.join(root, file)
file_checksum = generate_checksum(filepath)

# compare file checksum with checksum of all s3 files
for s3_file in s3_files:
s3_path = s3_file['s3_path']
s3_object = s3_client.get_object(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path)
s3_checksum = s3_object['ETag']

# remove file from the folder if checksums match
if str(file_checksum) == s3_checksum[1:-1]:
print("checksums match: ", file)
os.remove(filepath)
continue
# compare file checksum with checksum of all s3 files
for s3_file in s3_files:
s3_path = s3_file['s3_path']
#print("existing s3 file: ", s3_path)
s3_object = s3_client.get_object(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path)
s3_checksum = s3_object['ETag']

# remove file from the folder if checksums match
if str(file_checksum) == s3_checksum[1:-1]:
print("checksums match: ", file)
os.remove(filepath)
files_removed += 1
continue

print("total files: ", total_files)
print("files removed: ", files_removed)
if total_files > 0:
new_s3_paths = upload_data_files_to_s3(course_name, source_path)
subdir_ingest = ingester.bulk_ingest(new_s3_paths, course_name=course_name)

# Upload remaining files to S3 - canvas export contains subdirectories
subdirectories = [subdir for subdir in os.listdir(source_path) if os.path.isdir(os.path.join(source_path, subdir))]
print("subdirs: ", subdirectories)
# # Upload remaining files to S3 - canvas export contains subdirectories
# subdirectories = [subdir for subdir in os.listdir(source_path) if os.path.isdir(os.path.join(source_path, subdir))]
# print("subdirs: ", subdirectories)

if len(subdirectories) == 0:
# pass the source path
new_s3_paths = upload_data_files_to_s3(course_name, source_path)
else:
# pass the subdirectory paths
for subdir in subdirectories:
subdir_path = os.path.join(source_path, subdir)
if len(os.listdir(subdir_path)) == 0:
continue
print("subdir_path: ", subdir_path)
new_s3_paths = upload_data_files_to_s3(course_name, subdir_path)
subdir_ingest = ingester.bulk_ingest(new_s3_paths, course_name=course_name)
# if len(subdirectories) == 0:
# # pass the source path
# new_s3_paths = upload_data_files_to_s3(course_name, source_path)
# else:
# # pass the subdirectory paths
# for subdir in subdirectories:
# subdir_path = os.path.join(source_path, subdir)
# if len(os.listdir(subdir_path)) == 0:
# continue
# new_s3_paths = upload_data_files_to_s3(course_name, subdir_path)
# print("----------------------------------")
# print("new s3 paths: ", new_s3_paths)
# subdir_ingest = ingester.bulk_ingest(new_s3_paths, course_name=course_name)

# Delete files from local directory
shutil.rmtree(source_path)
Expand Down
10 changes: 6 additions & 4 deletions ai_ta_backend/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,7 @@ def _ingest_html(self, s3_path: str, course_name: str, **kwargs) -> str:
title = str(object=time.localtime()[1])+ "/" + str(time.localtime()[2]) + "/" + str(time.localtime()[0])[2:] + ' ' + str(title)

text = [soup.get_text()]

metadata: List[Dict[str, Any]] = [{
'course_name': course_name,
's3_path': s3_path,
Expand All @@ -382,7 +383,7 @@ def _ingest_html(self, s3_path: str, course_name: str, **kwargs) -> str:
'base_url': base_url,
'pagenumber_or_timestamp': ''
}]

success_or_failure = self.split_and_upload(text, metadata)
print(f"_ingest_html: {success_or_failure}")
return success_or_failure
Expand Down Expand Up @@ -776,18 +777,19 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]):
separators=". ", # try to split on sentences...
)
documents: List[Document] = text_splitter.create_documents(texts=texts, metadatas=metadatas)

def remove_small_contexts(documents: List[Document]) -> List[Document]:
# Remove TextSplit contexts with fewer than 50 chars.
return [doc for doc in documents if len(doc.page_content) > 50]

documents = remove_small_contexts(documents=documents)

# upload to Qdrant
self.vectorstore.add_texts([doc.page_content for doc in documents], [doc.metadata for doc in documents])
data = [{"content": doc.page_content, "metadata": doc.metadata} for doc in documents]
print("split_and_upload data: ", data)
count = self.supabase_client.table(os.getenv('MATERIALS_SUPABASE_TABLE')).insert(data).execute() # type: ignore

print("split_and_upload count: ", count)
return "Success"
except Exception as e:
err: str = f"ERROR IN split_and_upload(): Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore
Expand Down
Binary file added media/tmp4o6y9wmb.webm
Binary file not shown.

0 comments on commit 93646ac

Please sign in to comment.