modified canvas update

UIUC-Chatbot · Aug 30, 2023 · 93646ac · 93646ac
1 parent bf3726b
commit 93646ac
Show file tree

Hide file tree

Showing 4 changed files with 49 additions and 33 deletions.
diff --git a/ai_ta_backend/canvas.py b/ai_ta_backend/canvas.py
@@ -28,7 +28,7 @@ def add_users(self, canvas_course_id: str, course_name: str):
             email_id = net_id + "@illinois.edu"
             user_emails.append(email_id)
 
-        print(user_emails)
+        print("Collected emails: ", user_emails)
 
         if len(user_emails) > 0:
             return "Success"
@@ -100,6 +100,7 @@ def ingest_course_content(self, canvas_course_id: int, course_name: str)-> str:
         3. Call bulk_ingest() to ingest all files into QDRANT
         4. Delete extracted files from local directory
         """
+        print("-------------")
         print("In ingest_course_content")
         try:
             # Download files into course_content folder

diff --git a/ai_ta_backend/update_materials.py b/ai_ta_backend/update_materials.py
@@ -28,51 +28,64 @@ def update_files(source_path: str, course_name: str):
     """
     print("In update_files")
 
+
+
     ingester = Ingest()
     # Get S3 paths of files for given course_name
-    s3_files = ingester.getAll(course_name)    
+    s3_files = ingester.getAll(course_name)   
+
 
     # Access checksum of s3 files
     s3_client = boto3.client('s3', aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
                              aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),)
 
     # Compute checksum of every file in source_path folder
-    filenames = []
+    total_files = 0
+    files_removed = 0
     for root, subdirs, files in os.walk(source_path):
         for file in files:
+            total_files += 1
             print("file: ", file)
             filepath = os.path.join(root, file)
             file_checksum = generate_checksum(filepath)
 
-        # compare file checksum with checksum of all s3 files
-        for s3_file in s3_files:
-            s3_path = s3_file['s3_path']
-            s3_object = s3_client.get_object(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path)
-            s3_checksum = s3_object['ETag']
-
-            # remove file from the folder if checksums match
-            if str(file_checksum) == s3_checksum[1:-1]:
-                print("checksums match: ", file)
-                os.remove(filepath)
-                continue
+            # compare file checksum with checksum of all s3 files
+            for s3_file in s3_files:
+                s3_path = s3_file['s3_path']
+                #print("existing s3 file: ", s3_path) 
+                s3_object = s3_client.get_object(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path)
+                s3_checksum = s3_object['ETag']
 
+                # remove file from the folder if checksums match
+                if str(file_checksum) == s3_checksum[1:-1]:
+                    print("checksums match: ", file)
+                    os.remove(filepath)
+                    files_removed += 1
+                    continue
+
+    print("total files: ", total_files)
+    print("files removed: ", files_removed)
+    if total_files > 0:
+        new_s3_paths = upload_data_files_to_s3(course_name, source_path)
+        subdir_ingest = ingester.bulk_ingest(new_s3_paths, course_name=course_name)
 
-    # Upload remaining files to S3 - canvas export contains subdirectories
-    subdirectories = [subdir for subdir in os.listdir(source_path) if os.path.isdir(os.path.join(source_path, subdir))]
-    print("subdirs: ", subdirectories)
+    # # Upload remaining files to S3 - canvas export contains subdirectories
+    # subdirectories = [subdir for subdir in os.listdir(source_path) if os.path.isdir(os.path.join(source_path, subdir))]
+    # print("subdirs: ", subdirectories)
 
-    if len(subdirectories) == 0:
-        # pass the source path
-        new_s3_paths = upload_data_files_to_s3(course_name, source_path)
-    else:
-        # pass the subdirectory paths
-        for subdir in subdirectories:
-            subdir_path = os.path.join(source_path, subdir)
-            if len(os.listdir(subdir_path)) == 0:
-                continue
-            print("subdir_path: ", subdir_path)
-            new_s3_paths = upload_data_files_to_s3(course_name, subdir_path)
-            subdir_ingest = ingester.bulk_ingest(new_s3_paths, course_name=course_name)
+    # if len(subdirectories) == 0:
+    #     # pass the source path
+    #     new_s3_paths = upload_data_files_to_s3(course_name, source_path)
+    # else:
+    #     # pass the subdirectory paths
+    #     for subdir in subdirectories:
+    #         subdir_path = os.path.join(source_path, subdir)
+    #         if len(os.listdir(subdir_path)) == 0:
+    #             continue
+    #         new_s3_paths = upload_data_files_to_s3(course_name, subdir_path)
+    #         print("----------------------------------")
+    #         print("new s3 paths: ", new_s3_paths)
+    #         subdir_ingest = ingester.bulk_ingest(new_s3_paths, course_name=course_name)
 
     # Delete files from local directory
     shutil.rmtree(source_path)

diff --git a/ai_ta_backend/vector_database.py b/ai_ta_backend/vector_database.py
@@ -374,6 +374,7 @@ def _ingest_html(self, s3_path: str, course_name: str, **kwargs) -> str:
       title = str(object=time.localtime()[1])+ "/" + str(time.localtime()[2]) + "/" + str(time.localtime()[0])[2:] + ' ' + str(title)
 
       text = [soup.get_text()]
+
       metadata: List[Dict[str, Any]] = [{
           'course_name': course_name,
           's3_path': s3_path,
@@ -382,7 +383,7 @@ def _ingest_html(self, s3_path: str, course_name: str, **kwargs) -> str:
           'base_url': base_url,
           'pagenumber_or_timestamp': ''
       }]
-
+      
       success_or_failure = self.split_and_upload(text, metadata)
       print(f"_ingest_html: {success_or_failure}")
       return success_or_failure
@@ -776,18 +777,19 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]):
           separators=". ",  # try to split on sentences... 
       )
       documents: List[Document] = text_splitter.create_documents(texts=texts, metadatas=metadatas)
-
+      
       def remove_small_contexts(documents: List[Document]) -> List[Document]:
         # Remove TextSplit contexts with fewer than 50 chars.
         return [doc for doc in documents if len(doc.page_content) > 50]
 
       documents = remove_small_contexts(documents=documents)
-
+      
       # upload to Qdrant
       self.vectorstore.add_texts([doc.page_content for doc in documents], [doc.metadata for doc in documents])
       data = [{"content": doc.page_content, "metadata": doc.metadata} for doc in documents]
+      print("split_and_upload data: ", data)
       count = self.supabase_client.table(os.getenv('MATERIALS_SUPABASE_TABLE')).insert(data).execute()  # type: ignore
-
+      print("split_and_upload count: ", count)
       return "Success"
     except Exception as e:
       err: str = f"ERROR IN split_and_upload(): Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}"  # type: ignore

diff --git a/media/tmp4o6y9wmb.webm b/media/tmp4o6y9wmb.webm