From 3532679f09446aaafb4e20bce60bb157cac5babf Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Mon, 22 Jul 2024 11:32:46 -0500
Subject: [PATCH 1/5] modified download_modules() to download pages

---
 ai_ta_backend/beam/canvas_ingest.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/ai_ta_backend/beam/canvas_ingest.py b/ai_ta_backend/beam/canvas_ingest.py
index 1fed6acd..a1da4f77 100644
--- a/ai_ta_backend/beam/canvas_ingest.py
+++ b/ai_ta_backend/beam/canvas_ingest.py
@@ -364,14 +364,16 @@ def download_modules(self, dest_folder: str, api_path: str) -> str:
     Rest of the things are covered in other functions.
     """
     print("In download_modules")
+    # need to parse pages through modules
     try:
       module_request = requests.get(api_path + "/modules?include=items", headers=self.headers)
+      
       modules = module_request.json()
 
       for module in modules:
         module_items = module['items']
         for item in module_items:
-          if item['type'] == 'ExternalUrl':
+          if item['type'] == 'ExternalUrl': # if any external link is present in the module
             external_url = item['external_url']
             url_title = item['title']
 
@@ -381,6 +383,19 @@ def download_modules(self, dest_folder: str, api_path: str) -> str:
               html_file_name = url_title + ".html"
               with open(dest_folder + "/" + html_file_name, 'w') as html_file:
                 html_file.write(response.text)
+          else:
+            print("item type: ", item['type'])
+            item_url = item['url']
+            item_request = requests.get(item_url, headers=self.headers)
+            if item_request.status_code == 200:
+              item_data = item_request.json()
+              item_body = item_data['body']
+              html_file_name = item_data['url'] + ".html"
+              with open(dest_folder + "/" + html_file_name, 'w') as html_file:
+                html_file.write(item_body)
+            else:
+              print("Item request failed with status code: ", item_request.status_code)
+          
       return "Success"
     except Exception as e:
       sentry_sdk.capture_exception(e)

From 8a9145c9a3e3b8d1735a797ee61e6604253aa337 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Mon, 22 Jul 2024 15:13:44 -0500
Subject: [PATCH 2/5] improved error handling in download_modules()

---
 ai_ta_backend/beam/canvas_ingest.py | 48 ++++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 8 deletions(-)

diff --git a/ai_ta_backend/beam/canvas_ingest.py b/ai_ta_backend/beam/canvas_ingest.py
index a1da4f77..95ac470b 100644
--- a/ai_ta_backend/beam/canvas_ingest.py
+++ b/ai_ta_backend/beam/canvas_ingest.py
@@ -302,7 +302,7 @@ def download_files(self, dest_folder: str, api_path: str) -> str:
 
       course = self.canvas_client.get_course(api_path.split('/')[-1])
       files = course.get_files()
-
+      
       for file in files:
         # file_name = file['filename']
         file_name = file.filename
@@ -326,7 +326,7 @@ def download_pages(self, dest_folder: str, api_path: str) -> str:
     try:
       pages_request = requests.get(api_path + "/pages", headers=self.headers)
       pages = pages_request.json()
-
+      print("Pages: ", pages)
       for page in pages:
         if page['html_url'] != '':
           page_name = page['url'] + ".html"
@@ -373,24 +373,56 @@ def download_modules(self, dest_folder: str, api_path: str) -> str:
       for module in modules:
         module_items = module['items']
         for item in module_items:
-          if item['type'] == 'ExternalUrl': # if any external link is present in the module
+          if item['type'] == 'ExternalUrl': # EXTERNAL LINK
             external_url = item['external_url']
             url_title = item['title']
 
             # Download external url as HTML
             response = requests.get(external_url)
             if response.status_code == 200:
-              html_file_name = url_title + ".html"
+              html_file_name = "external_link_" + url_title.replace(" ", "_") + ".html"
               with open(dest_folder + "/" + html_file_name, 'w') as html_file:
                 html_file.write(response.text)
-          else:
-            print("item type: ", item['type'])
+          
+          elif item['type'] == 'Discussion':  # DISCUSSION
+            discussion_url = item['url']
+            discussion_req = requests.get(discussion_url, headers=self.headers)
+
+            if discussion_req.status_code == 200:
+              discussion_data = discussion_req.json()
+              discussion_message = discussion_data['message']
+              discussion_filename = "Discussion_" + discussion_data['title'].replace(" ", "_") + ".html"
+
+              # write the message to a file
+              with open(dest_folder + "/" + discussion_filename, 'w') as html_file:
+                html_file.write(discussion_message)
+
+          elif item['type'] == 'Assignment':  # ASSIGNMENT
+            print("Assigments are handled via download_assignments()")
+            continue
+
+          elif item['type'] == 'Quiz': 
+            print("Quizzes are not handled at the moment.")
+            continue
+
+          else: # OTHER ITEMS - PAGES
+            if 'url' not in item:
+              print("No URL in item: ", item['type'])
+              continue
+
             item_url = item['url']
             item_request = requests.get(item_url, headers=self.headers)
+            
             if item_request.status_code == 200:
               item_data = item_request.json()
+              if 'body' not in item_data:
+                print("No body in item: ", item_data)
+                continue
+
               item_body = item_data['body']
-              html_file_name = item_data['url'] + ".html"
+              html_file_name = item['type'] + "_" + item_data['url'] + ".html"
+
+              # write page body to a file
               with open(dest_folder + "/" + html_file_name, 'w') as html_file:
                 html_file.write(item_body)
             else:
@@ -430,7 +462,7 @@ def download_discussions(self, dest_folder: str, api_path: str) -> str:
     try:
       discussion_request = requests.get(api_path + "/discussion_topics", headers=self.headers)
       discussions = discussion_request.json()
-
+      #print("Discussions: ", discussions)
       for discussion in discussions:
         discussion_content = discussion['message']
         discussion_name = discussion['title'] + ".html"

From c25b7042d69c4b70f373f1fe5912a982ca35ac09 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Mon, 22 Jul 2024 18:12:31 -0500
Subject: [PATCH 3/5] added function to extract URLs from HTML

---
 ai_ta_backend/beam/canvas_ingest.py | 56 ++++++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/ai_ta_backend/beam/canvas_ingest.py b/ai_ta_backend/beam/canvas_ingest.py
index 95ac470b..22c1c742 100644
--- a/ai_ta_backend/beam/canvas_ingest.py
+++ b/ai_ta_backend/beam/canvas_ingest.py
@@ -186,7 +186,15 @@ def download_course_content(self, canvas_course_id: int, dest_folder: str, conte
           elif key == 'discussions':
             self.download_discussions(dest_folder, api_path)
 
-      # at this point, we have all extracted files in the dest_folder.
+      # at this point, we have all canvas files in the dest_folder.
+      # parse all HTML files in dest_folder and extract URLs
+      extract_urls_from_html = self.extract_urls_from_html(dest_folder)
+      print("extract_urls_from_html=", extract_urls_from_html)
+
+      # links - canvas files, external urls, embedded videos
+
+
+      
 
       return "Success"
     except Exception as e:
@@ -473,3 +481,49 @@ def download_discussions(self, dest_folder: str, api_path: str) -> str:
     except Exception as e:
       sentry_sdk.capture_exception(e)
       return "Failed! Error: " + str(e)
+    
+  def extract_urls_from_html(self, dir_path: str) -> Dict[str, List[str]]:
+    """
+    Extracts URLs from all HTML files in a directory.
+    """
+    print("In extract_urls_from_html")
+    try:
+      file_links = []
+      video_links = []
+      external_links = []
+      for file_name in os.listdir(dir_path):
+        if file_name.endswith(".html"):
+          file_path = os.path.join(dir_path, file_name)
+          try:
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
+              content = file.read()
+          except UnicodeDecodeError:
+            with open(file_path, 'r', encoding='latin-1') as file:
+              content = file.read()
+          
+          soup = BeautifulSoup(content, 'html.parser')
+
+          # Extracting links from href attributes
+          href_links = soup.find_all('a', href=True)
+          for link in href_links:
+            href = link['href']
+            if re.match(r'https://canvas\.illinois\.edu/courses/\d+/files/.*', href):
+              file_links.append(href)
+            else:
+              external_links.append(href)
+          
+          # Extracting video links from src attributes
+          src_links = soup.find_all('iframe', src=True)
+          for link in src_links:
+            src = link['src']
+            if re.match(r'https://ensemble\.illinois\.edu/hapi/v1/contents/.*', src):
+              video_links.append(src)
+
+      return {
+          'file_links': file_links,
+          'video_links': video_links,
+          'external_links': external_links}  
+
+    except Exception as e:
+      sentry_sdk.capture_exception(e)
+      return {}

From 714f60c64e49517eddacf69f66d6157d5acf5c6a Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Wed, 24 Jul 2024 21:39:58 -0500
Subject: [PATCH 4/5] added file and video download functions

---
 ai_ta_backend/beam/canvas_ingest.py | 68 +++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/ai_ta_backend/beam/canvas_ingest.py b/ai_ta_backend/beam/canvas_ingest.py
index 22c1c742..c09f9598 100644
--- a/ai_ta_backend/beam/canvas_ingest.py
+++ b/ai_ta_backend/beam/canvas_ingest.py
@@ -17,12 +17,16 @@
 from beam import App, QueueDepthAutoscaler, Runtime  # RequestLatencyAutoscaler,
 from canvasapi import Canvas
 from posthog import Posthog
+from bs4 import BeautifulSoup
+import yt_dlp
 
 requirements = [
     "boto3==1.28.79",
     "posthog==3.1.0",
     "canvasapi==3.2.0",
     "sentry-sdk==1.39.1",
+    "bs4==0.0.2",
+    "yt-dlp==2024.7.16",
 ]
 
 app = App(
@@ -527,3 +531,67 @@ def extract_urls_from_html(self, dir_path: str) -> Dict[str, List[str]]:
     except Exception as e:
       sentry_sdk.capture_exception(e)
       return {}
+    
+  def download_files_from_urls(self, urls: List[str], course_id: int, dir_path: str):
+    """
+    This function downloads files from a given Canvas course using the URLs provided.
+    input: urls - list of URLs scraped from Canvas HTML pages.
+    """
+    try:
+      for url in urls:
+        with requests.get(url, stream=True) as r:
+          content_type = r.headers.get('Content-Type')
+          print("Content type: ", content_type)
+          content_disposition = r.headers.get('Content-Disposition')
+          if 'filename=' in content_disposition:
+            filename = content_disposition.split('filename=')[1].strip('"')
+            print("local filename: ", filename)
+          else:
+            print("No filename in content disposition")
+            continue
+          
+          # write to PDF
+          file_path = os.path.join(dir_path, filename)
+          with open(file_path, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=8192):
+              f.write(chunk)
+          print("Downloaded file: ", filename)
+
+      return "Success"
+    except Exception as e:
+      sentry_sdk.capture_exception(e)
+      print("Error downloading files from URLs: ", e)
+      return "Failed! Error: " + str(e)
+    
+  def download_videos_from_urls(self, urls: List[str], course_id: int, dir_path: str):
+    """
+    This function downloads videos from a given Canvas course using the URLs provided.
+    """
+    try:
+      for url in urls:
+        with requests.get(url, stream=True) as r:
+          content_type = r.headers.get('Content-Type')
+          print("Content type: ", content_type)
+          content_disposition = r.headers.get('Content-Disposition')
+          if 'filename=' in content_disposition:
+            filename = content_disposition.split('filename=')[1].strip('"')
+            print("local filename: ", filename)
+          else:
+            print("No filename in content disposition")
+            continue
+          
+          # download video
+          file_path = os.path.join(dir_path, filename)
+          ydl_opts = {
+            'outtmpl': file_path
+          }
+          with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            ydl.download([url])
+
+          print(f"Video downloaded successfully: {filename}")
+
+      return "Success"
+    except Exception as e:
+      sentry_sdk.capture_exception(e)
+      print("Error downloading videos from URLs: ", e)
+      return "Failed! Error: " + str(e)

From b717fc9a052f950f739a43e02ea8e680dbb3cc27 Mon Sep 17 00:00:00 2001
From: star-nox <dabholkar.asmita@gmail.com>
Date: Thu, 25 Jul 2024 14:12:24 -0500
Subject: [PATCH 5/5] fixed video ingest

---
 ai_ta_backend/beam/canvas_ingest.py | 68 ++++++++++++++++++-----------
 ai_ta_backend/beam/ingest.py        | 28 +++++++++++-
 2 files changed, 68 insertions(+), 28 deletions(-)

diff --git a/ai_ta_backend/beam/canvas_ingest.py b/ai_ta_backend/beam/canvas_ingest.py
index c09f9598..9476b80c 100644
--- a/ai_ta_backend/beam/canvas_ingest.py
+++ b/ai_ta_backend/beam/canvas_ingest.py
@@ -192,13 +192,20 @@ def download_course_content(self, canvas_course_id: int, dest_folder: str, conte
 
       # at this point, we have all canvas files in the dest_folder.
       # parse all HTML files in dest_folder and extract URLs
-      extract_urls_from_html = self.extract_urls_from_html(dest_folder)
-      print("extract_urls_from_html=", extract_urls_from_html)
+      extracted_urls_from_html = self.extract_urls_from_html(dest_folder)
+      #print("extract_urls_from_html=", extract_urls_from_html)
 
       # links - canvas files, external urls, embedded videos
-
-
+      file_links = extracted_urls_from_html.get('file_links', [])
       
+      video_links = extracted_urls_from_html.get('video_links', [])
+      #external_links = extract_urls_from_html.get('external_links', [])
+
+      # download files from URLs
+      file_download_status = self.download_files_from_urls(file_links, canvas_course_id, dest_folder)
+      video_download_status = self.download_videos_from_urls(video_links, canvas_course_id, dest_folder)
+      print("file_download_status=", file_download_status)
+      print("video_download_status=", video_download_status)
 
       return "Success"
     except Exception as e:
@@ -266,11 +273,13 @@ def ingest_course_content(self,
         uid = str(uuid.uuid4()) + '-'
 
         unique_filename = uid + name_without_extension + extension
+        s3_path = "courses/" + course_name + "/" + unique_filename
         readable_filename = name_without_extension + extension
-        all_s3_paths.append(unique_filename)
+        all_s3_paths.append(s3_path)
         all_readable_filenames.append(readable_filename)
         print("Uploading file: ", readable_filename)
-        self.upload_file(file_path, os.getenv('S3_BUCKET_NAME'), unique_filename)
+        print("Filepath: ", file_path)
+        self.upload_file(file_path, os.getenv('S3_BUCKET_NAME'), s3_path)
 
       # Delete files from local directory
       shutil.rmtree(folder_path)
@@ -338,7 +347,7 @@ def download_pages(self, dest_folder: str, api_path: str) -> str:
     try:
       pages_request = requests.get(api_path + "/pages", headers=self.headers)
       pages = pages_request.json()
-      print("Pages: ", pages)
+      #print("Pages: ", pages)
       for page in pages:
         if page['html_url'] != '':
           page_name = page['url'] + ".html"
@@ -414,12 +423,12 @@ def download_modules(self, dest_folder: str, api_path: str) -> str:
             continue
 
           elif item['type'] == 'Quiz': 
-            print("Quizzes are not handled at the moment.")
+            #print("Quizzes are not handled at the moment.")
             continue
 
           else: # OTHER ITEMS - PAGES
             if 'url' not in item:
-              print("No URL in item: ", item['type'])
+              #print("No URL in item: ", item['type'])
               continue
 
             item_url = item['url']
@@ -428,7 +437,7 @@ def download_modules(self, dest_folder: str, api_path: str) -> str:
             if item_request.status_code == 200:
               item_data = item_request.json()
               if 'body' not in item_data:
-                print("No body in item: ", item_data)
+                #print("No body in item: ", item_data)
                 continue
 
               item_body = item_data['body']
@@ -525,8 +534,7 @@ def extract_urls_from_html(self, dir_path: str) -> Dict[str, List[str]]:
 
       return {
           'file_links': file_links,
-          'video_links': video_links,
-          'external_links': external_links}  
+          'video_links': video_links,}  
 
     except Exception as e:
       sentry_sdk.capture_exception(e)
@@ -537,17 +545,25 @@ def download_files_from_urls(self, urls: List[str], course_id: int, dir_path: st
     This function downloads files from a given Canvas course using the URLs provided.
     input: urls - list of URLs scraped from Canvas HTML pages.
     """
+    print("In download_files_from_urls")
+    #print("Number of URLs: ", len(urls))
     try:
       for url in urls:
+        #print("Downloading file from URL: ", url)
         with requests.get(url, stream=True) as r:
           content_type = r.headers.get('Content-Type')
-          print("Content type: ", content_type)
+          #print("Content type: ", content_type)
           content_disposition = r.headers.get('Content-Disposition')
+          #print("Content disposition: ", content_disposition)
+          if content_disposition is None:
+            #print("No content disposition")
+            continue
+
           if 'filename=' in content_disposition:
             filename = content_disposition.split('filename=')[1].strip('"')
-            print("local filename: ", filename)
+            #print("local filename: ", filename)
           else:
-            print("No filename in content disposition")
+            #print("No filename in content disposition")
             continue
           
           # write to PDF
@@ -567,26 +583,26 @@ def download_videos_from_urls(self, urls: List[str], course_id: int, dir_path: s
     """
     This function downloads videos from a given Canvas course using the URLs provided.
     """
+    print("In download_videos_from_urls")
+    #print("Video URLs: ", len(urls))
     try:
+      count = 0
       for url in urls:
+        count += 1
         with requests.get(url, stream=True) as r:
-          content_type = r.headers.get('Content-Type')
-          print("Content type: ", content_type)
-          content_disposition = r.headers.get('Content-Disposition')
-          if 'filename=' in content_disposition:
-            filename = content_disposition.split('filename=')[1].strip('"')
-            print("local filename: ", filename)
-          else:
-            print("No filename in content disposition")
-            continue
+          filename = f"{course_id}_video_{count}.mp4"
           
           # download video
           file_path = os.path.join(dir_path, filename)
           ydl_opts = {
-            'outtmpl': file_path
+                'outtmpl': f'{dir_path}/{course_id}_video_{count}.%(ext)s',  # Dynamic extension
+                'format': 'best',  # Best quality format
           }
           with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-            ydl.download([url])
+            info_dict = ydl.extract_info(url, download=True)
+            ext = info_dict.get('ext', 'mp4')  # Get extension from info, default to mp4
+            filename = f"{course_id}_video_{count}.{ext}"
+            
 
           print(f"Video downloaded successfully: {filename}")
 
diff --git a/ai_ta_backend/beam/ingest.py b/ai_ta_backend/beam/ingest.py
index 37bc90c7..99053cfd 100644
--- a/ai_ta_backend/beam/ingest.py
+++ b/ai_ta_backend/beam/ingest.py
@@ -51,6 +51,9 @@
 from qdrant_client.models import PointStruct
 from supabase.client import ClientOptions
 
+import subprocess
+
+
 # from langchain.schema.output_parser import StrOutputParser
 # from langchain.chat_models import AzureChatOpenAI
 
@@ -79,6 +82,7 @@
     "sentry-sdk==1.39.1",
     "nomic==2.0.14",
     "pdfplumber==0.11.0",  # PDF OCR, better performance than Fitz/PyMuPDF in my Gies PDF testing.
+    
 ]
 
 # TODO: consider adding workers. They share CPU and memory https://docs.beam.cloud/deployment/autoscaling#worker-use-cases
@@ -537,8 +541,28 @@ def _ingest_single_video(self, s3_path: str, course_name: str, **kwargs) -> str:
       with NamedTemporaryFile(suffix=file_ext) as video_tmpfile:
         # download from S3 into an video tmpfile
         self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=video_tmpfile)
-        # extract audio from video tmpfile
-        mp4_version = AudioSegment.from_file(video_tmpfile.name, file_ext[1:])
+        
+        # try with original file first
+        try:
+          mp4_version = AudioSegment.from_file(video_tmpfile.name, file_ext[1:])
+        except Exception as e:
+          print("Applying moov atom fix and retrying...")
+          # Fix the moov atom issue using FFmpeg
+          fixed_video_tmpfile = NamedTemporaryFile(suffix=file_ext, delete=False)
+          try:
+            result = subprocess.run([
+                      'ffmpeg', '-y', '-i', video_tmpfile.name, '-c', 'copy', '-movflags', 'faststart', fixed_video_tmpfile.name
+                  ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            #print(result.stdout.decode())
+            #print(result.stderr.decode())
+          except subprocess.CalledProcessError as e:
+            #print(e.stdout.decode())
+            #print(e.stderr.decode())
+            print("Error in FFmpeg command: ", e)
+            raise e
+          
+          # extract audio from video tmpfile
+          mp4_version = AudioSegment.from_file(fixed_video_tmpfile.name, file_ext[1:])
 
       # save the extracted audio as a temporary webm file
       with NamedTemporaryFile(suffix=".webm", dir=media_dir, delete=False) as webm_tmpfile: