Skip to content

Commit

Permalink
fixed video ingest
Browse files Browse the repository at this point in the history
  • Loading branch information
star-nox committed Jul 25, 2024
1 parent 714f60c commit b717fc9
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 28 deletions.
68 changes: 42 additions & 26 deletions ai_ta_backend/beam/canvas_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,13 +192,20 @@ def download_course_content(self, canvas_course_id: int, dest_folder: str, conte

# at this point, we have all canvas files in the dest_folder.
# parse all HTML files in dest_folder and extract URLs
extract_urls_from_html = self.extract_urls_from_html(dest_folder)
print("extract_urls_from_html=", extract_urls_from_html)
extracted_urls_from_html = self.extract_urls_from_html(dest_folder)
#print("extract_urls_from_html=", extract_urls_from_html)

# links - canvas files, external urls, embedded videos


file_links = extracted_urls_from_html.get('file_links', [])

video_links = extracted_urls_from_html.get('video_links', [])
#external_links = extract_urls_from_html.get('external_links', [])

# download files from URLs
file_download_status = self.download_files_from_urls(file_links, canvas_course_id, dest_folder)
video_download_status = self.download_videos_from_urls(video_links, canvas_course_id, dest_folder)
print("file_download_status=", file_download_status)
print("video_download_status=", video_download_status)

return "Success"
except Exception as e:
Expand Down Expand Up @@ -266,11 +273,13 @@ def ingest_course_content(self,
uid = str(uuid.uuid4()) + '-'

unique_filename = uid + name_without_extension + extension
s3_path = "courses/" + course_name + "/" + unique_filename
readable_filename = name_without_extension + extension
all_s3_paths.append(unique_filename)
all_s3_paths.append(s3_path)
all_readable_filenames.append(readable_filename)
print("Uploading file: ", readable_filename)
self.upload_file(file_path, os.getenv('S3_BUCKET_NAME'), unique_filename)
print("Filepath: ", file_path)
self.upload_file(file_path, os.getenv('S3_BUCKET_NAME'), s3_path)

# Delete files from local directory
shutil.rmtree(folder_path)
Expand Down Expand Up @@ -338,7 +347,7 @@ def download_pages(self, dest_folder: str, api_path: str) -> str:
try:
pages_request = requests.get(api_path + "/pages", headers=self.headers)
pages = pages_request.json()
print("Pages: ", pages)
#print("Pages: ", pages)
for page in pages:
if page['html_url'] != '':
page_name = page['url'] + ".html"
Expand Down Expand Up @@ -414,12 +423,12 @@ def download_modules(self, dest_folder: str, api_path: str) -> str:
continue

elif item['type'] == 'Quiz':
print("Quizzes are not handled at the moment.")
#print("Quizzes are not handled at the moment.")
continue

else: # OTHER ITEMS - PAGES
if 'url' not in item:
print("No URL in item: ", item['type'])
#print("No URL in item: ", item['type'])
continue

item_url = item['url']
Expand All @@ -428,7 +437,7 @@ def download_modules(self, dest_folder: str, api_path: str) -> str:
if item_request.status_code == 200:
item_data = item_request.json()
if 'body' not in item_data:
print("No body in item: ", item_data)
#print("No body in item: ", item_data)
continue

item_body = item_data['body']
Expand Down Expand Up @@ -525,8 +534,7 @@ def extract_urls_from_html(self, dir_path: str) -> Dict[str, List[str]]:

return {
'file_links': file_links,
'video_links': video_links,
'external_links': external_links}
'video_links': video_links,}

except Exception as e:
sentry_sdk.capture_exception(e)
Expand All @@ -537,17 +545,25 @@ def download_files_from_urls(self, urls: List[str], course_id: int, dir_path: st
This function downloads files from a given Canvas course using the URLs provided.
input: urls - list of URLs scraped from Canvas HTML pages.
"""
print("In download_files_from_urls")
#print("Number of URLs: ", len(urls))
try:
for url in urls:
#print("Downloading file from URL: ", url)
with requests.get(url, stream=True) as r:
content_type = r.headers.get('Content-Type')
print("Content type: ", content_type)
#print("Content type: ", content_type)
content_disposition = r.headers.get('Content-Disposition')
#print("Content disposition: ", content_disposition)
if content_disposition is None:
#print("No content disposition")
continue

if 'filename=' in content_disposition:
filename = content_disposition.split('filename=')[1].strip('"')
print("local filename: ", filename)
#print("local filename: ", filename)
else:
print("No filename in content disposition")
#print("No filename in content disposition")
continue

# write to PDF
Expand All @@ -567,26 +583,26 @@ def download_videos_from_urls(self, urls: List[str], course_id: int, dir_path: s
"""
This function downloads videos from a given Canvas course using the URLs provided.
"""
print("In download_videos_from_urls")
#print("Video URLs: ", len(urls))
try:
count = 0
for url in urls:
count += 1
with requests.get(url, stream=True) as r:
content_type = r.headers.get('Content-Type')
print("Content type: ", content_type)
content_disposition = r.headers.get('Content-Disposition')
if 'filename=' in content_disposition:
filename = content_disposition.split('filename=')[1].strip('"')
print("local filename: ", filename)
else:
print("No filename in content disposition")
continue
filename = f"{course_id}_video_{count}.mp4"

# download video
file_path = os.path.join(dir_path, filename)
ydl_opts = {
'outtmpl': file_path
'outtmpl': f'{dir_path}/{course_id}_video_{count}.%(ext)s', # Dynamic extension
'format': 'best', # Best quality format
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
info_dict = ydl.extract_info(url, download=True)
ext = info_dict.get('ext', 'mp4') # Get extension from info, default to mp4
filename = f"{course_id}_video_{count}.{ext}"


print(f"Video downloaded successfully: {filename}")

Expand Down
28 changes: 26 additions & 2 deletions ai_ta_backend/beam/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@
from qdrant_client.models import PointStruct
from supabase.client import ClientOptions

import subprocess


# from langchain.schema.output_parser import StrOutputParser
# from langchain.chat_models import AzureChatOpenAI

Expand Down Expand Up @@ -79,6 +82,7 @@
"sentry-sdk==1.39.1",
"nomic==2.0.14",
"pdfplumber==0.11.0", # PDF OCR, better performance than Fitz/PyMuPDF in my Gies PDF testing.

]

# TODO: consider adding workers. They share CPU and memory https://docs.beam.cloud/deployment/autoscaling#worker-use-cases
Expand Down Expand Up @@ -537,8 +541,28 @@ def _ingest_single_video(self, s3_path: str, course_name: str, **kwargs) -> str:
with NamedTemporaryFile(suffix=file_ext) as video_tmpfile:
# download from S3 into an video tmpfile
self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=video_tmpfile)
# extract audio from video tmpfile
mp4_version = AudioSegment.from_file(video_tmpfile.name, file_ext[1:])

# try with original file first
try:
mp4_version = AudioSegment.from_file(video_tmpfile.name, file_ext[1:])
except Exception as e:
print("Applying moov atom fix and retrying...")
# Fix the moov atom issue using FFmpeg
fixed_video_tmpfile = NamedTemporaryFile(suffix=file_ext, delete=False)
try:
result = subprocess.run([
'ffmpeg', '-y', '-i', video_tmpfile.name, '-c', 'copy', '-movflags', 'faststart', fixed_video_tmpfile.name
], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
#print(result.stdout.decode())
#print(result.stderr.decode())
except subprocess.CalledProcessError as e:
#print(e.stdout.decode())
#print(e.stderr.decode())
print("Error in FFmpeg command: ", e)
raise e

# extract audio from video tmpfile
mp4_version = AudioSegment.from_file(fixed_video_tmpfile.name, file_ext[1:])

# save the extracted audio as a temporary webm file
with NamedTemporaryFile(suffix=".webm", dir=media_dir, delete=False) as webm_tmpfile:
Expand Down

0 comments on commit b717fc9

Please sign in to comment.