From eafb1a51250b5f086a47e6fc31021068514f9bbd Mon Sep 17 00:00:00 2001 From: Kerry Vance Date: Sun, 31 Jan 2021 15:10:48 -0800 Subject: [PATCH] Extra functionality --- start_kit/preprocess.py | 182 ++++++++++++----------- start_kit/scripts/swf2mp4.sh | 2 +- start_kit/video_downloader.py | 262 ++++++++++++++++------------------ 3 files changed, 224 insertions(+), 222 deletions(-) diff --git a/start_kit/preprocess.py b/start_kit/preprocess.py index 83b98f83c7..84820e2bd3 100644 --- a/start_kit/preprocess.py +++ b/start_kit/preprocess.py @@ -3,15 +3,29 @@ # 2. Extract YouTube frames and create video instances. import os +import sys +import glob import json import cv2 - import shutil +import re + +import logging +logging.basicConfig( + filename="preProc.log", + filemode='w', + level=logging.DEBUG +) +logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) + -def convert_everything_to_mp4(): - cmd = 'bash scripts/swf2mp4.sh' +def convert_frames_to_video(frame_array, path_out, size, fps=25): + out = cv2.VideoWriter(path_out, cv2.VideoWriter_fourcc(*'mp4v'), fps, size) - os.system(cmd) + for i in range(len(frame_array)): + # writing to a image array + out.write(frame_array[i]) + out.release() def video_to_frames(video_path, size=None): @@ -20,13 +34,14 @@ def video_to_frames(video_path, size=None): size -> (int, int), width, height. """ + print(f"video_path: {video_path} size: {size}") cap = cv2.VideoCapture(video_path) frames = [] - + while True: ret, frame = cap.read() - + if ret: if size: frame = cv2.resize(frame, size) @@ -39,91 +54,88 @@ def video_to_frames(video_path, size=None): return frames -def convert_frames_to_video(frame_array, path_out, size, fps=25): - out = cv2.VideoWriter(path_out, cv2.VideoWriter_fourcc(*'mp4v'), fps, size) - - for i in range(len(frame_array)): - # writing to a image array - out.write(frame_array[i]) - out.release() - - def extract_frame_as_video(src_video_path, start_frame, end_frame): frames = video_to_frames(src_video_path) return frames[start_frame: end_frame+1] -def extract_all_yt_instances(content): - cnt = 1 - - if not os.path.exists('videos'): - os.mkdir('videos') - - for entry in content: - instances = entry['instances'] - - for inst in instances: - url = inst['url'] - video_id = inst['video_id'] - - if 'youtube' in url or 'youtu.be' in url: - cnt += 1 - - yt_identifier = url[-11:] - - src_video_path = os.path.join('raw_videos_mp4', yt_identifier + '.mp4') - dst_video_path = os.path.join('videos', video_id + '.mp4') - - if not os.path.exists(src_video_path): - continue - - if os.path.exists(dst_video_path): - print('{} exists.'.format(dst_video_path)) - continue - - # because the JSON file indexes from 1. - start_frame = inst['frame_start'] - 1 - end_frame = inst['frame_end'] - 1 - - if end_frame <= 0: - shutil.copyfile(src_video_path, dst_video_path) - continue - - selected_frames = extract_frame_as_video(src_video_path, start_frame, end_frame) - - # when OpenCV reads an image, it returns size in (h, w, c) - # when OpenCV creates a writer, it requres size in (w, h). - size = selected_frames[0].shape[:2][::-1] - - convert_frames_to_video(selected_frames, dst_video_path, size) - - print(cnt, dst_video_path) - else: - cnt += 1 - - src_video_path = os.path.join('raw_videos_mp4', video_id + '.mp4') - dst_video_path = os.path.join('videos', video_id + '.mp4') - - if os.path.exists(dst_video_path): - print('{} exists.'.format(dst_video_path)) - continue - - if not os.path.exists(src_video_path): - continue - - print(cnt, dst_video_path) - shutil.copyfile(src_video_path, dst_video_path) - - -def main(): - # 1. Convert .swf, .mkv file to mp4. - convert_everything_to_mp4() - - content = json.load(open('WLASL_v0.3.json')) - extract_all_yt_instances(content) +class Preproc: + def __init__(self, + idxf="WLASL_v0.3.json", + videoDir="data"): + self.indexFile = idxf + self.vd = videoDir + + def convertTomp4(self): + for f in os.scandir(self.vd): + if ( + not f.path.endswith(".mp4") and + not glob.glob( + os.path.join( + self.vd, + os.path.splitext(f.name)[0]) + '.mp4' + ) + ): + dest = os.path.join(self.vd, + os.path.splitext(f.name)[0] + '.mp4' + ) + if ( + os.system( + f"ffmpeg -loglevel panic -i {f.path} -vf " + f"pad=\"width=ceil(iw/2)*2\" {dest}" + ) == 0 + ): + logging.info(f"Conversion Successful\t-\t{f.name}") + else: + logging.error(f"Conversion Failed\t\t-\t{f.name}") + elif f.path.endswith(".swf"): + logging.info(f"{f.name} already converted - Skipping") + + def extractVideo(self): + idx = json.load(open(self.indexFile)) + + for i in idx: + for j in i["instances"]: + if re.search(r"youtu\.?be", j["url"]): + src = os.path.join( + self.vd, j["video_id"] + '.yt.mp4' + ) + dst = os.path.join( + self.vd, j["video_id"] + '.mp4' + ) + if not os.path.exists(src): + continue + if os.path.exists(dst): + logging.info(f"{src} already extracted - Skipping ") + continue + + if j["frame_end"] - 1 <= 0: + shutil.copyfile(src, dst) + continue + + print(f"src: {src}") + selected_frames = extract_frame_as_video( + src, + j["frame_start"] - 1, + j["frame_end"] - 1 + ) + + size = selected_frames[0].shape[:2][::-1] + convert_frames_to_video(selected_frames, dst, size) + + def main(self): + # logging.info(">>>Converting files to mp4") + # self.convertTomp4() + # logging.info(">>>Extracting youtube videos") + # self.extractVideo() + for r, d, f in os.walk(self.vd): + print(r) + print(d) + print(f) + print("==============") if __name__ == "__main__": - main() - + preproc = Preproc() + preproc.main() diff --git a/start_kit/scripts/swf2mp4.sh b/start_kit/scripts/swf2mp4.sh index 1bcc816e92..0d76cf808d 100644 --- a/start_kit/scripts/swf2mp4.sh +++ b/start_kit/scripts/swf2mp4.sh @@ -24,7 +24,7 @@ do fi echo "${i} / ${total}, ${filename}" - + if [ ${extension} != "mp4" ]; then ffmpeg -loglevel panic -i ${src_file} -vf pad="width=ceil(iw/2)*2:height=ceil(ih/2)*2" ${dst_file} diff --git a/start_kit/video_downloader.py b/start_kit/video_downloader.py index ae363b94a8..f074fa9c9d 100644 --- a/start_kit/video_downloader.py +++ b/start_kit/video_downloader.py @@ -1,151 +1,141 @@ import os +import subprocess +import glob import json import time import sys import urllib.request +import re from multiprocessing.dummy import Pool import random import logging -logging.basicConfig(filename='download_{}.log'.format(int(time.time())), filemode='w', level=logging.DEBUG) -logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) - - -def request_video(url, referer=''): - user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7' - - headers = {'User-Agent': user_agent, - } - - if referer: - headers['Referer'] = referer - - request = urllib.request.Request(url, None, headers) # The assembled request - - logging.info('Requesting {}'.format(url)) - response = urllib.request.urlopen(request) - data = response.read() # The data you need - - return data - - -def save_video(data, saveto): - with open(saveto, 'wb+') as f: - f.write(data) - - # please be nice to the host - take pauses and avoid spamming - time.sleep(random.uniform(0.5, 1.5)) - - -def download_youtube(url, dirname, video_id): - raise NotImplementedError("Urllib cannot deal with YouTube links.") - - -def download_aslpro(url, dirname, video_id): - saveto = os.path.join(dirname, '{}.swf'.format(video_id)) - if os.path.exists(saveto): - logging.info('{} exists at {}'.format(video_id, saveto)) - return - - data = request_video(url, referer='http://www.aslpro.com/cgi-bin/aslpro/aslpro.cgi') - save_video(data, saveto) - - -def download_others(url, dirname, video_id): - saveto = os.path.join(dirname, '{}.mp4'.format(video_id)) - if os.path.exists(saveto): - logging.info('{} exists at {}'.format(video_id, saveto)) - return - - data = request_video(url) - save_video(data, saveto) - - -def select_download_method(url): - if 'aslpro' in url: - return download_aslpro - elif 'youtube' in url or 'youtu.be' in url: - return download_youtube - else: - return download_others - -def download_nonyt_videos(indexfile, saveto='raw_videos'): - content = json.load(open(indexfile)) - - if not os.path.exists(saveto): - os.mkdir(saveto) - - for entry in content: - gloss = entry['gloss'] - instances = entry['instances'] - - for inst in instances: - video_url = inst['url'] - video_id = inst['video_id'] - - logging.info('gloss: {}, video: {}.'.format(gloss, video_id)) - - download_method = select_download_method(video_url) - - if download_method == download_youtube: - logging.warning('Skipping YouTube video {}'.format(video_id)) - continue - - try: - download_method(video_url, saveto, video_id) - except Exception as e: - logging.error('Unsuccessful downloading - video {}'.format(video_id)) - - -def check_youtube_dl_version(): - ver = os.popen('youtube-dl --version').read() - - assert ver, "youtube-dl cannot be found in PATH. Please verify your installation." - assert ver >= '2020.03.08', "Please update youtube-dl to newest version." - - -def download_yt_videos(indexfile, saveto='raw_videos'): - content = json.load(open(indexfile)) - - if not os.path.exists(saveto): - os.mkdir(saveto) - - for entry in content: - gloss = entry['gloss'] - instances = entry['instances'] - - for inst in instances: - video_url = inst['url'] - video_id = inst['video_id'] +logging.basicConfig( + filename="videoDownloader.log", filemode="w", level=logging.DEBUG +) +logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) - if 'youtube' not in video_url and 'youtu.be' not in video_url: - continue - if os.path.exists(os.path.join(saveto, video_url[-11:] + '.mp4')) or os.path.exists(os.path.join(saveto, video_url[-11:] + '.mkv')): - logging.info('YouTube videos {} already exists.'.format(video_url)) - continue +class videoDownloader: + wordCounts = {} + + def __init__(self, idxf="WLASL_v0.3.json", vd="data", n=1, m=2000000): + self.wordCounts = {} + self.indexFile = idxf + self.max = m + self.videoDir = vd + if not os.path.exists(self.videoDir): + os.mkdir(self.videoDir) + self.size = self.updateSize() + + def updateSize(self): + self.size = int( + subprocess.check_output(["du", "-ks", self.videoDir]) + .split()[0] + .decode("utf-8") + ) + return self.size + + def request_video(self, url, referer=""): + user_agent = ( + "Mozilla/5.0" + "(Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7)" + "Gecko/2009021910 Firefox/3.0.7" + ) + + headers = { + "User-Agent": user_agent, + } + + if referer: + headers["Referer"] = referer + + # The assembled request + request = urllib.request.Request(url, None, headers) + + logging.info("Requesting {}".format(url)) + response = urllib.request.urlopen(request) + data = response.read() # The data you need + urllib.request.urlopen + + return data + + def dlPass(self, video_id): + logging.info(f"Download Successful\t-\t{video_id}") + + def dlFail(self, video_id): + logging.error(f"Download Failed\t\t-\t{video_id}") + + def download(self, inst, gloss): + rv = False + saveto = os.path.join(self.videoDir, gloss, inst["video_id"]) + if glob.glob(f"{saveto}.*"): + logging.info(f"{inst['video_id']} exists at {saveto} - Skipping") + rv = True + else: + if re.search(r"youtu\.?be", inst["url"]): + status = os.system( + f"youtube-dl \"{inst['url']}\" -o \"{saveto}.yt.%(ext)s\"" + ) + if status == 0: + self.dlPass(inst["video_id"]) + rv = True + else: + rv = False + self.dlFail(inst["video_id"]) else: - cmd = "youtube-dl \"{}\" -o \"{}%(id)s.%(ext)s\"" - cmd = cmd.format(video_url, saveto + os.path.sep) - - rv = os.system(cmd) - - if not rv: - logging.info('Finish downloading youtube video url {}'.format(video_url)) + if "aslpro" in inst["url"]: + saveto = f"{saveto}.swf" + ref = "http://www.aslpro.com/cgi-bin/aslpro/aslpro.cgi" else: - logging.error('Unsuccessful downloading - youtube video url {}'.format(video_url)) - - # please be nice to the host - take pauses and avoid spamming - time.sleep(random.uniform(1.0, 1.5)) - - -if __name__ == '__main__': - logging.info('Start downloading non-youtube videos.') - download_nonyt_videos('WLASL_v0.3.json') - - check_youtube_dl_version() - logging.info('Start downloading youtube videos.') - download_yt_videos('WLASL_v0.3.json') - + saveto = f"{saveto}.mp4" + ref = "" + dat = self.request_video(inst["url"], referer=ref) + if dat: + with open(saveto, "wb+") as f: + f.write(dat) + self.dlPass(inst["video_id"]) + rv = True + else: + self.dlFail(inst["video_id"]) + # please be nice to the host - take pauses and avoid spamming + time.sleep(random.uniform(0.3, 0.7)) + return rv + + def main(self): + idx = json.load(open(self.indexFile)) + idx = sorted(idx, key=lambda x: (len(x["instances"])), reverse=True) + + if not os.path.exists(self.videoDir): + os.mkdir(self.videoDir) + + for i in idx: + if not os.path.exists(os.path.join(self.videoDir, i["gloss"])): + os.mkdir(os.path.join(self.videoDir, i["gloss"])) + if i["gloss"] not in self.wordCounts: + self.wordCounts[i["gloss"]] = 0 + if self.updateSize() >= self.max: + logging.info("Max size reached") + break + for j in i["instances"]: + if self.updateSize() >= self.max: + break + logging.info( + f">>>GLOSS: {i['gloss']}" + f"\tvideo: {j['video_id']}" + f"\tcount: {self.wordCounts[i['gloss']]}" + ) + try: + if self.download(j, i["gloss"]): + self.wordCounts[i["gloss"]] = ( + self.wordCounts[i["gloss"]] + 1 + ) + except Exception as e: + logging.error(f"ERROR - {j['video_id']}: {e}") + + +if __name__ == "__main__": + vd = videoDownloader() + vd.main()