From eafb1a51250b5f086a47e6fc31021068514f9bbd Mon Sep 17 00:00:00 2001
From: Kerry Vance <vancek@pelican04.eecs.oregonstate.edu>
Date: Sun, 31 Jan 2021 15:10:48 -0800
Subject: [PATCH] Extra functionality

---
 start_kit/preprocess.py       | 182 ++++++++++++-----------
 start_kit/scripts/swf2mp4.sh  |   2 +-
 start_kit/video_downloader.py | 262 ++++++++++++++++------------------
 3 files changed, 224 insertions(+), 222 deletions(-)

diff --git a/start_kit/preprocess.py b/start_kit/preprocess.py
index 83b98f83c7..84820e2bd3 100644
--- a/start_kit/preprocess.py
+++ b/start_kit/preprocess.py
@@ -3,15 +3,29 @@
 # 2. Extract YouTube frames and create video instances.
 
 import os
+import sys
+import glob
 import json
 import cv2
-
 import shutil
+import re
+
+import logging
+logging.basicConfig(
+    filename="preProc.log",
+    filemode='w',
+    level=logging.DEBUG
+)
+logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
+
 
-def convert_everything_to_mp4():
-    cmd = 'bash scripts/swf2mp4.sh'
+def convert_frames_to_video(frame_array, path_out, size, fps=25):
+    out = cv2.VideoWriter(path_out, cv2.VideoWriter_fourcc(*'mp4v'), fps, size)
 
-    os.system(cmd)
+    for i in range(len(frame_array)):
+        # writing to a image array
+        out.write(frame_array[i])
+    out.release()
 
 
 def video_to_frames(video_path, size=None):
@@ -20,13 +34,14 @@ def video_to_frames(video_path, size=None):
     size -> (int, int), width, height.
     """
 
+    print(f"video_path: {video_path} size: {size}")
     cap = cv2.VideoCapture(video_path)
 
     frames = []
-    
+
     while True:
         ret, frame = cap.read()
-    
+
         if ret:
             if size:
                 frame = cv2.resize(frame, size)
@@ -39,91 +54,88 @@ def video_to_frames(video_path, size=None):
     return frames
 
 
-def convert_frames_to_video(frame_array, path_out, size, fps=25):
-    out = cv2.VideoWriter(path_out, cv2.VideoWriter_fourcc(*'mp4v'), fps, size)
-
-    for i in range(len(frame_array)):
-        # writing to a image array
-        out.write(frame_array[i])
-    out.release()
-
-
 def extract_frame_as_video(src_video_path, start_frame, end_frame):
     frames = video_to_frames(src_video_path)
 
     return frames[start_frame: end_frame+1]
 
 
-def extract_all_yt_instances(content):
-    cnt = 1
-
-    if not os.path.exists('videos'):
-        os.mkdir('videos')
-
-    for entry in content:
-        instances = entry['instances']
-
-        for inst in instances:
-            url = inst['url']
-            video_id = inst['video_id']
-
-            if 'youtube' in url or 'youtu.be' in url:
-                cnt += 1
-                
-                yt_identifier = url[-11:]
-
-                src_video_path = os.path.join('raw_videos_mp4', yt_identifier + '.mp4')
-                dst_video_path = os.path.join('videos', video_id + '.mp4')
-
-                if not os.path.exists(src_video_path):
-                    continue
-
-                if os.path.exists(dst_video_path):
-                    print('{} exists.'.format(dst_video_path))
-                    continue
-
-                # because the JSON file indexes from 1.
-                start_frame = inst['frame_start'] - 1
-                end_frame = inst['frame_end'] - 1
-
-                if end_frame <= 0:
-                    shutil.copyfile(src_video_path, dst_video_path)
-                    continue
-
-                selected_frames = extract_frame_as_video(src_video_path, start_frame, end_frame)
-                
-                # when OpenCV reads an image, it returns size in (h, w, c)
-                # when OpenCV creates a writer, it requres size in (w, h).
-                size = selected_frames[0].shape[:2][::-1]
-                
-                convert_frames_to_video(selected_frames, dst_video_path, size)
-
-                print(cnt, dst_video_path)
-            else:
-                cnt += 1
-
-                src_video_path = os.path.join('raw_videos_mp4', video_id + '.mp4')
-                dst_video_path = os.path.join('videos', video_id + '.mp4')
-
-                if os.path.exists(dst_video_path):
-                    print('{} exists.'.format(dst_video_path))
-                    continue
-
-                if not os.path.exists(src_video_path):
-                    continue
-
-                print(cnt, dst_video_path)
-                shutil.copyfile(src_video_path, dst_video_path)
-
-        
-def main():
-    # 1. Convert .swf, .mkv file to mp4.
-    convert_everything_to_mp4()
-
-    content = json.load(open('WLASL_v0.3.json'))
-    extract_all_yt_instances(content)
+class Preproc:
+    def __init__(self,
+                 idxf="WLASL_v0.3.json",
+                 videoDir="data"):
+        self.indexFile = idxf
+        self.vd = videoDir
+
+    def convertTomp4(self):
+        for f in os.scandir(self.vd):
+            if (
+                not f.path.endswith(".mp4") and
+                not glob.glob(
+                    os.path.join(
+                        self.vd,
+                        os.path.splitext(f.name)[0]) + '.mp4'
+                )
+            ):
+                dest = os.path.join(self.vd,
+                                    os.path.splitext(f.name)[0] + '.mp4'
+                                    )
+                if (
+                    os.system(
+                        f"ffmpeg -loglevel panic -i {f.path} -vf "
+                        f"pad=\"width=ceil(iw/2)*2\" {dest}"
+                    ) == 0
+                ):
+                    logging.info(f"Conversion Successful\t-\t{f.name}")
+                else:
+                    logging.error(f"Conversion Failed\t\t-\t{f.name}")
+            elif f.path.endswith(".swf"):
+                logging.info(f"{f.name} already converted - Skipping")
+
+    def extractVideo(self):
+        idx = json.load(open(self.indexFile))
+
+        for i in idx:
+            for j in i["instances"]:
+                if re.search(r"youtu\.?be", j["url"]):
+                    src = os.path.join(
+                        self.vd, j["video_id"] + '.yt.mp4'
+                    )
+                    dst = os.path.join(
+                        self.vd, j["video_id"] + '.mp4'
+                    )
+                    if not os.path.exists(src):
+                        continue
+                    if os.path.exists(dst):
+                        logging.info(f"{src} already extracted - Skipping ")
+                        continue
+
+                    if j["frame_end"] - 1 <= 0:
+                        shutil.copyfile(src, dst)
+                        continue
+
+                    print(f"src: {src}")
+                    selected_frames = extract_frame_as_video(
+                        src,
+                        j["frame_start"] - 1,
+                        j["frame_end"] - 1
+                    )
+
+                    size = selected_frames[0].shape[:2][::-1]
+                    convert_frames_to_video(selected_frames, dst, size)
+
+    def main(self):
+        # logging.info(">>>Converting files to mp4")
+        # self.convertTomp4()
+        # logging.info(">>>Extracting youtube videos")
+        # self.extractVideo()
+        for r, d, f in os.walk(self.vd):
+            print(r)
+            print(d)
+            print(f)
+            print("==============")
 
 
 if __name__ == "__main__":
-    main()
-
+    preproc = Preproc()
+    preproc.main()
diff --git a/start_kit/scripts/swf2mp4.sh b/start_kit/scripts/swf2mp4.sh
index 1bcc816e92..0d76cf808d 100644
--- a/start_kit/scripts/swf2mp4.sh
+++ b/start_kit/scripts/swf2mp4.sh
@@ -24,7 +24,7 @@ do
     fi
 
     echo "${i} / ${total}, ${filename}"
-    
+
     if [ ${extension} != "mp4" ];
     then
 	    ffmpeg -loglevel panic -i ${src_file} -vf pad="width=ceil(iw/2)*2:height=ceil(ih/2)*2" ${dst_file}
diff --git a/start_kit/video_downloader.py b/start_kit/video_downloader.py
index ae363b94a8..f074fa9c9d 100644
--- a/start_kit/video_downloader.py
+++ b/start_kit/video_downloader.py
@@ -1,151 +1,141 @@
 import os
+import subprocess
+import glob
 import json
 import time
 import sys
 import urllib.request
+import re
 from multiprocessing.dummy import Pool
 
 import random
 
 import logging
-logging.basicConfig(filename='download_{}.log'.format(int(time.time())), filemode='w', level=logging.DEBUG)
-logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
-
-
-def request_video(url, referer=''):
-    user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
-
-    headers = {'User-Agent': user_agent,
-               }
-    
-    if referer:
-        headers['Referer'] = referer
-
-    request = urllib.request.Request(url, None, headers)  # The assembled request
-
-    logging.info('Requesting {}'.format(url))
-    response = urllib.request.urlopen(request)
-    data = response.read()  # The data you need
-
-    return data
-
-
-def save_video(data, saveto):
-    with open(saveto, 'wb+') as f:
-        f.write(data)
-
-    # please be nice to the host - take pauses and avoid spamming
-    time.sleep(random.uniform(0.5, 1.5))
-
-
-def download_youtube(url, dirname, video_id):
-    raise NotImplementedError("Urllib cannot deal with YouTube links.")
-
-
-def download_aslpro(url, dirname, video_id):
-    saveto = os.path.join(dirname, '{}.swf'.format(video_id))
-    if os.path.exists(saveto):
-        logging.info('{} exists at {}'.format(video_id, saveto))
-        return 
-
-    data = request_video(url, referer='http://www.aslpro.com/cgi-bin/aslpro/aslpro.cgi')
-    save_video(data, saveto)
-
-
-def download_others(url, dirname, video_id):
-    saveto = os.path.join(dirname, '{}.mp4'.format(video_id))
-    if os.path.exists(saveto):
-        logging.info('{} exists at {}'.format(video_id, saveto))
-        return 
-    
-    data = request_video(url)
-    save_video(data, saveto)
-
-
-def select_download_method(url):
-    if 'aslpro' in url:
-        return download_aslpro
-    elif 'youtube' in url or 'youtu.be' in url:
-        return download_youtube
-    else:
-        return download_others
-
 
-def download_nonyt_videos(indexfile, saveto='raw_videos'):
-    content = json.load(open(indexfile))
-
-    if not os.path.exists(saveto):
-        os.mkdir(saveto)
-
-    for entry in content:
-        gloss = entry['gloss']
-        instances = entry['instances']
-
-        for inst in instances:
-            video_url = inst['url']
-            video_id = inst['video_id']
-            
-            logging.info('gloss: {}, video: {}.'.format(gloss, video_id))
-
-            download_method = select_download_method(video_url)    
-            
-            if download_method == download_youtube:
-                logging.warning('Skipping YouTube video {}'.format(video_id))
-                continue
-
-            try:
-                download_method(video_url, saveto, video_id)
-            except Exception as e:
-                logging.error('Unsuccessful downloading - video {}'.format(video_id))
-
-
-def check_youtube_dl_version():
-    ver = os.popen('youtube-dl --version').read()
-
-    assert ver, "youtube-dl cannot be found in PATH. Please verify your installation."
-    assert ver >= '2020.03.08', "Please update youtube-dl to newest version."
-
-
-def download_yt_videos(indexfile, saveto='raw_videos'):
-    content = json.load(open(indexfile))
-    
-    if not os.path.exists(saveto):
-        os.mkdir(saveto)
-    
-    for entry in content:
-        gloss = entry['gloss']
-        instances = entry['instances']
-
-        for inst in instances:
-            video_url = inst['url']
-            video_id = inst['video_id']
+logging.basicConfig(
+    filename="videoDownloader.log", filemode="w", level=logging.DEBUG
+)
+logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
 
-            if 'youtube' not in video_url and 'youtu.be' not in video_url:
-                continue
 
-            if os.path.exists(os.path.join(saveto, video_url[-11:] + '.mp4')) or os.path.exists(os.path.join(saveto, video_url[-11:] + '.mkv')):
-                logging.info('YouTube videos {} already exists.'.format(video_url))
-                continue
+class videoDownloader:
+    wordCounts = {}
+
+    def __init__(self, idxf="WLASL_v0.3.json", vd="data", n=1, m=2000000):
+        self.wordCounts = {}
+        self.indexFile = idxf
+        self.max = m
+        self.videoDir = vd
+        if not os.path.exists(self.videoDir):
+            os.mkdir(self.videoDir)
+        self.size = self.updateSize()
+
+    def updateSize(self):
+        self.size = int(
+            subprocess.check_output(["du", "-ks", self.videoDir])
+            .split()[0]
+            .decode("utf-8")
+        )
+        return self.size
+
+    def request_video(self, url, referer=""):
+        user_agent = (
+            "Mozilla/5.0"
+            "(Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7)"
+            "Gecko/2009021910 Firefox/3.0.7"
+        )
+
+        headers = {
+            "User-Agent": user_agent,
+        }
+
+        if referer:
+            headers["Referer"] = referer
+
+        # The assembled request
+        request = urllib.request.Request(url, None, headers)
+
+        logging.info("Requesting {}".format(url))
+        response = urllib.request.urlopen(request)
+        data = response.read()  # The data you need
+        urllib.request.urlopen
+
+        return data
+
+    def dlPass(self, video_id):
+        logging.info(f"Download Successful\t-\t{video_id}")
+
+    def dlFail(self, video_id):
+        logging.error(f"Download Failed\t\t-\t{video_id}")
+
+    def download(self, inst, gloss):
+        rv = False
+        saveto = os.path.join(self.videoDir, gloss, inst["video_id"])
+        if glob.glob(f"{saveto}.*"):
+            logging.info(f"{inst['video_id']} exists at {saveto} - Skipping")
+            rv = True
+        else:
+            if re.search(r"youtu\.?be", inst["url"]):
+                status = os.system(
+                    f"youtube-dl \"{inst['url']}\" -o \"{saveto}.yt.%(ext)s\""
+                )
+                if status == 0:
+                    self.dlPass(inst["video_id"])
+                    rv = True
+                else:
+                    rv = False
+                    self.dlFail(inst["video_id"])
             else:
-                cmd = "youtube-dl \"{}\" -o \"{}%(id)s.%(ext)s\""
-                cmd = cmd.format(video_url, saveto + os.path.sep)
-
-                rv = os.system(cmd)
-                
-                if not rv:
-                    logging.info('Finish downloading youtube video url {}'.format(video_url))
+                if "aslpro" in inst["url"]:
+                    saveto = f"{saveto}.swf"
+                    ref = "http://www.aslpro.com/cgi-bin/aslpro/aslpro.cgi"
                 else:
-                    logging.error('Unsuccessful downloading - youtube video url {}'.format(video_url))
-
-                # please be nice to the host - take pauses and avoid spamming
-                time.sleep(random.uniform(1.0, 1.5))
-    
-
-if __name__ == '__main__':
-    logging.info('Start downloading non-youtube videos.')
-    download_nonyt_videos('WLASL_v0.3.json')
-
-    check_youtube_dl_version()
-    logging.info('Start downloading youtube videos.')
-    download_yt_videos('WLASL_v0.3.json')
-
+                    saveto = f"{saveto}.mp4"
+                    ref = ""
+                dat = self.request_video(inst["url"], referer=ref)
+                if dat:
+                    with open(saveto, "wb+") as f:
+                        f.write(dat)
+                        self.dlPass(inst["video_id"])
+                        rv = True
+                else:
+                    self.dlFail(inst["video_id"])
+            # please be nice to the host - take pauses and avoid spamming
+            time.sleep(random.uniform(0.3, 0.7))
+        return rv
+
+    def main(self):
+        idx = json.load(open(self.indexFile))
+        idx = sorted(idx, key=lambda x: (len(x["instances"])), reverse=True)
+
+        if not os.path.exists(self.videoDir):
+            os.mkdir(self.videoDir)
+
+        for i in idx:
+            if not os.path.exists(os.path.join(self.videoDir, i["gloss"])):
+                os.mkdir(os.path.join(self.videoDir, i["gloss"]))
+            if i["gloss"] not in self.wordCounts:
+                self.wordCounts[i["gloss"]] = 0
+            if self.updateSize() >= self.max:
+                logging.info("Max size reached")
+                break
+            for j in i["instances"]:
+                if self.updateSize() >= self.max:
+                    break
+                logging.info(
+                    f">>>GLOSS: {i['gloss']}"
+                    f"\tvideo: {j['video_id']}"
+                    f"\tcount: {self.wordCounts[i['gloss']]}"
+                )
+                try:
+                    if self.download(j, i["gloss"]):
+                        self.wordCounts[i["gloss"]] = (
+                            self.wordCounts[i["gloss"]] + 1
+                        )
+                except Exception as e:
+                    logging.error(f"ERROR - {j['video_id']}: {e}")
+
+
+if __name__ == "__main__":
+    vd = videoDownloader()
+    vd.main()