dxli94 · Vancelot11 · Jan 31, 2021
diff --git a/start_kit/preprocess.py b/start_kit/preprocess.py
@@ -3,15 +3,29 @@
 # 2. Extract YouTube frames and create video instances.
 
 import os
+import sys
+import glob
 import json
 import cv2
-
 import shutil
+import re
+
+import logging
+logging.basicConfig(
+    filename="preProc.log",
+    filemode='w',
+    level=logging.DEBUG
+)
+logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
+
 
-def convert_everything_to_mp4():
-    cmd = 'bash scripts/swf2mp4.sh'
+def convert_frames_to_video(frame_array, path_out, size, fps=25):
+    out = cv2.VideoWriter(path_out, cv2.VideoWriter_fourcc(*'mp4v'), fps, size)
 
-    os.system(cmd)
+    for i in range(len(frame_array)):
+        # writing to a image array
+        out.write(frame_array[i])
+    out.release()
 
 
 def video_to_frames(video_path, size=None):
@@ -20,13 +34,14 @@ def video_to_frames(video_path, size=None):
     size -> (int, int), width, height.
     """
 
+    print(f"video_path: {video_path} size: {size}")
     cap = cv2.VideoCapture(video_path)
 
     frames = []
-    
+
     while True:
         ret, frame = cap.read()
-    
+
         if ret:
             if size:
                 frame = cv2.resize(frame, size)
@@ -39,91 +54,88 @@ def video_to_frames(video_path, size=None):
     return frames
 
 
-def convert_frames_to_video(frame_array, path_out, size, fps=25):
-    out = cv2.VideoWriter(path_out, cv2.VideoWriter_fourcc(*'mp4v'), fps, size)
-
-    for i in range(len(frame_array)):
-        # writing to a image array
-        out.write(frame_array[i])
-    out.release()
-
-
 def extract_frame_as_video(src_video_path, start_frame, end_frame):
     frames = video_to_frames(src_video_path)
 
     return frames[start_frame: end_frame+1]
 
 
-def extract_all_yt_instances(content):
-    cnt = 1
-
-    if not os.path.exists('videos'):
-        os.mkdir('videos')
-
-    for entry in content:
-        instances = entry['instances']
-
-        for inst in instances:
-            url = inst['url']
-            video_id = inst['video_id']
-
-            if 'youtube' in url or 'youtu.be' in url:
-                cnt += 1
-
-                yt_identifier = url[-11:]
-
-                src_video_path = os.path.join('raw_videos_mp4', yt_identifier + '.mp4')
-                dst_video_path = os.path.join('videos', video_id + '.mp4')
-
-                if not os.path.exists(src_video_path):
-                    continue
-
-                if os.path.exists(dst_video_path):
-                    print('{} exists.'.format(dst_video_path))
-                    continue
-
-                # because the JSON file indexes from 1.
-                start_frame = inst['frame_start'] - 1
-                end_frame = inst['frame_end'] - 1
-
-                if end_frame <= 0:
-                    shutil.copyfile(src_video_path, dst_video_path)
-                    continue
-
-                selected_frames = extract_frame_as_video(src_video_path, start_frame, end_frame)
-
-                # when OpenCV reads an image, it returns size in (h, w, c)
-                # when OpenCV creates a writer, it requres size in (w, h).
-                size = selected_frames[0].shape[:2][::-1]
-
-                convert_frames_to_video(selected_frames, dst_video_path, size)
-
-                print(cnt, dst_video_path)
-            else:
-                cnt += 1
-
-                src_video_path = os.path.join('raw_videos_mp4', video_id + '.mp4')
-                dst_video_path = os.path.join('videos', video_id + '.mp4')
-
-                if os.path.exists(dst_video_path):
-                    print('{} exists.'.format(dst_video_path))
-                    continue
-
-                if not os.path.exists(src_video_path):
-                    continue
-
-                print(cnt, dst_video_path)
-                shutil.copyfile(src_video_path, dst_video_path)
-
-
-def main():
-    # 1. Convert .swf, .mkv file to mp4.
-    convert_everything_to_mp4()
-
-    content = json.load(open('WLASL_v0.3.json'))
-    extract_all_yt_instances(content)
+class Preproc:
+    def __init__(self,
+                 idxf="WLASL_v0.3.json",
+                 videoDir="data"):
+        self.indexFile = idxf
+        self.vd = videoDir
+
+    def convertTomp4(self):
+        for f in os.scandir(self.vd):
+            if (
+                not f.path.endswith(".mp4") and
+                not glob.glob(
+                    os.path.join(
+                        self.vd,
+                        os.path.splitext(f.name)[0]) + '.mp4'
+                )
+            ):
+                dest = os.path.join(self.vd,
+                                    os.path.splitext(f.name)[0] + '.mp4'
+                                    )
+                if (
+                    os.system(
+                        f"ffmpeg -loglevel panic -i {f.path} -vf "
+                        f"pad=\"width=ceil(iw/2)*2\" {dest}"
+                    ) == 0
+                ):
+                    logging.info(f"Conversion Successful\t-\t{f.name}")
+                else:
+                    logging.error(f"Conversion Failed\t\t-\t{f.name}")
+            elif f.path.endswith(".swf"):
+                logging.info(f"{f.name} already converted - Skipping")
+
+    def extractVideo(self):
+        idx = json.load(open(self.indexFile))
+
+        for i in idx:
+            for j in i["instances"]:
+                if re.search(r"youtu\.?be", j["url"]):
+                    src = os.path.join(
+                        self.vd, j["video_id"] + '.yt.mp4'
+                    )
+                    dst = os.path.join(
+                        self.vd, j["video_id"] + '.mp4'
+                    )
+                    if not os.path.exists(src):
+                        continue
+                    if os.path.exists(dst):
+                        logging.info(f"{src} already extracted - Skipping ")
+                        continue
+
+                    if j["frame_end"] - 1 <= 0:
+                        shutil.copyfile(src, dst)
+                        continue
+
+                    print(f"src: {src}")
+                    selected_frames = extract_frame_as_video(
+                        src,
+                        j["frame_start"] - 1,
+                        j["frame_end"] - 1
+                    )
+
+                    size = selected_frames[0].shape[:2][::-1]
+                    convert_frames_to_video(selected_frames, dst, size)
+
+    def main(self):
+        # logging.info(">>>Converting files to mp4")
+        # self.convertTomp4()
+        # logging.info(">>>Extracting youtube videos")
+        # self.extractVideo()
+        for r, d, f in os.walk(self.vd):
+            print(r)
+            print(d)
+            print(f)
+            print("==============")
 
 
 if __name__ == "__main__":
-    main()
-
+    preproc = Preproc()
+    preproc.main()
diff --git a/start_kit/scripts/swf2mp4.sh b/start_kit/scripts/swf2mp4.sh
@@ -24,7 +24,7 @@ do
     fi
 
     echo "${i} / ${total}, ${filename}"
-    
+
     if [ ${extension} != "mp4" ];
     then
 	    ffmpeg -loglevel panic -i ${src_file} -vf pad="width=ceil(iw/2)*2:height=ceil(ih/2)*2" ${dst_file}