v0.3

volotat · Apr 8, 2023 · 9c8d72f · 9c8d72f
1 parent 68bd68d
commit 9c8d72f
Show file tree

Hide file tree

Showing 3 changed files with 178 additions and 73 deletions.
diff --git a/RAFT b/RAFT
diff --git a/readme.md b/readme.md
@@ -11,6 +11,8 @@ To install all necessary dependencies run this command
 pip install opencv-python opencv-contrib-python numpy tqdm
 ```
 
+To run the algorithm alongside Stable Diffusion with ControlNet in 640x640 resolution would require about 8GB of VRAM, as [RAFT](https://github.com/princeton-vl/RAFT) (current optical flow estimation method) takes about 3,7GB of memory.
+
 ## Running the script
 This script works on top of [Automatic1111/web-ui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) interface via API. To run this script you have to set it up first. You also should have [sd-webui-controlnet](https://github.com/Mikubill/sd-webui-controlnet) extension installed. You need to have control_hed-fp16 model installed. If you have web-ui with ControlNet working correctly do the following:
 1. Go to the web-ui settings -> ControlNet tab -> Set "Allow other script to control this extension" checkbox to active and set "Multi ControlNet: Max models amount (requires restart)" to more then 2 -> press "Apply settings"
@@ -19,8 +21,14 @@ This script works on top of [Automatic1111/web-ui](https://github.com/AUTOMATIC1
 3. Go to the script.py file and change main parameters (INPUT_VIDEO, OUTPUT_VIDEO, PROMPT, N_PROMPT, W, H) to the ones you need for your project. The script is pretty simple so you may change other parameters as well, although I would recommend to leave them as is for the first time.
 4. Run the script with ```python3 script.py```
 
+## Last version changes 0.3
+* Flow estimation algorithm is updated to [RAFT](https://github.com/princeton-vl/RAFT) method.
+* Difference map now computed as per-pixel maximum of warped first and second frame of the original video and occlusion map that is computed from forward and backward flow estimation.
+* Added keyframe detection that illuminates ghosting artifacts between the scenes.
+
 ## Potential improvements
 There are several ways overall quality of animation may be improved:
 * You may use a separate processing for each camera position to get a more consistent style of the characters and less ghosting.
 * Because the quality of the video depends on how good optical flow was estimated it might be beneficial to use high frame rate video as a source, so it would be easier to guess the flow properly.
-* The quality of flow estimation might be greatly improved with proper flow estimation model like this one: https://github.com/autonomousvision/unimatch
+* The quality of flow estimation might be greatly improved with proper flow estimation model like this one: https://github.com/autonomousvision/unimatch .
+* It is possible to lower VRAM requirements if precompute flows maps beforehand. 
diff --git a/script.py b/script.py
@@ -5,92 +5,177 @@
 from tqdm import tqdm
 import os
 
-INPUT_VIDEO = "video_input.mp4"
+#RAFT dependencies
+import sys
+sys.path.append('RAFT/core')
+
+from collections import namedtuple
+import torch
+import argparse
+from raft import RAFT
+from utils.utils import InputPadder
+
+
+INPUT_VIDEO = "/media/alex/ded3efe6-5825-429d-ac89-7ded676a2b6d/media/Fallout_noir_2/benny.mp4"
 OUTPUT_VIDEO = "result.mp4"
 
-PROMPT = "The Matrix as stop motion animation with plastic figures. Clay plasticine texture, cinematic light. 4k textures, hd, hyperdetailed. Claymation, plasticine animation, clay stop motion animation."
-N_PROMPT = "green face, green skin, black and white, slanted eyes, red eyes, blood eyes, deformed, bad anatomy, disfigured, poorly drawn face, mutation, mutated, extra limb, ugly, disgusting, poorly drawn hands, missing limb, floating limbs, disconnected limbs, malformed hands, blurry, ((((mutated hands and fingers)))), watermark, watermarked, oversaturated, censored, distorted hands, amputation, missing hands, obese, doubled face, double hands"
+PROMPT = "RAW photo, Matthew Perry wearing suit and pants in the wasteland, cinematic light, dramatic light, (high detailed skin:1.2), 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3"
+N_PROMPT = "blur, blurred, unfocus, obscure, dim, fade, obscure, muddy, black and white image, old, naked, black person, green face, green skin, black and white, slanted eyes, red eyes, blood eyes, deformed, bad anatomy, disfigured, poorly drawn face, mutation, mutated, extra limb, ugly, disgusting, poorly drawn hands, missing limb, floating limbs, disconnected limbs, malformed hands, blurry, ((((mutated hands and fingers)))), watermark, watermarked, oversaturated, censored, distorted hands, amputation, missing hands, obese, doubled face, double hands"
 SEED = -1
-w,h = 896, 512 # Width and height of the processed image. Note that actual image processed would be a W x 2H resolution. You should have enough VRAM to process it.
+w,h = 512, 704 # Width and height of the processed image. Note that actual image processed would be a W x 2H resolution. You should have enough VRAM to process it.
 
 START_FROM_IND = 0 # index of a frame to start a processing from. Might be helpful with long animations where you need to restart the script multiple times
 SAVE_FRAMES = True # saves individual frames into 'out' folder if set True. Again might be helpful with long animations
 
+BLUR_SIZE = (15, 15)
+BLUR_SIGMA = 12
+
 def to_b64(img):
-    _, buffer = cv2.imencode('.png', img)
-    b64img = base64.b64encode(buffer).decode("utf-8")
-    return b64img
+  _, buffer = cv2.imencode('.png', img)
+  b64img = base64.b64encode(buffer).decode("utf-8")
+  return b64img
 
 class controlnetRequest():
-    def __init__(self, b64_cur_img, b64_hed_img, ds = 0.35, w=w, h=h, mask = None):
-
-        self.url = "http://localhost:7860/sdapi/v1/img2img"
-        self.body = {
-            "init_images": [b64_cur_img],
-            "mask": mask,
-            "mask_blur": 0,
-            "inpainting_fill": 1,
-            "inpainting_mask_invert": 0,
-            "prompt": PROMPT,
-            "negative_prompt": N_PROMPT,
-            "seed": SEED,
-            "subseed": -1,
-            "subseed_strength": 0,
-            "batch_size": 1,
-            "n_iter": 1,
-            "steps": 15,
-            "cfg_scale": 7,
-            "denoising_strength": ds,
-            "width": w,
-            "height": h,
-            "restore_faces": False,
-            "eta": 0,
-            "sampler_index": "DPM++ 2S a",
-            "control_net_enabled": True,
-            "alwayson_scripts": {
-                "ControlNet":{
-                    "args": [
-                        {
-                            "input_image": b64_hed_img,
-                            "module": "hed",
-                            "model": "control_hed-fp16 [13fee50b]",
-                            "weight": 1,
-                            "resize_mode": "Just Resize",
-                            "lowvram": False,
-                            "processor_res": 512,
-                            "guidance": 1,
-                            "guessmode": False
-                        }
-                    ]
-                }
-            },
+  def __init__(self, b64_cur_img, b64_hed_img, ds = 0.35, w=w, h=h, mask = None):
+    self.url = "http://localhost:7860/sdapi/v1/img2img"
+    self.body = {
+      "init_images": [b64_cur_img],
+      "mask": mask,
+      "mask_blur": 0,
+      "inpainting_fill": 1,
+      "inpainting_mask_invert": 0,
+      "prompt": PROMPT,
+      "negative_prompt": N_PROMPT,
+      "seed": SEED,
+      "subseed": -1,
+      "subseed_strength": 0,
+      "batch_size": 1,
+      "n_iter": 1,
+      "steps": 15,
+      "cfg_scale": 7,
+      "denoising_strength": ds,
+      "width": w,
+      "height": h,
+      "restore_faces": False,
+      "eta": 0,
+      "sampler_index": "DPM++ 2S a",
+      "control_net_enabled": True,
+      "alwayson_scripts": {
+        "ControlNet":{
+          "args": [
+            {
+                "input_image": b64_hed_img,
+                "module": "hed",
+                "model": "control_hed-fp16 [13fee50b]",
+                "weight": 1,
+                "resize_mode": "Just Resize",
+                "lowvram": False,
+                "processor_res": 512,
+                "guidance": 1,
+                "guessmode": False
+            }
+          ]
         }
+      },
+    }
 
-    def sendRequest(self):
-        r = requests.post(self.url, json=self.body)
-        return r.json()
+  def sendRequest(self):
+      r = requests.post(self.url, json=self.body)
+      return r.json()
 
+DEVICE = 'cuda'
+RAFT_model = None
+def RAFT_estimate_flow_diff(frame1, frame2, frame1_styled):
+  global RAFT_model
+  if RAFT_model is None:
+    args = argparse.Namespace(**{
+      'model': 'RAFT/models/raft-things.pth',
+      'mixed_precision': True,
+      'small': False,
+      'alternate_corr': False,
+      'path': ""
+    })
+
+    RAFT_model = torch.nn.DataParallel(RAFT(args))
+    RAFT_model.load_state_dict(torch.load(args.model))
+
+    RAFT_model = RAFT_model.module
+    RAFT_model.to(DEVICE)
+    RAFT_model.eval()
+
+  with torch.no_grad():
+    frame1_torch = torch.from_numpy(frame1).permute(2, 0, 1).float()[None].to(DEVICE)
+    frame2_torch = torch.from_numpy(frame2).permute(2, 0, 1).float()[None].to(DEVICE)
+
+    padder = InputPadder(frame1_torch.shape)
+    image1, image2 = padder.pad(frame1_torch, frame2_torch)
+
+    # estimate and apply optical flow
+    _, next_flow = RAFT_model(image1, image2, iters=20, test_mode=True)
+    _, prev_flow = RAFT_model(image2, image1, iters=20, test_mode=True)
+
+    next_flow = next_flow[0].permute(1,2,0).cpu().numpy()
+    prev_flow = prev_flow[0].permute(1,2,0).cpu().numpy()
+
+    flow_map = prev_flow.copy()
+    h, w = flow_map.shape[:2]
+    flow_map[:,:,0] += np.arange(w)
+    flow_map[:,:,1] += np.arange(h)[:,np.newaxis]
+
+    warped_frame = cv2.remap(frame1, flow_map, None, cv2.INTER_LINEAR)
+    warped_frame_styled = cv2.remap(frame1_styled, flow_map, None, cv2.INTER_LINEAR)
 
+    # compute occlusion mask
+    fb_flow = next_flow + prev_flow
+    fb_norm = np.linalg.norm(fb_flow, axis=2)
+
+    occlusion_mask = fb_norm[..., None] 
+
+    diff_mask = np.abs(warped_frame.astype(np.float32) - frame2.astype(np.float32)) / 255
+    diff_mask = diff_mask.max(axis = -1, keepdims=True)
+
+    diff = np.maximum(occlusion_mask * 0.2, diff_mask * 4).repeat(3, axis = -1)
+    #diff = diff * 1.5
+
+    diff_blured = cv2.GaussianBlur(diff, BLUR_SIZE, BLUR_SIGMA, cv2.BORDER_DEFAULT)
+    diff_frame = np.clip((diff + diff_blured) * 255, 0, 255).astype(np.uint8)
+
+  return warped_frame, diff_frame, warped_frame_styled
+
+''' # old flow estimation algorithm, might be useful later 
 def estimate_flow_diff(frame1, frame2, frame1_styled):
   prvs = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
   next = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
 
-  flow_data = cv2.calcOpticalFlowFarneback(prvs, next, None, 0.5, 3, 15, 3, 5, 1.2, 0)
-  h, w = flow_data.shape[:2]
-  flow_data = -flow_data
+  # estimate and apply optical flow
+  flow = cv2.calcOpticalFlowFarneback(prvs, next, None, 0.5, 3, 15, 3, 5, 1.2, 0)
+  h, w = flow.shape[:2]
+  flow_data = -flow
   flow_data[:,:,0] += np.arange(w)
   flow_data[:,:,1] += np.arange(h)[:,np.newaxis]
   #map_x, map_y = cv2.convertMaps(flow_data, 0, -1, True)
   warped_frame = cv2.remap(frame1, flow_data, None, cv2.INTER_LINEAR)
   warped_frame_styled = cv2.remap(frame1_styled, flow_data, None, cv2.INTER_LINEAR)
 
-  diff = np.abs(warped_frame.astype(np.float32) - frame2.astype(np.float32))
-  diff = diff.max(axis = -1, keepdims=True).repeat(3, axis = -1) * 30
-  diff = cv2.GaussianBlur(diff,(15,15),10,cv2.BORDER_DEFAULT)
-  diff_frame = np.clip(diff, 0, 255).astype(np.uint8)
+  # compute occlusion mask
+  flow_back = cv2.calcOpticalFlowFarneback(next, prvs, None, 0.5, 3, 15, 3, 5, 1.2, 0)
+  fb_flow = flow + flow_back
+  fb_norm = np.linalg.norm(fb_flow, axis=2)
+  occlusion_mask = fb_norm[..., None]
+
+  diff_mask = np.abs(warped_frame.astype(np.float32) - frame2.astype(np.float32)) / 255
+  diff_mask = diff_mask.max(axis = -1, keepdims=True)
+
+  diff = np.maximum(occlusion_mask, diff_mask).repeat(3, axis = -1)
+  #diff = diff * 1.5
+
+  #diff = cv2.GaussianBlur(diff, BLUR_SIZE, BLUR_SIGMA, cv2.BORDER_DEFAULT)
+  diff_frame = np.clip(diff * 255, 0, 255).astype(np.uint8)
 
   return warped_frame, diff_frame, warped_frame_styled
+'''
 
+cv2.namedWindow('Out img')
 
 # Open the input video file
 input_video = cv2.VideoCapture(INPUT_VIDEO)
@@ -102,22 +187,32 @@ def estimate_flow_diff(frame1, frame2, frame1_styled):
 # Create an output video file with the same fps, width, and height as the input video
 output_video = cv2.VideoWriter(OUTPUT_VIDEO, cv2.VideoWriter_fourcc(*'MP4V'), fps, (w, h))
 
+prev_frame = None
+
 for ind in tqdm(range(total_frames)):
     # Read the next frame from the input video
     if not input_video.isOpened(): break
-    ret, init_frame = input_video.read()
+    ret, cur_frame = input_video.read()
     if not ret: break
 
     if ind+1 < START_FROM_IND: continue
 
+    is_keyframe = True
+    if prev_frame is not None:
+        # Compute absolute difference between current and previous frame
+        frames_diff = cv2.absdiff(cur_frame, prev_frame)
+        # Compute mean of absolute difference
+        mean_diff = cv2.mean(frames_diff)[0]
+        # Check if mean difference is above threshold
+        is_keyframe = mean_diff > 30
 
     # Generate course version of a current frame with previous stylized frame as a reference image
-    if ind == 0:
+    if is_keyframe:
         # Resize the frame to proper resolution 
-        frame = cv2.resize(init_frame, (w, h))
+        frame = cv2.resize(cur_frame, (w, h))
 
         # Sending request to the web-ui
-        data_js = controlnetRequest(to_b64(frame), to_b64(frame), 0.85, w, h, mask = None).sendRequest()
+        data_js = controlnetRequest(to_b64(frame), to_b64(frame), 0.65, w, h, mask = None).sendRequest()
 
         # Convert the byte array to a NumPy array
         image_bytes = base64.b64decode(data_js["images"][0])
@@ -126,13 +221,14 @@ def estimate_flow_diff(frame1, frame2, frame1_styled):
         # Convert the NumPy array to a cv2 image
         out_image = cv2.imdecode(np_array, cv2.IMREAD_COLOR)
         diff_mask = out_image.copy()
+        diff_mask_blured = out_image.copy()
     else:
         # Resize the frame to proper resolution 
-        frame = cv2.resize(init_frame, (w, h))
+        frame = cv2.resize(cur_frame, (w, h))
         prev_frame = cv2.resize(prev_frame, (w, h))
 
         # Sending request to the web-ui
-        data_js = controlnetRequest(to_b64(frame), to_b64(frame), 0.55, w, h, mask = None).sendRequest()
+        data_js = controlnetRequest(to_b64(frame), to_b64(frame), 0.35, w, h, mask = None).sendRequest()
 
         # Convert the byte array to a NumPy array
         image_bytes = base64.b64decode(data_js["images"][0])
@@ -142,16 +238,16 @@ def estimate_flow_diff(frame1, frame2, frame1_styled):
         out_image = cv2.imdecode(np_array, cv2.IMREAD_COLOR)
 
 
-        _, diff_mask, warped_styled = estimate_flow_diff(prev_frame, frame, prev_frame_styled)
+        _, diff_mask, warped_styled = RAFT_estimate_flow_diff(prev_frame, frame, prev_frame_styled)
 
         alpha = diff_mask.astype(np.float32) / 255.0
-        pr_image = out_image * alpha * 0.5 + warped_styled * (1 - alpha * 0.5)
+        pr_image = out_image * alpha + warped_styled * (1 - alpha)
 
-        diff = cv2.GaussianBlur(alpha * 255 * 3,(11,11),4,cv2.BORDER_DEFAULT)
-        diff_mask = np.clip(diff, 0, 255).astype(np.uint8)
+        diff_mask_blured = cv2.GaussianBlur(alpha * 255, BLUR_SIZE, BLUR_SIGMA, cv2.BORDER_DEFAULT)
+        diff_mask_blured = np.clip(diff_mask_blured + diff_mask, 5, 255).astype(np.uint8)
 
         # Sending request to the web-ui
-        data_js = controlnetRequest(to_b64(pr_image), to_b64(frame), 0.35, w, h, mask = to_b64(diff_mask)).sendRequest()
+        data_js = controlnetRequest(to_b64(pr_image), to_b64(frame), 0.65, w, h, mask = to_b64(diff_mask_blured)).sendRequest()
 
         # Convert the byte array to a NumPy array
         image_bytes = base64.b64decode(data_js["images"][0])
@@ -165,14 +261,14 @@ def estimate_flow_diff(frame1, frame2, frame1_styled):
     output_video.write(frame_out)
 
     # show the last written frame - useful to catch any issue with the process
-    img_show = cv2.vconcat([out_image, diff_mask])
+    img_show = cv2.hconcat([out_image, diff_mask, diff_mask_blured])
     cv2.imshow('Out img', img_show)
     if cv2.waitKey(1) & 0xFF == ord('q'): exit() # press Q to close the script while processing
 
 
     # Write the frame to the output video
     output_video.write(frame_out)
-    prev_frame = init_frame.copy()
+    prev_frame = cur_frame.copy()
     prev_frame_styled = frame_out.copy()