v0.7

volotat · May 7, 2023 · 4adcaf0 · 4adcaf0
1 parent a553de3
commit 4adcaf0
Show file tree

Hide file tree

Showing 9 changed files with 1,001 additions and 566 deletions.
diff --git a/FloweR/model.py b/FloweR/model.py
@@ -0,0 +1,143 @@
+import torch
+import torch.nn as nn
+import torch.functional as F
+
+# Define the model
+class FloweR(nn.Module):
+  def __init__(self, input_size = (384, 384), window_size = 4):
+    super(FloweR, self).__init__()
+
+    self.input_size = input_size
+    self.window_size = window_size
+
+    #INPUT: 384 x 384 x 10 * 3 
+
+    ### DOWNSCALE ###
+    self.conv_block_1 = nn.Sequential(
+      nn.Conv2d(3 * self.window_size, 128, kernel_size=3, stride=1, padding='same'),
+      nn.ReLU(),
+    ) # 384 x 384 x 128
+
+    self.conv_block_2 = nn.Sequential(
+      nn.AvgPool2d(2),
+      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
+      nn.ReLU(),
+    ) # 192 x 192 x 128
+
+    self.conv_block_3 = nn.Sequential(
+      nn.AvgPool2d(2),
+      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
+      nn.ReLU(),
+    ) # 96 x 96 x 128
+
+    self.conv_block_4 = nn.Sequential(
+      nn.AvgPool2d(2),
+      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
+      nn.ReLU(),
+    ) # 48 x 48 x 128
+
+    self.conv_block_5 = nn.Sequential(
+      nn.AvgPool2d(2),
+      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
+      nn.ReLU(),
+    ) # 24 x 24 x 128
+
+    self.conv_block_6 = nn.Sequential(
+      nn.AvgPool2d(2),
+      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
+      nn.ReLU(),
+    ) # 12 x 12 x 128
+
+    self.conv_block_7 = nn.Sequential(
+      nn.AvgPool2d(2),
+      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
+      nn.ReLU(),
+    ) # 6 x 6 x 128
+
+    self.conv_block_8 = nn.Sequential(
+      nn.AvgPool2d(2),
+      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
+      nn.ReLU(),
+    ) # 3 x 3 x 128
+
+    ### UPSCALE ###
+    self.conv_block_9 = nn.Sequential(
+      nn.Upsample(scale_factor=2),
+      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
+      nn.ReLU(),
+    ) # 6 x 6 x 128
+
+    self.conv_block_10 = nn.Sequential(
+      nn.Upsample(scale_factor=2),
+      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
+      nn.ReLU(),
+    ) # 12 x 12 x 128
+
+    self.conv_block_11 = nn.Sequential(
+      nn.Upsample(scale_factor=2),
+      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
+      nn.ReLU(),
+    ) # 24 x 24 x 128
+
+    self.conv_block_12 = nn.Sequential(
+      nn.Upsample(scale_factor=2),
+      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
+      nn.ReLU(),
+    ) # 48 x 48 x 128
+
+    self.conv_block_13 = nn.Sequential(
+      nn.Upsample(scale_factor=2),
+      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
+      nn.ReLU(),
+    ) # 96 x 96 x 128
+
+    self.conv_block_14 = nn.Sequential(
+      nn.Upsample(scale_factor=2),
+      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
+      nn.ReLU(),
+    ) # 192 x 192 x 128
+
+    self.conv_block_15 = nn.Sequential(
+      nn.Upsample(scale_factor=2),
+      nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same'),
+      nn.ReLU(),
+    ) # 384 x 384 x 128
+
+    self.conv_block_16 = nn.Conv2d(128, 3, kernel_size=3, stride=1, padding='same')
+
+  def forward(self, x):
+    if x.size(1) != self.window_size: 
+      raise Exception(f'Shape of the input is not compatable. There should be exactly {self.window_size} frames in an input video.')
+
+    # batch, frames, height, width, colors
+    in_x = x.permute((0, 1, 4, 2, 3))
+    # batch, frames, colors, height, width
+
+    in_x = in_x.reshape(-1, self.window_size * 3, self.input_size[0], self.input_size[1])
+
+    ### DOWNSCALE ###
+    block_1_out = self.conv_block_1(in_x)        # 384 x 384 x 128
+    block_2_out = self.conv_block_2(block_1_out) # 192 x 192 x 128
+    block_3_out = self.conv_block_3(block_2_out) # 96 x 96 x 128
+    block_4_out = self.conv_block_4(block_3_out) # 48 x 48 x 128
+    block_5_out = self.conv_block_5(block_4_out) # 24 x 24 x 128
+    block_6_out = self.conv_block_6(block_5_out) # 12 x 12 x 128
+    block_7_out = self.conv_block_7(block_6_out) # 6 x 6 x 128
+    block_8_out = self.conv_block_8(block_7_out) # 3 x 3 x 128
+
+    ### UPSCALE ###
+    block_9_out = block_7_out + self.conv_block_9(block_8_out)    # 6 x 6 x 128
+    block_10_out = block_6_out + self.conv_block_10(block_9_out)  # 12 x 12 x 128
+    block_11_out = block_5_out + self.conv_block_11(block_10_out) # 24 x 24 x 128
+    block_12_out = block_4_out + self.conv_block_12(block_11_out) # 48 x 48 x 128
+    block_13_out = block_3_out + self.conv_block_13(block_12_out) # 96 x 96 x 128
+    block_14_out = block_2_out + self.conv_block_14(block_13_out) # 192 x 192 x 128
+    block_15_out = block_1_out + self.conv_block_15(block_14_out) # 384 x 384 x 128
+
+    block_16_out = self.conv_block_16(block_15_out) # 384 x 384 x (2 + 1)
+    out = block_16_out.reshape(-1, 3, self.input_size[0], self.input_size[1])
+
+    # batch, colors, height, width
+    out = out.permute((0, 2, 3, 1))
+    # batch, height, width, colors
+    return out
diff --git a/examples/ui_preview.png b/examples/ui_preview.png
diff --git a/readme.md b/readme.md
@@ -1,6 +1,10 @@
 # SD-CN-Animation
 This project allows you to automate video stylization task using StableDiffusion and ControlNet. It also allows you to generate completely new videos from text at any resolution and length in contrast to other current text2video methods using any Stable Diffusion model as a backbone, including custom ones. It uses '[RAFT](https://github.com/princeton-vl/RAFT)' optical flow estimation algorithm to keep the animation stable and create an occlusion mask that is used to generate the next frame. In text to video mode it relies on 'FloweR' method (work in progress) that predicts optical flow from the previous frames.
 
+
+![sd-cn-animation ui preview](examples/ui_preview.png)
+sd-cn-animation ui preview
+
 ### Video to Video Examples:
 </table>
 <table class="center">
@@ -46,11 +50,10 @@ Examples presented are generated at 1024x576 resolution using the 'realisticVisi
 All examples you can see here are originally generated at 512x512 resolution using the 'sd-v1-5-inpainting' model as a base. They were downsized and compressed for better loading speed. You can see them in their original quality in the 'examples' folder. Actual prompts used were stated in the following format: "RAW photo, {subject}, 8k uhd, dslr, soft lighting, high quality, film grain, Fujifilm XT3", only the 'subject' part is described in the table above.
 
 ## Installing the extension
-To install the extension go to 'Extensions' tab in [Automatic1111 web-ui](https://github.com/AUTOMATIC1111/stable-diffusion-webui), then go to 'Install from URL' tab. In 'URL for extension's git repository' field inter the path to this repository, i.e. 'https://github.com/volotat/SD-CN-Animation.git'. Leave 'Local directory name' field empty. Then just press 'Install' button. Download RAFT 'raft-things.pth' model from here: [Google Drive](https://drive.google.com/drive/folders/1sWDsfuZ3Up38EUQt7-JDTT1HcGHuJgvT) and place it into 'stable-diffusion-webui/models/RAFT/' folder. Restart web-ui, new 'SD-CN-Animation' tab should appear. All generated video will be saved into 'stable-diffusion-webui/outputs/sd-cn-animation' folder.
-
-## Last version changes: v0.6
-* Complete rewrite of the project to make it possible to install as an Automatic1111/Web-ui extension.
-* Added flow normalization before resizing it, so the magnitude of the flow computed correctly at the different resolution.
-* Less ghosting and color drift in vid2vid mode
-* Added "warped styled frame fix" at vid2vid mode that removes duplicates from the parts of the image that cannot be relocated from the optical flow.
+To install the extension go to 'Extensions' tab in [Automatic1111 web-ui](https://github.com/AUTOMATIC1111/stable-diffusion-webui), then go to 'Install from URL' tab. In 'URL for extension's git repository' field inter the path to this repository, i.e. 'https://github.com/volotat/SD-CN-Animation.git'. Leave 'Local directory name' field empty. Then just press 'Install' button. Restart web-ui, new 'SD-CN-Animation' tab should appear. All generated video will be saved into 'stable-diffusion-webui/outputs/sd-cn-animation' folder.
 
+## Last version changes: v0.7
+* Text to Video mode added to the extension
+* 'Generate' button is now automatically disabled while the video is generated
+* Added 'Interrupt' button that allows to stop video generation process
+* Now all necessary models are automatically downloaded. No need for manual preparation.
diff --git a/scripts/base_ui.py b/scripts/base_ui.py
@@ -13,6 +13,7 @@
             sys.path.extend([scripts_path_fix])
 
 import gradio as gr
+import modules
 from types import SimpleNamespace
 
 from modules import script_callbacks, shared
@@ -26,8 +27,7 @@
 from modules.sd_samplers import samplers_for_img2img
 from modules.ui import setup_progressbar, create_sampler_and_steps_selection, ordered_ui_categories, create_output_panel
 
-
-from vid2vid import *
+from core import vid2vid, txt2vid, utils
 
 def V2VArgs():
     seed = -1
@@ -75,41 +75,59 @@ def setup_common_values(mode, d):
 def inputs_ui():
     v2v_args = SimpleNamespace(**V2VArgs())
     t2v_args = SimpleNamespace(**T2VArgs())
-    with gr.Tab('vid2vid') as tab_vid2vid:
-        with gr.Row():
-            gr.HTML('Put your video here')
-        with gr.Row():
-            vid2vid_file = gr.File(label="Input video", interactive=True, file_count="single", file_types=["video"], elem_id="vid_to_vid_chosen_file")
-            #init_img = gr.Image(label="Image for img2img", elem_id="img2img_image", show_label=False, source="upload", interactive=True, type="pil", image_mode="RGBA")
-        #with gr.Row():
-        #    gr.HTML('Alternative: enter the relative (to the webui) path to the file')
-        #with gr.Row():
-        #    vid2vid_frames_path = gr.Textbox(label="Input video path", interactive=True, elem_id="vid_to_vid_chosen_path", placeholder='Enter your video path here, or upload in the box above ^')
-
-        width, height, prompt, n_prompt, cfg_scale, seed, processing_strength, fix_frame_strength = setup_common_values('vid2vid', v2v_args)
-
-        with FormRow(elem_id=f"sampler_selection_v2v"):
-            sampler_index = gr.Dropdown(label='Sampling method', elem_id=f"v2v_sampling", choices=[x.name for x in samplers_for_img2img], value=samplers_for_img2img[0].name, type="index")
-            steps = gr.Slider(minimum=1, maximum=150, step=1, elem_id=f"v2v_steps", label="Sampling steps", value=15)
-
-        with FormRow(elem_id="vid2vid_override_settings_row") as row:
-            override_settings = create_override_settings_dropdown("vid2vid", row)
-
-        with FormGroup(elem_id=f"script_container"):
-            custom_inputs = scripts.scripts_img2img.setup_ui()
-        #with gr.Row():
-        #    strength = gr.Slider(label="denoising strength", value=d.strength, minimum=0, maximum=1, step=0.05, interactive=True)
-        #    vid2vid_startFrame=gr.Number(label='vid2vid start frame',value=d.vid2vid_startFrame)
+    with gr.Tabs():
+        sdcn_process_mode = gr.State(value='vid2vid')
+
+        with gr.Tab('vid2vid') as tab_vid2vid:
+            with gr.Row():
+                gr.HTML('Put your video here')
+            with gr.Row():
+                v2v_file = gr.File(label="Input video", interactive=True, file_count="single", file_types=["video"], elem_id="vid_to_vid_chosen_file")
+                #init_img = gr.Image(label="Image for img2img", elem_id="img2img_image", show_label=False, source="upload", interactive=True, type="pil", image_mode="RGBA")
+            #with gr.Row():
+            #    gr.HTML('Alternative: enter the relative (to the webui) path to the file')
+            #with gr.Row():
+            #    vid2vid_frames_path = gr.Textbox(label="Input video path", interactive=True, elem_id="vid_to_vid_chosen_path", placeholder='Enter your video path here, or upload in the box above ^')
+
+            v2v_width, v2v_height, v2v_prompt, v2v_n_prompt, v2v_cfg_scale, v2v_seed, v2v_processing_strength, v2v_fix_frame_strength = setup_common_values('vid2vid', v2v_args)
+
+            with FormRow(elem_id=f"sampler_selection_v2v"):
+                v2v_sampler_index = gr.Dropdown(label='Sampling method', elem_id=f"v2v_sampling", choices=[x.name for x in samplers_for_img2img], value=samplers_for_img2img[0].name, type="index")
+                v2v_steps = gr.Slider(minimum=1, maximum=150, step=1, elem_id=f"v2v_steps", label="Sampling steps", value=15)
+
+            with FormRow(elem_id="vid2vid_override_settings_row") as row:
+                v2v_override_settings = create_override_settings_dropdown("vid2vid", row)
+
+            with FormGroup(elem_id=f"script_container"):
+                v2v_custom_inputs = scripts.scripts_img2img.setup_ui()
+            #with gr.Row():
+            #    strength = gr.Slider(label="denoising strength", value=d.strength, minimum=0, maximum=1, step=0.05, interactive=True)
+            #    vid2vid_startFrame=gr.Number(label='vid2vid start frame',value=d.vid2vid_startFrame)
+
+        with gr.Tab('txt2vid') as tab_txt2vid:
+            t2v_width, t2v_height, t2v_prompt, t2v_n_prompt, t2v_cfg_scale, t2v_seed, t2v_processing_strength, t2v_fix_frame_strength = setup_common_values('txt2vid', t2v_args)
+            with gr.Row():
+                t2v_length = gr.Slider(label='Length (in frames)', minimum=10, maximum=2048, step=10, value=40, interactive=True)
+                t2v_fps = gr.Slider(label='Video FPS', minimum=4, maximum=64, step=4, value=12, interactive=True)
 
-    with gr.Tab('txt2vid') as tab_txt2vid:
-        gr.Markdown('Work in progress...') 
-    #    width, height, prompt, n_prompt, steps, cfg_scale, seed, processing_strength, fix_frame_strength = setup_common_values('txt2vid', t2v_args)
-
-    #with gr.Tab('settings') as tab_setts:        
-    #    gr.Markdown('Work in progress...')         
+
+    tab_vid2vid.select(fn=lambda: 'vid2vid', inputs=[], outputs=[sdcn_process_mode])
+    tab_txt2vid.select(fn=lambda: 'txt2vid', inputs=[], outputs=[sdcn_process_mode])
 
     return locals()
 
+def process(*args):
+    if args[0] == 'vid2vid':
+        yield from vid2vid.start_process(*args)
+    elif args[0] == 'txt2vid':
+        yield from txt2vid.start_process(*args)
+    else:
+        raise Exception(f"Unsupported processing mode: '{args[0]}'")
+
+def stop_process(*args):
+    utils.shared.is_interrupted = True
+    return gr.Button.update(interactive=False)
+
 def on_ui_tabs():
     modules.scripts.scripts_current = modules.scripts.scripts_img2img
     modules.scripts.scripts_img2img.initialize_scripts(is_img2img=True)
@@ -118,11 +136,8 @@ def on_ui_tabs():
         components = {}
 
         #dv = SimpleNamespace(**T2VOutputArgs())
-        with gr.Row(elem_id='v2v-core').style(equal_height=False, variant='compact'):
+        with gr.Row(elem_id='sdcn-core').style(equal_height=False, variant='compact'):
             with gr.Column(scale=1, variant='panel'):
-                with gr.Row(variant='compact'):
-                    run_button = gr.Button('Generate', elem_id=f"sdcn_anim_generate", variant='primary')
-
                 with gr.Tabs():
                     components = inputs_ui()
 
@@ -139,6 +154,10 @@ def on_ui_tabs():
                     #            custom_inputs = scripts.scripts_img2img.setup_ui()
 
             with gr.Column(scale=1, variant='compact'):
+                with gr.Row(variant='compact'):
+                    run_button = gr.Button('Generate', elem_id=f"sdcn_anim_generate", variant='primary')
+                    stop_button = gr.Button('Interrupt', elem_id=f"sdcn_anim_interrupt", variant='primary', interactive=False)
+
                 with gr.Column(variant="panel"):
                     sp_progress = gr.HTML(elem_id="sp_progress", value="")
                     sp_progress.update()
@@ -157,52 +176,9 @@ def on_ui_tabs():
 
                 with gr.Row(variant='compact'):
                     dummy_component = gr.Label(visible=False)
-
 
-            # Define parameters for the action methods. Not all of them are included yet
-            method_inputs = [
-                dummy_component,                    # send None for task_id
-                dummy_component,                    # mode
-                components['prompt'],               # prompt
-                components['n_prompt'],             # negative_prompt
-                dummy_component,                    # prompt_styles
-                components['vid2vid_file'],         # input_video
-                dummy_component,                    # sketch
-                dummy_component,                    # init_img_with_mask
-                dummy_component,                    # inpaint_color_sketch
-                dummy_component,                    # inpaint_color_sketch_orig
-                dummy_component,                    # init_img_inpaint
-                dummy_component,                    # init_mask_inpaint
-                components['steps'],                # steps
-                components['sampler_index'],        # sampler_index
-                dummy_component,                    # mask_blur
-                dummy_component,                    # mask_alpha
-                dummy_component,                    # inpainting_fill
-                dummy_component,                    # restore_faces
-                dummy_component,                    # tiling
-                dummy_component,                    # n_iter
-                dummy_component,                    # batch_size
-                components['cfg_scale'],            # cfg_scale
-                dummy_component,                    # image_cfg_scale
-                components['processing_strength'],  # denoising_strength
-                components['fix_frame_strength'],   # fix_frame_strength
-                components['seed'],                 # seed
-                dummy_component,                    # subseed
-                dummy_component,                    # subseed_strength
-                dummy_component,                    # seed_resize_from_h
-                dummy_component,                    # seed_resize_from_w
-                dummy_component,                    # seed_enable_extras
-                components['height'],               # height
-                components['width'],                # width
-                dummy_component,                    # resize_mode
-                dummy_component,                    # inpaint_full_res
-                dummy_component,                    # inpaint_full_res_padding
-                dummy_component,                    # inpainting_mask_invert
-                dummy_component,                    # img2img_batch_input_dir
-                dummy_component,                    # img2img_batch_output_dir
-                dummy_component,                    # img2img_batch_inpaint_mask_dir
-                components['override_settings'],    # override_settings_texts
-            ] + components['custom_inputs']
+            # Define parameters for the action methods.
+            method_inputs = [components[name] for name in utils.get_component_names()] + components['v2v_custom_inputs']
 
             method_outputs = [
                 sp_progress,
@@ -211,15 +187,23 @@ def on_ui_tabs():
                 img_preview_prev_warp,
                 img_preview_processed,
                 html_log,
+                run_button,
+                stop_button
             ]
 
             run_button.click(
-                fn=start_process, #wrap_gradio_gpu_call(start_process, extra_outputs=[None, '', '']), 
+                fn=process, #wrap_gradio_gpu_call(start_process, extra_outputs=[None, '', '']), 
                 inputs=method_inputs,
                 outputs=method_outputs,
                 show_progress=True,
             )
 
+            stop_button.click(
+                fn=stop_process,
+                outputs=[stop_button],
+                show_progress=False
+            )
+
         modules.scripts.scripts_current = None
 
         # define queue - required for generators