From ee0b48edb981062b4435d68de8207be52485e9bc Mon Sep 17 00:00:00 2001 From: Ahmad Alobaid Date: Sat, 30 Mar 2024 14:18:28 +0300 Subject: [PATCH 1/3] add clip and opencv to the requirements --- .gitignore | 1 + setup.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 2087dc1..f2422d4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Custom .DS_Store +.idea/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/setup.py b/setup.py index e39b9aa..126ca0c 100644 --- a/setup.py +++ b/setup.py @@ -25,8 +25,9 @@ "einops", "sentencepiece", "diffusers", - "accelerate" - + "accelerate", + "clip @ git+https://github.com/openai/CLIP.git", + "opencv-python" ], author="", ) From 69759df490dc8fbaab0e4846428eef0721b53f9a Mon Sep 17 00:00:00 2001 From: Ahmad Alobaid Date: Fri, 5 Apr 2024 13:51:15 +0300 Subject: [PATCH 2/3] support MPS for mac --- kandinsky2/model/gaussian_diffusion.py | 9 ++++++++- train_2_1_unclip.py | 16 ++++++++++------ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/kandinsky2/model/gaussian_diffusion.py b/kandinsky2/model/gaussian_diffusion.py index b5449e1..1bec159 100644 --- a/kandinsky2/model/gaussian_diffusion.py +++ b/kandinsky2/model/gaussian_diffusion.py @@ -6,6 +6,7 @@ import enum import math +import traceback from copy import deepcopy import numpy as np import torch as th @@ -822,7 +823,13 @@ def _extract_into_tensor(arr, timesteps, broadcast_shape): dimension equal to the length of timesteps. :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims. """ - res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float() + try: + # For CUDA + res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float() + except Exception as e: + # For MPS + res = th.from_numpy(arr).to(device=timesteps.device, dtype=th.float32)[timesteps].float() + while len(res.shape) < len(broadcast_shape): res = res[..., None] return res.expand(broadcast_shape) diff --git a/train_2_1_unclip.py b/train_2_1_unclip.py index 41c890d..0d29d76 100644 --- a/train_2_1_unclip.py +++ b/train_2_1_unclip.py @@ -20,6 +20,7 @@ import clip import argparse + def drop_first_layer(path): d = {} state_dict = torch.load(path) @@ -28,6 +29,7 @@ def drop_first_layer(path): d[key] = state_dict[key] return d + def main(): parser = argparse.ArgumentParser() parser.add_argument('--config', type=str, help='config path') @@ -65,11 +67,13 @@ def main(): clip_model.token_embedding = None clip_model.text_projection = None clip_model = clip_model.eval().to(device) - train_unclip(unet=model, diffusion=diffusion, image_encoder=image_encoder, - clip_model=clip_model, text_encoder=text_encoder, optimizer=optimizer, - lr_scheduler=lr_scheduler, schedule_sampler=schedule_sampler, - train_loader=train_loader, val_loader=None, scale=config['image_enc_params']['scale'], - num_epochs=config['num_epochs'], save_every=config['save_every'], save_name=config['save_name'], - save_path=config['save_path'], inpainting=config['inpainting'], device=device) + train_unclip(unet=model, diffusion=diffusion, image_encoder=image_encoder, clip_model=clip_model, + text_encoder=text_encoder, optimizer=optimizer, lr_scheduler=lr_scheduler, + schedule_sampler=schedule_sampler, train_loader=train_loader, val_loader=None, + scale=config['image_enc_params']['scale'], num_epochs=config['num_epochs'], + save_every=config['save_every'], save_name=config['save_name'], save_path=config['save_path'], + inpainting=config['inpainting'], device=device) + + if __name__ == '__main__': main() From 6e8335f8196f563193160d1b82711b36dac5b95a Mon Sep 17 00:00:00 2001 From: Ahmad Alobaid Date: Wed, 1 May 2024 20:03:21 +0300 Subject: [PATCH 3/3] fix the code to support cpu --- kandinsky2/kandinsky2_2_model.py | 14 ++++++++++---- kandinsky2/model/gaussian_diffusion.py | 1 - 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/kandinsky2/kandinsky2_2_model.py b/kandinsky2/kandinsky2_2_model.py index 5dc1fa6..679957f 100644 --- a/kandinsky2/kandinsky2_2_model.py +++ b/kandinsky2/kandinsky2_2_model.py @@ -21,12 +21,18 @@ def __init__( ): self.device = device self.task_type = task_type - self.image_encoder = CLIPVisionModelWithProjection.from_pretrained('kandinsky-community/kandinsky-2-2-prior', subfolder='image_encoder').to(torch.float16).to(self.device) + + if device == "cpu": + torch_dtype = torch.float32 + else: + torch_dtype = torch.float16 + + self.image_encoder = CLIPVisionModelWithProjection.from_pretrained('kandinsky-community/kandinsky-2-2-prior', subfolder='image_encoder').to(torch_dtype).to(self.device) if task_type == "text2img": - self.unet = UNet2DConditionModel.from_pretrained('kandinsky-community/kandinsky-2-2-decoder', subfolder='unet').to(torch.float16).to(self.device) - self.prior = KandinskyV22PriorPipeline.from_pretrained('kandinsky-community/kandinsky-2-2-prior', image_encoder=self.image_encoder, torch_dtype=torch.float16) + self.unet = UNet2DConditionModel.from_pretrained('kandinsky-community/kandinsky-2-2-decoder', subfolder='unet').to(torch_dtype).to(self.device) + self.prior = KandinskyV22PriorPipeline.from_pretrained('kandinsky-community/kandinsky-2-2-prior', image_encoder=self.image_encoder, torch_dtype=torch_dtype) self.prior = self.prior.to(self.device) - self.decoder = KandinskyV22Pipeline.from_pretrained('kandinsky-community/kandinsky-2-2-decoder', unet=self.unet, torch_dtype=torch.float16) + self.decoder = KandinskyV22Pipeline.from_pretrained('kandinsky-community/kandinsky-2-2-decoder', unet=self.unet, torch_dtype=torch_dtype) self.decoder = self.decoder.to(self.device) elif task_type == "inpainting": self.unet = UNet2DConditionModel.from_pretrained('kandinsky-community/kandinsky-2-2-decoder-inpaint', subfolder='unet').to(torch.float16).to(self.device) diff --git a/kandinsky2/model/gaussian_diffusion.py b/kandinsky2/model/gaussian_diffusion.py index 1bec159..da32add 100644 --- a/kandinsky2/model/gaussian_diffusion.py +++ b/kandinsky2/model/gaussian_diffusion.py @@ -829,7 +829,6 @@ def _extract_into_tensor(arr, timesteps, broadcast_shape): except Exception as e: # For MPS res = th.from_numpy(arr).to(device=timesteps.device, dtype=th.float32)[timesteps].float() - while len(res.shape) < len(broadcast_shape): res = res[..., None] return res.expand(broadcast_shape)