feat: text-to-image-model

codedmachine111 · codedmachine111 · commit cb2d4b996a50 · 2024-01-05T16:09:03.000+05:30
diff --git a/main.py b/main.py
@@ -32,7 +32,16 @@ def train(n_epochs, batch_size, codings_size, d_steps, gp_w):
         d_loss_fn=discriminator_loss,
     )
 
-    visualization_callback = tf.keras.callbacks.LambdaCallback(on_epoch_end=lambda epoch, logs: visualize_generated_images(epoch, generator))
+    # Create a visualization callback
+    visualization_callback = tf.keras.callbacks.LambdaCallback(on_epoch_end=lambda epoch, logs: visualize_generated_images(epoch, generator, dataset))
+    # Create a ModelCheckpoint callback
+    model_checkpoint_path_weights = 'ckpts/CUB-WGAN-GP-weights-{epoch:02d}.keras'
+    model_checkpoint_callback_weights = ModelCheckpoint(
+        filepath=model_checkpoint_path_weights,
+        save_freq='epoch',  # Save every epoch
+        save_weights_only=True,  # Save only the weights
+    )
+
     history = gan.fit(dataset, epochs=n_epochs, verbose=1, callbacks=[visualization_callback, model_checkpoint_callback_weights])
 
     fig, ax = plt.subplots(figsize=(20, 6))
diff --git a/models.py b/models.py
@@ -1,7 +1,8 @@
 import tensorflow as tf
 from keras.layers import (Dense, Reshape, BatchNormalization, Conv2DTranspose, 
-                          Dropout, LayerNormalization, Embedding, Input, Conv2D, LeakyReLU, Flatten) 
-from keras.models import Sequential
+                          Dropout, LayerNormalization, Embedding, Input, Conv2D, LeakyReLU, Flatten,
+                          Concatenate, concatenate, Lambda, ReLU) 
+from utils import *
 
 def scaled_dot_product(q, k, v):
     dk = tf.cast(tf.shape(k)[-1], tf.float32)
@@ -88,11 +89,20 @@ def build_generator(noise_dim,
                     projection_dim,
                     num_heads,
                     mlp_dim):
+    # Input layer
+    embed_input = Input(shape=(1024,))
+    x = Dense(256)(embed_input)
+    mean_logsigma = LeakyReLU(alpha=0.2)(x)
+
+    c = Lambda(generate_c)(mean_logsigma)
 
     noise_input = Input(shape=(noise_dim,))
 
-    x = Dense(8 * 8 * projection_dim)(noise_input)
+    gen_input = Concatenate(axis=1)([c, noise_input])
+
+    x = Dense(8 * 8 * projection_dim)(gen_input)
     x = Reshape((8 *8, projection_dim))(x)
+    # x = layers.BatchNormalization()(x)
 
     positional_embeddings  = PositionalEmbedding(64, projection_dim)
     x = positional_embeddings(x)
@@ -109,29 +119,39 @@ def build_generator(noise_dim,
 
     outputs = Conv2DTranspose(3, kernel_size=3, strides=2, padding="SAME",activation="tanh")(x)
 
-    return tf.keras.Model(inputs=noise_input, outputs=outputs, name='generator')    
+    return tf.keras.Model(inputs=[embed_input,noise_input], outputs=outputs, name='generator')
 
 def build_discriminator():
-  return Sequential([
+      image_input = Input(shape=(64,64,3))
+
+      x = Conv2D(64, kernel_size=4, strides=2, padding="SAME", activation=LeakyReLU(0.2))(image_input)
+      x = LayerNormalization()(x)
+      x = Conv2D(128, kernel_size=4, strides=2, padding="SAME", activation=LeakyReLU(0.2))(x)
+      x = LayerNormalization()(x)
+      x = Conv2D(256, kernel_size=4, strides=2, padding="SAME", activation=LeakyReLU(0.2))(x)
+      x = LayerNormalization()(x)
+      x = Conv2D(512, kernel_size=4, strides=2, padding="SAME", activation=LeakyReLU(0.2))(x)
+
+      x = Dropout(0.4)(x)
 
-      Conv2D(64, kernel_size=4, strides=1, padding="SAME", activation=LeakyReLU(0.2), input_shape=[64,64,3]),
-      LayerNormalization(),
-      Conv2D(128, kernel_size=4, strides=2, padding="SAME", activation=LeakyReLU(0.2)),
-      LayerNormalization(),
-      Conv2D(256, kernel_size=4, strides=2, padding="SAME", activation=LeakyReLU(0.2)),
-      LayerNormalization(),
-      Conv2D(512, kernel_size=4, strides=2, padding="SAME", activation=LeakyReLU(0.2)),
+      embedding_input = Input(shape=(1024,))
+      compressed_embedding = Dense(128)(embedding_input)
+      compressed_embedding = ReLU()(compressed_embedding)
 
-      Dropout(0.4),
+      compressed_embedding = tf.reshape(compressed_embedding, (-1, 1, 1, 128))
+      compressed_embedding = tf.tile(compressed_embedding, (1, 4, 4, 1))
 
-      Conv2D(64 * 8, kernel_size=1, strides=1, padding="SAME", activation=LeakyReLU(0.2)),
-      LayerNormalization(),
+      concat_input = concatenate([x, compressed_embedding])
 
-      Dropout(0.4),
-      Flatten(),
+      x = Conv2D(64 * 8, kernel_size=1, strides=1, padding="SAME", activation=LeakyReLU(0.2))(concat_input)
+      x = LayerNormalization()(x)
 
-      Dense(1),
-  ], name='discriminator')
+      x = Dropout(0.4)(x)
+      x = Flatten()(x)
+
+      outputs = Dense(1)(x)
+
+      return tf.keras.Model(inputs=[image_input,embedding_input], outputs=outputs, name='discriminator')
 
 class WGAN(tf.keras.Model):
     def __init__(
@@ -156,7 +176,7 @@ def compile(self, d_optimizer, g_optimizer, d_loss_fn, g_loss_fn):
         self.d_loss_fn = d_loss_fn
         self.g_loss_fn = g_loss_fn
 
-    def gradient_penalty(self, batch_size, real_images, fake_images):
+    def gradient_penalty(self, batch_size, real_images, fake_images, text_embeddings):
         """ Calculates the gradient penalty.
 
         This loss is calculated on an interpolated image
@@ -170,7 +190,7 @@ def gradient_penalty(self, batch_size, real_images, fake_images):
         with tf.GradientTape() as gp_tape:
             gp_tape.watch(interpolated)
             # 1. Get the discriminator output for this interpolated image.
-            pred = self.discriminator(interpolated, training=True)
+            pred = self.discriminator([interpolated, text_embeddings], training=True)
 
         # 2. Calculate the gradients w.r.t to this interpolated image.
         grads = gp_tape.gradient(pred, [interpolated])[0]
@@ -179,9 +199,14 @@ def gradient_penalty(self, batch_size, real_images, fake_images):
         gp = tf.reduce_mean((norm - 1.0) ** 2)
         return gp
 
-    def train_step(self, real_images):
+    def train_step(self, dataset):
+
+        real_images, text_embeddings = dataset
+
         if isinstance(real_images, tuple):
             real_images = real_images[0]
+        if isinstance(text_embeddings, tuple):
+            text_embeddings = text_embeddings[0]
 
         batch_size = tf.shape(real_images)[0]
 
@@ -201,36 +226,24 @@ def train_step(self, real_images):
                 shape=(batch_size, self.latent_dim)
             )
             with tf.GradientTape() as tape:
-                fake_images = self.generator(random_latent_vectors, training=True)
-                fake_logits = self.discriminator(fake_images, training=True)
-                real_logits = self.discriminator(real_images, training=True)
-
+                fake_images = self.generator([text_embeddings,random_latent_vectors], training=True)
+                fake_logits = self.discriminator([fake_images, text_embeddings], training=True)
+                real_logits = self.discriminator([real_images, text_embeddings], training=True)
                 d_cost = self.d_loss_fn(real_img=real_logits, fake_img=fake_logits)
-                gp = self.gradient_penalty(batch_size, real_images, fake_images)
+                gp = self.gradient_penalty(batch_size, real_images, fake_images, text_embeddings)
                 d_loss = d_cost + gp * self.gp_weight
-
             d_gradient = tape.gradient(d_loss, self.discriminator.trainable_variables)
             self.d_optimizer.apply_gradients(
                 zip(d_gradient, self.discriminator.trainable_variables)
             )
-
         random_latent_vectors = tf.random.normal(shape=(batch_size, self.latent_dim))
         with tf.GradientTape() as tape:
-            generated_images = self.generator(random_latent_vectors, training=True)
-            gen_img_logits = self.discriminator(generated_images, training=True)
+            generated_images = self.generator([text_embeddings, random_latent_vectors], training=True)
+            gen_img_logits = self.discriminator([generated_images, text_embeddings], training=True)
             g_loss = self.g_loss_fn(gen_img_logits)
 
         gen_gradient = tape.gradient(g_loss, self.generator.trainable_variables)
         self.g_optimizer.apply_gradients(
             zip(gen_gradient, self.generator.trainable_variables)
         )
-        return {"d_loss": d_loss, "g_loss": g_loss}
-
-def discriminator_loss(real_img, fake_img):
-    real_loss = tf.reduce_mean(real_img)
-    fake_loss = tf.reduce_mean(fake_img)
-    return fake_loss - real_loss
-
-
-def generator_loss(fake_img):
-    return -tf.reduce_mean(fake_img)
+        return {"d_loss": d_loss, "g_loss": g_loss}
diff --git a/utils.py b/utils.py
@@ -1,25 +1,18 @@
 import numpy as np
 import tensorflow as tf
-import pickle
+from keras import backend as K
 from keras.callbacks import ModelCheckpoint
 import matplotlib.pyplot as plt
 
-def plot_results(images, n_cols=None, title=None):
+def generate_c(x):
+    mean = x[:, :128]
+    log_sigma = x[:, 128:]
 
-    n_cols = n_cols or len(images)
-    n_rows = (len(images) - 1) // n_cols + 1
+    stddev = K.exp(log_sigma)
+    epsilon = K.random_normal(shape=K.constant((mean.shape[1],), dtype='int32'))
+    c = stddev * epsilon + mean
 
-    if images.shape[-1] == 1:
-        images = np.squeeze(images, axis=-1)
-
-    fig = plt.figure(figsize=(n_cols, n_rows))
-
-    for index, image in enumerate(images):
-        plt.subplot(n_rows, n_cols, index + 1)
-        plt.imshow(image.astype(np.uint8), cmap="binary")
-        plt.axis("off")
-
-    plt.suptitle(title)
+    return c
 
 def show_dataset_images(images):
     fig, axes = plt.subplots(4,4, figsize=(8,8))
@@ -29,9 +22,11 @@ def show_dataset_images(images):
       axes[i].axis('off')
     plt.show()
 
-def visualize_generated_images(epoch, generator, latent_dim=100, num_samples=5):
+# Visualization callback
+def visualize_generated_images(epoch, generator, dataset, latent_dim=100, num_samples=5):
+    real_images, text_embeddings = next(iter(dataset.take(1)))
     random_latent_vectors = np.random.normal(size=(num_samples, latent_dim))
-    generated_images = generator.predict(random_latent_vectors)
+    generated_images = generator.predict([text_embeddings[:num_samples], random_latent_vectors])
     generated_images += 1
     generated_images *= 127.5
 
@@ -43,41 +38,34 @@ def visualize_generated_images(epoch, generator, latent_dim=100, num_samples=5):
     plt.suptitle(f'Generated Images - Epoch {epoch}')
     plt.show()
 
-# Define the file path for saving the model
-model_checkpoint_path_weights = 'ckpts/CUB-WGAN-GP-weights-{epoch:02d}.keras'
-
-# Create a ModelCheckpoint callback
-model_checkpoint_callback_weights = ModelCheckpoint(
-    filepath=model_checkpoint_path_weights,
-    save_freq='epoch',  # Save every epoch
-    save_weights_only=True,  # Save only the weights
-)
-
-def load_images(path):
-    #Loading images from pickle file
-    with open(path, 'rb') as f_in:
-        images = pickle.load(f_in)
-    return images
+def prepare_data(batch_size, data_path):
+    x_train_path = data_path + "/X_train_CUB.npy"
+    embed_train_path = data_path + "/embeddings_train_CUB.npy"
 
-def load_data(pickle_data_file):
-    #Load images and embeddings
-    x = np.array(load_images(pickle_data_file))
-    return x
-
-def prepare_data(batch_size):
-    pickle_path_64 = "data/64images.pickle"
-    x_train_64 = load_data(pickle_path_64)
+    x_train_64 = np.load(x_train_path)
+    embed_train_64 = np.load(embed_train_path)
 
     print(f'Dataset images shape: {x_train_64.shape}\n')
+    print(f'Text embeddings shape: {embed_train_64.shape}\n')
+
     print(f'Dataset images: \n')
     show_dataset_images(x_train_64)
 
     # Normalization
     x_train = x_train_64.astype(np.float32) / 127.5
     x_train = x_train - 1
 
-    dataset = tf.data.Dataset.from_tensor_slices(x_train)
+    dataset = tf.data.Dataset.from_tensor_slices((x_train, embed_train_64))
     dataset = dataset.shuffle(1024)
     dataset = dataset.batch(batch_size, drop_remainder=True).prefetch(1)   
 
     return dataset 
+
+def discriminator_loss(real_img, fake_img):
+    real_loss = tf.reduce_mean(real_img)
+    fake_loss = tf.reduce_mean(fake_img)
+    return fake_loss - real_loss
+
+
+def generator_loss(fake_img):
+    return -tf.reduce_mean(fake_img)