sayakpaul · LeviVasconcelos · Dec 16, 2022 · Dec 16, 2022 · Dec 16, 2022 · Dec 16, 2022
diff --git a/maxim/blocks/block_gating.py b/maxim/blocks/block_gating.py
@@ -6,7 +6,7 @@
 from tensorflow.keras import backend as K
 from tensorflow.keras import layers
 
-from ..layers import BlockImages, SwapAxes, UnblockImages
+from ..layers import SwapAxes, TFBlockImages, TFUnblockImages
 
 
 def BlockGatingUnit(use_bias: bool = True, name: str = "block_gating_unit"):
@@ -18,9 +18,7 @@ def BlockGatingUnit(use_bias: bool = True, name: str = "block_gating_unit"):
 
     def apply(x):
         u, v = tf.split(x, 2, axis=-1)
-        v = layers.LayerNormalization(
-            epsilon=1e-06, name=f"{name}_intermediate_layernorm"
-        )(v)
+        v = layers.LayerNormalization(epsilon=1e-06, name=f"{name}_intermediate_layernorm")(v)
         n = K.int_shape(x)[-2]  # get spatial dim
         v = SwapAxes()(v, -1, -2)
         v = layers.Dense(n, use_bias=use_bias, name=f"{name}_Dense_0")(v)
@@ -47,8 +45,7 @@ def apply(x):
             K.int_shape(x)[3],
         )
         fh, fw = block_size
-        gh, gw = h // fh, w // fw
-        x = BlockImages()(x, patch_size=(fh, fw))
+        x, gh, gw = TFBlockImages()(x, patch_size=(fh, fw))
         # MLP2: Local (block) mixing part, provides within-block communication.
         y = layers.LayerNormalization(epsilon=1e-06, name=f"{name}_LayerNorm")(x)
         y = layers.Dense(
@@ -65,7 +62,7 @@ def apply(x):
         )(y)
         y = layers.Dropout(dropout_rate)(y)
         x = x + y
-        x = UnblockImages()(x, grid_size=(gh, gw), patch_size=(fh, fw))
+        x = TFUnblockImages()(x, patch_size=(fh, fw), grid_size=(gh, gw))
         return x
 
     return apply
diff --git a/maxim/blocks/grid_gating.py b/maxim/blocks/grid_gating.py
@@ -6,7 +6,7 @@
 from tensorflow.keras import backend as K
 from tensorflow.keras import layers
 
-from ..layers import BlockImages, SwapAxes, UnblockImages
+from ..layers import SwapAxes, TFBlockImagesByGrid, TFUnblockImages
 
 
 def GridGatingUnit(use_bias: bool = True, name: str = "grid_gating_unit"):
@@ -18,9 +18,7 @@ def GridGatingUnit(use_bias: bool = True, name: str = "grid_gating_unit"):
 
     def apply(x):
         u, v = tf.split(x, 2, axis=-1)
-        v = layers.LayerNormalization(
-            epsilon=1e-06, name=f"{name}_intermediate_layernorm"
-        )(v)
+        v = layers.LayerNormalization(epsilon=1e-06, name=f"{name}_intermediate_layernorm")(v)
         n = K.int_shape(x)[-3]  # get spatial dim
         v = SwapAxes()(v, -1, -3)
         v = layers.Dense(n, use_bias=use_bias, name=f"{name}_Dense_0")(v)
@@ -47,9 +45,8 @@ def apply(x):
             K.int_shape(x)[3],
         )
         gh, gw = grid_size
-        fh, fw = h // gh, w // gw
 
-        x = BlockImages()(x, patch_size=(fh, fw))
+        x, ph, pw = TFBlockImagesByGrid()(x, grid_size=(gh, gw))
         # gMLP1: Global (grid) mixing part, provides global grid communication.
         y = layers.LayerNormalization(epsilon=1e-06, name=f"{name}_LayerNorm")(x)
         y = layers.Dense(
@@ -66,7 +63,7 @@ def apply(x):
         )(y)
         y = layers.Dropout(dropout_rate)(y)
         x = x + y
-        x = UnblockImages()(x, grid_size=(gh, gw), patch_size=(fh, fw))
+        x = TFUnblockImages()(x, grid_size=(gh, gw), patch_size=(ph, pw))
         return x
 
     return apply
diff --git a/maxim/blocks/misc_gating.py b/maxim/blocks/misc_gating.py
@@ -8,18 +8,14 @@
 from tensorflow.keras import backend as K
 from tensorflow.keras import layers
 
-from ..layers import BlockImages, SwapAxes, UnblockImages
+from ..layers import SwapAxes, TFBlockImages, TFBlockImagesByGrid, TFUnblockImages
 from .block_gating import BlockGmlpLayer
 from .grid_gating import GridGmlpLayer
 
 Conv1x1 = functools.partial(layers.Conv2D, kernel_size=(1, 1), padding="same")
 Conv3x3 = functools.partial(layers.Conv2D, kernel_size=(3, 3), padding="same")
-ConvT_up = functools.partial(
-    layers.Conv2DTranspose, kernel_size=(2, 2), strides=(2, 2), padding="same"
-)
-Conv_down = functools.partial(
-    layers.Conv2D, kernel_size=(4, 4), strides=(2, 2), padding="same"
-)
+ConvT_up = functools.partial(layers.Conv2DTranspose, kernel_size=(2, 2), strides=(2, 2), padding="same")
+Conv_down = functools.partial(layers.Conv2D, kernel_size=(4, 4), strides=(2, 2), padding="same")
 
 
 def ResidualSplitHeadMultiAxisGmlpLayer(
@@ -116,24 +112,22 @@ def apply(x):
         u, v = tf.split(x, 2, axis=-1)
 
         # Get grid MLP weights
-        gh, gw = grid_size
-        fh, fw = h // gh, w // gw
-        u = BlockImages()(u, patch_size=(fh, fw))
-        dim_u = K.int_shape(u)[-3]
+        ghu, gwu = grid_size
+        u, phu, pwu = TFBlockImagesByGrid()(u, grid_size=(ghu, gwu))
+        dim_u = ghu * gwu
         u = SwapAxes()(u, -1, -3)
         u = layers.Dense(dim_u, use_bias=use_bias, name=f"{name}_Dense_0")(u)
         u = SwapAxes()(u, -1, -3)
-        u = UnblockImages()(u, grid_size=(gh, gw), patch_size=(fh, fw))
+        u = TFUnblockImages()(u, grid_size=(ghu, gwu), patch_size=(phu, pwu))
 
         # Get Block MLP weights
-        fh, fw = block_size
-        gh, gw = h // fh, w // fw
-        v = BlockImages()(v, patch_size=(fh, fw))
-        dim_v = K.int_shape(v)[-2]
+        fhv, fwv = block_size
+        v, ghv, gwv = TFBlockImages()(v, patch_size=(fhv, fwv))
+        dim_v = fhv * fwv
         v = SwapAxes()(v, -1, -2)
         v = layers.Dense(dim_v, use_bias=use_bias, name=f"{name}_Dense_1")(v)
         v = SwapAxes()(v, -1, -2)
-        v = UnblockImages()(v, grid_size=(gh, gw), patch_size=(fh, fw))
+        v = TFUnblockImages()(v, patch_size=(fhv, fwv), grid_size=(ghv, gwv))
 
         x = tf.concat([u, v], axis=-1)
         x = layers.Dense(num_channels, use_bias=use_bias, name=f"{name}_out_project")(x)
@@ -159,9 +153,7 @@ def CrossGatingBlock(
     def apply(x, y):
         # Upscale Y signal, y is the gating signal.
         if upsample_y:
-            y = ConvT_up(
-                filters=features, use_bias=use_bias, name=f"{name}_ConvTranspose_0"
-            )(y)
+            y = ConvT_up(filters=features, use_bias=use_bias, name=f"{name}_ConvTranspose_0")(y)
 
         x = Conv1x1(filters=features, use_bias=use_bias, name=f"{name}_Conv_0")(x)
         n, h, w, num_channels = (

diff --git a/maxim/blocks/others.py b/maxim/blocks/others.py
@@ -8,7 +8,7 @@
 from tensorflow.keras import backend as K
 from tensorflow.keras import layers
 
-from ..layers import Resizing
+from ..layers import ResizingUp
 
 Conv1x1 = functools.partial(layers.Conv2D, kernel_size=(1, 1), padding="same")
 
@@ -32,23 +32,13 @@ def apply(x):
     return apply
 
 
-def UpSampleRatio(
-    num_channels: int, ratio: float, use_bias: bool = True, name: str = "upsample"
-):
+def UpSampleRatio(num_channels: int, ratio: float, use_bias: bool = True, name: str = "upsample"):
     """Upsample features given a ratio > 0."""
 
     def apply(x):
-        n, h, w, c = (
-            K.int_shape(x)[0],
-            K.int_shape(x)[1],
-            K.int_shape(x)[2],
-            K.int_shape(x)[3],
-        )
-
         # Following `jax.image.resize()`
-        x = Resizing(
-            height=int(h * ratio),
-            width=int(w * ratio),
+        x = ResizingUp(
+            ratio=ratio,
             method="bilinear",
             antialias=True,
             name=f"{name}_resizing_{K.get_uid('Resizing')}",

diff --git a/maxim/layers.py b/maxim/layers.py
@@ -1,65 +1,89 @@
 """
 Layers based on https://github.com/google-research/maxim/blob/main/maxim/models/maxim.py
+and reworked to cope with variable image dimensions
 """
 
-import einops
 import tensorflow as tf
 from tensorflow.experimental import numpy as tnp
-from tensorflow.keras import backend as K
 from tensorflow.keras import layers
 
 
 @tf.keras.utils.register_keras_serializable("maxim")
-class BlockImages(layers.Layer):
+class TFBlockImages(layers.Layer):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-    def call(self, x, patch_size):
-        bs, h, w, num_channels = (
-            K.int_shape(x)[0],
-            K.int_shape(x)[1],
-            K.int_shape(x)[2],
-            K.int_shape(x)[3],
-        )
+    def call(self, image, patch_size):
+        bs, h, w, num_channels = (tf.shape(image)[0], tf.shape(image)[1], tf.shape(image)[2], tf.shape(image)[3])
+        ph, pw = patch_size
+        gh = h // ph
+        gw = w // pw
+        pad = [[0, 0], [0, 0]]
+        patches = tf.space_to_batch_nd(image, [ph, pw], pad)
+        patches = tf.split(patches, ph * pw, axis=0)
+        patches = tf.stack(patches, 3)  # (bs, h/p, h/p, p*p, 3)
+        patches_dim = tf.shape(patches)
+        patches = tf.reshape(patches, [patches_dim[0], patches_dim[1], patches_dim[2], -1])
+        patches = tf.reshape(patches, (patches_dim[0], patches_dim[1] * patches_dim[2], ph * pw, num_channels))
+        return [patches, gh, gw]
 
-        grid_height, grid_width = h // patch_size[0], w // patch_size[1]
+    def get_config(self):
+        return super().get_config()
 
-        x = einops.rearrange(
-            x,
-            "n (gh fh) (gw fw) c -> n (gh gw) (fh fw) c",
-            gh=grid_height,
-            gw=grid_width,
-            fh=patch_size[0],
-            fw=patch_size[1],
-        )
 
-        return x
+@tf.keras.utils.register_keras_serializable("maxim")
+class TFBlockImagesByGrid(layers.Layer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def call(self, image, grid_size):
+        bs, h, w, num_channels = (tf.shape(image)[0], tf.shape(image)[1], tf.shape(image)[2], tf.shape(image)[3])
+        gh, gw = grid_size
+        ph = h // gh
+        pw = w // gw
+        pad = [[0, 0], [0, 0]]
+
+        def block_single_image(img):
+            pat = tf.expand_dims(img, 0)  # batch = 1
+            pat = tf.space_to_batch_nd(pat, [ph, pw], pad)  # p*p*bs, g, g, c
+            pat = tf.expand_dims(pat, 3)  # pxpxbs, g, g, 1, c
+            pat = tf.transpose(pat, perm=[3, 1, 2, 0, 4])  # 1, g, g, pxp, c
+            pat = tf.reshape(pat, [gh, gw, ph * pw, num_channels])
+            return pat
+
+        patches = image
+        patches = tf.map_fn(fn=lambda x: block_single_image(x), elems=patches)
+        patches_dim = tf.shape(patches)
+        patches = tf.reshape(patches, [patches_dim[0], patches_dim[1], patches_dim[2], -1])
+        patches = tf.reshape(patches, (patches_dim[0], patches_dim[1] * patches_dim[2], ph * pw, num_channels))
+        return [patches, ph, pw]
 
     def get_config(self):
-        config = super().get_config().copy()
-        return config
+        return super().get_config()
 
 
 @tf.keras.utils.register_keras_serializable("maxim")
-class UnblockImages(layers.Layer):
+class TFUnblockImages(layers.Layer):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-    def call(self, x, grid_size, patch_size):
-        x = einops.rearrange(
-            x,
-            "n (gh gw) (fh fw) c -> n (gh fh) (gw fw) c",
-            gh=grid_size[0],
-            gw=grid_size[1],
-            fh=patch_size[0],
-            fw=patch_size[1],
-        )
+    def call(self, x, patch_size, grid_size):
+        bs, grid_sqrt, patch_sqrt, num_channels = (tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2], tf.shape(x)[3])
+        ph, pw = patch_size
+        gh, gw = grid_size
 
-        return x
+        pad = [[0, 0], [0, 0]]
+
+        y = tf.reshape(x, (bs, gh, gw, -1, num_channels))  # (bs, gh, gw, ph*pw, 3)
+        y = tf.expand_dims(y, 0)
+        y = tf.transpose(y, perm=[4, 1, 2, 3, 0, 5])
+        y = tf.reshape(y, [bs * ph * pw, gh, gw, num_channels])
+        y = tf.batch_to_space(y, [ph, pw], pad)
+
+        return y
 
     def get_config(self):
-        config = super().get_config().copy()
-        return config
+        return super().get_config()
 
 
 @tf.keras.utils.register_keras_serializable("maxim")
@@ -76,28 +100,60 @@ def get_config(self):
 
 
 @tf.keras.utils.register_keras_serializable("maxim")
-class Resizing(layers.Layer):
-    def __init__(self, height, width, antialias=True, method="bilinear", **kwargs):
+class ResizingDown(tf.keras.layers.Layer):
+    def __init__(self, ratio: float, method="bilinear", antialias=True, **kwargs):
         super().__init__(**kwargs)
-        self.height = height
-        self.width = width
-        self.antialias = antialias
+        self.ratio = ratio
         self.method = method
+        self.antialias = antialias
 
-    def call(self, x):
-        return tf.image.resize(
-            x,
-            size=(self.height, self.width),
+    def __call__(self, img):
+        n, h, w, c = (tf.shape(img)[0], tf.shape(img)[1], tf.shape(img)[2], tf.shape(img)[3])
+        x = tf.image.resize(
+            img,
+            (h // self.ratio, w // self.ratio),
+            method=self.method,
             antialias=self.antialias,
+        )
+        return x
+
+    def get_config(self):
+        config = super().get_config().copy()
+        config.update(
+            {
+                "ratio": self.ratio,
+                "antialias": self.antialias,
+                "method": self.method,
+            }
+        )
+        return config
+
+
+@tf.keras.utils.register_keras_serializable("maxim")
+class ResizingUp(tf.keras.layers.Layer):
+    def __init__(self, ratio: float, method="bilinear", antialias=True, **kwargs):
+        super().__init__(**kwargs)
+        self.ratio = tf.constant(ratio, dtype=tf.float32)
+        self.method = method
+        self.antialias = antialias
+
+    def __call__(self, img):
+        shape = tf.shape(img)
+        new_sh = self.ratio * tf.cast(shape[1:3], tf.float32)
+
+        x = tf.image.resize(
+            img,
+            size=tf.cast(new_sh, tf.int32),
             method=self.method,
+            antialias=self.antialias,
         )
+        return x
 
     def get_config(self):
         config = super().get_config().copy()
         config.update(
             {
-                "height": self.height,
-                "width": self.width,
+                "ratio": self.ratio,
                 "antialias": self.antialias,
                 "method": self.method,
             }