sayakpaul · LeviVasconcelos · Dec 16, 2022 · Dec 16, 2022 · Dec 16, 2022 · Dec 16, 2022
diff --git a/README.md b/README.md
@@ -140,6 +140,12 @@ MAXIM supports arbitrary image resolutions. However, the available TensorFlow mo
 
 But these models can be extended to support arbitrary resolution. Refer to [this notebook](https://colab.research.google.com/github/sayakpaul/maxim-tf/blob/main/notebooks/inference-dynamic-resize.ipynb) for more details. Specifically, for a given task and an image, a new version of the model is instantiated and the weights of the available model are copied into the new model instance. This is a time-consuming process and isn't very efficient. 
 
+#### Changes to achieve arbitrary image resolution on TF
+
+- Substitute einops calls for pure TF operations: Einops operations are not intended operate on data-dependent (unknown) dimensionality [https://github.com/data-apis/array-api/issues/494](https://github.com/data-apis/array-api/issues/494). Thus, it was necessary to re-write BlockImages and UnblockImages as full TF ops. For convenience, we separate BlockImages into TFBlockImages and TFBlockImagesByGrid. We also rewrote UnblockImages as TFUnblockImages. 
+- Make [dim_u](https://github.com/sayakpaul/maxim-tf/pull/24/files#diff-8b281bcfc137b53489e1b19b29735462d5deac19b8c2c2f82cf0383680908063R121) and [dim_v](https://github.com/sayakpaul/maxim-tf/pull/24/files#diff-8b281bcfc137b53489e1b19b29735462d5deac19b8c2c2f82cf0383680908063R130) parameters independent of the input image size. This can be done by computing dim_u and dim_v from the provided grid_size and/or block_size. 
+- Change resizing layers so as to receive a ratio independent of the image size. It was important to use the float ratios to compute the final image size, just then converting back to int, to avoid loss of information.
+
 ### Output mismatches
 
 The outputs of the TF and JAX models vary slightly. This is because of the differences in the implementation of different layers (resizing layer mainly). Even though the differences in the outputs of individual blocks of TF and JAX models are small, they add up, in the end, to be larger than one might expect. 

diff --git a/create_maxim_model.py b/create_maxim_model.py
@@ -4,7 +4,7 @@
 from maxim.configs import MAXIM_CONFIGS
 
 
-def Model(variant=None, input_resolution=(256, 256), **kw) -> keras.Model:
+def Model(variant=None, input_resolution=(None, None), **kw) -> keras.Model:
     """Factory function to easily create a Model variant like "S".
 
     Args:

diff --git a/maxim/blocks/attentions.py b/maxim/blocks/attentions.py
@@ -134,7 +134,9 @@ def apply(x, x_image):
 
         # Get attention maps for num_channels
         x2 = tf.nn.sigmoid(
-            Conv3x3(filters=num_channels, use_bias=use_bias, name=f"{name}_Conv_2")(image)
+            Conv3x3(filters=num_channels, use_bias=use_bias, name=f"{name}_Conv_2")(
+                image
+            )
         )
 
         # Get attended feature maps

diff --git a/maxim/blocks/block_gating.py b/maxim/blocks/block_gating.py
@@ -6,7 +6,7 @@
 from tensorflow.keras import backend as K
 from tensorflow.keras import layers
 
-from ..layers import BlockImages, SwapAxes, UnblockImages
+from ..layers import SwapAxes, TFBlockImages, TFUnblockImages
 
 
 def BlockGatingUnit(use_bias: bool = True, name: str = "block_gating_unit"):
@@ -47,8 +47,7 @@ def apply(x):
             K.int_shape(x)[3],
         )
         fh, fw = block_size
-        gh, gw = h // fh, w // fw
-        x = BlockImages()(x, patch_size=(fh, fw))
+        x, gh, gw = TFBlockImages()(x, patch_size=(fh, fw))
         # MLP2: Local (block) mixing part, provides within-block communication.
         y = layers.LayerNormalization(epsilon=1e-06, name=f"{name}_LayerNorm")(x)
         y = layers.Dense(
@@ -65,7 +64,7 @@ def apply(x):
         )(y)
         y = layers.Dropout(dropout_rate)(y)
         x = x + y
-        x = UnblockImages()(x, grid_size=(gh, gw), patch_size=(fh, fw))
+        x = TFUnblockImages()(x, patch_size=(fh, fw), grid_size=(gh, gw))
         return x
 
     return apply
diff --git a/maxim/blocks/grid_gating.py b/maxim/blocks/grid_gating.py
@@ -6,7 +6,7 @@
 from tensorflow.keras import backend as K
 from tensorflow.keras import layers
 
-from ..layers import BlockImages, SwapAxes, UnblockImages
+from ..layers import SwapAxes, TFBlockImagesByGrid, TFUnblockImages
 
 
 def GridGatingUnit(use_bias: bool = True, name: str = "grid_gating_unit"):
@@ -47,9 +47,8 @@ def apply(x):
             K.int_shape(x)[3],
         )
         gh, gw = grid_size
-        fh, fw = h // gh, w // gw
 
-        x = BlockImages()(x, patch_size=(fh, fw))
+        x, ph, pw = TFBlockImagesByGrid()(x, grid_size=(gh, gw))
         # gMLP1: Global (grid) mixing part, provides global grid communication.
         y = layers.LayerNormalization(epsilon=1e-06, name=f"{name}_LayerNorm")(x)
         y = layers.Dense(
@@ -66,7 +65,7 @@ def apply(x):
         )(y)
         y = layers.Dropout(dropout_rate)(y)
         x = x + y
-        x = UnblockImages()(x, grid_size=(gh, gw), patch_size=(fh, fw))
+        x = TFUnblockImages()(x, grid_size=(gh, gw), patch_size=(ph, pw))
         return x
 
     return apply
diff --git a/maxim/blocks/misc_gating.py b/maxim/blocks/misc_gating.py
@@ -8,7 +8,7 @@
 from tensorflow.keras import backend as K
 from tensorflow.keras import layers
 
-from ..layers import BlockImages, SwapAxes, UnblockImages
+from ..layers import SwapAxes, TFBlockImages, TFBlockImagesByGrid, TFUnblockImages
 from .block_gating import BlockGmlpLayer
 from .grid_gating import GridGmlpLayer
 
@@ -117,23 +117,21 @@ def apply(x):
 
         # Get grid MLP weights
         gh, gw = grid_size
-        fh, fw = h // gh, w // gw
-        u = BlockImages()(u, patch_size=(fh, fw))
-        dim_u = K.int_shape(u)[-3]
+        u, phu, pwu = TFBlockImagesByGrid()(u, grid_size=(gh, gw))
+        dim_u = gh * gw
         u = SwapAxes()(u, -1, -3)
         u = layers.Dense(dim_u, use_bias=use_bias, name=f"{name}_Dense_0")(u)
         u = SwapAxes()(u, -1, -3)
-        u = UnblockImages()(u, grid_size=(gh, gw), patch_size=(fh, fw))
+        u = TFUnblockImages()(u, grid_size=(gh, gw), patch_size=(phu, pwu))
 
         # Get Block MLP weights
         fh, fw = block_size
-        gh, gw = h // fh, w // fw
-        v = BlockImages()(v, patch_size=(fh, fw))
-        dim_v = K.int_shape(v)[-2]
+        v, gh, gw = TFBlockImages()(v, patch_size=(fh, fw))
+        dim_v = fh * fw
         v = SwapAxes()(v, -1, -2)
         v = layers.Dense(dim_v, use_bias=use_bias, name=f"{name}_Dense_1")(v)
         v = SwapAxes()(v, -1, -2)
-        v = UnblockImages()(v, grid_size=(gh, gw), patch_size=(fh, fw))
+        v = TFUnblockImages()(v, patch_size=(fh, fw), grid_size=(gh, gw))
 
         x = tf.concat([u, v], axis=-1)
         x = layers.Dense(num_channels, use_bias=use_bias, name=f"{name}_out_project")(x)
@@ -178,7 +176,9 @@ def apply(x, y):
 
         # Get gating weights from X
         x = layers.LayerNormalization(epsilon=1e-06, name=f"{name}_LayerNorm_x")(x)
-        x = layers.Dense(num_channels, use_bias=use_bias, name=f"{name}_in_project_x")(x)
+        x = layers.Dense(num_channels, use_bias=use_bias, name=f"{name}_in_project_x")(
+            x
+        )
         x = tf.nn.gelu(x, approximate=True)
         gx = GetSpatialGatingWeights(
             features=num_channels,
@@ -191,7 +191,9 @@ def apply(x, y):
 
         # Get gating weights from Y
         y = layers.LayerNormalization(epsilon=1e-06, name=f"{name}_LayerNorm_y")(y)
-        y = layers.Dense(num_channels, use_bias=use_bias, name=f"{name}_in_project_y")(y)
+        y = layers.Dense(num_channels, use_bias=use_bias, name=f"{name}_in_project_y")(
+            y
+        )
         y = tf.nn.gelu(y, approximate=True)
         gy = GetSpatialGatingWeights(
             features=num_channels,
@@ -204,12 +206,16 @@ def apply(x, y):
 
         # Apply cross gating: X = X * GY, Y = Y * GX
         y = y * gx
-        y = layers.Dense(num_channels, use_bias=use_bias, name=f"{name}_out_project_y")(y)
+        y = layers.Dense(num_channels, use_bias=use_bias, name=f"{name}_out_project_y")(
+            y
+        )
         y = layers.Dropout(dropout_rate)(y)
         y = y + shortcut_y
 
         x = x * gy  # gating x using y
-        x = layers.Dense(num_channels, use_bias=use_bias, name=f"{name}_out_project_x")(x)
+        x = layers.Dense(num_channels, use_bias=use_bias, name=f"{name}_out_project_x")(
+            x
+        )
         x = layers.Dropout(dropout_rate)(x)
         x = x + y + shortcut_x  # get all aggregated signals
         return x, y

diff --git a/maxim/blocks/others.py b/maxim/blocks/others.py
@@ -38,17 +38,9 @@ def UpSampleRatio(
     """Upsample features given a ratio > 0."""
 
     def apply(x):
-        n, h, w, c = (
-            K.int_shape(x)[0],
-            K.int_shape(x)[1],
-            K.int_shape(x)[2],
-            K.int_shape(x)[3],
-        )
-
         # Following `jax.image.resize()`
         x = Resizing(
-            height=int(h * ratio),
-            width=int(w * ratio),
+            ratio=1 / ratio,
             method="bilinear",
             antialias=True,
             name=f"{name}_resizing_{K.get_uid('Resizing')}",

diff --git a/maxim/layers.py b/maxim/layers.py
@@ -1,65 +1,114 @@
 """
 Layers based on https://github.com/google-research/maxim/blob/main/maxim/models/maxim.py
+and reworked to cope with variable image dimensions
 """
 
-import einops
 import tensorflow as tf
 from tensorflow.experimental import numpy as tnp
-from tensorflow.keras import backend as K
 from tensorflow.keras import layers
 
 
 @tf.keras.utils.register_keras_serializable("maxim")
-class BlockImages(layers.Layer):
+class TFBlockImages(layers.Layer):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-    def call(self, x, patch_size):
+    def call(self, image, patch_size):
         bs, h, w, num_channels = (
-            K.int_shape(x)[0],
-            K.int_shape(x)[1],
-            K.int_shape(x)[2],
-            K.int_shape(x)[3],
+            tf.shape(image)[0],
+            tf.shape(image)[1],
+            tf.shape(image)[2],
+            tf.shape(image)[3],
         )
+        ph, pw = patch_size
+        gh = h // ph
+        gw = w // pw
+        pad = [[0, 0], [0, 0]]
+        patches = tf.space_to_batch_nd(image, [ph, pw], pad)
+        patches = tf.split(patches, ph * pw, axis=0)
+        patches = tf.stack(patches, 3)  # (bs, h/p, h/p, p*p, 3)
+        patches_dim = tf.shape(patches)
+        patches = tf.reshape(
+            patches, [patches_dim[0], patches_dim[1], patches_dim[2], -1]
+        )
+        patches = tf.reshape(
+            patches,
+            (patches_dim[0], patches_dim[1] * patches_dim[2], ph * pw, num_channels),
+        )
+        return [patches, gh, gw]
 
-        grid_height, grid_width = h // patch_size[0], w // patch_size[1]
+    def get_config(self):
+        return super().get_config()
 
-        x = einops.rearrange(
-            x,
-            "n (gh fh) (gw fw) c -> n (gh gw) (fh fw) c",
-            gh=grid_height,
-            gw=grid_width,
-            fh=patch_size[0],
-            fw=patch_size[1],
-        )
 
-        return x
+@tf.keras.utils.register_keras_serializable("maxim")
+class TFBlockImagesByGrid(layers.Layer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def call(self, image, grid_size):
+        bs, h, w, num_channels = (
+            tf.shape(image)[0],
+            tf.shape(image)[1],
+            tf.shape(image)[2],
+            tf.shape(image)[3],
+        )
+        gh, gw = grid_size
+        ph = h // gh
+        pw = w // gw
+        pad = [[0, 0], [0, 0]]
+
+        def block_single_image(img):
+            pat = tf.expand_dims(img, 0)  # batch = 1
+            pat = tf.space_to_batch_nd(pat, [ph, pw], pad)  # p*p*bs, g, g, c
+            pat = tf.expand_dims(pat, 3)  # pxpxbs, g, g, 1, c
+            pat = tf.transpose(pat, perm=[3, 1, 2, 0, 4])  # 1, g, g, pxp, c
+            pat = tf.reshape(pat, [gh, gw, ph * pw, num_channels])
+            return pat
+
+        patches = image
+        patches = tf.map_fn(fn=lambda x: block_single_image(x), elems=patches)
+        patches_dim = tf.shape(patches)
+        patches = tf.reshape(
+            patches, [patches_dim[0], patches_dim[1], patches_dim[2], -1]
+        )
+        patches = tf.reshape(
+            patches,
+            (patches_dim[0], patches_dim[1] * patches_dim[2], ph * pw, num_channels),
+        )
+        return [patches, ph, pw]
 
     def get_config(self):
-        config = super().get_config().copy()
-        return config
+        return super().get_config()
 
 
 @tf.keras.utils.register_keras_serializable("maxim")
-class UnblockImages(layers.Layer):
+class TFUnblockImages(layers.Layer):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-    def call(self, x, grid_size, patch_size):
-        x = einops.rearrange(
-            x,
-            "n (gh gw) (fh fw) c -> n (gh fh) (gw fw) c",
-            gh=grid_size[0],
-            gw=grid_size[1],
-            fh=patch_size[0],
-            fw=patch_size[1],
+    def call(self, x, patch_size, grid_size):
+        bs, grid_sqrt, patch_sqrt, num_channels = (
+            tf.shape(x)[0],
+            tf.shape(x)[1],
+            tf.shape(x)[2],
+            tf.shape(x)[3],
         )
+        ph, pw = patch_size
+        gh, gw = grid_size
 
-        return x
+        pad = [[0, 0], [0, 0]]
+
+        y = tf.reshape(x, (bs, gh, gw, -1, num_channels))  # (bs, gh, gw, ph*pw, 3)
+        y = tf.expand_dims(y, 0)
+        y = tf.transpose(y, perm=[4, 1, 2, 3, 0, 5])
+        y = tf.reshape(y, [bs * ph * pw, gh, gw, num_channels])
+        y = tf.batch_to_space(y, [ph, pw], pad)
+
+        return y
 
     def get_config(self):
-        config = super().get_config().copy()
-        return config
+        return super().get_config()
 
 
 @tf.keras.utils.register_keras_serializable("maxim")
@@ -76,28 +125,31 @@ def get_config(self):
 
 
 @tf.keras.utils.register_keras_serializable("maxim")
-class Resizing(layers.Layer):
-    def __init__(self, height, width, antialias=True, method="bilinear", **kwargs):
+class Resizing(tf.keras.layers.Layer):
+    def __init__(self, ratio: float, method="bilinear", antialias=True, **kwargs):
         super().__init__(**kwargs)
-        self.height = height
-        self.width = width
-        self.antialias = antialias
+        self.ratio = ratio
         self.method = method
+        self.antialias = antialias
 
-    def call(self, x):
-        return tf.image.resize(
-            x,
-            size=(self.height, self.width),
-            antialias=self.antialias,
+    def call(self, img):
+        shape = tf.shape(img)
+
+        new_sh = tf.cast(shape[1:3], tf.float32) // self.ratio
+
+        x = tf.image.resize(
+            img,
+            size=tf.cast(new_sh, tf.int32),
             method=self.method,
+            antialias=self.antialias,
         )
+        return x
 
     def get_config(self):
         config = super().get_config().copy()
         config.update(
             {
-                "height": self.height,
-                "width": self.width,
+                "ratio": self.ratio,
                 "antialias": self.antialias,
                 "method": self.method,
             }