model.py

'''
-  `train-folder`: The folder of training images
-  `valid-folder`: The folder of validation images
-  `model-folder`: Where the model after training saved
-  `num-classes`: The number of your problem classes.
- `batch-size`: The batch size of the dataset
- `c`: Patch Projection Dimension
- `ds`: Token-mixing units. It was mentioned in the paper on [page 3](https://arxiv.org/pdf/2105.01601.pdf)
- `dc`: Channel-mixing units. It was mentioned in the paper on [page 3](https://arxiv.org/pdf/2105.01601.pdf)
- `num-of-mlp-blocks`: The number of MLP Blocks
- `learning-rate`: The learning rate of Adam Optimizer
'''

import numpy as np
import tensorflow as tf
from keras import layers
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Embedding, Input, LayerNormalization, Dense, GlobalAveragePooling1D, Dropout
from tensorflow.keras.layers.experimental.preprocessing import Normalization, Resizing, RandomFlip, RandomRotation, \
    RandomZoom
from tensorflow.keras.models import Sequential


class Patches(layers.Layer):
    def __init__(self, patch_size):
        super(Patches, self).__init__()
        self.patch_size = patch_size

    def call(self, images):
        batch_size = tf.shape(images)[0]
        patches = tf.image.extract_patches(
            images=images,
            sizes=[1, self.patch_size, self.patch_size, 1],
            strides=[1, self.patch_size, self.patch_size, 1],
            rates=[1, 1, 1, 1],
            padding='VALID',
        )
        dim = patches.shape[-1]

        patches = tf.reshape(patches, (batch_size, -1, dim))
        return patches


class MLPBlock(tf.keras.layers.Layer):
    def __init__(self, S, C, DS, DC):
        super(MLPBlock, self).__init__()
        self.layerNorm1 = LayerNormalization()
        self.layerNorm2 = LayerNormalization()
        w_init = tf.random_normal_initializer()
        self.DS = DS
        self.DC = DC
        self.W1 = tf.Variable(
            initial_value=w_init(shape=(S, DS), dtype="float32"),
            trainable=True,
        )
        self.W2 = tf.Variable(
            initial_value=w_init(shape=(DS, S), dtype="float32"),
            trainable=True,
        )
        self.W3 = tf.Variable(
            initial_value=w_init(shape=(C, DC), dtype="float32"),
            trainable=True,
        )
        self.W4 = tf.Variable(
            initial_value=w_init(shape=(DC, C), dtype="float32"),
            trainable=True,
        )

    def call(self, X):
        # patches (..., S, C)
        batch_size, S, C = X.shape

        # Token-mixing
        # (..., C, S)
        X_T = tf.transpose(self.layerNorm1(X), perm=(0, 2, 1))

        # assert X_T.shape == (batch_size, C, S), 'X_T.shape: {}'.format(X_T.shape)

        W1X = tf.matmul(X_T, self.W1)  # (..., C, S) . (S, DS) = (..., C, DS)

        # (..., C, DS) . (DS, S) == (..., C, S)
        # (..., C, S). T == (..., S, C)
        # (..., S, C) + (..., S, C) = (..., S, C)
        U = tf.transpose(tf.matmul(tf.nn.gelu(W1X), self.W2), perm=(0, 2, 1)) + X

        # Channel-minxing

        W3U = tf.matmul(self.layerNorm2(U), self.W3)  # (...,S, C) . (C, DC) = (..., S, DC)

        Y = tf.matmul(tf.nn.gelu(W3U), self.W4) + U  # (..., S, DC) . (..., DC, C) + (..., S, C) = (..., S, C)

        return Y


class MLPMixer(tf.keras.models.Model):
    def __init__(self, patch_size, S, C, DS, DC, num_of_mlp_blocks, image_size, batch_size, num_classes):
        super(MLPMixer, self).__init__()
        self.projection = Dense(C)
        self.mlpBlocks = [MLPBlock(S, C, DS, DC) for _ in range(num_of_mlp_blocks)]

        self.batch_size = batch_size
        self.patch_size = patch_size
        self.S = S
        self.C = C
        self.DS = DS
        self.DC = DC
        self.image_size = image_size
        self.num_classes = num_classes

        self.data_augmentation = tf.keras.Sequential(
            [
                Normalization(),
                # Resizing(image_size, image_size),
                RandomFlip("horizontal"),
                RandomRotation(factor=0.02),
                RandomZoom(
                    height_factor=0.2, width_factor=0.2
                ),
            ],
            name="data_augmentation",
        )

        self.classificationLayer = Sequential([
            GlobalAveragePooling1D(),
            Dropout(0.2),
            Dense(num_classes, activation='softmax')
        ])

    def extract_patches(self, images, patch_size):
        batch_size = tf.shape(images)[0]

        patches = tf.image.extract_patches(
            images=images,
            sizes=[1, patch_size, patch_size, 1],
            strides=[1, patch_size, patch_size, 1],
            rates=[1, 1, 1, 1],
            padding="VALID",
        )

        patches = tf.reshape(patches, [batch_size, -1, 3 * patch_size ** 2])

        return patches

    def call(self, images):
        # input
        # images shape: (batch_size, image_size, image_size, 3) = (32, 64, 64, 3)

        batch_size = images.shape[0]

        augumented_images = self.data_augmentation(images)

        # assert augumented_images.shape == (batch_size, self.image_size, self.image_size, 3)

        # patches shape: (batch_size, S, 3 * patch_size ** 2)
        X = self.extract_patches(augumented_images, self.patch_size)

        # Per-patch Fully-connected
        # X shape: (batch_size, S, C)
        X = self.projection(X)

        # assert X.shape == (batch_size, self.S, self.C)
        for block in self.mlpBlocks:
            X = block(X)

        # assert X.shape == (batch_size, self.S, self.C)

        # out shape: (batch_size, C)
        out = self.classificationLayer(X)

        # assert out.shape == (batch_size, self.num_classes)
        return out