From ab06bfcb0c0d78219c7a6fc2b0b5b7035be41b53 Mon Sep 17 00:00:00 2001
From: Ian Coccimiglio <icoccimi@gmail.com>
Date: Sat, 20 Apr 2024 22:38:47 -0700
Subject: [PATCH] Fixed bug in re-registering model

---
 .../minimal_detection/tinyvit/tiny_vit.py     | 479 ++++++++++++------
 1 file changed, 314 insertions(+), 165 deletions(-)

diff --git a/src/napari_segment_everything/minimal_detection/tinyvit/tiny_vit.py b/src/napari_segment_everything/minimal_detection/tinyvit/tiny_vit.py
index 235b866..f67547d 100644
--- a/src/napari_segment_everything/minimal_detection/tinyvit/tiny_vit.py
+++ b/src/napari_segment_everything/minimal_detection/tinyvit/tiny_vit.py
@@ -12,32 +12,57 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils.checkpoint as checkpoint
-from timm.models.layers import DropPath as TimmDropPath,\
-    to_2tuple, trunc_normal_
-from timm.models.registry import register_model
+from timm.models.layers import (
+    DropPath as TimmDropPath,
+    to_2tuple,
+    trunc_normal_,
+)
+from timm.models.registry import register_model, list_models
 from typing import Tuple
 
 
 class Conv2d_BN(torch.nn.Sequential):
-    def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1,
-                 groups=1, bn_weight_init=1):
+    def __init__(
+        self,
+        a,
+        b,
+        ks=1,
+        stride=1,
+        pad=0,
+        dilation=1,
+        groups=1,
+        bn_weight_init=1,
+    ):
         super().__init__()
-        self.add_module('c', torch.nn.Conv2d(
-            a, b, ks, stride, pad, dilation, groups, bias=False))
+        self.add_module(
+            "c",
+            torch.nn.Conv2d(
+                a, b, ks, stride, pad, dilation, groups, bias=False
+            ),
+        )
         bn = torch.nn.BatchNorm2d(b)
         torch.nn.init.constant_(bn.weight, bn_weight_init)
         torch.nn.init.constant_(bn.bias, 0)
-        self.add_module('bn', bn)
+        self.add_module("bn", bn)
 
     @torch.no_grad()
     def fuse(self):
         c, bn = self._modules.values()
-        w = bn.weight / (bn.running_var + bn.eps)**0.5
+        w = bn.weight / (bn.running_var + bn.eps) ** 0.5
         w = c.weight * w[:, None, None, None]
-        b = bn.bias - bn.running_mean * bn.weight / \
-            (bn.running_var + bn.eps)**0.5
-        m = torch.nn.Conv2d(w.size(1) * self.c.groups, w.size(
-            0), w.shape[2:], stride=self.c.stride, padding=self.c.padding, dilation=self.c.dilation, groups=self.c.groups)
+        b = (
+            bn.bias
+            - bn.running_mean * bn.weight / (bn.running_var + bn.eps) ** 0.5
+        )
+        m = torch.nn.Conv2d(
+            w.size(1) * self.c.groups,
+            w.size(0),
+            w.shape[2:],
+            stride=self.c.stride,
+            padding=self.c.padding,
+            dilation=self.c.dilation,
+            groups=self.c.groups,
+        )
         m.weight.data.copy_(w)
         m.bias.data.copy_(b)
         return m
@@ -50,7 +75,7 @@ def __init__(self, drop_prob=None):
 
     def __repr__(self):
         msg = super().__repr__()
-        msg += f'(drop_prob={self.drop_prob})'
+        msg += f"(drop_prob={self.drop_prob})"
         return msg
 
 
@@ -59,8 +84,9 @@ def __init__(self, in_chans, embed_dim, resolution, activation):
         super().__init__()
         img_size: Tuple[int, int] = to_2tuple(resolution)
         self.patches_resolution = (img_size[0] // 4, img_size[1] // 4)
-        self.num_patches = self.patches_resolution[0] * \
-            self.patches_resolution[1]
+        self.num_patches = (
+            self.patches_resolution[0] * self.patches_resolution[1]
+        )
         self.in_chans = in_chans
         self.embed_dim = embed_dim
         n = embed_dim
@@ -75,8 +101,9 @@ def forward(self, x):
 
 
 class MBConv(nn.Module):
-    def __init__(self, in_chans, out_chans, expand_ratio,
-                 activation, drop_path):
+    def __init__(
+        self, in_chans, out_chans, expand_ratio, activation, drop_path
+    ):
         super().__init__()
         self.in_chans = in_chans
         self.hidden_chans = int(in_chans * expand_ratio)
@@ -85,16 +112,24 @@ def __init__(self, in_chans, out_chans, expand_ratio,
         self.conv1 = Conv2d_BN(in_chans, self.hidden_chans, ks=1)
         self.act1 = activation()
 
-        self.conv2 = Conv2d_BN(self.hidden_chans, self.hidden_chans,
-                               ks=3, stride=1, pad=1, groups=self.hidden_chans)
+        self.conv2 = Conv2d_BN(
+            self.hidden_chans,
+            self.hidden_chans,
+            ks=3,
+            stride=1,
+            pad=1,
+            groups=self.hidden_chans,
+        )
         self.act2 = activation()
 
         self.conv3 = Conv2d_BN(
-            self.hidden_chans, out_chans, ks=1, bn_weight_init=0.0)
+            self.hidden_chans, out_chans, ks=1, bn_weight_init=0.0
+        )
         self.act3 = activation()
 
-        self.drop_path = DropPath(
-            drop_path) if drop_path > 0. else nn.Identity()
+        self.drop_path = (
+            DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        )
 
     def forward(self, x):
         shortcut = x
@@ -124,10 +159,12 @@ def __init__(self, input_resolution, dim, out_dim, activation):
         self.out_dim = out_dim
         self.act = activation()
         self.conv1 = Conv2d_BN(dim, out_dim, 1, 1, 0)
-        stride_c=2
-        if(out_dim==320 or out_dim==448 or out_dim==576):
-            stride_c=1
-        self.conv2 = Conv2d_BN(out_dim, out_dim, 3, stride_c, 1, groups=out_dim)
+        stride_c = 2
+        if out_dim == 320 or out_dim == 448 or out_dim == 576:
+            stride_c = 1
+        self.conv2 = Conv2d_BN(
+            out_dim, out_dim, 3, stride_c, 1, groups=out_dim
+        )
         self.conv3 = Conv2d_BN(out_dim, out_dim, 1, 1, 0)
 
     def forward(self, x):
@@ -148,12 +185,18 @@ def forward(self, x):
 
 
 class ConvLayer(nn.Module):
-    def __init__(self, dim, input_resolution, depth,
-                 activation,
-                 drop_path=0., downsample=None, use_checkpoint=False,
-                 out_dim=None,
-                 conv_expand_ratio=4.,
-                 ):
+    def __init__(
+        self,
+        dim,
+        input_resolution,
+        depth,
+        activation,
+        drop_path=0.0,
+        downsample=None,
+        use_checkpoint=False,
+        out_dim=None,
+        conv_expand_ratio=4.0,
+    ):
 
         super().__init__()
         self.dim = dim
@@ -162,16 +205,27 @@ def __init__(self, dim, input_resolution, depth,
         self.use_checkpoint = use_checkpoint
 
         # build blocks
-        self.blocks = nn.ModuleList([
-            MBConv(dim, dim, conv_expand_ratio, activation,
-                   drop_path[i] if isinstance(drop_path, list) else drop_path,
-                   )
-            for i in range(depth)])
+        self.blocks = nn.ModuleList(
+            [
+                MBConv(
+                    dim,
+                    dim,
+                    conv_expand_ratio,
+                    activation,
+                    drop_path[i] if isinstance(drop_path, list) else drop_path,
+                )
+                for i in range(depth)
+            ]
+        )
 
         # patch merging layer
         if downsample is not None:
             self.downsample = downsample(
-                input_resolution, dim=dim, out_dim=out_dim, activation=activation)
+                input_resolution,
+                dim=dim,
+                out_dim=out_dim,
+                activation=activation,
+            )
         else:
             self.downsample = None
 
@@ -187,8 +241,14 @@ def forward(self, x):
 
 
 class Mlp(nn.Module):
-    def __init__(self, in_features, hidden_features=None,
-                 out_features=None, act_layer=nn.GELU, drop=0.):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
         super().__init__()
         out_features = out_features or in_features
         hidden_features = hidden_features or in_features
@@ -210,15 +270,19 @@ def forward(self, x):
 
 
 class Attention(torch.nn.Module):
-    def __init__(self, dim, key_dim, num_heads=8,
-                 attn_ratio=4,
-                 resolution=(14, 14),
-                 ):
+    def __init__(
+        self,
+        dim,
+        key_dim,
+        num_heads=8,
+        attn_ratio=4,
+        resolution=(14, 14),
+    ):
         super().__init__()
         # (h, w)
         assert isinstance(resolution, tuple) and len(resolution) == 2
         self.num_heads = num_heads
-        self.scale = key_dim ** -0.5
+        self.scale = key_dim**-0.5
         self.key_dim = key_dim
         self.nh_kd = nh_kd = key_dim * num_heads
         self.d = int(attn_ratio * key_dim)
@@ -230,8 +294,9 @@ def __init__(self, dim, key_dim, num_heads=8,
         self.qkv = nn.Linear(dim, h)
         self.proj = nn.Linear(self.dh, dim)
 
-        points = list(itertools.product(
-            range(resolution[0]), range(resolution[1])))
+        points = list(
+            itertools.product(range(resolution[0]), range(resolution[1]))
+        )
         N = len(points)
         attention_offsets = {}
         idxs = []
@@ -242,21 +307,25 @@ def __init__(self, dim, key_dim, num_heads=8,
                     attention_offsets[offset] = len(attention_offsets)
                 idxs.append(attention_offsets[offset])
         self.attention_biases = torch.nn.Parameter(
-            torch.zeros(num_heads, len(attention_offsets)))
-        self.register_buffer('attention_bias_idxs',
-                             torch.LongTensor(idxs).view(N, N),
-                             persistent=False)
+            torch.zeros(num_heads, len(attention_offsets))
+        )
+        self.register_buffer(
+            "attention_bias_idxs",
+            torch.LongTensor(idxs).view(N, N),
+            persistent=False,
+        )
 
     @torch.no_grad()
     def train(self, mode=True):
         super().train(mode)
-        if mode and hasattr(self, 'ab'):
+        if mode and hasattr(self, "ab"):
             del self.ab
         else:
             self.ab = self.attention_biases[:, self.attention_bias_idxs]
             # self.register_buffer('ab',
             #                    self.attention_biases[:, self.attention_bias_idxs],
             #                    persistent=False)
+
     def forward(self, x):  # x (B,N,C)
         B, N, _ = x.shape
 
@@ -265,17 +334,18 @@ def forward(self, x):  # x (B,N,C)
 
         qkv = self.qkv(x)
         # (B, N, num_heads, d)
-        q, k, v = qkv.view(B, N, self.num_heads, -
-                           1).split([self.key_dim, self.key_dim, self.d], dim=3)
+        q, k, v = qkv.view(B, N, self.num_heads, -1).split(
+            [self.key_dim, self.key_dim, self.d], dim=3
+        )
         # (B, num_heads, N, d)
         q = q.permute(0, 2, 1, 3)
         k = k.permute(0, 2, 1, 3)
         v = v.permute(0, 2, 1, 3)
 
-        attn = (
-            (q @ k.transpose(-2, -1)) * self.scale
-            +
-            (self.attention_biases[:, self.attention_bias_idxs] if self.training else self.ab)
+        attn = (q @ k.transpose(-2, -1)) * self.scale + (
+            self.attention_biases[:, self.attention_bias_idxs]
+            if self.training
+            else self.ab
         )
         attn = attn.softmax(dim=-1)
         x = (attn @ v).transpose(1, 2).reshape(B, N, self.dh)
@@ -284,7 +354,7 @@ def forward(self, x):  # x (B,N,C)
 
 
 class TinyViTBlock(nn.Module):
-    r""" TinyViT Block.
+    r"""TinyViT Block.
 
     Args:
         dim (int): Number of input channels.
@@ -299,37 +369,55 @@ class TinyViTBlock(nn.Module):
         activation: the activation function. Default: nn.GELU
     """
 
-    def __init__(self, dim, input_resolution, num_heads, window_size=7,
-                 mlp_ratio=4., drop=0., drop_path=0.,
-                 local_conv_size=3,
-                 activation=nn.GELU,
-                 ):
+    def __init__(
+        self,
+        dim,
+        input_resolution,
+        num_heads,
+        window_size=7,
+        mlp_ratio=4.0,
+        drop=0.0,
+        drop_path=0.0,
+        local_conv_size=3,
+        activation=nn.GELU,
+    ):
         super().__init__()
         self.dim = dim
         self.input_resolution = input_resolution
         self.num_heads = num_heads
-        assert window_size > 0, 'window_size must be greater than 0'
+        assert window_size > 0, "window_size must be greater than 0"
         self.window_size = window_size
         self.mlp_ratio = mlp_ratio
 
-        self.drop_path = DropPath(
-            drop_path) if drop_path > 0. else nn.Identity()
+        self.drop_path = (
+            DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        )
 
-        assert dim % num_heads == 0, 'dim must be divisible by num_heads'
+        assert dim % num_heads == 0, "dim must be divisible by num_heads"
         head_dim = dim // num_heads
 
         window_resolution = (window_size, window_size)
-        self.attn = Attention(dim, head_dim, num_heads,
-                              attn_ratio=1, resolution=window_resolution)
+        self.attn = Attention(
+            dim,
+            head_dim,
+            num_heads,
+            attn_ratio=1,
+            resolution=window_resolution,
+        )
 
         mlp_hidden_dim = int(dim * mlp_ratio)
         mlp_activation = activation
-        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim,
-                       act_layer=mlp_activation, drop=drop)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=mlp_activation,
+            drop=drop,
+        )
 
         pad = local_conv_size // 2
         self.local_conv = Conv2d_BN(
-            dim, dim, ks=local_conv_size, stride=1, pad=pad, groups=dim)
+            dim, dim, ks=local_conv_size, stride=1, pad=pad, groups=dim
+        )
 
     def forward(self, x):
         H, W = self.input_resolution
@@ -340,10 +428,12 @@ def forward(self, x):
             x = self.attn(x)
         else:
             x = x.view(B, H, W, C)
-            pad_b = (self.window_size - H %
-                     self.window_size) % self.window_size
-            pad_r = (self.window_size - W %
-                     self.window_size) % self.window_size
+            pad_b = (
+                self.window_size - H % self.window_size
+            ) % self.window_size
+            pad_r = (
+                self.window_size - W % self.window_size
+            ) % self.window_size
             padding = pad_b > 0 or pad_r > 0
 
             if padding:
@@ -353,12 +443,18 @@ def forward(self, x):
             nH = pH // self.window_size
             nW = pW // self.window_size
             # window partition
-            x = x.view(B, nH, self.window_size, nW, self.window_size, C).transpose(2, 3).reshape(
-                B * nH * nW, self.window_size * self.window_size, C)
+            x = (
+                x.view(B, nH, self.window_size, nW, self.window_size, C)
+                .transpose(2, 3)
+                .reshape(B * nH * nW, self.window_size * self.window_size, C)
+            )
             x = self.attn(x)
             # window reverse
-            x = x.view(B, nH, nW, self.window_size, self.window_size,
-                       C).transpose(2, 3).reshape(B, pH, pW, C)
+            x = (
+                x.view(B, nH, nW, self.window_size, self.window_size, C)
+                .transpose(2, 3)
+                .reshape(B, pH, pW, C)
+            )
 
             if padding:
                 x = x[:, :H, :W].contiguous()
@@ -375,12 +471,14 @@ def forward(self, x):
         return x
 
     def extra_repr(self) -> str:
-        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
-               f"window_size={self.window_size}, mlp_ratio={self.mlp_ratio}"
+        return (
+            f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, "
+            f"window_size={self.window_size}, mlp_ratio={self.mlp_ratio}"
+        )
 
 
 class BasicLayer(nn.Module):
-    """ A basic TinyViT layer for one stage.
+    """A basic TinyViT layer for one stage.
 
     Args:
         dim (int): Number of input channels.
@@ -398,13 +496,22 @@ class BasicLayer(nn.Module):
         out_dim: the output dimension of the layer. Default: dim
     """
 
-    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
-                 mlp_ratio=4., drop=0.,
-                 drop_path=0., downsample=None, use_checkpoint=False,
-                 local_conv_size=3,
-                 activation=nn.GELU,
-                 out_dim=None,
-                 ):
+    def __init__(
+        self,
+        dim,
+        input_resolution,
+        depth,
+        num_heads,
+        window_size,
+        mlp_ratio=4.0,
+        drop=0.0,
+        drop_path=0.0,
+        downsample=None,
+        use_checkpoint=False,
+        local_conv_size=3,
+        activation=nn.GELU,
+        out_dim=None,
+    ):
 
         super().__init__()
         self.dim = dim
@@ -413,22 +520,35 @@ def __init__(self, dim, input_resolution, depth, num_heads, window_size,
         self.use_checkpoint = use_checkpoint
 
         # build blocks
-        self.blocks = nn.ModuleList([
-            TinyViTBlock(dim=dim, input_resolution=input_resolution,
-                         num_heads=num_heads, window_size=window_size,
-                         mlp_ratio=mlp_ratio,
-                         drop=drop,
-                         drop_path=drop_path[i] if isinstance(
-                             drop_path, list) else drop_path,
-                         local_conv_size=local_conv_size,
-                         activation=activation,
-                         )
-            for i in range(depth)])
+        self.blocks = nn.ModuleList(
+            [
+                TinyViTBlock(
+                    dim=dim,
+                    input_resolution=input_resolution,
+                    num_heads=num_heads,
+                    window_size=window_size,
+                    mlp_ratio=mlp_ratio,
+                    drop=drop,
+                    drop_path=(
+                        drop_path[i]
+                        if isinstance(drop_path, list)
+                        else drop_path
+                    ),
+                    local_conv_size=local_conv_size,
+                    activation=activation,
+                )
+                for i in range(depth)
+            ]
+        )
 
         # patch merging layer
         if downsample is not None:
             self.downsample = downsample(
-                input_resolution, dim=dim, out_dim=out_dim, activation=activation)
+                input_resolution,
+                dim=dim,
+                out_dim=out_dim,
+                activation=activation,
+            )
         else:
             self.downsample = None
 
@@ -445,6 +565,7 @@ def forward(self, x):
     def extra_repr(self) -> str:
         return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
 
+
 class LayerNorm2d(nn.Module):
     def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
         super().__init__()
@@ -458,22 +579,29 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = (x - u) / torch.sqrt(s + self.eps)
         x = self.weight[:, None, None] * x + self.bias[:, None, None]
         return x
+
+
 class TinyViT(nn.Module):
-    def __init__(self, img_size=224, in_chans=3, num_classes=1000,
-                 embed_dims=[96, 192, 384, 768], depths=[2, 2, 6, 2],
-                 num_heads=[3, 6, 12, 24],
-                 window_sizes=[7, 7, 14, 7],
-                 mlp_ratio=4.,
-                 drop_rate=0.,
-                 drop_path_rate=0.1,
-                 use_checkpoint=False,
-                 mbconv_expand_ratio=4.0,
-                 local_conv_size=3,
-                 layer_lr_decay=1.0,
-                 ):
+    def __init__(
+        self,
+        img_size=224,
+        in_chans=3,
+        num_classes=1000,
+        embed_dims=[96, 192, 384, 768],
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_sizes=[7, 7, 14, 7],
+        mlp_ratio=4.0,
+        drop_rate=0.0,
+        drop_path_rate=0.1,
+        use_checkpoint=False,
+        mbconv_expand_ratio=4.0,
+        local_conv_size=3,
+        layer_lr_decay=1.0,
+    ):
         super().__init__()
-        self.img_size=img_size
-        #import pdb;pdb.set_trace()
+        self.img_size = img_size
+        # import pdb;pdb.set_trace()
         self.num_classes = num_classes
         self.depths = depths
         self.num_layers = len(depths)
@@ -481,35 +609,45 @@ def __init__(self, img_size=224, in_chans=3, num_classes=1000,
 
         activation = nn.GELU
 
-        self.patch_embed = PatchEmbed(in_chans=in_chans,
-                                      embed_dim=embed_dims[0],
-                                      resolution=img_size,
-                                      activation=activation)
+        self.patch_embed = PatchEmbed(
+            in_chans=in_chans,
+            embed_dim=embed_dims[0],
+            resolution=img_size,
+            activation=activation,
+        )
 
         patches_resolution = self.patch_embed.patches_resolution
         self.patches_resolution = patches_resolution
 
         # stochastic depth
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate,
-                                                sum(depths))]  # stochastic depth decay rule
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
 
         # build layers
         self.layers = nn.ModuleList()
         for i_layer in range(self.num_layers):
-            kwargs = dict(dim=embed_dims[i_layer],
-                        input_resolution=(patches_resolution[0] // (2 ** (i_layer-1 if i_layer == 3 else i_layer)),
-                                patches_resolution[1] // (2 ** (i_layer-1 if i_layer == 3 else i_layer))),
-                        #   input_resolution=(patches_resolution[0] // (2 ** i_layer),
-                        #                     patches_resolution[1] // (2 ** i_layer)),
-                          depth=depths[i_layer],
-                          drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
-                          downsample=PatchMerging if (
-                              i_layer < self.num_layers - 1) else None,
-                          use_checkpoint=use_checkpoint,
-                          out_dim=embed_dims[min(
-                              i_layer + 1, len(embed_dims) - 1)],
-                          activation=activation,
-                          )
+            kwargs = dict(
+                dim=embed_dims[i_layer],
+                input_resolution=(
+                    patches_resolution[0]
+                    // (2 ** (i_layer - 1 if i_layer == 3 else i_layer)),
+                    patches_resolution[1]
+                    // (2 ** (i_layer - 1 if i_layer == 3 else i_layer)),
+                ),
+                #   input_resolution=(patches_resolution[0] // (2 ** i_layer),
+                #                     patches_resolution[1] // (2 ** i_layer)),
+                depth=depths[i_layer],
+                drop_path=dpr[
+                    sum(depths[:i_layer]) : sum(depths[: i_layer + 1])
+                ],
+                downsample=(
+                    PatchMerging if (i_layer < self.num_layers - 1) else None
+                ),
+                use_checkpoint=use_checkpoint,
+                out_dim=embed_dims[min(i_layer + 1, len(embed_dims) - 1)],
+                activation=activation,
+            )
             if i_layer == 0:
                 layer = ConvLayer(
                     conv_expand_ratio=mbconv_expand_ratio,
@@ -522,13 +660,17 @@ def __init__(self, img_size=224, in_chans=3, num_classes=1000,
                     mlp_ratio=self.mlp_ratio,
                     drop=drop_rate,
                     local_conv_size=local_conv_size,
-                    **kwargs)
+                    **kwargs,
+                )
             self.layers.append(layer)
 
         # Classifier head
         self.norm_head = nn.LayerNorm(embed_dims[-1])
-        self.head = nn.Linear(
-            embed_dims[-1], num_classes) if num_classes > 0 else torch.nn.Identity()
+        self.head = (
+            nn.Linear(embed_dims[-1], num_classes)
+            if num_classes > 0
+            else torch.nn.Identity()
+        )
 
         # init weights
         self.apply(self._init_weights)
@@ -550,13 +692,14 @@ def __init__(self, img_size=224, in_chans=3, num_classes=1000,
             ),
             LayerNorm2d(256),
         )
+
     def set_layer_lr_decay(self, layer_lr_decay):
         decay_rate = layer_lr_decay
 
         # layers -> blocks (depth)
         depth = sum(self.depths)
         lr_scales = [decay_rate ** (depth - i - 1) for i in range(depth)]
-        #print("LR SCALES:", lr_scales)
+        # print("LR SCALES:", lr_scales)
 
         def _set_lr_scale(m, scale):
             for p in m.parameters():
@@ -570,7 +713,8 @@ def _set_lr_scale(m, scale):
                 i += 1
             if layer.downsample is not None:
                 layer.downsample.apply(
-                    lambda x: _set_lr_scale(x, lr_scales[i - 1]))
+                    lambda x: _set_lr_scale(x, lr_scales[i - 1])
+                )
         assert i == depth
         for m in [self.norm_head, self.head]:
             m.apply(lambda x: _set_lr_scale(x, lr_scales[-1]))
@@ -580,13 +724,13 @@ def _set_lr_scale(m, scale):
 
         def _check_lr_scale(m):
             for p in m.parameters():
-                assert hasattr(p, 'lr_scale'), p.param_name
+                assert hasattr(p, "lr_scale"), p.param_name
 
         self.apply(_check_lr_scale)
 
     def _init_weights(self, m):
         if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=.02)
+            trunc_normal_(m.weight, std=0.02)
             if isinstance(m, nn.Linear) and m.bias is not None:
                 nn.init.constant_(m.bias, 0)
         elif isinstance(m, nn.LayerNorm):
@@ -595,7 +739,7 @@ def _init_weights(self, m):
 
     @torch.jit.ignore
     def no_weight_decay_keywords(self):
-        return {'attention_biases'}
+        return {"attention_biases"}
 
     def forward_features(self, x):
         # x: (N, C, H, W)
@@ -607,52 +751,57 @@ def forward_features(self, x):
         for i in range(start_i, len(self.layers)):
             layer = self.layers[i]
             x = layer(x)
-        B,_,C=x.size()
+        B, _, C = x.size()
         x = x.view(B, 64, 64, C)
-        x=x.permute(0, 3, 1, 2)
-        x=self.neck(x)
+        x = x.permute(0, 3, 1, 2)
+        x = self.neck(x)
         return x
 
     def forward(self, x):
         x = self.forward_features(x)
-        #x = self.norm_head(x)
-        #x = self.head(x)
+        # x = self.norm_head(x)
+        # x = self.head(x)
         return x
 
 
-_checkpoint_url_format = \
-    'https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/{}.pth'
+_checkpoint_url_format = "https://github.com/wkcn/TinyViT-model-zoo/releases/download/checkpoints/{}.pth"
 _provided_checkpoints = {
-    'tiny_vit_5m_224': 'tiny_vit_5m_22kto1k_distill',
-    'tiny_vit_11m_224': 'tiny_vit_11m_22kto1k_distill',
-    'tiny_vit_21m_224': 'tiny_vit_21m_22kto1k_distill',
-    'tiny_vit_21m_384': 'tiny_vit_21m_22kto1k_384_distill',
-    'tiny_vit_21m_512': 'tiny_vit_21m_22kto1k_512_distill',
+    "tiny_vit_5m_224": "tiny_vit_5m_22kto1k_distill",
+    "tiny_vit_11m_224": "tiny_vit_11m_22kto1k_distill",
+    "tiny_vit_21m_224": "tiny_vit_21m_22kto1k_distill",
+    "tiny_vit_21m_384": "tiny_vit_21m_22kto1k_384_distill",
+    "tiny_vit_21m_512": "tiny_vit_21m_22kto1k_512_distill",
 }
 
 
 def register_tiny_vit_model(fn):
-    '''Register a TinyViT model
+    """Register a TinyViT model
     It is a wrapper of `register_model` with loading the pretrained checkpoint.
-    '''
+    """
+
     def fn_wrapper(pretrained=False, **kwargs):
         model = fn()
         if pretrained:
             model_name = fn.__name__
-            assert model_name in _provided_checkpoints, \
-                f'Sorry that the checkpoint `{model_name}` is not provided yet.'
+            assert (
+                model_name in _provided_checkpoints
+            ), f"Sorry that the checkpoint `{model_name}` is not provided yet."
             url = _checkpoint_url_format.format(
-                _provided_checkpoints[model_name])
+                _provided_checkpoints[model_name]
+            )
             checkpoint = torch.hub.load_state_dict_from_url(
                 url=url,
-                map_location='cpu', check_hash=False,
+                map_location="cpu",
+                check_hash=False,
             )
-            model.load_state_dict(checkpoint['model'])
+            model.load_state_dict(checkpoint["model"])
 
         return model
 
     # rename the name of fn_wrapper
     fn_wrapper.__name__ = fn.__name__
+    if fn_wrapper.__name__ in list_models():
+        return
     return register_model(fn_wrapper)