Make ViT and Unetr to be torchscript comaptible (#7937)

KumoLiu · pre-commit-ci[bot] · web-flow · commit 37917e009f49 · 2024-07-23T17:58:56.000Z
Fixes #7936 ### Description - Pre-define `self.causal_mask = torch.Tensor()` before register buffer - Move norm_cross_attn and cross_attn out of if block ### Types of changes  - [x] Non-breaking change (fix or new feature that would not break existing functionality). - [ ] Breaking change (fix or new feature that would cause existing functionality to change). - [ ] New tests added to cover the changes. - [ ] Integration tests passed locally by running `./runtests.sh -f -u --net --coverage`. - [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests --disttests`. - [ ] In-line docstrings updated. - [ ] Documentation updated, tested `make html` command in the `docs/` folder. --------- Signed-off-by: YunLiu <55491388+KumoLiu@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/monai/networks/blocks/crossattention.py b/monai/networks/blocks/crossattention.py
@@ -109,6 +109,8 @@ def __init__(
                 torch.tril(torch.ones(sequence_length, sequence_length)).view(1, 1, sequence_length, sequence_length),
             )
             self.causal_mask: torch.Tensor
+        else:
+            self.causal_mask = torch.Tensor()
 
         self.att_mat = torch.Tensor()
         self.rel_positional_embedding = (
@@ -118,7 +120,7 @@ def __init__(
         )
         self.input_size = input_size
 
-    def forward(self, x: torch.Tensor, context: torch.Tensor | None = None):
+    def forward(self, x: torch.Tensor, context: Optional[torch.Tensor] = None):
         """
         Args:
             x (torch.Tensor): input tensor. B x (s_dim_1 * ... * s_dim_n) x C
diff --git a/monai/networks/blocks/selfattention.py b/monai/networks/blocks/selfattention.py
@@ -105,6 +105,8 @@ def __init__(
                 torch.tril(torch.ones(sequence_length, sequence_length)).view(1, 1, sequence_length, sequence_length),
             )
             self.causal_mask: torch.Tensor
+        else:
+            self.causal_mask = torch.Tensor()
 
         self.rel_positional_embedding = (
             get_rel_pos_embedding_layer(rel_pos_embedding, input_size, self.dim_head, self.num_heads)
diff --git a/monai/networks/blocks/transformerblock.py b/monai/networks/blocks/transformerblock.py
@@ -11,6 +11,8 @@
 
 from __future__ import annotations
 
+from typing import Optional
+
 import torch
 import torch.nn as nn
 
@@ -68,13 +70,12 @@ def __init__(
         self.norm2 = nn.LayerNorm(hidden_size)
         self.with_cross_attention = with_cross_attention
 
-        if self.with_cross_attention:
-            self.norm_cross_attn = nn.LayerNorm(hidden_size)
-            self.cross_attn = CrossAttentionBlock(
-                hidden_size=hidden_size, num_heads=num_heads, dropout_rate=dropout_rate, qkv_bias=qkv_bias, causal=False
-            )
+        self.norm_cross_attn = nn.LayerNorm(hidden_size)
+        self.cross_attn = CrossAttentionBlock(
+            hidden_size=hidden_size, num_heads=num_heads, dropout_rate=dropout_rate, qkv_bias=qkv_bias, causal=False
+        )
 
-    def forward(self, x: torch.Tensor, context: torch.Tensor | None = None) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, context: Optional[torch.Tensor] = None) -> torch.Tensor:
         x = x + self.attn(self.norm1(x))
         if self.with_cross_attention:
             x = x + self.cross_attn(self.norm_cross_attn(x), context=context)
diff --git a/tests/test_vit.py b/tests/test_vit.py
@@ -30,7 +30,7 @@
                         for mlp_dim in [3072]:
                             for num_layers in [4]:
                                 for num_classes in [8]:
-                                    for pos_embed in ["conv", "perceptron"]:
+                                    for proj_type in ["conv", "perceptron"]:
                                         for classification in [False, True]:
                                             for nd in (2, 3):
                                                 test_case = [
@@ -42,7 +42,7 @@
                                                         "mlp_dim": mlp_dim,
                                                         "num_layers": num_layers,
                                                         "num_heads": num_heads,
-                                                        "pos_embed": pos_embed,
+                                                        "proj_type": proj_type,
                                                         "classification": classification,
                                                         "num_classes": num_classes,
                                                         "dropout_rate": dropout_rate,
@@ -87,7 +87,7 @@ def test_ill_arg(
         mlp_dim,
         num_layers,
         num_heads,
-        pos_embed,
+        proj_type,
         classification,
         dropout_rate,
     ):
@@ -100,12 +100,12 @@ def test_ill_arg(
                 mlp_dim=mlp_dim,
                 num_layers=num_layers,
                 num_heads=num_heads,
-                pos_embed=pos_embed,
+                proj_type=proj_type,
                 classification=classification,
                 dropout_rate=dropout_rate,
             )
 
-    @parameterized.expand(TEST_CASE_Vit)
+    @parameterized.expand(TEST_CASE_Vit[:1])
     @SkipIfBeforePyTorchVersion((1, 9))
     def test_script(self, input_param, input_shape, _):
         net = ViT(**(input_param))

Original file line number	Diff line number	Diff line change
`@@ -105,6 +105,8 @@ def __init__(`
`105`	`105`	`torch.tril(torch.ones(sequence_length, sequence_length)).view(1, 1, sequence_length, sequence_length),`
`106`	`106`	`)`
`107`	`107`	`self.causal_mask: torch.Tensor`
	`108`	`+ else:`
	`109`	`+ self.causal_mask = torch.Tensor()`
`108`	`110`
`109`	`111`	`self.rel_positional_embedding = (`
`110`	`112`	`get_rel_pos_embedding_layer(rel_pos_embedding, input_size, self.dim_head, self.num_heads)`