Refactoring to be easier to follow

Lucas-rbnt · Lucas-rbnt · commit 42db7c054938 · 2024-11-22T11:04:09.000+01:00
Signed-off-by: Lucas Robinet &lt;robinet.lucas@iuct-oncopole.fr&gt;
diff --git a/monai/networks/blocks/selfattention.py b/monai/networks/blocks/selfattention.py
@@ -159,7 +159,7 @@ def forward(self, x, attn_mask: torch.Tensor | None = None):
         Args:
             x (torch.Tensor): input tensor. B x (s_dim_1 * ... * s_dim_n) x C
             attn_mask (torch.Tensor, optional): mask to apply to the attention matrix.
-            Defaults to None. B x N_heads x (s_dim_1 * ... * s_dim_n) x (s_dim_1 * ... * s_dim_n).
+            B x (s_dim_1 * ... * s_dim_n). Defaults to None.
 
         Return:
             torch.Tensor: B x (s_dim_1 * ... * s_dim_n) x C
@@ -194,14 +194,15 @@ def forward(self, x, attn_mask: torch.Tensor | None = None):
                 att_mat = self.rel_positional_embedding(x, att_mat, q)
 
             if self.causal:
+                assert attn_mask is None, "Causal attention does not support attention masks."
                 att_mat = att_mat.masked_fill(self.causal_mask[:, :, : x.shape[-2], : x.shape[-2]] == 0, float("-inf"))
 
             if attn_mask is not None:
-                attn_mask = attn_mask[:, None, :, None] * attn_mask[:, None, None, :]
-                att_mat.masked_fill_(~attn_mask, torch.finfo(att_mat.dtype).min)
+                attn_mask = attn_mask.unsqueeze(1).unsqueeze(2)
+                attn_mask = attn_mask.expand(-1, self.num_heads, -1, -1)
+                att_mat = att_mat.masked_fill(attn_mask == 0, float("-inf"))
 
             att_mat = att_mat.softmax(dim=-1)
-
             if self.save_attn:
                 # no gradients and new tensor;
                 # https://pytorch.org/docs/stable/generated/torch.Tensor.detach.html
@@ -215,13 +216,3 @@ def forward(self, x, attn_mask: torch.Tensor | None = None):
             x = self.out_proj(x)
         x = self.drop_output(x)
         return x
-
-
-if __name__ == "__main__":
-    sa = SABlock(128, 1)
-    x = torch.randn(1, 6, 128)
-    mask = torch.ones((1, 6), dtype=torch.bool)
-    mask[0][2] = False
-    print(mask)
-    out = sa(x, attn_mask=mask)
-    print(out.shape)
diff --git a/tests/test_selfattention.py b/tests/test_selfattention.py
@@ -123,19 +123,22 @@ def test_causal(self):
         assert torch.triu(block.att_mat, diagonal=1).sum() == 0
 
     def test_masked_selfattention(self):
-        n = 4
+        n = 64
         block = SABlock(hidden_size=128, num_heads=1, dropout_rate=0.1, sequence_length=16, save_attn=True)
         input_shape = (1, n, 128)
-        mask = torch.tensor([[1, 1, 1, 0]]).bool()
+        # generate a mask randomly with zeros and ones of shape (1, n)
+        mask = torch.randint(0, 2, (1, n)).bool()
         block(torch.randn(input_shape), attn_mask=mask)
-        att_mat = block.att_mat.squeeze(1)
-        # get the masked row and the remaining ones based on mask 0 values
-        rows_true = att_mat[mask, :]
-        rows_false = att_mat[~mask, :]
-        # check that in false rows every element is equal to 1/4
-        assert torch.allclose(rows_false, torch.ones_like(rows_false) / n)
-        # check that in true rows the mask column is zero
-        assert torch.allclose(rows_true[:, -1], torch.zeros_like(rows_true[:, -1]))
+        att_mat = block.att_mat.squeeze()
+        # ensure all masked columns are zeros
+        assert torch.allclose(att_mat[:, ~mask.squeeze(0)], torch.zeros_like(att_mat[:, ~mask.squeeze(0)]))
+
+    def test_causal_and_mask(self):
+        with self.assertRaises(AssertionError):
+            block = SABlock(hidden_size=128, num_heads=1, causal=True, sequence_length=64)
+            inputs = torch.randn(2, 64, 128)
+            mask = torch.randint(0, 2, (2, 64)).bool()
+            block(inputs, attn_mask=mask)
 
     @skipUnless(has_einops, "Requires einops")
     def test_access_attn_matrix(self):