Add with_attn_mask flag to conditionally apply attention mask

voltjia · voltjia · commit d49393437d5b · 2025-06-23T09:23:11.000+08:00
diff --git a/src/ntops/kernels/scaled_dot_product_attention.py b/src/ntops/kernels/scaled_dot_product_attention.py
@@ -19,6 +19,7 @@ def arrangement(
     attn_mask,
     scale,
     output,
+    with_attn_mask,
     with_kv_cache,
     BLOCK_SIZE_M=BLOCK_SIZE_M,
     BLOCK_SIZE_N=BLOCK_SIZE_N,
@@ -68,6 +69,7 @@ def arrange_attn_mask(input):
     attn_mask_arranged = arrange_attn_mask(attn_mask)
     scale_arranged = scale
     output_arranged = arrange_query_or_output(output)
+    with_attn_mask_arranged = with_attn_mask
 
     if with_kv_cache:
         return (
@@ -81,6 +83,7 @@ def arrange_attn_mask(input):
             attn_mask_arranged,
             scale_arranged,
             output_arranged,
+            with_attn_mask_arranged,
         )
 
     return (
@@ -90,6 +93,7 @@ def arrange_attn_mask(input):
         attn_mask_arranged,
         scale_arranged,
         output_arranged,
+        with_attn_mask_arranged,
     )
 
 
@@ -104,14 +108,19 @@ def application_with_kv_cache(
     attn_mask,
     scale,
     output,
+    with_attn_mask,
 ):
     present_key_slot = present_key  # noqa: F841
     present_value_slot = present_value  # noqa: F841
 
-    application_without_kv_cache(query, key, value, attn_mask, scale, output)
+    application_without_kv_cache(
+        query, key, value, attn_mask, scale, output, with_attn_mask
+    )
 
 
-def application_without_kv_cache(query, key, value, attn_mask, scale, output):
+def application_without_kv_cache(
+    query, key, value, attn_mask, scale, output, with_attn_mask
+):
     for i in range(query.shape[0]):
         query_i = (1.4426950408889634 * scale * query[i]).to(query[i].dtype)
 
@@ -120,9 +129,12 @@ def application_without_kv_cache(query, key, value, attn_mask, scale, output):
         max = ntl.full((query_i.shape[-2],), float("-inf"), dtype=ntl.float32)
 
         for j in range(key.shape[0]):
-            qk = ntl.dot(query_i, ntl.trans(key[j])) + attn_mask[j]
+            qk = ntl.dot(query_i, ntl.trans(key[j]))
             qk = ntl.where(key[j].offsets(-2) < key.source.shape[-2], qk, float("-inf"))
 
+            if with_attn_mask:
+                qk += attn_mask[j]
+
             next_max = ntl.maximum(max, ntl.max(qk, 1))
             stable_qk = ntl.exp2(qk - next_max[:, None])
 
@@ -156,6 +168,7 @@ def make(with_kv_cache):
         for _ in range(4)
     )
     scale = Tensor(0)
+    with_attn_mask = Tensor(0, constexpr=True)
 
     if with_kv_cache:
         application = application_with_kv_cache
@@ -173,6 +186,7 @@ def make(with_kv_cache):
         attn_mask,
         scale,
         output,
+        with_attn_mask,
     )
 
     return ninetoothed.make(
diff --git a/src/ntops/torch.py b/src/ntops/torch.py
@@ -342,12 +342,17 @@ def scaled_dot_product_attention(
 
     mask_shape = query.shape[:-1] + (key.shape[-2],)
 
-    if attn_mask is None:
-        attn_mask = torch.zeros(mask_shape, dtype=query.dtype, device=query.device)
-    elif attn_mask.dtype == torch.bool:
-        attn_mask = torch.where(attn_mask, 0, float("-inf"))
+    if attn_mask is not None:
+        with_attn_mask = True
 
-    attn_mask = attn_mask.expand(mask_shape)
+        if attn_mask.dtype == torch.bool:
+            attn_mask = torch.where(attn_mask, 0, float("-inf"))
+
+        attn_mask = attn_mask.expand(mask_shape)
+    else:
+        with_attn_mask = False
+
+        attn_mask = torch.empty(mask_shape, device="meta")
 
     if scale is None:
         scale = 1 / math.sqrt(query.shape[-1])
@@ -373,9 +378,10 @@ def scaled_dot_product_attention(
             attn_mask,
             scale,
             output,
+            with_attn_mask,
         )
     else:
-        kernel(query, key, value, attn_mask, scale, output)
+        kernel(query, key, value, attn_mask, scale, output, with_attn_mask)
 
     return output