Add support for is_causal

voltjia · voltjia · commit 57119f0e3c35 · 2025-06-23T11:21:42.000+08:00
diff --git a/src/ntops/kernels/scaled_dot_product_attention.py b/src/ntops/kernels/scaled_dot_product_attention.py
@@ -17,6 +17,7 @@ def arrangement(
     present_key_slot,
     present_value_slot,
     attn_mask,
+    is_causal,
     scale,
     output,
     with_attn_mask,
@@ -67,6 +68,7 @@ def arrange_attn_mask(input):
         present_value_slot
     )
     attn_mask_arranged = arrange_attn_mask(attn_mask)
+    is_causal_arranged = is_causal
     scale_arranged = scale
     output_arranged = arrange_query_or_output(output)
     with_attn_mask_arranged = with_attn_mask
@@ -81,6 +83,7 @@ def arrange_attn_mask(input):
             present_key_slot_arranged,
             present_value_slot_arranged,
             attn_mask_arranged,
+            is_causal_arranged,
             scale_arranged,
             output_arranged,
             with_attn_mask_arranged,
@@ -91,6 +94,7 @@ def arrange_attn_mask(input):
         key_arranged,
         value_arranged,
         attn_mask_arranged,
+        is_causal_arranged,
         scale_arranged,
         output_arranged,
         with_attn_mask_arranged,
@@ -106,6 +110,7 @@ def application_with_kv_cache(
     present_key_slot,
     present_value_slot,
     attn_mask,
+    is_causal,
     scale,
     output,
     with_attn_mask,
@@ -114,12 +119,12 @@ def application_with_kv_cache(
     present_value_slot = present_value  # noqa: F841
 
     application_without_kv_cache(
-        query, key, value, attn_mask, scale, output, with_attn_mask
+        query, key, value, attn_mask, is_causal, scale, output, with_attn_mask
     )
 
 
 def application_without_kv_cache(
-    query, key, value, attn_mask, scale, output, with_attn_mask
+    query, key, value, attn_mask, is_causal, scale, output, with_attn_mask
 ):
     for i in range(query.shape[0]):
         query_i = (1.4426950408889634 * scale * query[i]).to(query[i].dtype)
@@ -135,6 +140,10 @@ def application_without_kv_cache(
             if with_attn_mask:
                 qk += attn_mask[j]
 
+            if is_causal:
+                mask = query[i].offsets(-2)[:, None] >= key[j].offsets(-2)[None, :]
+                qk = ntl.where(mask, qk, float("-inf"))
+
             next_max = ntl.maximum(max, ntl.max(qk, 1))
             stable_qk = ntl.exp2(qk - next_max[:, None])
 
@@ -168,7 +177,7 @@ def make(with_kv_cache):
         for _ in range(4)
     )
     scale = Tensor(0)
-    with_attn_mask = Tensor(0, constexpr=True)
+    is_causal, with_attn_mask = (Tensor(0, constexpr=True) for _ in range(2))
 
     if with_kv_cache:
         application = application_with_kv_cache
@@ -184,6 +193,7 @@ def make(with_kv_cache):
         present_key_slot,
         present_value_slot,
         attn_mask,
+        is_causal,
         scale,
         output,
         with_attn_mask,
diff --git a/src/ntops/torch.py b/src/ntops/torch.py
@@ -336,10 +336,12 @@ def scaled_dot_product_attention(
 ):
     # TODO: Support `dropout_p`.
     assert dropout_p == 0, "`dropout_p` is not supported yet."
-    # TODO: Support `is_causal`.
-    assert not is_causal, "`is_causal` is not supported yet."
     assert enable_gqa, "GQA must be enabled for now."
 
+    assert attn_mask is None or not is_causal, (
+        "Cannot use `attn_mask` and `is_causal` together."
+    )
+
     mask_shape = query.shape[:-1] + (key.shape[-2],)
 
     if attn_mask is not None:
@@ -376,12 +378,13 @@ def scaled_dot_product_attention(
             present_key_slot,
             present_value_slot,
             attn_mask,
+            is_causal,
             scale,
             output,
             with_attn_mask,
         )
     else:
-        kernel(query, key, value, attn_mask, scale, output, with_attn_mask)
+        kernel(query, key, value, attn_mask, is_causal, scale, output, with_attn_mask)
 
     return output
 
diff --git a/tests/test_scaled_dot_product_attention.py b/tests/test_scaled_dot_product_attention.py
@@ -17,13 +17,17 @@ def _generate_random_size():
     arguments = []
 
     attn_mask_types = (None, torch.bool, torch.float32)
+    is_causal_values = (False, True)
     scales = (None, random.uniform(0.05, 0.5))
     dtypes = (torch.float32, torch.float16)
     with_kv_cache_values = (False, True)
 
-    for attn_mask_type, scale, dtype, with_kv_cache in itertools.product(
-        attn_mask_types, scales, dtypes, with_kv_cache_values
+    for attn_mask_type, is_causal, scale, dtype, with_kv_cache in itertools.product(
+        attn_mask_types, is_causal_values, scales, dtypes, with_kv_cache_values
     ):
+        if attn_mask_type is not None and is_causal:
+            continue
+
         batch_size = random.randint(1, 4)
         num_heads_q = 2 ** random.randint(1, 5)
         seq_len_q = _generate_random_size()
@@ -49,6 +53,7 @@ def _generate_random_size():
                 num_heads_kv,
                 seq_len_kv,
                 attn_mask_type,
+                is_causal,
                 scale,
                 enable_gqa,
                 with_kv_cache,
@@ -59,7 +64,7 @@ def _generate_random_size():
         )
 
     return (
-        "batch_size, num_heads_q, seq_len_q, head_dim, num_heads_kv, seq_len_kv, attn_mask_type, scale, enable_gqa, with_kv_cache, dtype, atol, rtol",
+        "batch_size, num_heads_q, seq_len_q, head_dim, num_heads_kv, seq_len_kv, attn_mask_type, is_causal, scale, enable_gqa, with_kv_cache, dtype, atol, rtol",
         arguments,
     )
 
@@ -74,6 +79,7 @@ def test_cuda(
     num_heads_kv,
     seq_len_kv,
     attn_mask_type,
+    is_causal,
     scale,
     enable_gqa,
     with_kv_cache,
@@ -129,6 +135,7 @@ def _generate_present_and_slot(tensor):
         key,
         value,
         attn_mask=attn_mask,
+        is_causal=is_causal,
         scale=scale,
         enable_gqa=enable_gqa,
         present_key=present_key,
@@ -141,6 +148,7 @@ def _generate_present_and_slot(tensor):
         key_cloned,
         value_cloned,
         attn_mask=attn_mask,
+        is_causal=is_causal,
         scale=scale,
         enable_gqa=enable_gqa,
     )