Take present_key into account when generating mask_shape

voltjia · voltjia · commit 5a2541740c53 · 2025-08-01T16:10:52.000+08:00
diff --git a/src/ntops/torch.py b/src/ntops/torch.py
@@ -473,7 +473,9 @@ def scaled_dot_product_attention(
             "Number of heads in `query` must be divisible by number of heads in `key` and `value` when GQA is enabled."
         )
 
-    mask_shape = query.shape[:-1] + (key.shape[-2],)
+    mask_shape = query.shape[:-1] + (
+        key.shape[-2] if present_key is None else key.shape[-2] + present_key.shape[-2],
+    )
 
     if attn_mask is not None:
         with_attn_mask = True

Original file line number	Diff line number	Diff line change
`@@ -473,7 +473,9 @@ def scaled_dot_product_attention(`
`473`	`473`	"Number of heads in `query` must be divisible by number of heads in `key` and `value` when GQA is enabled."
`474`	`474`	`)`
`475`	`475`
`476`		`- mask_shape = query.shape[:-1] + (key.shape[-2],)`
	`476`	`+ mask_shape = query.shape[:-1] + (`
	`477`	`+ key.shape[-2] if present_key is None else key.shape[-2] + present_key.shape[-2],`
	`478`	`+ )`
`477`	`479`
`478`	`480`	`if attn_mask is not None:`
`479`	`481`	`with_attn_mask = True`