take care of vlm key / values having different amount of attention heads than main model

lucidrains · lucidrains · commit bb68a8c260f5 · 2025-08-07T09:25:33.000-07:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "x-transformers"
-version = "2.6.2"
+version = "2.6.3"
 description = "X-Transformers"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_x_transformers.py b/tests/test_x_transformers.py
@@ -1228,8 +1228,8 @@ def test_external_key_values():
     seq = torch.randint(0, 20000, (3, 1024))
 
     key_values = [
-        (torch.randn(3, 8, 32, 16), torch.randn(3, 8, 32, 16)),
-        (torch.randn(3, 8, 32, 16), torch.randn(3, 8, 32, 16)),
+        (torch.randn(3, 2, 32, 16), torch.randn(3, 2, 32, 16)),
+        (torch.randn(3, 2, 32, 16), torch.randn(3, 2, 32, 16)),
     ]
 
     additional_kv_mask = torch.randint(0, 2, (3, 32)).bool()
diff --git a/x_transformers/x_transformers.py b/x_transformers/x_transformers.py
@@ -1795,14 +1795,20 @@ def forward(
             seq_len = k.shape[-2]
 
             added_k, added_v = additional_key_values
+            added_kv_heads, added_kv_len = added_k.shape[1], added_k.shape[-2]
+
+            # take care of expanding to query heads if mismatch between key / value heads with the ones coming from vlm
+
+            if added_kv_heads != kv_h:
+                assert divisible_by(h, added_kv_heads)
+                k, v, added_k, added_v = tuple(repeat(t, 'b h ... -> b (r h) ...', r = h // t.shape[1]) for t in (k, v, added_k, added_v))
 
             k = cat((added_k, k), dim = -2)
             v = cat((added_v, v), dim = -2)
 
             if (exists(input_mask) or exists(additional_key_value_mask)):
 
                 if not exists(additional_key_value_mask):
-                    added_kv_len = added_k.shape[-2]
                     input_mask = pad_at_dim(input_mask, (added_kv_len, 0), dim = -1, value = True)
                 elif not exists(input_mask):
                     input_mask = pad_at_dim(additional_key_value_mask, (0, seq_len), dim = -1, value = True)

Original file line number	Diff line number	Diff line change
`@@ -1228,8 +1228,8 @@ def test_external_key_values():`
`1228`	`1228`	`seq = torch.randint(0, 20000, (3, 1024))`
`1229`	`1229`
`1230`	`1230`	`key_values = [`
`1231`		`- (torch.randn(3, 8, 32, 16), torch.randn(3, 8, 32, 16)),`
`1232`		`- (torch.randn(3, 8, 32, 16), torch.randn(3, 8, 32, 16)),`
	`1231`	`+ (torch.randn(3, 2, 32, 16), torch.randn(3, 2, 32, 16)),`
	`1232`	`+ (torch.randn(3, 2, 32, 16), torch.randn(3, 2, 32, 16)),`
`1233`	`1233`	`]`
`1234`	`1234`
`1235`	`1235`	`additional_kv_mask = torch.randint(0, 2, (3, 32)).bool()`