complete the qk clip on transformer wrapper / attention layers for muon training

lucidrains · lucidrains · commit fc1db876661a · 2025-09-13T11:24:13.000-07:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "x-transformers"
-version = "2.7.4"
+version = "2.7.6"
 description = "X-Transformers"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_x_transformers.py b/tests/test_x_transformers.py
@@ -1315,7 +1315,7 @@ def test_simple_mdlm(
     loss = nar(seq)
     loss.loss.backward()
 
-def test_qk_clip():
+def test_qk_clip_attn():
     from x_transformers import Attention
 
     x = torch.randn(1, 1024, 512)
@@ -1325,3 +1325,18 @@ def test_qk_clip():
     out, intermediates = attn(x, return_intermediates = True)
 
     attn.qk_clip_(intermediates, tau = 100)
+
+def test_qk_clip_attn_layers():
+    from x_transformers import TransformerWrapper, Decoder
+
+    model = TransformerWrapper(
+        num_tokens = 256,
+        max_seq_len = 1024,
+        attn_layers = Decoder(dim = 512, depth = 2)
+    )
+
+    seq = torch.randint(0, 256, (1, 1024))
+
+    out, intermediates = model(seq, return_intermediates = True)
+
+    model.attn_qk_clip_(intermediates)
diff --git a/x_transformers/x_transformers.py b/x_transformers/x_transformers.py
@@ -2462,6 +2462,23 @@ def __init__(
 
         self.can_cache_kv = all([module.can_cache_kv for module in self.modules() if isinstance(module, Attention)])
 
+    def attn_qk_clip_(
+        self,
+        intermediates: LayerIntermediates,
+        tau = 100.
+    ):
+        # pairs up the attention intermediates with each attention module and does qk clip proposed by kimi team
+
+        layer_and_layer_types = (self.layers, self.layer_types)
+
+        attn_layers = [layer for (_, layer, _), layer_type in zip(self.layers, self.layer_types) if layer_type in ('a', 'c')]
+        attn_intermeds = intermediates.attn_intermediates
+
+        assert len(attn_layers) == len(attn_intermeds)
+
+        for attn_layer, attn_inter in zip(attn_layers, attn_intermeds):
+            attn_layer.qk_clip_(attn_inter, tau = tau)
+
     def forward(
         self,
         x,
@@ -3192,6 +3209,13 @@ def init_(self):
             if not isinstance(self.pos_emb, always):
                 nn.init.normal_(self.pos_emb.emb.weight, std = 1e-5)
 
+    def attn_qk_clip_(
+        self,
+        intermediates: LayerIntermediates,
+        tau = 100.
+    ):
+        self.attn_layers.attn_qk_clip_(intermediates, tau = tau)
+
     def forward(
         self,
         x,