add another solution to the attention able to noop issue shared by gpt-oss

lucidrains · lucidrains · commit a19e70005876 · 2025-08-08T10:45:19.000-07:00
diff --git a/README.md b/README.md
@@ -2459,4 +2459,14 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17)
 }
 ```
 
+```bibtex
+@misc{openai_gpt_oss,
+  author       = {OpenAI},
+  title        = {Introducing gpt-oss},
+  howpublished = {https://openai.com/index/introducing-gpt-oss},
+  month        = {August},
+  year         = {2025}
+}
+```
+
 *solve intelligence... then use that to solve everything else.* - Demis Hassabis
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "x-transformers"
-version = "2.6.3"
+version = "2.6.6"
 description = "X-Transformers"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_x_transformers.py b/tests/test_x_transformers.py
@@ -1235,3 +1235,20 @@ def test_external_key_values():
     additional_kv_mask = torch.randint(0, 2, (3, 32)).bool()
 
     logits = model(seq, self_attn_additional_kv = key_values, additional_kv_mask = additional_kv_mask)
+
+def test_learned_head_attn_sink():
+
+    model = TransformerWrapper(
+        num_tokens = 20000,
+        max_seq_len = 1024,
+        attn_layers = Decoder(
+            dim = 512,
+            depth = 12,
+            heads = 8,
+            attn_head_learned_sink = True
+        )
+    )
+
+    seq = torch.randint(0, 20000, (3, 1024))
+
+    logits = model(seq)
diff --git a/x_transformers/attend.py b/x_transformers/attend.py
@@ -4,8 +4,8 @@
 from typing import Tuple, Callable
 
 import torch
-from torch.nn import Module
-from torch import nn, einsum, Tensor
+from torch.nn import Module, Parameter
+from torch import cat, nn, einsum, Tensor
 import torch.nn.functional as F
 
 from collections import namedtuple
@@ -176,6 +176,7 @@ def __init__(
         softclamp_logits = False,
         logit_softclamp_value = 50.,
         add_zero_kv = False,
+        head_learned_sink = False,
         selective = False,
         hard = False,
         cope = None,
@@ -254,6 +255,13 @@ def __init__(
 
         self.add_zero_kv = add_zero_kv
 
+        # learned sink concatted pre-softmax, working solution from gpt-oss
+
+        assert not (head_learned_sink and flash), f'not supported for flash attention yet'
+
+        self.head_learned_sink = head_learned_sink
+        self.head_attn_sink = Parameter(torch.zeros(heads)) if head_learned_sink else None
+
         # soft clamp attention logit value
 
         if softclamp_logits:
@@ -315,10 +323,10 @@ def flash_attn(
         if self.l2_distance:
             k_norm_sq = k.norm(dim = -1, keepdim = True) ** 2
             k = F.pad(k, (0, 1), value = -1.)
-            k = torch.cat((k, k_norm_sq), dim = -1)
+            k = cat((k, k_norm_sq), dim = -1)
 
             q_norm_sq = q.norm(dim = -1, keepdim = True) ** 2
-            q = torch.cat((2 * q, q_norm_sq), dim = -1)
+            q = cat((2 * q, q_norm_sq), dim = -1)
             q = F.pad(q, (0, 1), value = -1.)
 
         # handle scale - by default they scale by dim_head ** -0.5, but need to take care if using cosine sim attention
@@ -509,6 +517,11 @@ def forward(
         if self.selective:
             sim = selective_attn(sim)
 
+        if self.head_learned_sink:
+            # add learned attention sink
+            attn_sink = repeat(self.head_attn_sink, 'h -> b h i 1', b = sim.shape[0], i = sim.shape[2])
+            sim = cat((attn_sink, sim), dim = -1)
+
         pre_softmax_attn = sim
 
         attn = self.attn_fn(sim)
@@ -517,6 +530,10 @@ def forward(
 
         post_softmax_attn = attn
 
+        if self.head_learned_sink:
+            # remove attention sink
+            attn = attn[..., 1:]
+
         attn = self.attn_dropout(attn)
 
         if exists(self.post_softmax_talking_heads):
diff --git a/x_transformers/x_transformers.py b/x_transformers/x_transformers.py
@@ -1319,6 +1319,7 @@ def __init__(
         value_dim_head = None,
         dim_out = None,
         add_zero_kv = False,         # same as add_zero_attn in pytorch
+        head_learned_sink = False,
         rotate_num_heads = None,
         data_dependent_alibi = False,
         data_dependent_alibi_per_row = False,
@@ -1515,6 +1516,7 @@ def __init__(
             selective = selective,
             custom_attn_fn = custom_attn_fn,
             add_zero_kv = add_zero_kv,
+            head_learned_sink = head_learned_sink,
             flash = flash,
             softclamp_logits = softclamp_logits,
             logit_softclamp_value = logit_softclamp_value,