Revert DyT to RMSNorm + SGD momentum SLOT (novel eval improvement)

Chidera Ibe · claude · Chidera Ibe · commit b5835f19ed70 · 2026-04-03T11:52:15.000-05:00
DyT hurt (1.1307 vs 1.1263 sliding). Back to RMSNorm. Novel: replace AdamW with SGD+momentum(0.9) for SLOT optimization. PR openai#995 showed SGD+momentum beats AdamW for TTT by 0.036 BPB. Nobody has tried SGD SLOT specifically. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/train_gpt.py b/train_gpt.py
@@ -540,8 +540,8 @@ def __init__(
         dtg: bool = False,
     ):
         super().__init__()
-        self.attn_norm = DyT(dim)  # DyT replaces RMSNorm (arXiv:2503.10622)
-        self.mlp_norm = DyT(dim)
+        self.attn_norm = RMSNorm()
+        self.mlp_norm = RMSNorm()
         self.attn = CausalSelfAttention(dim, num_heads, num_kv_heads, rope_base, qk_gain_init)
         self.mlp = MLP(dim, mlp_mult)
         self.attn_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32))
@@ -874,7 +874,8 @@ def eval_val_sliding_slot(
         valid_count = mask.sum()
         delta = torch.zeros(bsz, 1, hidden_f.size(-1), device=device, dtype=torch.float32, requires_grad=True)
         logit_bias = torch.zeros(bsz, 1, proj_w.size(0), device=device, dtype=torch.float32, requires_grad=True)
-        slot_opt = torch.optim.AdamW([delta, logit_bias], lr=slot_lr)
+        # SGD+momentum for SLOT (inspired by PR #995: SGD beats AdamW for TTT)
+        slot_opt = torch.optim.SGD([delta, logit_bias], lr=slot_lr, momentum=0.9)
         targets_flat = y_batch.reshape(-1)
         for _step in range(slot_steps):
             _lr = slot_lr_min + 0.5 * (slot_lr - slot_lr_min) * (1 + math.cos(math.pi * _step / slot_steps))